adds support for multiple directory paths

pnnl · Jun 22, 2018 · ad61564 · ad61564
1 parent 69df713
commit ad61564
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 26 deletions.
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -26,21 +26,31 @@ we can annotate across SILVA using:
         --reference-database silva \
         mothur_sop_data
 
-.. tip::
-    The data directory can optionally be a pattern containing a wildcard,
-    such as::
 
-        hundo annotate \
-            --filter-adapters qc_references/adapters.fa.gz \
-            --filter-contaminants qc_references/phix174.fa.gz \
-            --out-dir mothur_sop_silva \
-            --database-dir annotation_references \
-            --reference-database silva \
-            'mothur_sop_data/F3D14*S20*.fastq.gz'
+The data directory can optionally be a pattern containing a wildcard,
+such as::
 
-    The string must be contained between single quotes so it isn't expanded
-    into a space delimited list.
+    hundo annotate \
+        --filter-adapters qc_references/adapters.fa.gz \
+        --filter-contaminants qc_references/phix174.fa.gz \
+        --out-dir mothur_sop_silva \
+        --database-dir annotation_references \
+        --reference-database silva \
+        'mothur_sop_data/F3D14*S20*.fastq.gz'
+
+The string must be contained between single quotes so it isn't expanded
+into a space delimited list.
+
+Or when data is spread across multiple directories, you can use a combination
+of paths and patterns in a comma separated list, like::
 
+    hundo annotate \
+        --filter-adapters qc_references/adapters.fa.gz \
+        --filter-contaminants qc_references/phix174.fa.gz \
+        --out-dir mothur_sop_silva \
+        --database-dir annotation_references \
+        --reference-database silva \
+        'collection1/LM_*.fastq.gz,collection2/rawdata'
 
 Dependencies are installed by default in the results directory defined
 on the command line as ``--out-dir``. If you want to re-use dependencies

diff --git a/hundo/Snakefile b/hundo/Snakefile
@@ -41,31 +41,36 @@ def count_sequences(filename):
     return sum(buf.count(b'\n') for buf in f_gen) / 4
 
 
-def get_sample_files(fastq_dir, prefilter_file_size):
-    if not fastq_dir:
+def get_sample_files(fastq_dirs, prefilter_file_size):
+    if not fastq_dirs:
         logger.error(
             (
                 "'fastq_dir' has not been set -- this directory "
                 "should contain your input FASTQs"
             )
         )
         sys.exit(1)
-    if os.path.isfile(fastq_dir):
+    if os.path.isfile(fastq_dirs):
         logger.error(
             (
-                "'fastq_dir' must be a directory or a file pattern, "
+                "'fastq_dir' must be a directory, a file pattern, or a comma "
+                "separated list of both of those things "
                 "e.g. '*.fastq', with single quotes surrounding the pattern."
             )
         )
         sys.exit(1)
-    if "*" in fastq_dir:
-        from glob import glob
-
-        logger.info("Finding samples matching %s" % fastq_dir)
-        raw_fastq_files = glob(fastq_dir)
-    else:
-        logger.info("Finding samples in %s" % fastq_dir)
-        raw_fastq_files = [os.path.join(fastq_dir, i) for i in os.listdir(fastq_dir)]
+    # grab all of the possible fastq file paths
+    # user is permitted to send a single path, a single pattern, or a
+    # comma separated list of paths and patterns
+    raw_fastq_files = list()
+    for fastq_dir in fastq_dirs.split(","):
+        if "*" in fastq_dir:
+            from glob import glob
+            logger.info("Finding samples matching %s" % fastq_dir)
+            raw_fastq_files.extend(glob(fastq_dir))
+        else:
+            logger.info("Finding samples in %s" % fastq_dir)
+            raw_fastq_files.extend([os.path.join(fastq_dir, i) for i in os.listdir(fastq_dir)])
     samples = dict()
     seen = set()
     omitted = dict()

diff --git a/hundo/__init__.py b/hundo/__init__.py
@@ -1 +1 @@
-__version__ = "1.1.15"
+__version__ = "1.1.16"
diff --git a/hundo/hundo.py b/hundo/hundo.py
@@ -523,6 +523,10 @@ def run_annotate(
         \b
         https://hundo.rtfd.io
     """
+    fq_dir = list()
+    for input_dir in fastq_dir.replace(" ", "").split(","):
+        fq_dir.append(os.path.realpath(input_dir))
+    fq_dir = ",".join(fq_dir)
     database_dir = os.path.realpath(database_dir)
     filter_adapters = os.path.realpath(filter_adapters) if filter_adapters else ""
     filter_contaminants = os.path.realpath(
@@ -565,7 +569,7 @@ def run_annotate(
         jobs=jobs,
         conda="" if no_conda else "--use-conda",
         dryrun="--dryrun" if dryrun else "",
-        fq_dir=os.path.realpath(fastq_dir),
+        fq_dir=fq_dir,
         author=author,
         threads=threads,
         database_dir=database_dir,