handle regex for include / exclude

eastgenomics · Nov 7, 2023 · 5626a97 · 5626a97
1 parent 0283dc7
commit 5626a97
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -16,8 +16,8 @@ Runs the Illumina TSO500 local analysis app.
 - `analysis_options` (`str`) -  a string which can be passed to the ruo command line
 - `isNovaSeq` (`bool`; default: true) - passes the `-isNovaSeq` flag to the TSO500 local app for running on NovaSeq data
 - `scatter_instance` (`str`): DNAnexus instance type to use for the per sample analysis (default: `mem1_ssd1_v2_x36`)
-- `include_samples` (`str`) - comma separated string of samples to run analyses for (mutually exclusive with `exclude_samples`)
-- `exclude_samples` (`str`) - comma separated string of samples to NOT run analyses for (mutually exclusive with `include_samples`)
+- `include_samples` (`str`) - comma separated string of sample names / regex patterns to run analyses for (mutually exclusive with `exclude_samples`)
+- `exclude_samples` (`str`) - comma separated string of sample names / regex patterns to NOT run analyses for (mutually exclusive with `include_samples`)
 - `n_samples` (`int`) - maximum number of samples from samplesheet to run analysis on (this will take the first n sample rows from the samplesheet)
 
 ## How does this app work?
@@ -291,7 +291,7 @@ output_folder/
 - Samplesheet input is optional, if not specified the analysis app looks for the samplesheet in top level of runfolder
 - When running in scatter / gather mode, demultiplexing via the local app must always be first performed (i.e it can't be started from previously demultiplexed data and reusing fastqs) To achieve the equivalent, `-iupload_demultiplex_output=false` may be specified to not upload the output of demultiplexing from the job. When used in conjunction with `-iinclude_samples="sample1"`, this would effectively just run and output data for the given sample(s) as if the local app were just run from fastqs for a single sample
 - Jobs are launched per sample parsed from the 'Pair_ID' column, this means if the samplesheet is formatted for running in paired analysis mode, the fastqs for all samples for the given pair ID will be used for the analysis
-- samples specified to `-iinclude_samples` or `-iexclude_samples` should be specified as given in the Pair_ID column of the samplesheet
+- sample names specified to `-iinclude_samples` or `-iexclude_samples` should be specified as given in the Pair_ID column of the samplesheet, or as regex pattern(s) which will be matched against the Pair_ID column
 - Intermediary genome vcfs found in `scatter/` are compressed before uploading to save on storage
 - All log files are gathered up before uploading and combined into tar files, there is one tar file per file from the scatter step and one from the gather step. This is to reduce the total number of files uploaded at the end.
 

diff --git a/src/code.sh b/src/code.sh
@@ -181,30 +181,16 @@ _modify_samplesheet() {
         dx-jobutil-report-error "Invalid options specified"
     fi
 
-    if [[ "$include_samples" ]] || [[ "$exclude_samples" ]]; then
-        # first ensure any samples specified to include or exclude are
-        # valid sample names from our samplesheet
-        samples=$(sed -e 's/^,\|,$//' <<< "${include_samples},${exclude_samples}")
-
-        invalid=$(while read -d',' -r line; do \
-            [[ "$sample_list" =~ $line ]] || echo "${line} "; done <<< "${samples},")
-
-        if [[ "${invalid}" ]]; then
-            echo "One or more samplenames provided to include/exclude are invalid: ${invalid}"
-            dx-jobutil-report-error "Invalid samplename specified"
-        fi
-    fi
-
     if [[ "$include_samples" ]]; then
-        # retaining rows containing only those specified for given samples
+        # retaining rows containing only those specified for given samples / patterns
         echo "-iinclude_samples specified: ${include_samples}"
         include=$(sed 's/,/|/g' <<< "$include_samples")
         sample_rows=$(awk '/'"$include"'/ {print $1}' <<< "$sample_rows")
         sample_list=$(sed -e  "s/\r//g" <<< "$sample_rows" | cut -d, -f $pair_id_col | sort | uniq)
     fi
 
     if [[ "$exclude_samples" ]]; then
-        # exclude rows containing only those specified for given samples
+        # exclude rows containing only those specified for given samples / patterns
         echo -e "-iexclude_samples specified: ${exclude_samples}"
         exclude=$(sed 's/,/|/g' <<< "$exclude_samples")
         sample_rows=$(awk '!/'"$exclude"'/ {print $1}' <<< "$sample_rows")