feat: add download with sratoolkit/fasterq-dump

SPAAM-community · Dec 18, 2023 · 54b01f2 · 54b01f2
1 parent 758ea6b
commit 54b01f2
Show file tree

Hide file tree

Showing 5 changed files with 101 additions and 67 deletions.
diff --git a/AMDirT/__init__.py b/AMDirT/__init__.py
@@ -1 +1 @@
-__version__ = "1.4.6"
+__version__ = "1.5.0"
diff --git a/AMDirT/cli.py b/AMDirT/cli.py
@@ -139,14 +139,19 @@ def viewer(ctx, no_args_is_help=True, **kwargs):
     help="Generate bash script with Aspera-based download commands for all libraries of samples in input table",
 )
 @click.option(
-    "--eager",
+    "--fetchngs",
     is_flag=True,
-    help="Convert filtered samples and libraries tables to eager input tables",
+    help="Convert filtered samples and libraries tables to nf-core/fetchngs input tables",
 )
 @click.option(
-    "--fetchngs",
+    "--sratoolkit",
     is_flag=True,
-    help="Convert filtered samples and libraries tables to nf-core/fetchngs input tables",
+    help="Generate bash script with SRA Toolkit fasterq-dump based download commands for all libraries of samples in input table",
+)
+@click.option(
+    "--eager",
+    is_flag=True,
+    help="Convert filtered samples and libraries tables to eager input tables",
 )
 @click.option(
     "--ameta",
@@ -178,26 +183,27 @@ def convert(ctx, no_args_is_help=True, **kwargs):
 # Autofill tool #
 #################
 
+
 @cli.command()
 @click.argument("accession", type=str, nargs=-1)
 @click.option(
     "-n",
-    "--table_name", 
+    "--table_name",
     type=click.Choice(get_table_list()),
-    default='ancientmetagenome-hostassociated',
-    show_default=True
+    default="ancientmetagenome-hostassociated",
+    show_default=True,
 )
 @click.option(
     "-l",
     "--library_output",
     type=click.Path(writable=True),
-    help="path to library output table file"
+    help="path to library output table file",
 )
 @click.option(
     "-s",
     "--sample_output",
     type=click.Path(writable=True),
-    help="path to sample output table file"
+    help="path to sample output table file",
 )
 @click.pass_context
 def autofill(ctx, no_args_is_help=True, **kwargs):
@@ -219,31 +225,26 @@ def autofill(ctx, no_args_is_help=True, **kwargs):
 @click.argument("dataset", type=click.Path(exists=True))
 @click.option(
     "-n",
-    "--table_name", 
+    "--table_name",
     type=click.Choice(get_table_list()),
-    default='ancientmetagenome-hostassociated',
-    show_default=True
+    default="ancientmetagenome-hostassociated",
+    show_default=True,
 )
 @click.option(
     "-t",
-    "--table_type", 
-    type=click.Choice(['samples', 'libraries']),
-    default='libraries',
-    show_default=True
-)
-@click.option(
-    "-m", 
-    "--markdown", 
-    is_flag=True, 
-    help="Output is in markdown format"
+    "--table_type",
+    type=click.Choice(["samples", "libraries"]),
+    default="libraries",
+    show_default=True,
 )
+@click.option("-m", "--markdown", is_flag=True, help="Output is in markdown format")
 @click.option(
     "-o",
     "--outdir",
     type=click.Path(writable=True),
     default=".",
     show_default=True,
-    help="path to sample output table file"
+    help="path to sample output table file",
 )
 @click.pass_context
 def merge(ctx, no_args_is_help=True, **kwargs):
@@ -255,5 +256,6 @@ def merge(ctx, no_args_is_help=True, **kwargs):
     """
     merge_new_df(**kwargs, **ctx.obj)
 
+
 if __name__ == "__main__":
     cli()
diff --git a/AMDirT/convert/__init__.py b/AMDirT/convert/__init__.py
@@ -28,6 +28,7 @@ def run_convert(
     aspera=False,
     eager=False,
     fetchngs=False,
+    sratoolkit=False,
     ameta=False,
     taxprofiler=False,
     mag=False,
@@ -65,13 +66,21 @@ def run_convert(
         supported_archives=supported_archives,
     )
 
+    accession_table = prepare_accession_table(
+        samples=samples,
+        libraries=selected_libraries,
+        table_name=table_name,
+        supported_archives=supported_archives,
+    )
+
     logger.warning(
         "We provide no warranty to the accuracy of the generated input sheets."
     )
 
     if bibliography == True:
-        logger.info("Preparing Bibtex citation file")
-        with open(f"{output}/AncientMetagenomeDir_bibliography.bib", "w") as fw:
+        bibfile = f"{output}/AncientMetagenomeDir_bibliography.bib"
+        logger.info(f"Writing Bibtex citation file to {bibfile}")
+        with open(bibfile, "w") as fw:
             fw.write(prepare_bibtex_file(samples))
 
     if table_name in ["ancientmetagenome-environmental"]:
@@ -80,86 +89,78 @@ def run_convert(
         col_drop = ["archive_accession", "sample_host"]
 
     if librarymetadata == True:
-        logger.info("Writing filtered libraries table")
+        tbl_file = f"{output}/AncientMetagenomeDir_filtered_libraries.tsv"
+        logger.info(f"Writing filtered libraries table to {tbl_file}")
         librarymetadata = selected_libraries.drop(col_drop, axis=1)
         librarymetadata.to_csv(
-            f"{output}/AncientMetagenomeDir_filtered_libraries.tsv",
+            tbl_file,
             sep="\t",
             index=False,
         )
 
     if curl == True:
-        logger.info("Writing curl download script")
-        accession_table = prepare_accession_table(
-            samples=samples,
-            libraries=selected_libraries,
-            table_name=table_name,
-            supported_archives=supported_archives,
-        )
-        with open(f"{output}/AncientMetagenomeDir_curl_download_script.sh", "w") as fw:
+        dl_file = f"{output}/AncientMetagenomeDir_curl_download_script.sh"
+        logger.info(f"Writing curl download script to {dl_file}")
+        with open(dl_file, "w") as fw:
             fw.write(accession_table["curl_script"])
 
     if aspera == True:
-        logger.info("Writing Aspera download script")
+        dl_file = f"{output}/AncientMetagenomeDir_aspera_download_script.sh"
+        logger.info(f"Writing Aspera download script to {dl_file}")
         logger.warning(
             "You will need to set the ${ASPERA_PATH} environment variable. See https://amdirt.readthedocs.io for more information."
         )
-        accession_table = prepare_accession_table(
-            samples=samples,
-            libraries=selected_libraries,
-            table_name=table_name,
-            supported_archives=supported_archives,
-        )
-        with open(
-            f"{output}/AncientMetagenomeDir_aspera_download_script.sh", "w"
-        ) as fw:
+        with open(dl_file, "w") as fw:
             fw.write(accession_table["aspera_script"])
 
     if fetchngs == True:
-        logger.info("Preparing nf-core/fetchngs table")
-        accession_table = prepare_accession_table(
-            samples=samples,
-            libraries=selected_libraries,
-            table_name=table_name,
-            supported_archives=supported_archives,
-        )
+        dl_file = f"{output}/AncientMetagenomeDir_nf_core_fetchngs_download_script.sh"
+        logger.info(f"Writing nf-core/fetchngs table to {dl_file}")
         accession_table["df"]["archive_data_accession"].to_csv(
-            f"{output}/AncientMetagenomeDir_nf_core_fetchngs_input_table.tsv",
+            dl_file,
             sep="\t",
             header=False,
             index=False,
         )
+    if sratoolkit == True:
+        dl_file = f"{output}/AncientMetagenomeDir_sratoolkit_download_script.sh"
+        logger.info(f"Writing sratoolkit/fasterq-dump download script to {dl_file}")
+        with open(dl_file, "w") as fw:
+            fw.write(accession_table["fasterq_dump_script"])
 
     if eager == True:
-        logger.info("Preparing nf-core/eager table")
+        tbl_file = f"{output}/AncientMetagenomeDir_nf_core_eager_input_table.tsv"
+        logger.info(f"Writing nf-core/eager table to {tbl_file}")
         eager_table = prepare_eager_table(
             samples=samples,
             libraries=selected_libraries,
             table_name=table_name,
             supported_archives=supported_archives,
         )
         eager_table.to_csv(
-            f"{output}/AncientMetagenomeDir_nf_core_eager_input_table.tsv",
+            tbl_file,
             sep="\t",
             index=False,
         )
 
     if taxprofiler == True:
-        logger.info("Preparing nf-core/taxprofiler table")
-        accession_table = prepare_taxprofiler_table(
+        tbl_file = f"{output}/AncientMetagenomeDir_nf_core_taxprofiler_input_table.csv"
+        logger.info(f"Writing nf-core/taxprofiler table to {tbl_file}")
+        taxprofiler_table = prepare_taxprofiler_table(
             samples=samples,
             libraries=selected_libraries,
             table_name=table_name,
             supported_archives=supported_archives,
         )
-        accession_table.to_csv(
-            f"{output}/AncientMetagenomeDir_nf_core_taxprofiler_input_table.csv",
+        taxprofiler_table.to_csv(
+            tbl_file,
             header=False,
             index=False,
         )
 
     if ameta == True:
-        logger.info("Preparing aMeta table")
+        tbl_file = f"{output}/AncientMetagenomeDir_aMeta_input_table.tsv"
+        logger.info(f"Writing aMeta table to {tbl_file}")
         logger.warning(
             "aMeta does not support pairs. You must manually merge pair-end data before using samplesheet."
         )
@@ -169,27 +170,40 @@ def run_convert(
             table_name=table_name,
             supported_archives=supported_archives,
         )
+
         aMeta_table.to_csv(
-            f"{output}/AncientMetagenomeDir_aMeta_input_table.tsv",
+            tbl_file,
             sep="\t",
             index=False,
         )
 
     if mag == True:
-        logger.info("Preparing nf-core/mag table")
+        logger.info("Preparing nf-core/mag table(s)")
         mag_table_single, mag_table_paired = prepare_mag_table(
             samples=samples,
             libraries=selected_libraries,
             table_name=table_name,
             supported_archives=supported_archives,
         )
         if not mag_table_single.empty:
+            mag_tbl_single_file = (
+                f"{output}/AncientMetagenomeDir_nf_core_mag_input_single_table.csv"
+            )
+            logger.info(
+                f"Writing nf-core/mag single-end table to {mag_tbl_single_file}"
+            )
             mag_table_single.to_csv(
-                f"{output}/AncientMetagenomeDir_nf_core_mag_input_single_table.csv",
+                mag_tbl_single_file,
                 index=False,
             )
         if not mag_table_paired.empty:
+            mag_tbl_paired_file = (
+                f"{output}/AncientMetagenomeDir_nf_core_mag_input_paired_table.csv"
+            )
+            logger.info(
+                f"Writing nf-core/mag paired-end table to {mag_tbl_paired_file}"
+            )
             mag_table_paired.to_csv(
-                f"{output}/AncientMetagenomeDir_nf_core_mag_input_paired_table.csv",
+                mag_tbl_paired_file,
                 index=False,
             )
diff --git a/AMDirT/core/__init__.py b/AMDirT/core/__init__.py
@@ -302,6 +302,7 @@ def prepare_accession_table(
 
     # Downloading with curl or aspera instead of fetchngs
     urls = set(libraries["download_links"])
+    accessions = set(libraries["archive_data_accession"])
     links = set()
     for u in urls:
         for s in u.split(";"):
@@ -321,11 +322,15 @@ def prepare_accession_table(
         )
         + "\n"
     )
+    fasterq_dump_script = (
+        "\n".join([f"fasterq-dump --split-files -p {a}" for a in accessions]) + "\n"
+    )
 
     return {
         "df": libraries[["archive_data_accession", "download_sizes"]].drop_duplicates(),
         "curl_script": dl_script_header + curl_script,
         "aspera_script": dl_script_header + aspera_script,
+        "fasterq_dump_script": dl_script_header + fasterq_dump_script,
     }
 
 

diff --git a/AMDirT/viewer/streamlit.py b/AMDirT/viewer/streamlit.py
@@ -77,10 +77,11 @@ def parse_args():
     options = ["No table selected"] + list(samples.keys())
     st.session_state.table_name = st.selectbox(label="Select a table", options=options)
     st.session_state.height = st.selectbox(
-        "Number of rows to display", (10, 20, 50, 100, 200), index=2
+        "Number of rows to display", (10, 20, 50, 100, 200), index=1
     )
     st.session_state.dl_method = st.selectbox(
-        label="Data download method", options=["curl", "nf-core/fetchngs", "aspera"]
+        label="Data download method",
+        options=["curl", "nf-core/fetchngs", "aspera", "sratookit"],
     )
     if st.session_state.dl_method == "aspera":
         st.warning(
@@ -319,6 +320,18 @@ def parse_args():
                             )["aspera_script"],
                             file_name="AncientMetagenomeDir_aspera_download_script.sh",
                         )
+                    elif st.session_state.dl_method == "sratookit":
+                        st.download_button(
+                            label="Download SRAtoolkit/fasterq-dump sample download script",
+                            help=f"approx. {total_size_str} of sequencing data selected",
+                            data=prepare_accession_table(
+                                pd.DataFrame(df_mod["selected_rows"]),
+                                lib_mod,
+                                st.session_state.table_name,
+                                supported_archives,
+                            )["fasterq_dump_script"],
+                            file_name="AncientMetagenomeDir_sratoolkit_download_script.sh",
+                        )
                     else:
                         st.download_button(
                             label="Download Curl sample download script",