From 54b01f211a143dee911615571fd37ccae19b2b6a Mon Sep 17 00:00:00 2001 From: maxibor Date: Mon, 18 Dec 2023 13:58:58 +0100 Subject: [PATCH] feat: add download with sratoolkit/fasterq-dump --- AMDirT/__init__.py | 2 +- AMDirT/cli.py | 48 ++++++++++--------- AMDirT/convert/__init__.py | 96 ++++++++++++++++++++++---------------- AMDirT/core/__init__.py | 5 ++ AMDirT/viewer/streamlit.py | 17 ++++++- 5 files changed, 101 insertions(+), 67 deletions(-) diff --git a/AMDirT/__init__.py b/AMDirT/__init__.py index bde0031..5b60188 100644 --- a/AMDirT/__init__.py +++ b/AMDirT/__init__.py @@ -1 +1 @@ -__version__ = "1.4.6" +__version__ = "1.5.0" diff --git a/AMDirT/cli.py b/AMDirT/cli.py index 74562f4..f543579 100644 --- a/AMDirT/cli.py +++ b/AMDirT/cli.py @@ -139,14 +139,19 @@ def viewer(ctx, no_args_is_help=True, **kwargs): help="Generate bash script with Aspera-based download commands for all libraries of samples in input table", ) @click.option( - "--eager", + "--fetchngs", is_flag=True, - help="Convert filtered samples and libraries tables to eager input tables", + help="Convert filtered samples and libraries tables to nf-core/fetchngs input tables", ) @click.option( - "--fetchngs", + "--sratoolkit", is_flag=True, - help="Convert filtered samples and libraries tables to nf-core/fetchngs input tables", + help="Generate bash script with SRA Toolkit fasterq-dump based download commands for all libraries of samples in input table", +) +@click.option( + "--eager", + is_flag=True, + help="Convert filtered samples and libraries tables to eager input tables", ) @click.option( "--ameta", @@ -178,26 +183,27 @@ def convert(ctx, no_args_is_help=True, **kwargs): # Autofill tool # ################# + @cli.command() @click.argument("accession", type=str, nargs=-1) @click.option( "-n", - "--table_name", + "--table_name", type=click.Choice(get_table_list()), - default='ancientmetagenome-hostassociated', - show_default=True + default="ancientmetagenome-hostassociated", + show_default=True, ) @click.option( "-l", "--library_output", type=click.Path(writable=True), - help="path to library output table file" + help="path to library output table file", ) @click.option( "-s", "--sample_output", type=click.Path(writable=True), - help="path to sample output table file" + help="path to sample output table file", ) @click.pass_context def autofill(ctx, no_args_is_help=True, **kwargs): @@ -219,31 +225,26 @@ def autofill(ctx, no_args_is_help=True, **kwargs): @click.argument("dataset", type=click.Path(exists=True)) @click.option( "-n", - "--table_name", + "--table_name", type=click.Choice(get_table_list()), - default='ancientmetagenome-hostassociated', - show_default=True + default="ancientmetagenome-hostassociated", + show_default=True, ) @click.option( "-t", - "--table_type", - type=click.Choice(['samples', 'libraries']), - default='libraries', - show_default=True -) -@click.option( - "-m", - "--markdown", - is_flag=True, - help="Output is in markdown format" + "--table_type", + type=click.Choice(["samples", "libraries"]), + default="libraries", + show_default=True, ) +@click.option("-m", "--markdown", is_flag=True, help="Output is in markdown format") @click.option( "-o", "--outdir", type=click.Path(writable=True), default=".", show_default=True, - help="path to sample output table file" + help="path to sample output table file", ) @click.pass_context def merge(ctx, no_args_is_help=True, **kwargs): @@ -255,5 +256,6 @@ def merge(ctx, no_args_is_help=True, **kwargs): """ merge_new_df(**kwargs, **ctx.obj) + if __name__ == "__main__": cli() diff --git a/AMDirT/convert/__init__.py b/AMDirT/convert/__init__.py index 4ca4565..71b11e2 100644 --- a/AMDirT/convert/__init__.py +++ b/AMDirT/convert/__init__.py @@ -28,6 +28,7 @@ def run_convert( aspera=False, eager=False, fetchngs=False, + sratoolkit=False, ameta=False, taxprofiler=False, mag=False, @@ -65,13 +66,21 @@ def run_convert( supported_archives=supported_archives, ) + accession_table = prepare_accession_table( + samples=samples, + libraries=selected_libraries, + table_name=table_name, + supported_archives=supported_archives, + ) + logger.warning( "We provide no warranty to the accuracy of the generated input sheets." ) if bibliography == True: - logger.info("Preparing Bibtex citation file") - with open(f"{output}/AncientMetagenomeDir_bibliography.bib", "w") as fw: + bibfile = f"{output}/AncientMetagenomeDir_bibliography.bib" + logger.info(f"Writing Bibtex citation file to {bibfile}") + with open(bibfile, "w") as fw: fw.write(prepare_bibtex_file(samples)) if table_name in ["ancientmetagenome-environmental"]: @@ -80,58 +89,48 @@ def run_convert( col_drop = ["archive_accession", "sample_host"] if librarymetadata == True: - logger.info("Writing filtered libraries table") + tbl_file = f"{output}/AncientMetagenomeDir_filtered_libraries.tsv" + logger.info(f"Writing filtered libraries table to {tbl_file}") librarymetadata = selected_libraries.drop(col_drop, axis=1) librarymetadata.to_csv( - f"{output}/AncientMetagenomeDir_filtered_libraries.tsv", + tbl_file, sep="\t", index=False, ) if curl == True: - logger.info("Writing curl download script") - accession_table = prepare_accession_table( - samples=samples, - libraries=selected_libraries, - table_name=table_name, - supported_archives=supported_archives, - ) - with open(f"{output}/AncientMetagenomeDir_curl_download_script.sh", "w") as fw: + dl_file = f"{output}/AncientMetagenomeDir_curl_download_script.sh" + logger.info(f"Writing curl download script to {dl_file}") + with open(dl_file, "w") as fw: fw.write(accession_table["curl_script"]) if aspera == True: - logger.info("Writing Aspera download script") + dl_file = f"{output}/AncientMetagenomeDir_aspera_download_script.sh" + logger.info(f"Writing Aspera download script to {dl_file}") logger.warning( "You will need to set the ${ASPERA_PATH} environment variable. See https://amdirt.readthedocs.io for more information." ) - accession_table = prepare_accession_table( - samples=samples, - libraries=selected_libraries, - table_name=table_name, - supported_archives=supported_archives, - ) - with open( - f"{output}/AncientMetagenomeDir_aspera_download_script.sh", "w" - ) as fw: + with open(dl_file, "w") as fw: fw.write(accession_table["aspera_script"]) if fetchngs == True: - logger.info("Preparing nf-core/fetchngs table") - accession_table = prepare_accession_table( - samples=samples, - libraries=selected_libraries, - table_name=table_name, - supported_archives=supported_archives, - ) + dl_file = f"{output}/AncientMetagenomeDir_nf_core_fetchngs_download_script.sh" + logger.info(f"Writing nf-core/fetchngs table to {dl_file}") accession_table["df"]["archive_data_accession"].to_csv( - f"{output}/AncientMetagenomeDir_nf_core_fetchngs_input_table.tsv", + dl_file, sep="\t", header=False, index=False, ) + if sratoolkit == True: + dl_file = f"{output}/AncientMetagenomeDir_sratoolkit_download_script.sh" + logger.info(f"Writing sratoolkit/fasterq-dump download script to {dl_file}") + with open(dl_file, "w") as fw: + fw.write(accession_table["fasterq_dump_script"]) if eager == True: - logger.info("Preparing nf-core/eager table") + tbl_file = f"{output}/AncientMetagenomeDir_nf_core_eager_input_table.tsv" + logger.info(f"Writing nf-core/eager table to {tbl_file}") eager_table = prepare_eager_table( samples=samples, libraries=selected_libraries, @@ -139,27 +138,29 @@ def run_convert( supported_archives=supported_archives, ) eager_table.to_csv( - f"{output}/AncientMetagenomeDir_nf_core_eager_input_table.tsv", + tbl_file, sep="\t", index=False, ) if taxprofiler == True: - logger.info("Preparing nf-core/taxprofiler table") - accession_table = prepare_taxprofiler_table( + tbl_file = f"{output}/AncientMetagenomeDir_nf_core_taxprofiler_input_table.csv" + logger.info(f"Writing nf-core/taxprofiler table to {tbl_file}") + taxprofiler_table = prepare_taxprofiler_table( samples=samples, libraries=selected_libraries, table_name=table_name, supported_archives=supported_archives, ) - accession_table.to_csv( - f"{output}/AncientMetagenomeDir_nf_core_taxprofiler_input_table.csv", + taxprofiler_table.to_csv( + tbl_file, header=False, index=False, ) if ameta == True: - logger.info("Preparing aMeta table") + tbl_file = f"{output}/AncientMetagenomeDir_aMeta_input_table.tsv" + logger.info(f"Writing aMeta table to {tbl_file}") logger.warning( "aMeta does not support pairs. You must manually merge pair-end data before using samplesheet." ) @@ -169,14 +170,15 @@ def run_convert( table_name=table_name, supported_archives=supported_archives, ) + aMeta_table.to_csv( - f"{output}/AncientMetagenomeDir_aMeta_input_table.tsv", + tbl_file, sep="\t", index=False, ) if mag == True: - logger.info("Preparing nf-core/mag table") + logger.info("Preparing nf-core/mag table(s)") mag_table_single, mag_table_paired = prepare_mag_table( samples=samples, libraries=selected_libraries, @@ -184,12 +186,24 @@ def run_convert( supported_archives=supported_archives, ) if not mag_table_single.empty: + mag_tbl_single_file = ( + f"{output}/AncientMetagenomeDir_nf_core_mag_input_single_table.csv" + ) + logger.info( + f"Writing nf-core/mag single-end table to {mag_tbl_single_file}" + ) mag_table_single.to_csv( - f"{output}/AncientMetagenomeDir_nf_core_mag_input_single_table.csv", + mag_tbl_single_file, index=False, ) if not mag_table_paired.empty: + mag_tbl_paired_file = ( + f"{output}/AncientMetagenomeDir_nf_core_mag_input_paired_table.csv" + ) + logger.info( + f"Writing nf-core/mag paired-end table to {mag_tbl_paired_file}" + ) mag_table_paired.to_csv( - f"{output}/AncientMetagenomeDir_nf_core_mag_input_paired_table.csv", + mag_tbl_paired_file, index=False, ) diff --git a/AMDirT/core/__init__.py b/AMDirT/core/__init__.py index 528fbb2..f2e74e6 100644 --- a/AMDirT/core/__init__.py +++ b/AMDirT/core/__init__.py @@ -302,6 +302,7 @@ def prepare_accession_table( # Downloading with curl or aspera instead of fetchngs urls = set(libraries["download_links"]) + accessions = set(libraries["archive_data_accession"]) links = set() for u in urls: for s in u.split(";"): @@ -321,11 +322,15 @@ def prepare_accession_table( ) + "\n" ) + fasterq_dump_script = ( + "\n".join([f"fasterq-dump --split-files -p {a}" for a in accessions]) + "\n" + ) return { "df": libraries[["archive_data_accession", "download_sizes"]].drop_duplicates(), "curl_script": dl_script_header + curl_script, "aspera_script": dl_script_header + aspera_script, + "fasterq_dump_script": dl_script_header + fasterq_dump_script, } diff --git a/AMDirT/viewer/streamlit.py b/AMDirT/viewer/streamlit.py index 2a83be5..38c3fbd 100644 --- a/AMDirT/viewer/streamlit.py +++ b/AMDirT/viewer/streamlit.py @@ -77,10 +77,11 @@ def parse_args(): options = ["No table selected"] + list(samples.keys()) st.session_state.table_name = st.selectbox(label="Select a table", options=options) st.session_state.height = st.selectbox( - "Number of rows to display", (10, 20, 50, 100, 200), index=2 + "Number of rows to display", (10, 20, 50, 100, 200), index=1 ) st.session_state.dl_method = st.selectbox( - label="Data download method", options=["curl", "nf-core/fetchngs", "aspera"] + label="Data download method", + options=["curl", "nf-core/fetchngs", "aspera", "sratookit"], ) if st.session_state.dl_method == "aspera": st.warning( @@ -319,6 +320,18 @@ def parse_args(): )["aspera_script"], file_name="AncientMetagenomeDir_aspera_download_script.sh", ) + elif st.session_state.dl_method == "sratookit": + st.download_button( + label="Download SRAtoolkit/fasterq-dump sample download script", + help=f"approx. {total_size_str} of sequencing data selected", + data=prepare_accession_table( + pd.DataFrame(df_mod["selected_rows"]), + lib_mod, + st.session_state.table_name, + supported_archives, + )["fasterq_dump_script"], + file_name="AncientMetagenomeDir_sratoolkit_download_script.sh", + ) else: st.download_button( label="Download Curl sample download script",