diff --git a/.gitignore b/.gitignore index ead8095..d87db8d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ docs/build/ +.DS_Store # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 0000000..6fba074 --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,10 @@ +# This configuration file was automatically generated by Gitpod. +# Please adjust to your needs (see https://www.gitpod.io/docs/introduction/learn-gitpod/gitpod-yaml) +# and commit this file to your remote git repository to share the goodness with others. + +# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart + +tasks: + - init: pip install . + + diff --git a/AMDirT/__init__.py b/AMDirT/__init__.py index e992399..f49459c 100644 --- a/AMDirT/__init__.py +++ b/AMDirT/__init__.py @@ -1 +1 @@ -__version__ = "1.4" +__version__ = "1.6.1" diff --git a/AMDirT/autofill/__init__.py b/AMDirT/autofill/__init__.py index ef1fab0..12479d5 100644 --- a/AMDirT/autofill/__init__.py +++ b/AMDirT/autofill/__init__.py @@ -108,6 +108,8 @@ def run_autofill(accession, table_name=None, schema=None, dataset=None, sample_o lib_out[col] = None lib_out = lib_out[libraries_df.columns] lib_out = lib_out.loc[:,~lib_out.columns.duplicated()].copy() + lib_out['read_count'] = lib_out['read_count'].str.replace(",", "", + regex=False) lib_out = lib_out.astype(libraries_df.dtypes.to_dict()) if library_output: diff --git a/AMDirT/cli.py b/AMDirT/cli.py index 74562f4..d6374fd 100644 --- a/AMDirT/cli.py +++ b/AMDirT/cli.py @@ -4,12 +4,38 @@ from AMDirT.validate import run_validation from AMDirT.viewer import run_app from AMDirT.convert import run_convert -from AMDirT.core import get_json_path +from AMDirT.core import get_json_path, get_amdir_tags, get_latest_tag from AMDirT.autofill import run_autofill from AMDirT.merge import merge_new_df +from AMDirT.download import download as download_amdir from json import load +class MutuallyExclusiveOption(click.Option): + # Credits goes to Stan Chang for this code snippet + # https://gist.github.com/stanchan/bce1c2d030c76fe9223b5ff6ad0f03db + + def __init__(self, *args, **kwargs): + self.mutually_exclusive = set(kwargs.pop("mutually_exclusive", [])) + help = kwargs.get("help", "") + if self.mutually_exclusive: + ex_str = ", ".join(self.mutually_exclusive) + kwargs["help"] = help + ( + " NOTE: This argument is mutually exclusive with " + " arguments: [" + ex_str + "]." + ) + super(MutuallyExclusiveOption, self).__init__(*args, **kwargs) + + def handle_parse_result(self, ctx, opts, args): + if self.mutually_exclusive.intersection(opts) and self.name in opts: + raise click.UsageError( + "Illegal usage: `{}` is mutually exclusive with " + "arguments `{}`.".format(self.name, ", ".join(self.mutually_exclusive)) + ) + + return super(MutuallyExclusiveOption, self).handle_parse_result(ctx, opts, args) + + def get_table_list(): json_path = get_json_path() with open(json_path, "r") as f: @@ -110,6 +136,20 @@ def viewer(ctx, no_args_is_help=True, **kwargs): type=click.Path(exists=True), help="(Optional) JSON file listing AncientMetagenomeDir tables", ) +@click.option( + "--libraries", + type=click.Path(readable=True, file_okay=True, dir_okay=False, exists=True), + help=("(Optional) Path to a pre-filtered libraries table"), + cls=MutuallyExclusiveOption, + mutually_exclusive=["librarymetadata"], +) +@click.option( + "--librarymetadata", + is_flag=True, + help="Generate AncientMetagenomeDir libraries table of all samples in input table", + cls=MutuallyExclusiveOption, + mutually_exclusive=["libraries"], +) @click.option( "-o", "--output", @@ -123,11 +163,6 @@ def viewer(ctx, no_args_is_help=True, **kwargs): is_flag=True, help="Generate BibTeX file of all publications in input table", ) -@click.option( - "--librarymetadata", - is_flag=True, - help="Generate AncientMetagenomeDir libraries table of all samples in input table", -) @click.option( "--curl", is_flag=True, @@ -139,14 +174,19 @@ def viewer(ctx, no_args_is_help=True, **kwargs): help="Generate bash script with Aspera-based download commands for all libraries of samples in input table", ) @click.option( - "--eager", + "--fetchngs", is_flag=True, - help="Convert filtered samples and libraries tables to eager input tables", + help="Convert filtered samples and libraries tables to nf-core/fetchngs input tables", ) @click.option( - "--fetchngs", + "--sratoolkit", is_flag=True, - help="Convert filtered samples and libraries tables to nf-core/fetchngs input tables", + help="Generate bash script with SRA Toolkit fasterq-dump based download commands for all libraries of samples in input table", +) +@click.option( + "--eager", + is_flag=True, + help="Convert filtered samples and libraries tables to eager input tables", ) @click.option( "--ameta", @@ -167,6 +207,9 @@ def viewer(ctx, no_args_is_help=True, **kwargs): def convert(ctx, no_args_is_help=True, **kwargs): """\b Converts filtered samples and libraries tables to eager, ameta, taxprofiler, and fetchNGS input tables + + Note: When supplying a pre-filtered libraries table with `--libraries`, the corresponding sample table is still required! + \b SAMPLES: path to filtered AncientMetagenomeDir samples tsv file TABLE_NAME: name of table to convert @@ -178,26 +221,27 @@ def convert(ctx, no_args_is_help=True, **kwargs): # Autofill tool # ################# + @cli.command() @click.argument("accession", type=str, nargs=-1) @click.option( "-n", - "--table_name", + "--table_name", type=click.Choice(get_table_list()), - default='ancientmetagenome-hostassociated', - show_default=True + default="ancientmetagenome-hostassociated", + show_default=True, ) @click.option( "-l", "--library_output", type=click.Path(writable=True), - help="path to library output table file" + help="path to library output table file", ) @click.option( "-s", "--sample_output", type=click.Path(writable=True), - help="path to sample output table file" + help="path to sample output table file", ) @click.pass_context def autofill(ctx, no_args_is_help=True, **kwargs): @@ -219,31 +263,26 @@ def autofill(ctx, no_args_is_help=True, **kwargs): @click.argument("dataset", type=click.Path(exists=True)) @click.option( "-n", - "--table_name", + "--table_name", type=click.Choice(get_table_list()), - default='ancientmetagenome-hostassociated', - show_default=True + default="ancientmetagenome-hostassociated", + show_default=True, ) @click.option( "-t", - "--table_type", - type=click.Choice(['samples', 'libraries']), - default='libraries', - show_default=True -) -@click.option( - "-m", - "--markdown", - is_flag=True, - help="Output is in markdown format" + "--table_type", + type=click.Choice(["samples", "libraries"]), + default="libraries", + show_default=True, ) +@click.option("-m", "--markdown", is_flag=True, help="Output is in markdown format") @click.option( "-o", "--outdir", type=click.Path(writable=True), default=".", show_default=True, - help="path to sample output table file" + help="path to sample output table file", ) @click.pass_context def merge(ctx, no_args_is_help=True, **kwargs): @@ -255,5 +294,46 @@ def merge(ctx, no_args_is_help=True, **kwargs): """ merge_new_df(**kwargs, **ctx.obj) + +@cli.command() +@click.option( + "-t", + "--table", + help="AncientMetagenomeDir table to download", + type=click.Choice(get_table_list()), + default="ancientmetagenome-hostassociated", + show_default=True, +) +@click.option( + "-y", + "--table_type", + help="Type of table to download", + type=click.Choice(["samples", "libraries"]), + default="samples", + show_default=True, +) +@click.option( + "-r", + "--release", + help="Release tag to download", + type=click.Choice(get_amdir_tags()), + default=get_latest_tag(get_amdir_tags()), + show_default=True, +) +@click.option( + "-o", + "--output", + help="Output directory", + type=click.Path(writable=True), + default=".", + show_default=True, +) +def download(no_args_is_help=True, **kwargs): + """\b + Download a table from the AMDirT repository + """ + download_amdir(**kwargs) + + if __name__ == "__main__": cli() diff --git a/AMDirT/convert/__init__.py b/AMDirT/convert/__init__.py index e417e40..bab68df 100644 --- a/AMDirT/convert/__init__.py +++ b/AMDirT/convert/__init__.py @@ -9,8 +9,11 @@ is_merge_size_zero, prepare_taxprofiler_table, get_libraries, + get_remote_resources, + get_json_path, ) -from AMDirT.core import get_json_path +from AMDirT.validate import AMDirValidator +from AMDirT.validate.exceptions import DatasetValidationError from json import load from AMDirT.core import logger import pandas as pd @@ -19,6 +22,7 @@ def run_convert( samples, + libraries, table_name, tables=None, output=".", @@ -28,6 +32,7 @@ def run_convert( aspera=False, eager=False, fetchngs=False, + sratoolkit=False, ameta=False, taxprofiler=False, mag=False, @@ -36,9 +41,10 @@ def run_convert( """Run the AMDirT conversion application to input samplesheet tables for different pipelines Args: - tables (str): Path to JSON file listing tables samples (str): Path to AncientMetagenomeDir filtered samples tsv file + libraries(str): Optional path to AncientMetagenomeDir pre-filtered libraries tsv file table_name (str): Name of the table of the table to convert + tables (str): Path to JSON file listing tables output (str): Path to output table. Defaults to "." """ os.makedirs(output, exist_ok=True) @@ -46,23 +52,85 @@ def run_convert( if not verbose: warnings.filterwarnings("ignore") supported_archives = ["ENA", "SRA"] + + # Validate input table if tables is None: - table_path = get_json_path() + remote_resources = get_remote_resources() else: - table_path = tables - with open(table_path, "r") as f: - tables = load(f) - table_list = list(tables["samples"].keys()) - if table_name not in table_list: - logger.info(f"Table '{table_name}' not found in {table_list}") - samples = pd.read_csv(samples, sep="\t") - libraries = pd.read_csv(tables["libraries"][table_name], sep="\t") + with open(tables, "r") as f: + remote_resources = load(f) + + if table_name not in remote_resources["samples"]: + raise ValueError(f"{table_name} not found in AncientMetagenomeDir file") + if not verbose: + warnings.filterwarnings("ignore") - logger.warning("We provide no warranty to the accuracy of the generated input sheets.") + schema = remote_resources[f"samples_schema"][table_name] + dataset_valid = list() + v = AMDirValidator(schema, samples) + dataset_valid.append(v.parsing_ok) + if v.parsing_ok: + dataset_valid.append(v.validate_schema()) + dataset_valid.append(v.check_duplicate_rows()) + dataset_valid.append(v.check_columns()) + + dataset_valid = all(dataset_valid) + if dataset_valid is False: + v.to_rich() + raise DatasetValidationError("Input sample dataset is not valid") + else: + logger.info("Input sample dataset is valid") + samples = pd.read_csv(samples, sep="\t") + remote_libraries = pd.read_csv( + remote_resources["libraries"][table_name], sep="\t" + ) + + if not libraries: + selected_libraries = get_libraries( + samples=samples, + libraries=remote_libraries, + table_name=table_name, + supported_archives=supported_archives, + ) + else: + schema = remote_resources[f"libraries_schema"][table_name] + dataset_valid = list() + v = AMDirValidator(schema, libraries) + dataset_valid.append(v.parsing_ok) + if v.parsing_ok: + dataset_valid.append(v.validate_schema()) + dataset_valid.append(v.check_duplicate_rows()) + dataset_valid.append(v.check_columns()) + + dataset_valid = all(dataset_valid) + if dataset_valid is False: + v.to_rich() + raise DatasetValidationError("Input libraries dataset is not valid") + else: + logger.info("Input libraries dataset is valid") + libraries = pd.read_csv(libraries, sep="\t") + selected_libraries = get_libraries( + samples=samples, + libraries=libraries, + table_name=table_name, + supported_archives=supported_archives, + ) + + accession_table = prepare_accession_table( + samples=samples, + libraries=selected_libraries, + table_name=table_name, + supported_archives=supported_archives, + ) + + logger.warning( + "We provide no warranty to the accuracy of the generated input sheets." + ) if bibliography == True: - logger.info("Preparing Bibtex citation file") - with open(f"{output}/AncientMetagenomeDir_bibliography.bib", "w") as fw: + bibfile = f"{output}/AncientMetagenomeDir_bibliography.bib" + logger.info(f"Writing Bibtex citation file to {bibfile}") + with open(bibfile, "w") as fw: fw.write(prepare_bibtex_file(samples)) if table_name in ["ancientmetagenome-environmental"]: @@ -71,121 +139,121 @@ def run_convert( col_drop = ["archive_accession", "sample_host"] if librarymetadata == True: - logger.info("Writing filtered libraries table") - librarymetadata = get_libraries( - samples=samples, - libraries=libraries, - table_name=table_name, - supported_archives=supported_archives, - ).drop( - col_drop, axis=1 - ) + tbl_file = f"{output}/AncientMetagenomeDir_filtered_libraries.tsv" + logger.info(f"Writing filtered libraries table to {tbl_file}") + librarymetadata = selected_libraries.drop(col_drop, axis=1) librarymetadata.to_csv( - f"{output}/AncientMetagenomeDir_filtered_libraries.tsv", + tbl_file, sep="\t", index=False, ) if curl == True: - logger.info("Writing curl download script") - accession_table = prepare_accession_table( - samples=samples, - libraries=libraries, - table_name=table_name, - supported_archives=supported_archives, - ) - with open(f"{output}/AncientMetagenomeDir_curl_download_script.sh", "w") as fw: + dl_file = f"{output}/AncientMetagenomeDir_curl_download_script.sh" + logger.info(f"Writing curl download script to {dl_file}") + with open(dl_file, "w") as fw: fw.write(accession_table["curl_script"]) if aspera == True: - logger.info("Writing Aspera download script") + dl_file = f"{output}/AncientMetagenomeDir_aspera_download_script.sh" + logger.info(f"Writing Aspera download script to {dl_file}") logger.warning( "You will need to set the ${ASPERA_PATH} environment variable. See https://amdirt.readthedocs.io for more information." ) - accession_table = prepare_accession_table( - samples=samples, - libraries=libraries, - table_name=table_name, - supported_archives=supported_archives, - ) - with open( - f"{output}/AncientMetagenomeDir_aspera_download_script.sh", "w" - ) as fw: + with open(dl_file, "w") as fw: fw.write(accession_table["aspera_script"]) if fetchngs == True: - logger.info("Preparing nf-core/fetchngs table") - accession_table = prepare_accession_table( - samples=samples, - libraries=libraries, - table_name=table_name, - supported_archives=supported_archives, - ) - accession_table["df"]['archive_accession'].to_csv( - f"{output}/AncientMetagenomeDir_nf_core_fetchngs_input_table.tsv", + dl_file = f"{output}/AncientMetagenomeDir_nf_core_fetchngs_download_script.sh" + logger.info(f"Writing nf-core/fetchngs table to {dl_file}") + accession_table["df"]["archive_data_accession"].to_csv( + dl_file, sep="\t", header=False, index=False, ) + if sratoolkit == True: + dl_file = f"{output}/AncientMetagenomeDir_sratoolkit_download_script.sh" + logger.info(f"Writing sratoolkit/fasterq-dump download script to {dl_file}") + with open(dl_file, "w") as fw: + fw.write(accession_table["fasterq_dump_script"]) if eager == True: - logger.info("Preparing nf-core/eager table") + tbl_file = f"{output}/AncientMetagenomeDir_nf_core_eager_input_table.tsv" + logger.info(f"Writing nf-core/eager table to {tbl_file}") eager_table = prepare_eager_table( samples=samples, - libraries=libraries, + libraries=selected_libraries, table_name=table_name, supported_archives=supported_archives, ) eager_table.to_csv( - f"{output}/AncientMetagenomeDir_nf_core_eager_input_table.tsv", + tbl_file, sep="\t", index=False, ) if taxprofiler == True: - logger.info("Preparing nf-core/taxprofiler table") - accession_table = prepare_taxprofiler_table( + tbl_file = f"{output}/AncientMetagenomeDir_nf_core_taxprofiler_input_table.csv" + logger.info(f"Writing nf-core/taxprofiler table to {tbl_file}") + taxprofiler_table = prepare_taxprofiler_table( samples=samples, - libraries=libraries, + libraries=selected_libraries, table_name=table_name, supported_archives=supported_archives, ) - accession_table["df"].to_csv( - f"{output}/AncientMetagenomeDir_nf_core_taxprofiler_input_table.csv", + taxprofiler_table.to_csv( + tbl_file, header=False, index=False, ) if ameta == True: - logger.info("Preparing aMeta table") - logger.warning("aMeta does not support pairs. You must manually merge pair-end data before using samplesheet.") + tbl_file = f"{output}/AncientMetagenomeDir_aMeta_input_table.tsv" + logger.info(f"Writing aMeta table to {tbl_file}") + logger.warning( + "aMeta does not support pairs. You must manually merge pair-end data before using samplesheet." + ) aMeta_table = prepare_aMeta_table( samples=samples, - libraries=libraries, + libraries=selected_libraries, table_name=table_name, supported_archives=supported_archives, ) + aMeta_table.to_csv( - f"{output}/AncientMetagenomeDir_aMeta_input_table.tsv", + tbl_file, sep="\t", index=False, ) if mag == True: - logger.info("Preparing nf-core/mag table") + logger.info("Preparing nf-core/mag table(s)") mag_table_single, mag_table_paired = prepare_mag_table( samples=samples, - libraries=libraries, + libraries=selected_libraries, table_name=table_name, supported_archives=supported_archives, ) if not mag_table_single.empty: + mag_tbl_single_file = ( + f"{output}/AncientMetagenomeDir_nf_core_mag_input_single_table.csv" + ) + logger.info( + f"Writing nf-core/mag single-end table to {mag_tbl_single_file}" + ) mag_table_single.to_csv( - f"{output}/AncientMetagenomeDir_nf_core_mag_input_single_table.csv", + mag_tbl_single_file, index=False, ) if not mag_table_paired.empty: + mag_tbl_paired_file = ( + f"{output}/AncientMetagenomeDir_nf_core_mag_input_paired_table.csv" + ) + logger.info( + f"Writing nf-core/mag paired-end table to {mag_tbl_paired_file}" + ) mag_table_paired.to_csv( - f"{output}/AncientMetagenomeDir_nf_core_mag_input_paired_table.csv", + mag_tbl_paired_file, index=False, ) diff --git a/AMDirT/core/__init__.py b/AMDirT/core/__init__.py index e38e288..e4545b4 100644 --- a/AMDirT/core/__init__.py +++ b/AMDirT/core/__init__.py @@ -5,28 +5,55 @@ import pandas as pd import streamlit as st from packaging import version +from packaging.version import InvalidVersion from importlib.resources import files as get_module_dir import os import logging import colorlog +from json import load + pd.options.mode.chained_assignment = None +logging.basicConfig(level=logging.INFO) handler = colorlog.StreamHandler() -handler.setFormatter(colorlog.ColoredFormatter( - '%(log_color)s%(name)s [%(levelname)s]: %(message)s')) +handler.setFormatter( + colorlog.ColoredFormatter("%(log_color)s%(name)s [%(levelname)s]: %(message)s") +) -logger = colorlog.getLogger('AMDirT') +logger = colorlog.getLogger("AMDirT") logger.addHandler(handler) logger.propagate = False +def monkeypatch_get_storage_manager(): + if st.runtime.exists(): + return st.runtime.get_instance().cache_storage_manager + else: + # When running in "raw mode", we can't access the CacheStorageManager, + # so we're falling back to InMemoryCache. + # https://github.com/streamlit/streamlit/issues/6620 + # _LOGGER.warning("No runtime found, using MemoryCacheStorageManager") + return ( + st.runtime.caching.storage.dummy_cache_storage.MemoryCacheStorageManager() + ) + + +st.runtime.caching._data_caches.get_storage_manager = monkeypatch_get_storage_manager + + def get_json_path(): - path = os.path.join(get_module_dir("AMDirT.assets"), "tables.json") + path = get_module_dir("AMDirT.assets").joinpath("tables.json") return path +def get_remote_resources(): + json_path = get_json_path() + with open(json_path, "r") as f: + return load(f) + + @st.cache_data def get_amdir_tags(): r = requests.get( @@ -39,7 +66,34 @@ def get_amdir_tags(): if version.parse(tag["name"]) >= version.parse("v22.09") ] else: - return [] + logger.warning( + "Could not fetch tags from AncientMetagenomeDir. Defaulting to master. Metadata may not yet be officially released." + ) + return ["master"] + + +@st.cache_data +def get_latest_tag(tags): + try: + return sorted(tags, key=lambda x: version.Version(x))[-1] + except InvalidVersion: + if "master" in tags: + return "master" + else: + raise InvalidVersion("No valid tags found") + + +def check_allowed_values(ref: list, test: str): + """ + Check if test is in ref + Args: + ref(list): List of allowed values + test(str): value to check + """ + + if test in ref: + return True + return False def get_colour_chemistry(instrument: str) -> int: @@ -76,8 +130,14 @@ def doi2bib(doi: str) -> str: return r.text + @st.cache_data -def get_libraries(table_name: str, samples: pd.DataFrame, libraries: pd.DataFrame, supported_archives: Iterable[str]): +def get_libraries( + table_name: str, + samples: pd.DataFrame, + libraries: pd.DataFrame, + supported_archives: Iterable[str], +): """Get filtered libraries from samples and libraries tables Args: @@ -116,6 +176,7 @@ def get_libraries(table_name: str, samples: pd.DataFrame, libraries: pd.DataFram return selected_libraries + def get_filename(path_string: str, orientation: str) -> Tuple[str, str]: """ Get Fastq Filename from download_links column @@ -138,18 +199,16 @@ def get_filename(path_string: str, orientation: str) -> Tuple[str, str]: elif orientation == "rev": return rev -def parse_to_mag(selected_libraries): - selected_libraries["short_reads_1"] = selected_libraries["download_links"].apply( +def parse_to_mag(libraries): + libraries["short_reads_1"] = libraries["download_links"].apply( get_filename, orientation="fwd" ) - selected_libraries["short_reads_2"] = selected_libraries["download_links"].apply( + libraries["short_reads_2"] = libraries["download_links"].apply( get_filename, orientation="rev" ) - selected_libraries["short_reads_2"] = selected_libraries["short_reads_2"].replace( - "NA", "" - ) - selected_libraries["longs_reads"] = "" + libraries["short_reads_2"] = libraries["short_reads_2"].replace("NA", "") + libraries["longs_reads"] = "" col2keep = [ "archive_data_accession", "archive_sample_accession", @@ -157,13 +216,14 @@ def parse_to_mag(selected_libraries): "short_reads_2", "longs_reads", ] - selected_libraries = selected_libraries[col2keep].rename( + libraries = libraries[col2keep].rename( columns={ "archive_data_accession": "sample", "archive_sample_accession": "group", } ) - return selected_libraries + return libraries + @st.cache_data def prepare_eager_table( @@ -180,36 +240,24 @@ def prepare_eager_table( table_name (str): Name of the table supported_archives (list): list of supported archives """ - selected_libraries = get_libraries( - table_name=table_name, - samples=samples, - libraries=libraries, - supported_archives=supported_archives, - ) - selected_libraries["Colour_Chemistry"] = selected_libraries[ - "instrument_model" - ].apply(get_colour_chemistry) + libraries["Colour_Chemistry"] = libraries["instrument_model"].apply( + get_colour_chemistry + ) - selected_libraries[ - "UDG_Treatment" - ] = selected_libraries.library_treatment.str.split("-", expand=True)[0] + libraries["UDG_Treatment"] = libraries.library_treatment.str.split( + "-", expand=True + )[0] - selected_libraries["R1"] = selected_libraries["download_links"].apply( - get_filename, orientation="fwd" - ) + libraries["R1"] = libraries["download_links"].apply(get_filename, orientation="fwd") - selected_libraries["R2"] = selected_libraries["download_links"].apply( - get_filename, orientation="rev" - ) + libraries["R2"] = libraries["download_links"].apply(get_filename, orientation="rev") - selected_libraries["Lane"] = 0 - selected_libraries["SeqType"] = where( - selected_libraries["library_layout"] == "SINGLE", "SE", "PE" - ) - selected_libraries["BAM"] = "NA" + libraries["Lane"] = 0 + libraries["SeqType"] = where(libraries["library_layout"] == "SINGLE", "SE", "PE") + libraries["BAM"] = "NA" if table_name == "ancientmetagenome-environmental": - selected_libraries["sample_host"] = "environmental" + libraries["sample_host"] = "environmental" col2keep = [ "sample_name", "archive_data_accession", @@ -223,7 +271,7 @@ def prepare_eager_table( "R2", "BAM", ] - selected_libraries = selected_libraries[col2keep].rename( + libraries = libraries[col2keep].rename( columns={ "sample_name": "Sample_Name", "archive_data_accession": "Library_ID", @@ -232,7 +280,7 @@ def prepare_eager_table( } ) - return selected_libraries + return libraries @st.cache_data @@ -251,18 +299,11 @@ def prepare_mag_table( supported_archives (list): list of supported archives """ - selected_libraries = get_libraries( - table_name=table_name, - samples=samples, - libraries=libraries, - supported_archives=supported_archives, - ) - # Create a DataFrame for "SINGLE" values - single_libraries = selected_libraries[selected_libraries["library_layout"] == "SINGLE"] + single_libraries = libraries[libraries["library_layout"] == "SINGLE"] # Create a DataFrame for "PAIRED" values - paired_libraries = selected_libraries[selected_libraries["library_layout"] == "PAIRED"] + paired_libraries = libraries[libraries["library_layout"] == "PAIRED"] if not single_libraries.empty: single_libraries = parse_to_mag(single_libraries) @@ -271,6 +312,7 @@ def prepare_mag_table( return single_libraries, paired_libraries + @st.cache_data def prepare_accession_table( samples: pd.DataFrame, @@ -287,15 +329,16 @@ def prepare_accession_table( supported_archives (list): list of supported archives """ - selected_libraries = get_libraries( - table_name=table_name, - samples=samples, - libraries=libraries, - supported_archives=supported_archives, - ) + # libraries = get_libraries( + # table_name=table_name, + # samples=samples, + # libraries=libraries, + # supported_archives=supported_archives, + # ) # Downloading with curl or aspera instead of fetchngs - urls = set(selected_libraries["download_links"]) + urls = set(libraries["download_links"]) + accessions = set(libraries["archive_data_accession"]) links = set() for u in urls: for s in u.split(";"): @@ -315,15 +358,18 @@ def prepare_accession_table( ) + "\n" ) + fasterq_dump_script = ( + "\n".join([f"fasterq-dump --split-files -p {a}" for a in accessions]) + "\n" + ) return { - "df": selected_libraries[ - ["archive_accession", "download_sizes"] - ].drop_duplicates(), + "df": libraries[["archive_data_accession", "download_sizes"]].drop_duplicates(), "curl_script": dl_script_header + curl_script, "aspera_script": dl_script_header + aspera_script, + "fasterq_dump_script": dl_script_header + fasterq_dump_script, } + @st.cache_data def prepare_taxprofiler_table( samples: pd.DataFrame, @@ -339,45 +385,60 @@ def prepare_taxprofiler_table( table_name (str): Name of the table supported_archives (list): list of supported archives """ - selected_libraries = get_libraries( - table_name=table_name, - samples=samples, - libraries=libraries, - supported_archives=supported_archives, - ) - selected_libraries["fastq_1"] = selected_libraries["download_links"].apply( + libraries["fastq_1"] = libraries["download_links"].apply( get_filename, orientation="fwd" ) - selected_libraries["fastq_2"] = selected_libraries["download_links"].apply( + libraries["fastq_2"] = libraries["download_links"].apply( get_filename, orientation="rev" ) - selected_libraries["fastq_2"] = selected_libraries["fastq_2"].replace( - "NA", "" + libraries["fastq_2"] = libraries["fastq_2"].replace("NA", "") + + libraries["fasta"] = "" + + libraries["instrument_model"] = where( + libraries["instrument_model"] + .str.lower() + .str.contains("illumina|nextseq|hiseq|miseq"), + "ILLUMINA", + where( + libraries["instrument_model"].str.lower().str.contains("torrent"), + "ION_TORRENT", + where( + libraries["instrument_model"].str.lower().str.contains("helicos"), + "HELICOS", + where( + libraries["instrument_model"].str.lower().str.contains("bgiseq"), + "BGISEQ", + where( + libraries["instrument_model"].str.lower().str.contains("454"), + "LS454", + libraries["instrument_model"], + ), + ), + ), + ), ) - selected_libraries["fasta"] = "" - - selected_libraries['instrument_model'] = where(selected_libraries['instrument_model'].str.lower().str.contains('illumina|nextseq|hiseq|miseq'), 'ILLUMINA', - where(selected_libraries['instrument_model'].str.lower().str.contains('torrent'), 'ION_TORRENT', - where(selected_libraries['instrument_model'].str.lower().str.contains('helicos'), 'HELICOS', - where(selected_libraries['instrument_model'].str.lower().str.contains('bgiseq'), 'BGISEQ', - where(selected_libraries['instrument_model'].str.lower().str.contains('454'), 'LS454', - selected_libraries['instrument_model'])))) - ) - - col2keep = ["sample_name", "library_name", "instrument_model", "fastq_1", "fastq_2", "fasta"] - selected_libraries = selected_libraries[col2keep].rename( + col2keep = [ + "sample_name", + "library_name", + "instrument_model", + "fastq_1", + "fastq_2", + "fasta", + ] + libraries = libraries[col2keep].rename( columns={ "sample_name": "sample", "library_name": "run_accession", - "instrument_model": "instrument_platform" + "instrument_model": "instrument_platform", } ) - return selected_libraries + return libraries @st.cache_data @@ -395,45 +456,33 @@ def prepare_aMeta_table( table_name (str): Name of the table supported_archives (list): list of supported archives """ - selected_libraries = get_libraries( - table_name=table_name, - samples=samples, - libraries=libraries, - supported_archives=supported_archives, - ) - selected_libraries["Colour_Chemistry"] = selected_libraries[ - "instrument_model" - ].apply(get_colour_chemistry) + libraries["Colour_Chemistry"] = libraries["instrument_model"].apply( + get_colour_chemistry + ) - selected_libraries[ - "UDG_Treatment" - ] = selected_libraries.library_treatment.str.split("-", expand=True)[0] + libraries["UDG_Treatment"] = libraries.library_treatment.str.split( + "-", expand=True + )[0] - selected_libraries["R1"] = selected_libraries["download_links"].apply( - get_filename, orientation="fwd" - ) + libraries["R1"] = libraries["download_links"].apply(get_filename, orientation="fwd") - selected_libraries["R2"] = selected_libraries["download_links"].apply( - get_filename, orientation="rev" - ) + libraries["R2"] = libraries["download_links"].apply(get_filename, orientation="rev") - selected_libraries["Lane"] = 0 - selected_libraries["SeqType"] = where( - selected_libraries["library_layout"] == "SINGLE", "SE", "PE" - ) - selected_libraries["BAM"] = "NA" + libraries["Lane"] = 0 + libraries["SeqType"] = where(libraries["library_layout"] == "SINGLE", "SE", "PE") + libraries["BAM"] = "NA" if table_name == "ancientmetagenome-environmental": - selected_libraries["sample_host"] = "environmental" + libraries["sample_host"] = "environmental" col2keep = ["archive_data_accession", "R1"] - selected_libraries = selected_libraries[col2keep].rename( + libraries = libraries[col2keep].rename( columns={ "archive_data_accession": "sample", "R1": "fastq", } ) - return selected_libraries + return libraries @st.cache_data diff --git a/AMDirT/download/__init__.py b/AMDirT/download/__init__.py new file mode 100644 index 0000000..22623ef --- /dev/null +++ b/AMDirT/download/__init__.py @@ -0,0 +1,62 @@ +from AMDirT.core import ( + logger, + get_amdir_tags, + get_remote_resources, + check_allowed_values, +) +import requests + + +def download(table: str, table_type: str, release: str, output: str = ".") -> str: + """ + Download a table from the AMDirT repository. + + Parameters + ---------- + table : str + The AncientMetagenomeDir table to download. + table_type : str + The type of table to download. Allowed values are ['samples', 'libraries']. + release : str + The release of the table to download. Must be a valid release tag. + output: str + The output directory to save the table. Default is the current directory. + + Returns + ------- + str: + The path to the downloaded table. + + Raises + ------ + ValueError + If an invalid table is provided. + ValueError + If an invalid table type is provided. + ValueError + If an invalid release is provided. + """ + + resources = get_remote_resources() + tags = get_amdir_tags() + if tags != ["master"]: + if check_allowed_values(tags, release) is False: + raise ValueError(f"Invalid release: {release}. Allowed values are {tags}") + + tables = resources["samples"] + if check_allowed_values(tables, table) is False: + raise ValueError(f"Invalid table: {table}. Allowed values are {tables}") + + if check_allowed_values(["samples", "libraries"], table_type) is False: + raise ValueError( + f"Invalid table type: {table_type}. Allowed values are ['samples', 'libraries']" + ) + table_filename = f"{table}_{table_type}_{release}.tsv" + logger.info( + f"Downloading {table} {table_type} table from {release} release, saving to {output}/{table_filename}" + ) + t = requests.get(resources[table_type][table].replace("master", release)) + with open(table_filename, "w") as fh: + fh.write(t.text) + + return table_filename diff --git a/AMDirT/merge/__init__.py b/AMDirT/merge/__init__.py index 5a63996..5302ece 100644 --- a/AMDirT/merge/__init__.py +++ b/AMDirT/merge/__init__.py @@ -2,15 +2,9 @@ from AMDirT.validate.exceptions import DatasetValidationError import warnings import pandas as pd -from AMDirT.core import logger, get_json_path -from json import load +from AMDirT.core import logger, get_remote_resources from os.path import join -def get_remote_resources(): - json_path = get_json_path() - with open(json_path, "r") as f: - return load(f) - def merge_new_df( dataset, @@ -22,7 +16,7 @@ def merge_new_df( schema_check=True, line_dup=True, columns=True, -): +): """Merge a new dataset with the remote master dataset Args: @@ -40,10 +34,10 @@ def merge_new_df( ValueError: Table type must be either 'samples' or 'libraries' ValueError: Table name not found in AncientMetagenomeDir file DatasetValidationError: New dataset is not valid - """ + """ remote_resources = get_remote_resources() - if table_type not in ['samples', 'libraries']: + if table_type not in ["samples", "libraries"]: raise ValueError("table_type must be either 'samples' or 'libraries'") if table_name not in remote_resources[table_type]: raise ValueError("table_name not found in AncientMetagenomeDir file") @@ -68,13 +62,24 @@ def merge_new_df( else: v.to_rich() raise DatasetValidationError("New Dataset is not valid") - + else: - remote_dataset = pd.read_table(remote_resources[table_type][table_name], dtype=dict(v.dataset.dtypes)) + remote_dataset = pd.read_table( + remote_resources[table_type][table_name], dtype=dict(v.dataset.dtypes) + ) logger.info("New Dataset is valid") - logger.info(f"Merging new dataset with remote {table_name} {table_type} dataset") + logger.info( + f"Merging new dataset with remote {table_name} {table_type} dataset" + ) dataset = pd.concat([remote_dataset, v.dataset]) dataset.drop_duplicates(inplace=True) - dataset.to_csv(join(outdir,f"{table_name}_{table_type}.tsv"), sep="\t", na_rep= "NA", index=False) - logger.info(f"New {table_name} {table_type} dataset written to {join(outdir,f'{table_name}_{table_type}.tsv')}") + dataset.to_csv( + join(outdir, f"{table_name}_{table_type}.tsv"), + sep="\t", + na_rep="NA", + index=False, + ) + logger.info( + f"New {table_name} {table_type} dataset written to {join(outdir,f'{table_name}_{table_type}.tsv')}" + ) diff --git a/AMDirT/validate/domain/__init__.py b/AMDirT/validate/domain/__init__.py index 8af1cbe..2cb9d7b 100644 --- a/AMDirT/validate/domain/__init__.py +++ b/AMDirT/validate/domain/__init__.py @@ -316,19 +316,11 @@ def to_markdown(self) -> bool: Raises: SystemExit: If dataset is invalid """ - df = pd.DataFrame(columns=["Error", "Source", "Column", "Row", "Message"]) - for error in self.errors: - df = pd.concat( - [ - df, - pd.Series(error.to_dict()) - ], - ignore_index=True - ) - if len(df) > 0: + if len(self.errors) > 0: + df = pd.concat([pd.Series(error.to_dict()).to_frame().transpose() for error in self.errors]) raise SystemExit( f"Invalid dataset `{self.dataset_name}`\n\n{df.to_markdown(index=False)}" ) else: - logger.info(f"{self.dataset_name} is valid") + print(f"`{self.dataset_name}` is valid") return True diff --git a/AMDirT/viewer/requirements.txt b/AMDirT/viewer/requirements.txt index b7f7b52..24afd00 100644 --- a/AMDirT/viewer/requirements.txt +++ b/AMDirT/viewer/requirements.txt @@ -1,9 +1,11 @@ -click==8.1.2 -jsonschema==4.4.0 -numpy==1.22.3 -pandas==1.4.2 -requests==2.27.1 -rich==12.2.0 -streamlit==1.14.1 -watchdog==2.1.7 -streamlit-aggrid==0.3.3 +click +jsonschema +numpy +pandas +requests +rich +streamlit +watchdog +streamlit-aggrid +colorlog +amdirt==1.6.1 diff --git a/AMDirT/viewer/streamlit.py b/AMDirT/viewer/streamlit.py index b52d90c..38c3fbd 100644 --- a/AMDirT/viewer/streamlit.py +++ b/AMDirT/viewer/streamlit.py @@ -1,7 +1,7 @@ from numpy import ALLOW_THREADS import streamlit as st import pandas as pd - +import os from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode, JsCode import argparse import zipfile @@ -17,11 +17,11 @@ prepare_taxprofiler_table, is_merge_size_zero, get_amdir_tags, - get_libraries + get_libraries, + get_json_path, ) - st.set_page_config( page_title="AMDirT viewer", page_icon="https://raw.githubusercontent.com/SPAAM-community/AncientMetagenomeDir/master/assets/images/logos/spaam-AncientMetagenomeDir_logo_mini.png", @@ -32,15 +32,19 @@ if "compute" not in st.session_state: st.session_state.compute = False -if "force_validation" not in st.session_state: - st.session_state.force_validation = False +if "force_samp_validation" not in st.session_state: + st.session_state.force_samp_validation = False +if "force_lib_validation" not in st.session_state: + st.session_state.force_lib_validation = False if "table_name" not in st.session_state: st.session_state.table_name = None def parse_args(): parser = argparse.ArgumentParser("Run Streamlit app") - parser.add_argument("-c", "--config", help="json config file", required=True) + parser.add_argument( + "-c", "--config", help="json config file", default=get_json_path() + ) try: args = parser.parse_args() except SystemExit as e: @@ -73,10 +77,11 @@ def parse_args(): options = ["No table selected"] + list(samples.keys()) st.session_state.table_name = st.selectbox(label="Select a table", options=options) st.session_state.height = st.selectbox( - "Number of rows to display", (10, 20, 50, 100, 200), index=2 + "Number of rows to display", (10, 20, 50, 100, 200), index=1 ) st.session_state.dl_method = st.selectbox( - label="Data download method", options=["curl", "nf-core/fetchngs", "aspera"] + label="Data download method", + options=["curl", "nf-core/fetchngs", "aspera", "sratookit"], ) if st.session_state.dl_method == "aspera": st.warning( @@ -104,8 +109,32 @@ def parse_args(): lib_url, sep="\t", ) - gb = GridOptionsBuilder.from_dataframe(df) - gb.configure_default_column( + gbs = GridOptionsBuilder.from_dataframe(df) + gbs.configure_default_column( + groupable=True, + value=True, + enableRowGroup=True, + aggFunc="sum", + editable=False, + filterParams={"inRangeInclusive": "true"}, + ) + gbs.configure_selection(selection_mode="multiple", use_checkbox=True) + gbs.configure_grid_options(checkboxSelection=True) + + gbs.configure_pagination( + enabled=True, + paginationAutoPageSize=False, + paginationPageSize=st.session_state.height, + ) + gbs.configure_column( + "project_name", + headerCheckboxSelection=True, + headerCheckboxSelectionFilteredOnly=True, + ) + gridOptions_sample = gbs.build() + + gbl = GridOptionsBuilder.from_dataframe(library) + gbl.configure_default_column( groupable=True, value=True, enableRowGroup=True, @@ -113,30 +142,30 @@ def parse_args(): editable=False, filterParams={"inRangeInclusive": "true"}, ) - gb.configure_selection(selection_mode="multiple", use_checkbox=True) - gb.configure_grid_options(checkboxSelection=True) + gbl.configure_selection(selection_mode="multiple", use_checkbox=True) + gbl.configure_grid_options(checkboxSelection=True) - gb.configure_pagination( + gbl.configure_pagination( enabled=True, paginationAutoPageSize=False, paginationPageSize=st.session_state.height, ) - gb.configure_column( + gbl.configure_column( "project_name", headerCheckboxSelection=True, headerCheckboxSelectionFilteredOnly=True, ) - gridOptions = gb.build() + gridOptions_library = gbl.build() with st.form("Samples table") as f: st.markdown("Select samples to filter") df_mod = AgGrid( df, - gridOptions=gridOptions, + gridOptions=gridOptions_sample, data_return_mode="filtered", update_mode="selection_changed", ) - if st.form_submit_button("Validate selection", type="primary"): + if st.form_submit_button("Validate sample selection", type="primary"): if len(df_mod["selected_rows"]) == 0: st.error( "You didn't select any sample! Please select at least one sample." @@ -155,27 +184,64 @@ def parse_args(): ): nb_sel_samples = pd.DataFrame(df_mod["selected_rows"]).shape[0] st.write(f"{nb_sel_samples } sample{'s'[:nb_sel_samples^1]} selected") - st.session_state.force_validation = True + st.session_state.force_samp_validation = True + + placeholder_lib_table = st.empty() + with placeholder_lib_table.container(): + if st.session_state.force_samp_validation: + with st.form("Library table"): + st.markdown("Select libraries to filter") + libs = get_libraries( + table_name=st.session_state.table_name, + libraries=library, + samples=pd.DataFrame(df_mod["selected_rows"]), + supported_archives=supported_archives, + ) + lib_sel = AgGrid( + libs, + gridOptions=gridOptions_library, + data_return_mode="filtered", + update_mode="selection_changed", + ) + try: + lib_mod = pd.DataFrame(lib_sel["selected_rows"]).drop( + "_selectedRowNodeInfo", axis=1 + ) + except KeyError: + lib_mod = pd.DataFrame(lib_sel["selected_rows"]) + + if st.form_submit_button( + "Validate library selection", type="primary" + ): + if len(lib_mod) == 0: + st.error( + "You didn't select any library! Please select at least one library." + ) + else: + st.session_state.force_lib_validation = True - placeholder = st.empty() + placeholder_buttons = st.empty() - with placeholder.container(): - + with placeholder_buttons.container(): ( + button_sample_table, button_libraries, - button_fastq, - button_samplesheet_eager, + button_fastq, + button_samplesheet_eager, button_samplesheet_mag, - button_samplesheet_taxprofiler, + button_samplesheet_taxprofiler, button_samplesheet_ameta, - button_bibtex - ) = st.columns(7) - - if st.session_state.force_validation: + button_bibtex, + ) = st.columns(8) + + if ( + st.session_state.force_samp_validation + and st.session_state.force_lib_validation + ): # Calculate the fastq file size of the selected libraries acc_table = prepare_accession_table( pd.DataFrame(df_mod["selected_rows"]), - library, + lib_mod, st.session_state.table_name, supported_archives, )["df"] @@ -190,6 +256,22 @@ def parse_args(): else: total_size_str = f"{total_size / 1e9:.2f}GB" + ################### + ## SAMPLE TABLE ## + ################### + + with button_sample_table: + st.download_button( + label="Download AncientMetagenomeDir Sample Table", + data=( + pd.DataFrame(df_mod["selected_rows"]) + .drop("_selectedRowNodeInfo", axis=1) + .to_csv(sep="\t", index=False) + .encode("utf-8") + ), + file_name="AncientMetagenomeDir_filtered_samples.tsv", + ) + ################### ## LIBRARY TABLE ## ################### @@ -202,17 +284,10 @@ def parse_args(): with button_libraries: st.download_button( label="Download AncientMetagenomeDir Library Table", - data=get_libraries( - table_name=st.session_state.table_name, - libraries=library, - samples=pd.DataFrame(df_mod["selected_rows"]), - supported_archives=supported_archives, - ).drop( - col_drop, axis=1 - ) - .to_csv(sep="\t", index=False) - .encode("utf-8"), - file_name="AncientMetagenomeDir_filtered_libraries.csv", + data=( + lib_mod.drop(col_drop, axis=1).to_csv(sep="\t", index=False) + ).encode("utf-8"), + file_name="AncientMetagenomeDir_filtered_libraries.tsv", ) ############################ @@ -225,10 +300,10 @@ def parse_args(): help=f"approx. {total_size_str} of sequencing data selected", data=prepare_accession_table( pd.DataFrame(df_mod["selected_rows"]), - library, + lib_mod, st.session_state.table_name, supported_archives, - )["df"]['archive_accession'] + )["df"]["archive_accession"] .to_csv(sep="\t", header=False, index=False) .encode("utf-8"), file_name="AncientMetagenomeDir_nf_core_fetchngs_input_table.tsv", @@ -239,19 +314,31 @@ def parse_args(): help=f"approx. {total_size_str} of sequencing data selected", data=prepare_accession_table( pd.DataFrame(df_mod["selected_rows"]), - library, + lib_mod, st.session_state.table_name, supported_archives, )["aspera_script"], file_name="AncientMetagenomeDir_aspera_download_script.sh", ) + elif st.session_state.dl_method == "sratookit": + st.download_button( + label="Download SRAtoolkit/fasterq-dump sample download script", + help=f"approx. {total_size_str} of sequencing data selected", + data=prepare_accession_table( + pd.DataFrame(df_mod["selected_rows"]), + lib_mod, + st.session_state.table_name, + supported_archives, + )["fasterq_dump_script"], + file_name="AncientMetagenomeDir_sratoolkit_download_script.sh", + ) else: st.download_button( label="Download Curl sample download script", help=f"approx. {total_size_str} of sequencing data selected", data=prepare_accession_table( pd.DataFrame(df_mod["selected_rows"]), - library, + lib_mod, st.session_state.table_name, supported_archives, )["curl_script"], @@ -266,7 +353,7 @@ def parse_args(): label="Download nf-core/eager input TSV", data=prepare_eager_table( pd.DataFrame(df_mod["selected_rows"]), - library, + lib_mod, st.session_state.table_name, supported_archives, ) @@ -275,40 +362,52 @@ def parse_args(): file_name="AncientMetagenomeDir_nf_core_eager_input_table.tsv", ) - ####################### - ## NF-CORE/MAG TABLE ## - ####################### - mag_table_single, mag_table_paired = prepare_mag_table( + ####################### + ## NF-CORE/MAG TABLE ## + ####################### + with button_samplesheet_mag: + mag_table_single, mag_table_paired = prepare_mag_table( pd.DataFrame(df_mod["selected_rows"]), - library, + lib_mod, st.session_state.table_name, supported_archives, ) - zip_file = zipfile.ZipFile( - 'ancientMetagenomeDir_mag_input.zip', mode='w') - if not mag_table_single.empty: - mag_table_single.to_csv( - "nf_core_mag_input_single_table.csv", index=False + zip_file = zipfile.ZipFile( + "AncientMetagenomeDir_nf_core_mag_input.zip", mode="w" + ) + if not mag_table_single.empty: + mag_table_single.to_csv( + "AncientMetagenomeDir_nf_core_mag_input_single_table.csv", + index=False, + ) + zip_file.write( + "AncientMetagenomeDir_nf_core_mag_input_single_table.csv" + ) + os.remove( + "AncientMetagenomeDir_nf_core_mag_input_single_table.csv" ) - zip_file.write( - 'nf_core_mag_input_single_table.csv' + if not mag_table_paired.empty: + mag_table_paired.to_csv( + "AncientMetagenomeDir_nf_core_mag_input_paired_table.csv", + index=False, ) - if not mag_table_paired.empty: - mag_table_paired.to_csv( - "nf_core_mag_input_paired_table.csv", index=False + zip_file.write( + "AncientMetagenomeDir_nf_core_mag_input_paired_table.csv" ) - zip_file.write( - 'nf_core_mag_input_paired_table.csv' + os.remove( + "AncientMetagenomeDir_nf_core_mag_input_paired_table.csv" ) - zip_file.close() - with open("ancientMetagenomeDir_mag_input.zip", "rb") as zip_file: - with button_samplesheet_mag: + zip_file.close() + with open( + "AncientMetagenomeDir_nf_core_mag_input.zip", "rb" + ) as zip_file: st.download_button( label="Download nf-core/mag input CSV", data=zip_file, file_name="AncientMetagenomeDir_nf_core_mag_input.zip", mime="application/zip", ) + os.remove("AncientMetagenomeDir_nf_core_mag_input.zip") ####################### ## TAXPROFILER TABLE ## @@ -318,7 +417,7 @@ def parse_args(): label="Download nf-core/taxprofiler input CSV", data=prepare_taxprofiler_table( pd.DataFrame(df_mod["selected_rows"]), - library, + lib_mod, st.session_state.table_name, supported_archives, ) @@ -335,7 +434,7 @@ def parse_args(): label="Download aMeta input TSV", data=prepare_aMeta_table( pd.DataFrame(df_mod["selected_rows"]), - library, + lib_mod, st.session_state.table_name, supported_archives, ) @@ -353,12 +452,18 @@ def parse_args(): data=prepare_bibtex_file(pd.DataFrame(df_mod["selected_rows"])), file_name="AncientMetagenomeDir_bibliography.bib", ) - - st.markdown("ℹ️ _By default all download scripts/inputs include ALL libraries of the selected samples. \n Review the AncientMetagenomeDir library table prior using any other table, to ensure usage of relevant libraries!_") - st.markdown("⚠️ _We provide no warranty to the accuracy of the generated input sheets._") + + st.markdown( + "ℹ️ _By default all download scripts/inputs include ALL libraries of the selected samples. \n Review the AncientMetagenomeDir library table prior using any other table, to ensure usage of relevant libraries!_" + ) + st.markdown( + "⚠️ _We provide no warranty to the accuracy of the generated input sheets._" + ) if st.button("Start New Selection", type="primary"): st.session_state.compute = False st.session_state.table_name = "No table selected" - st.session_state.force_validation = False - placeholder.empty() + st.session_state.force_samp_validation = False + st.session_state.force_lib_validation = False + placeholder_buttons.empty() + placeholder_lib_table.empty() diff --git a/README.md b/README.md index 098ca1f..b266d06 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,37 @@ -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4003826.svg)](https://doi.org/10.5281/zenodo.4003826) [![PyPI version](https://badge.fury.io/py/AMDirT.svg)](https://pypi.org/project/AMDirT) [![Documentation Status](https://readthedocs.org/projects/amdirt/badge/?version=dev)](https://amdirt.readthedocs.io/en/dev/?badge=dev) [![AMDirT-CI](https://github.com/SPAAM-community/AMDirT/actions/workflows/ci_test.yml/badge.svg)](https://github.com/SPAAM-community/AMDirT/actions/workflows/ci_test.yml) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4003825.svg)](https://doi.org/10.5281/zenodo.4003825) [![PyPI version](https://badge.fury.io/py/AMDirT.svg)](https://pypi.org/project/AMDirT) [![Documentation Status](https://readthedocs.org/projects/amdirt/badge/?version=dev)](https://amdirt.readthedocs.io/en/dev/?badge=dev) [![AMDirT-CI](https://github.com/SPAAM-community/AMDirT/actions/workflows/ci_test.yml/badge.svg)](https://github.com/SPAAM-community/AMDirT/actions/workflows/ci_test.yml) -# AMDirT + + AMDirT Logo + **AMDirT**: [**A**ncient**M**etagenome**Dir**](https://github.com/SPAAM-community/ancientmetagenomedir) **T**oolkit -AMDirT is a toolkit for interacting with the AncientMetagenomeDir metadata repository of ancient metagenomic samples and ancient microbial genomes. This tool provides ways to validate AncientMetagenomeDir submissions, explore and download sequencing data for ancient microbial and environmental (meta)genomes, and automatically prepare input samplesheets for a range of bioinformatic processing pipelines. +AMDirT is a toolkit for interacting with the AncientMetagenomeDir metadata repository of ancient metagenomic samples and ancient microbial genomes. -For documentation on using the tool, please see [How Tos](how_to/index), [Tutorials](/tutorials) and/or [Quick Reference](/reference). +This tool provides ways to explore and download sequencing data for ancient microbial and environmental (meta)genomes, automatically prepare input samplesheets for a range of bioinformatic processing pipelines, and to validate AncientMetagenomeDir submissions. + +For documentation on using the tool, please see [How Tos](https://amdirt.readthedocs.io/en/latest/how_to/index.html), [Tutorials](https://amdirt.readthedocs.io/en/latest/tutorials/index.html) and/or [Quick Reference](https://amdirt.readthedocs.io/en/latest/reference.html). ## Install -Before we release AMDirt on (bio)Conda, please follow the instructions below. +AMDirT has been tested on different Unix systems (macOS and Ubuntu) using Intel and AMD chips. If you suspect that AMDirT isn't working properly because you use a different hardware/OS, please open an [issue on GitHub](https://github.com/SPAAM-community/AMDirT/issues). -### 1. With pip +### 1. With [pip](https://pip.pypa.io/en/stable/getting-started/) -...upon release of v 1.4 +```bash +pip install amdirt +``` ### 2. With conda -...upon release of v 1.4 +Installing AMDirT in a dedicated [conda](https://docs.conda.io/projects/miniconda/en/latest/index.html) environment + +```bash +conda create -n amdirt -c bioconda amdirt #install amdirt in a dedicated conda environment +conda activate amdirt # activate the conda environment +# use amdirt +conda deactivate amdirt # deactivate the conda environment +``` ### The latest development version, directly from GitHub @@ -34,7 +47,7 @@ pip install --upgrade --force-reinstall git+https://github.com/SPAAM-community/A - Create the conda environment `conda env create -f environment.yml` - Activate the environment `conda activate amdirt` - Install amdirt in development mode `pip install -e .` - - In some cases you may need to force update streamlit with `pip install --upgrade steamlit` + - In some cases you may need to force update streamlit with `pip install --upgrade steamlit` To locally render documentation: diff --git a/assets/amdirt-square_text.png b/assets/amdirt-square_text.png new file mode 100644 index 0000000..630a4ba Binary files /dev/null and b/assets/amdirt-square_text.png differ diff --git a/assets/amdirt_square.png b/assets/amdirt_square.png new file mode 100644 index 0000000..124abdf Binary files /dev/null and b/assets/amdirt_square.png differ diff --git a/assets/amdirt_square_web.png b/assets/amdirt_square_web.png new file mode 100644 index 0000000..79694fc Binary files /dev/null and b/assets/amdirt_square_web.png differ diff --git a/assets/logo_rectangular.png b/assets/logo_rectangular.png new file mode 100644 index 0000000..88b9fb7 Binary files /dev/null and b/assets/logo_rectangular.png differ diff --git a/assets/logo_rectangular.svg b/assets/logo_rectangular.svg new file mode 100644 index 0000000..65b71db --- /dev/null +++ b/assets/logo_rectangular.svg @@ -0,0 +1,153 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + AMDirT + + + + + + diff --git a/assets/logo_rectangular_dark.png b/assets/logo_rectangular_dark.png new file mode 100644 index 0000000..2dd6492 Binary files /dev/null and b/assets/logo_rectangular_dark.png differ diff --git a/assets/logo_rectangular_dark.svg b/assets/logo_rectangular_dark.svg new file mode 100644 index 0000000..63cc57f --- /dev/null +++ b/assets/logo_rectangular_dark.svg @@ -0,0 +1,105 @@ + + + + + + + + + + + + + + + + + + + + AMDirT + + + + + diff --git a/assets/logo_rectangular_transparent.png b/assets/logo_rectangular_transparent.png new file mode 100644 index 0000000..ea4d442 Binary files /dev/null and b/assets/logo_rectangular_transparent.png differ diff --git a/assets/logo_rectangular_transparent.svg b/assets/logo_rectangular_transparent.svg new file mode 100644 index 0000000..16f60ef --- /dev/null +++ b/assets/logo_rectangular_transparent.svg @@ -0,0 +1,153 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + AMDirT + + + + + + diff --git a/assets/logo_rectangular_web.png b/assets/logo_rectangular_web.png new file mode 100644 index 0000000..080d524 Binary files /dev/null and b/assets/logo_rectangular_web.png differ diff --git a/docs/source/how_to/autofill.md b/docs/source/how_to/autofill.md index e1002a4..3effc20 100644 --- a/docs/source/how_to/autofill.md +++ b/docs/source/how_to/autofill.md @@ -8,6 +8,8 @@ The purpose of the `autofill` command is to help AncientMetagenomeDir contributo You should use these commands when you want to contribute to AncientMetagenomeDir, by adding a newly published dataset, if it's already available on a sequencing archive (ENA/SRA). +It is normally executed for you by a 'bot' on GitHub when you have opened a pull-request with a samplesheet, by leaving a comment of `@spaam-bot autofill `. Thus, you should also only run this command if you want to do your AncientMetagenomeDir _entirely_ locally. + ## How ### `autofill` @@ -50,7 +52,7 @@ project_name publication_year data_publication_doi sample_name archive archive_p You will notice that some columns are missing information, especially in the sample metadata table (in this example, `samples.tsv`). Despite our best efforts, not all information is made available through ENA, and it will be up to you to fill these missing columns, from the original publication, its supplementary material, or elsewhere. You can do it in your favorite text editor, or table editor (like LibreOffice Calc, or Excel). -Please refer to the AncientMetagenomeDir wiki for information on this process: [https://github.com/SPAAM-community/AncientMetagenomeDir/wiki](github.com/SPAAM-community/AncientMetagenomeDir/wiki). +Please refer to the AncientMetagenomeDir wiki for information on this process: [https://github.com/SPAAM-community/AncientMetagenomeDir/wiki](https://github.com/SPAAM-community/AncientMetagenomeDir/wiki). > ⚠️ The sample and library names reported on sequencing archives (ENA, SRA, ...) might not be the same as the one list in the original article. Please double check before proceeding further. diff --git a/docs/source/how_to/convert.md b/docs/source/how_to/convert.md index 40792a1..e28c1f9 100644 --- a/docs/source/how_to/convert.md +++ b/docs/source/how_to/convert.md @@ -12,7 +12,7 @@ You typically will use `convert` if you are a command-line power user, and have ## How -The following description assumes that you have already have a AncientMetagenomeDir **samples** table that has been filtered to the samples you wish to run through a given pipeline(s). +The following description assumes that you have already have an AncientMetagenomeDir **samples** table that has been filtered to the samples you wish to run through a given pipeline(s). > ⚠️ _The header, and present columns etc. should match exactly that on AncientMetagenomeDir, only rows may be removed._ @@ -25,9 +25,19 @@ AMDirT convert ancientmetagenome-hostassociated_samples_warinnersamplesonly.tsv where you provide the filtered TSV, which AncientMetagenomeDir samples table the filtered table is derived from,then the output directory where the samplesheets should be saved into, and which tool to generate a samplesheet from. +Alternatively, if you only want specific libraries, and already have pre-filtered the associated AncientMetagenomeDir libraries table, you can also provide it. Here for example: + +- the filtered sample table is : `ancientmetagenome-hostassociated_samples_warinnersamplesonly.tsv` +- the matching filtered libraries table is: `ancientmetagenome-hostassociated_libraries_warinnerlibrariesonly.tsv` + +```bash +mkdir -p samplesheets/ +AMDirT convert --libraries ancientmetagenome-hostassociated_libraries_warinnerlibrariesonly.tsv ancientmetagenome-hostassociated_samples_warinnersamplesonly.tsv ancientmetagenome-hostassociated -o samplesheets/ -- +``` + See [Output](#output) for descriptions of all output files. -> ⚠️ _When using a **pipeline input samplesheet**, you should always double check the sheet is correctly configured. We cannot guarantee accuracy between metadata and sequencing files._ +> ⚠️ _When using a **pipeline input samplesheet**, you should always double check the sheet is correctly configured. We cannot guarantee accuracy between metadata and sequencing files._ Once you have validated it, you can directly supply it to the appropriate pipeline as follows (using nf-core/eager as an example): @@ -41,33 +51,33 @@ The **citations BibTex** file contains all the citation information of your sele ## Output -> ⚠️ _We highly recommend generating and reviewing `AncientMetagenomeDir_filtered_libraries.tsv` **before** downloading or running any pipelines to ensure you have in the download scripts and/or pipeline input sheets only the actual library types you wish to use (e.g. you may only want paired-end data, or non-UDG treated data)._ +> ⚠️ _We highly recommend generating and reviewing `AncientMetagenomeDir_filtered_libraries.tsv` **before** downloading or running any pipelines to ensure you have in the download scripts and/or pipeline input sheets only the actual library types you wish to use (e.g. you may only want paired-end data, or non-UDG treated data)._ -> ⚠️ _To use a **pipeline input samplesheet**, you should always double check the sheet is correctly configured. We cannot guarantee accuracy between metadata and sequencing files._ +> ⚠️ _To use a **pipeline input samplesheet**, you should always double check the sheet is correctly configured. We cannot guarantee accuracy between metadata and sequencing files._ All possible output is as follows: - ``: where all the pipeline samplesheets are placed (by default `.`) -- `AncientMetagenomeDir_bibliography.bib`: - - A BibTex format citation information file with all references (where available) present in the filtered sample table. -- `AncientMetagenomeDir_filtered_libraries.tsv`: - - The associated AncientMetagenomeDir curated metadata for all _libraries_ of the samples in the input table. +- `AncientMetagenomeDir_bibliography.bib`: + - A BibTex format citation information file with all references (where available) present in the filtered sample table. +- `AncientMetagenomeDir_filtered_libraries.tsv`: + - The associated AncientMetagenomeDir curated metadata for all _libraries_ of the samples in the input table. - `AncientMetagenomeDir_curl_download_script.sh`: - - A bash script containing curl commands for all libraries in the input samples list. + - A bash script containing curl commands for all libraries in the input samples list. - `AncientMetagenomeDir_aspera_download_script.sh`: - - A bash script containing Aspera commands for all libraries in the input samples list. See [How Tos](/how_to/miscellaneous) for Aspera configuration information. -- `AncientMetagenomeDir_nf_core_fetchngs_input_table.tsv`: - - An input sheet containing ERS/SRS accession numbers in a format compatible with the [nf-core/fetchngs](https://nf-co.re/fetchngs) input samplesheet. + - A bash script containing Aspera commands for all libraries in the input samples list. See [How Tos](/how_to/miscellaneous) for Aspera configuration information. +- `AncientMetagenomeDir_nf_core_fetchngs_input_table.tsv`: + - An input sheet containing ERS/SRS accession numbers in a format compatible with the [nf-core/fetchngs](https://nf-co.re/fetchngs) input samplesheet. - `AncientMetagenomeDir_nf_core_eager_input_table.tsv`: - - An input sheet with metadata in a format compatible with the [nf-core/eager](https://nf-co.re/eager) input samplesheet. - - Contained paths are relative to the directory output when using the `curl` and `aspera` download scripts (i.e., input sheet assumes files are in the same directory as the input sheet itself). -- `AncientMetagenomeDir_nf_core_taxprofiler_input_table.csv`: - - An input sheet with metadata in a format compatible with the [nf-core/taxprofiler](https://nf-co.re/eager) input samplesheet. - - Contained paths are relative to the directory output when using the `curl` and `aspera` download scripts (i.e., input sheet assumes files are in the same directory as the input sheet itself). + - An input sheet with metadata in a format compatible with the [nf-core/eager](https://nf-co.re/eager) input samplesheet. + - Contained paths are relative to the directory output when using the `curl` and `aspera` download scripts (i.e., input sheet assumes files are in the same directory as the input sheet itself). +- `AncientMetagenomeDir_nf_core_taxprofiler_input_table.csv`: + - An input sheet with metadata in a format compatible with the [nf-core/taxprofiler](https://nf-co.re/eager) input samplesheet. + - Contained paths are relative to the directory output when using the `curl` and `aspera` download scripts (i.e., input sheet assumes files are in the same directory as the input sheet itself). - `AncientMetagenomeDir_aMeta_input_table.tsv`: - - An input sheet with metadata in a format compatible with the [aMeta](https://github.com/NBISweden/aMeta) input samplesheet. - - Contained paths are relative to the directory output when using the `curl` and `aspera` download scripts (i.e., input sheet assumes files are in the same directory as the input sheet itself). -- `AncientMetagenomeDir_nf_core_mag_input_{single,paired}_table.csv`: - - An input sheet with metadata in a format compatible with the [nf-core/mag](https://nf-co.re/eager) input samplesheet. - - Contained paths are relative to the directory output when using the `curl` and `aspera` download scripts (i.e., input sheet assumes files are in the same directory as the input sheet itself). - - nf-core/mag does not support paired- and single-end data in the same run, therefore two sheets will be generated if your selected samples contain both types of libraries. + - An input sheet with metadata in a format compatible with the [aMeta](https://github.com/NBISweden/aMeta) input samplesheet. + - Contained paths are relative to the directory output when using the `curl` and `aspera` download scripts (i.e., input sheet assumes files are in the same directory as the input sheet itself). +- `AncientMetagenomeDir_nf_core_mag_input_{single,paired}_table.csv`: + - An input sheet with metadata in a format compatible with the [nf-core/mag](https://nf-co.re/eager) input samplesheet. + - Contained paths are relative to the directory output when using the `curl` and `aspera` download scripts (i.e., input sheet assumes files are in the same directory as the input sheet itself). + - nf-core/mag does not support paired- and single-end data in the same run, therefore two sheets will be generated if your selected samples contain both types of libraries. diff --git a/docs/source/how_to/download.md b/docs/source/how_to/download.md new file mode 100644 index 0000000..68e16c2 --- /dev/null +++ b/docs/source/how_to/download.md @@ -0,0 +1,21 @@ +# download + +## What + +Download a copy of an AncientMetagenomeDir table. + +## When + +This command would be used when you want to download an AncientMetagenomeDir table locally. + +You typically do this if you're planning to use the `convert` command later. + +## How + +```bash +AMDirT download --table ancientsinglegenome-hostassociated --table_type samples -r v23.12.0 -o . +``` + +## Output + +This example command above will download the `ancientsinglegenome-hostassociated` `sample` table from the `v23.12.0` AncientMetagenomeDir release, and save it locally to `ancientmetagenome-hostassociated_samples_v23.12.0.tsv` diff --git a/docs/source/how_to/images/amdirt-filter-download-buttons.png b/docs/source/how_to/images/amdirt-filter-download-buttons.png new file mode 100644 index 0000000..afb2768 Binary files /dev/null and b/docs/source/how_to/images/amdirt-filter-download-buttons.png differ diff --git a/docs/source/how_to/images/amdirt-filter-libraries-table.png b/docs/source/how_to/images/amdirt-filter-libraries-table.png new file mode 100644 index 0000000..d9ab9f0 Binary files /dev/null and b/docs/source/how_to/images/amdirt-filter-libraries-table.png differ diff --git a/docs/source/how_to/index.rst b/docs/source/how_to/index.rst index ac3eb4d..f8f73d3 100644 --- a/docs/source/how_to/index.rst +++ b/docs/source/how_to/index.rst @@ -11,6 +11,8 @@ All 'How Tos' assume you have already installed AMDirT following the installatio viewer convert + download validate autofill + merge miscellaneous \ No newline at end of file diff --git a/docs/source/how_to/merge.md b/docs/source/how_to/merge.md new file mode 100644 index 0000000..8ce4fec --- /dev/null +++ b/docs/source/how_to/merge.md @@ -0,0 +1,33 @@ +# merge + +## What + +Merges a user-supplied metadata table with the latest AncientMetagenomeDir master metadata tables, with on-the-fly [validation](/how_to/validate). + +## When + +This command would be used when you have a local version of an AncientMetagenomeDir table (samples or libraries) of just the new samples or libraries to add, and want to append to the current master table before submitting a pull request. + +You typically only do this if preparing a pull request to the AncientMetagenomeDir repository entirely locally. + +## How + +The following description assumes you have already prepared an AncientMetagenomeDir **samples** or **libraries** table whose rows only consist of the header and new samples to be added. + +> ⚠️ _The header, and present columns etc. should match exactly that on the corresponding AncientMetagenomeDir table_ + +Given a new samples table `samples_for_new_pr.tsv` to be added to the single genome samples table `ancientsinglegenome-hostassociated`, you can run the following command: + +```bash +AMDirT merge -n ancientsinglegenome-hostassociated -t samples samples_for_new_pr.tsv +``` + +Note that during merge `merge` will also perform schema validation to ensure the contents of the new rows are valid against the AncientMetagenomeDir schema. + +## Output + +The output of the `merge` command is a new table with the merged rows named after the table you merged the new rows onto, placed by default in the directory you ran the command from (customisable with `-o`). + +In the example above, the file result would be: `ancientsinglegenome-hostassociated_samples.tsv`. + +The contents of this file can then theoretically be used to submit a pull request to the AncientMetagenomeDir repository. diff --git a/docs/source/how_to/miscellaneous.md b/docs/source/how_to/miscellaneous.md index 9843282..9c8b0a7 100644 --- a/docs/source/how_to/miscellaneous.md +++ b/docs/source/how_to/miscellaneous.md @@ -71,9 +71,3 @@ The output from `AMDirT viewer`/`convert` will contain a list of accessions in a nextflow pull nf-core/fetchngs nextflow run nf-core/fetchngs --input AncientMetagenomeDir_nf_core_fetchngs_input_table.tsv` ``` - - diff --git a/docs/source/how_to/validate.md b/docs/source/how_to/validate.md index 1de7590..071621f 100644 --- a/docs/source/how_to/validate.md +++ b/docs/source/how_to/validate.md @@ -2,7 +2,7 @@ ## What -The purpose of the `validate` command is to check that a AncientMetagenomeDir metadata file confirms the specifications of the project. +The purpose of the `validate` command is to check that an AncientMetagenomeDir metadata file confirms the specifications of the project. ## When diff --git a/docs/source/how_to/viewer.md b/docs/source/how_to/viewer.md index 2edf1b7..35a119a 100644 --- a/docs/source/how_to/viewer.md +++ b/docs/source/how_to/viewer.md @@ -13,6 +13,7 @@ The purpose of the `viewer` command is to provide a easy-to-use graphical-user i > ⚠️ _The `viewer` tool was previously named `filter`, and might still be referred as such in some parts of the documentation._ For video based walkthroughs please see [Tutorials](/tutorials/viewer.md). + ## When You should use this tool when you wish to find particular types of ancient metagenomic data, but wish to explore the dataset manually and interactively (i.e., don't know exactly what you're looking for yet), and/or if you do not wish to download the AncientMetagenomeDir full tables yourself and filter them within languages such as R or Python (with pandas). @@ -87,7 +88,15 @@ Now validate your selection! Press the 'Validate selection' button at the bottom > ⚠️ _If you wish to download the data, make sure you have already selected your 'Data download method' in the sidebar before pressing 'Validate selection'!_ -Once the select is validated, more buttons will appear allowing you to download different files +Once the selection is validated, the library filtering table will appear below + +![AMDirT libraries filtering table](images/amdirt-filter-libraries-table.png) + +You can proceed to select your libraries of choice (or all of them) in the same way as the samples table. Once happy, you can then click on 'Validate library selection' + +Your different download options will finally appear: + +![AMDirT output download options](images/amdirt-filter-download-buttons.png) In this case we suggest you press: @@ -97,8 +106,6 @@ In this case we suggest you press: See [Output](#output) for descriptions of all output possible files. -![AMDirT buttons after selection validation](images/amdirt-filter-validate-buttons.png) - To use the **download** script, you can simply run: ```bash diff --git a/docs/source/index.rst b/docs/source/index.rst index ade9480..7e9194a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -5,17 +5,19 @@ Welcome to AMDirT's documentation! ==================================== -.. image:: https://github.com/SPAAM-community/AncientMetagenomeDir/raw/master/assets/images/logos/spaam-AncientMetagenomeDir_socialmedia.png +.. image:: https://github.com/SPAAM-community/AMDirT/raw/master/assets/logo_rectangular_web.png :width: 400 :alt: AncientMetagenomeDir Logo 🛠️ **AMDirT**: `AncientMetagenomeDir `_ Toolkit 🛠️ -AMDirT is a toolkit for interacting with the AncientMetagenomeDir metadata repository of ancient metagenomic samples and ancient microbial genomes. This tool provides ways to validate AncientMetagenomeDir submissions, explore and download sequencing data for ancient microbial and environmental (meta)genomes, and automatically prepare input samplesheets for a range of bioinformatic processing pipelines. +AMDirT is a toolkit for interacting with the AncientMetagenomeDir metadata repository of ancient metagenomic samples and ancient microbial genomes. -For more information on installation see: :doc:`README`. For how to use the toolkit (tutorials, how-toes, references) please see the table of contents. +This tool provides ways to explore and download sequencing data for ancient microbial and environmental (meta)genomes, automatically prepare input samplesheets for a range of bioinformatic processing pipelines, and to validate AncientMetagenomeDir submissions. -__ homepage_ +For more information on installation see: :doc:`README`. + +For how to use the toolkit (tutorials, how-toes, references) please see the table of contents. .. toctree:: :maxdepth: 1 diff --git a/docs/source/tutorials/convert.md b/docs/source/tutorials/convert.md new file mode 100644 index 0000000..78b8fe7 --- /dev/null +++ b/docs/source/tutorials/convert.md @@ -0,0 +1,93 @@ +# convert + +On this page we provide a brief tutorial on how you can use the AMDirT command-line-interface (CLI) of the `convert` command. + +This tutorial assumes you are on a UNIX based operating system and has internet access. It also assumes you have already installed `AMDirT`. + +We will show how given a pre-filtered samples or libraries table (e.g. via command line tools or in an R session), in much the same way as the graphical-based `GUI` command, you can use the command-line interface to convert the table to various formats such as download scripts or prepared input sample sheets for ancient metagenomic pipelines. + +In this case we will want to download all metagenomic sequencing data of ancient dental calculus samples from Germany, and prepare a sample sheet for the nf-core/eager pipeline. + +## Data + +We will take use one of the previous releases of AncientMetagenomeDir as an example dataset. You can download the dataset from the following link. + +```bash +mkdir amdirt-convert-tutorial +cd amdirt-convert-tutorial +AMDirT download --table ancientmetagenome-hostassociated --table_type samples -r v23.09.0 +``` + +## Filter a sample metadata table + +Next we can filter the ancient metagenome 'host-associated' sample sheet for all dental calculus tables from Germany. + +```bash +cat ancientmetagenome-hostassociated_samples_v23.09.0.tsv | grep -e '^project_name' -e 'dental calculus' | grep -e '^project_name' -e 'Germany' > germany_dentalcalculus.tsv +``` + +> ⚠ _The command above is not robust and is only used for system portability and demonstration purposes. For example the `Germany` string could be in a site name. In practice, you should use more robust filtering methods such more specific `grep` expressions or in R_. + +Now we can use the `convert` command to provide a download script, a nf-core/eager samplesheet, the AncientMetagenomeDir library metadata, and a citations file. + +```bash +AMDirT convert --curl --eager --librarymetadata --bibliography germany_dentalcalculus.tsv ancientmetagenome-hostassociated +``` + +This will create the following files: + +- `AncientMetagenomeDir_bibliography.bib`: A BiBTeX file with the citations of the samples in the filtered table supplied to `convert` +- `AncientMetagenomeDir_curl_download_script.sh`: A curl download script to download all associated FASTQ files of the samples in the filtered table +- `AncientMetagenomeDir_filtered_libraries.tsv`: A AncientMetagenomeDir _library_ metadata table of the samples in the filtered table +- `AncientMetagenomeDir_nf_core_eager_input_table.tsv`: A nf-core/eager input table for the samples and FASTQ files downloaded from the `curl` script in the filtered table + +You could then supply run the `curl` script to download the FASTQ files, and then run the nf-core/eager pipeline with the input table. + +```bash +bash AncientMetagenomeDir_curl_download_script.sh +nextflow run nf-core/eager -profile docker --input AncientMetagenomeDir_nf_core_eager_input_table.tsv <...> +``` + +## Filter a library metadata table + +The `convert` command is not just for sample metadata! You can also use it to filter AncientMetagenomeDir library metadata tables. + +Let's say of the samples we just downloaded, you realised you only wanted to use the libraries that were with sequenced with a paired-end sequencing kit. We can filter the previously downloaded library metadata table in the same manner as above. + +```bash + grep -e '^project_name' -e 'PAIRED' AncientMetagenomeDir_filtered_libraries.tsv > germany_dentalcalculus_libraries_pe.tsv +``` + +> ⚠ _The command above is not robust and is only used for system portability and demonstration purposes. For example the `Germany` string could be in a site name. In practice, you should use more robust filtering methods such more specific `grep` expressions or in R_. + +We can then again use the `convert` command to provide an updated download script, nf-core/eager samplesheet, and an citations file. + +```bash +AMDirT convert --curl --eager --bibliography --libraries germany_dentalcalculus_libraries_pe.tsv germany_dentalcalculus.tsv ancientmetagenome-hostassociated +``` + +> ℹ _It's important to note that you still need a (full or filtered) AncientMetagenomeDir samples sheet even when supplying `--libraries`_. + +Once again you will have similar output as above (minus the libraries metadata table), you can than download in this case just the FASTQ files and run these through nf-core eager: + +- `AncientMetagenomeDir_bibliography.bib`: A BiBTeX file with the citations of the samples in the filtered table supplied to `convert` +- `AncientMetagenomeDir_curl_download_script.sh`: A curl download script to download all associated FASTQ files of the samples in the filtered table +- `AncientMetagenomeDir_nf_core_eager_input_table.tsv`: A nf-core/eager input table for the samples and FASTQ files downloaded from the `curl` script in the filtered table + +You could then supply run the `curl` script to download the FASTQ files, and then run the nf-core/eager pipeline with the input table. + +```bash +bash AncientMetagenomeDir_curl_download_script.sh +nextflow run nf-core/eager -profile docker --input AncientMetagenomeDir_nf_core_eager_input_table.tsv <...> +``` + +## Clean up + +Once you've completed this tutorial, simply leave the `amdirt-convert-tutorial` directory and remove it. + +For example: + +```bash +cd .. +rm -r amdirt-convert-tutorial/ +``` diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst index 3037871..8dc5326 100644 --- a/docs/source/tutorials/index.rst +++ b/docs/source/tutorials/index.rst @@ -1,14 +1,17 @@ Tutorials ============================================= -Introductory tutorials for each sub-command can be found below in text and/or video format. +Introductory tutorials for sub-commands can be found below in text and/or video format. All videos can be also found on the `SPAAM Community Youtube channel `_. All tutorials assume you have already installed AMDirT following the installation instructions (:doc:`/README`). +Note that tutorials currently exist for user-facing commands, sub-commands such as `validate` and `autofill` are generally executed by users. For these please see the respective (:doc:`/how_to/index`) page. + .. toctree:: :maxdepth: 1 :caption: Tutorials: - viewer \ No newline at end of file + viewer + convert \ No newline at end of file diff --git a/docs/source/tutorials/viewer.md b/docs/source/tutorials/viewer.md index 7f0b163..06c05a1 100644 --- a/docs/source/tutorials/viewer.md +++ b/docs/source/tutorials/viewer.md @@ -1,52 +1,8 @@ # viewer -On this page we provide step-by-step video tutorials on how to use the AMDirT graphical-user-interface (GUI) of the `viewer` command. - -> ℹ️ _The `viewer` subcommand was previously known as `filter`._ - -The [full walkthrough](#full-walkthrough) of all steps in a single video can be seen at the end of this page. - -## Loading AMDirT viewer - -This tutorial shows how to activate a [pre-created](README#install) conda environment from a command-line interface, and load the graphical-user interface (GUI) in your browser. - - - -## Selecting Table - -This tutorial shows you how to select which release of AncientMetagenomeDir you wish to explore, as well as how to select the given research-area specific AncientMetagenomeDir table of interest. - - - -## Row Selection - -This tutorial shows how to increase the number of rows in the tabular view of the GUI. - - - -## Column Options - -This tutorial shows you how to reorder columns, hide/display columns, and filter columns by the particular specifications of the user. - - - -## Selecting Rows - -This tutorial shows how to select different rows of interest. - - - -## Exporting Information - -This tutorial shows how to export the selected data for downstream use, in this case, download scripts and a input table for the [nf-core/eager](https://nf-co.re/eager) ancient DNA pipeline. +On this page we provide a step-by-step video tutorial on how to use the AMDirT graphical-user-interface (GUI) of the `viewer` command. For more detail about all possible output files, please see the corresponding pages on [How To](/how_to/index). - - -## Full walkthrough - -This tutorial has all the above steps in a single video to show the full procedure from beginning to end. - - + \ No newline at end of file diff --git a/environment.yml b/environment.yml deleted file mode 100644 index d5294bd..0000000 --- a/environment.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: amdirt -channels: - - conda-forge -dependencies: - - click=8.0.* - - jsonschema=4.4.0 - - numpy=1.22.3 - - pandas=1.4.2 - - pip=22.0.4 - - python>=3.9 - - recommonmark=0.7.1 - - requests=2.27.1 - - rich=12.2.0 - - setuptools=62.0.0 - - sphinx=4.5.0 - - sphinx-click=3.1.0 - - sphinx_rtd_theme=1.0.0 - - streamlit=1.14.1 - - watchdog=2.1.7 - - tox - - colorlog>=6.7.0 - - pip: - - streamlit-aggrid==0.3.3 diff --git a/setup.py b/setup.py index f7e6df9..1653f52 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def get_version(rel_path): "tabulate", "jsonschema", "rich", - "streamlit", + "streamlit<=1.35.0", "streamlit-aggrid", "numpy", "requests", @@ -41,6 +41,6 @@ def get_version(rel_path): ], packages=find_packages(exclude="test"), entry_points={"console_scripts": ["AMDirT = AMDirT.cli:cli"]}, - include_package_data=True, - package_data={"": ["assets/tables.json"]}, + package_dir={"AMDirT": "AMDirT"}, + package_data={"AMDirT.assets": ["tables.json"]}, ) diff --git a/tests/data/data_test_convert.tsv b/tests/data/data_test_convert.tsv deleted file mode 100644 index d1470f6..0000000 --- a/tests/data/data_test_convert.tsv +++ /dev/null @@ -1,3 +0,0 @@ -project_name publication_year publication_doi site_name latitude longitude geo_loc_name study_primary_focus sequence_name depth sample_name sample_age sample_age_doi feature material sampling_date archive archive_project archive_accession -Ahmed2018 2018 10.1016/j.quascirev.2017.11.037 Hässeldala Port 56.16 15.01 Sweden microbial Sediment core #7.4 435 HA1.1 13900 10.3389/fevo.2019.00189 lake lake sediment 2015 ENA PRJNA378719 SRS2040659 -Ahmed2018 2018 10.1016/j.quascirev.2017.11.037 Hässeldala Port 56.16 15.01 Sweden microbial Sediment core #7.4 435 HA1.12 13900 \ No newline at end of file diff --git a/tests/data/libraries_schema.json b/tests/data/libraries_schema.json index 702618a..b23768d 100644 --- a/tests/data/libraries_schema.json +++ b/tests/data/libraries_schema.json @@ -38,13 +38,9 @@ "$id": "#/items/properties/project_name", "type": "string", "title": "AncientMetagenomeDir key of the publication", - "description": "Format: surnameYYYY (if duplicate key but different publication, add b,c,d etc. as necessary). Must match a AncientMetagenomeDir samples table entry", + "description": "Format: surnameYYYY (if duplicate key but different publication, add b,c,d etc. as necessary). Must match an AncientMetagenomeDir samples table entry", "pattern": "^[a-zA-Z]+\\d{4}[b-z]?$", - "examples": [ - "Warinner2014", - "Muhlemann2018", - "Muhlemann2018a" - ] + "examples": ["Warinner2014", "Muhlemann2018", "Muhlemann2018a"] }, "publication_year": { "$id": "#/items/properties/publication_year", @@ -53,28 +49,22 @@ "maximum": 2100, "title": "Year of publication", "description": "Format: YYYY", - "examples": [ - 2014 - ] + "examples": [2014] }, "publication_doi": { "$id": "#/items/properties/publication_doi", "type": "string", "pattern": "^10.\\d{4,9}\\/[^,]+$", "title": "Digital Object Identifier (DOI) of the publication.", - "description": "A valid DOI code (not as an URL). Must match a AncientMetagenomeDir samples table entry", - "examples": [ - "10.1038/ng.2906" - ] + "description": "A valid DOI code (not as an URL). Must match an AncientMetagenomeDir samples table entry", + "examples": ["10.1038/ng.2906"] }, "sample_name": { "$id": "#/items/properties/sample_name", "type": "string", "title": "Name of the sample", - "description": "In most cases this should be the name of the host individual. Must match a AncientMetagenomeDir samples table entry", - "examples": [ - "B61" - ] + "description": "In most cases this should be the name of the host individual. Must match an AncientMetagenomeDir samples table entry", + "examples": ["B61"] }, "archive": { "$id": "#/items/properties/archive", @@ -82,29 +72,22 @@ "title": "Archiving platform", "description": "Name of the nucleotide data archiving platform", "$ref": "https://spaam-community.github.io/AncientMetagenomeDir/assets/enums/archive.json", - "examples": [ - "ENA" - ] + "examples": ["ENA"] }, "archive_project": { "$id": "#/items/properties/archive_project", "type": "string", "title": "Archive project accession platform", - "description": "Name of the nucleotide data archiving platform. Must match a AncientMetagenomeDir samples table entry", - "examples": [ - "PRJNA438985", - "mgp13354" - ] + "description": "Name of the nucleotide data archiving platform. Must match an AncientMetagenomeDir samples table entry", + "examples": ["PRJNA438985", "mgp13354"] }, "archive_sample_accession": { "$id": "#/items/properties/archive_sample_accession", "type": "string", "pattern": "^[\\S]+$", "title": "Archive accession number", - "description": "Samples archive accession numbers, multiple records can be separated with commas. No spaces allowed. . Must match a AncientMetagenomeDir samples table entry", - "examples": [ - "SRS473742,SRS473743,SRS473744,SRS473745" - ] + "description": "Samples archive accession numbers, multiple records can be separated with commas. No spaces allowed. . Must match an AncientMetagenomeDir samples table entry", + "examples": ["SRS473742,SRS473743,SRS473744,SRS473745"] }, "library_name": { "$id": "#/items/properties/library_name", @@ -112,11 +95,7 @@ "pattern": "^[\\S]+$", "title": "Name of the sequencing library", "description": "Name of the sequencing library generated from the sample. Typically matches the corresponding FASTQ file name", - "examples": [ - "ElSidron1_12056", - "DRT001.A0301", - "L1" - ] + "examples": ["ElSidron1_12056", "DRT001.A0301", "L1"] }, "strand_type": { "$id": "#/items/properties/strand_type", @@ -125,10 +104,7 @@ "title": "Strandedness of library", "description": "Strandedness of the DNA in the library, i.e. whether single or double stranded.", "$ref": "https://spaam-community.github.io/AncientMetagenomeDir/assets/enums/strand_type.json", - "examples": [ - "single", - "double" - ] + "examples": ["single", "double"] }, "library_polymerase": { "$id": "#/items/properties/library_polymerase", @@ -145,36 +121,20 @@ }, "library_treatment": { "$id": "#/items/properties/library_treatment", - "type": [ - "string", - "null" - ], + "type": ["string", "null"], "pattern": "^[\\S]+$", "title": "Name of damage-removal treatment", "description": "Name of any established damage-removal treatment that may have been performed.", "$ref": "https://spaam-community.github.io/AncientMetagenomeDir/assets/enums/library_treatment.json", - "examples": [ - "none", - "half-udg", - "full-udg" - ] + "examples": ["none", "half-udg", "full-udg"] }, "library_concentration": { "$id": "#/items/properties/library_concentration", - "type": [ - "integer", - "null" - ], + "type": ["integer", "null"], "minimum": 0, "title": "Number of pre-amplification DNA copies of library per microlitre", "description": "Number of pre-amplification DNA copies of library per microlitre as measured by qPCR. Can be used for contamination estimation against blanks", - "examples": [ - "NA", - 0, - 9064000, - 15730000, - 1331500000 - ] + "examples": ["NA", 0, 9064000, 15730000, 1331500000] }, "instrument_model": { "$id": "#/items/properties/instrument_model", @@ -196,10 +156,7 @@ "title": "Sequencing layout of library", "description": "Sequencing layout of library, i.e. either single or paired end, based on ENA controlled vocabulary (https://www.ebi.ac.uk/ena/portal/api/controlledVocab?field=library_layout", "$ref": "https://spaam-community.github.io/AncientMetagenomeDir/assets/enums/library_layout.json", - "examples": [ - "PAIRED", - "SINGLE" - ] + "examples": ["PAIRED", "SINGLE"] }, "library_strategy": { "$id": "#/items/properties/library_strategy", @@ -207,27 +164,14 @@ "title": "Type of sequencing library", "description": "Type of sequencing library. Whether shotgun sequenced (WGS) or enriched using 'Target-Capture' protocols and similar.", "$ref": "https://spaam-community.github.io/AncientMetagenomeDir/assets/enums/library_strategy.json", - "examples": [ - "WGS", - "Target-Capture", - "depletion", - "Unknown" - ] + "examples": ["WGS", "Target-Capture", "depletion", "Unknown"] }, "read_count": { "$id": "#/items/properties/read_count", - "type": [ - "integer", - "null" - ], + "type": ["integer", "null"], "title": "Number of reads or pairs in library", "description": "Number of reads or pairs in library, i.e. how deep sequenced. For paired end, count pairs (should be same number for both directions)", - "examples": [ - "NA", - 10000, - 6900000, - 123982 - ] + "examples": ["NA", 10000, 6900000, 123982] }, "archive_data_accession": { "$id": "#/items/properties/archive_data_accession", @@ -235,12 +179,7 @@ "pattern": "^[\\S]+$", "title": "Archive run accession number", "description": "Run archive accession numbers, single run accession per line. TODO: improve regex", - "examples": [ - "SRR13263119", - "SRR13263120", - "ERR3003613", - "ERR3003661" - ] + "examples": ["SRR13263119", "SRR13263120", "ERR3003613", "ERR3003661"] }, "download_links": { "$id": "#/items/properties/download_links", @@ -266,18 +205,12 @@ }, "download_sizes": { "$id": "#/items/properties/download_sizes", - "type": [ - "string", - "null" - ], + "type": ["string", "null"], "pattern": "^[0-9]+;[0-9]+;[0-9]+$|^[0-9]+;[0-9]+$|^[0-9]+$", "title": "File sizes of downloads", "description": "File sizes of corresponding download files in bytes, to allow calculation of HDD usage. Can be semi-colon separated list for paired end. TODO: improve regex", - "examples": [ - "126181389", - "614385694;622383780" - ] + "examples": ["126181389", "614385694;622383780"] } } } -} \ No newline at end of file +} diff --git a/tests/data/libraries_test_convert.tsv b/tests/data/libraries_test_convert.tsv new file mode 100644 index 0000000..dc13a87 --- /dev/null +++ b/tests/data/libraries_test_convert.tsv @@ -0,0 +1,3 @@ +project_name publication_year data_publication_doi sample_name archive archive_project archive_sample_accession library_name strand_type library_polymerase library_treatment library_concentration instrument_model library_layout library_strategy read_count archive_data_accession download_links download_md5s download_sizes +Warinner2014 2014 10.1038/ng.2906 B61 SRA PRJNA216965 SRS473742 S1-Shot-B61-calc double Phusion HS II DNA none Illumina HiSeq 2000 SINGLE WGS 13228381 SRR957738 ftp.sra.ebi.ac.uk/vol1/fastq/SRR957/SRR957738/SRR957738.fastq.gz 9c40c43b5d455e760ae8db924347f0b2 953396663 +Weyrich2017 2017 10.1038/nature21674 ElSidron1 SRA PRJNA685265 SRS7890498 ElSidron1_12056 double Unknown none Illumina HiSeq 2500 PAIRED WGS 53186534 SRR13263123 ftp.sra.ebi.ac.uk/vol1/fastq/SRR132/023/SRR13263123/SRR13263123_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR132/023/SRR13263123/SRR13263123_2.fastq.gz e70fb68658e6e66a60ae4d289666bc03;15a9cbc5c9a1234978c220df8e91c4e7 3533496739;4013394642 diff --git a/tests/data/samples_test_convert.tsv b/tests/data/samples_test_convert.tsv new file mode 100644 index 0000000..7e86465 --- /dev/null +++ b/tests/data/samples_test_convert.tsv @@ -0,0 +1,3 @@ +project_name publication_year publication_doi site_name latitude longitude geo_loc_name sample_name sample_host sample_age sample_age_doi community_type material archive archive_project archive_accession +Warinner2014 2014 10.1038/ng.2906 Dalheim 51.565 8.84 Germany B61 Homo sapiens 900 10.1038/ng.2906 oral dental calculus SRA PRJNA216965 SRS473742,SRS473743,SRS473744,SRS473745 +Weyrich2017 2017 10.1038/nature21674 El Sidrón Cave 43.386 -5.328 Spain ElSidron1 Homo sapiens neanderthalensis 49000 10.1038/nature21674 oral dental calculus SRA PRJNA685265 SRS7890498 diff --git a/tests/test_convert.py b/tests/test_convert.py index 49c13ec..8fa93f7 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -6,11 +6,28 @@ from AMDirT.convert import run_convert -def test_convert(test_data_dir): +def test_convert_only_sample_df(test_data_dir): assert ( run_convert( - samples=os.path.join(test_data_dir, "data_test_convert.tsv"), - table_name="ancientmetagenome-environmental", + samples=os.path.join(test_data_dir, "samples_test_convert.tsv"), + libraries=None, + table_name="ancientmetagenome-hostassociated", + eager=True, + fetchngs=True, + ameta=True, + output="test_files", + ) + is None + ) + shutil.rmtree("test_files") + + +def test_convert_only_libraries_df(test_data_dir): + assert ( + run_convert( + samples=os.path.join(test_data_dir, "samples_test_convert.tsv"), + libraries=os.path.join(test_data_dir, "libraries_test_convert.tsv"), + table_name="ancientmetagenome-hostassociated", eager=True, fetchngs=True, ameta=True, diff --git a/tests/test_core.py b/tests/test_core.py index 00d781a..34c71ab 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -14,7 +14,7 @@ def test_get_colour_chemistry(): def test_doi2bib(): - assert doi2bib("10.1038/nature14236")[:18] == "@article{Mnih_2015" + assert doi2bib("10.1038/nature14236")[:18] == " @article{Mnih_201" def test_ena_portal_status(): diff --git a/tests/test_download.py b/tests/test_download.py new file mode 100644 index 0000000..5923212 --- /dev/null +++ b/tests/test_download.py @@ -0,0 +1,10 @@ +from AMDirT.download import download + + +def test_download(): + table = "ancientmetagenome-hostassociated" + table_type = "samples" + release = "v23.12.0" + + d = download(table, table_type, release, output=".") + assert d == "ancientmetagenome-hostassociated_samples_v23.12.0.tsv"