From f1ae0cd7af2e06c117c5177635f915ef7e6c831a Mon Sep 17 00:00:00 2001 From: Thomas Cokelaer Date: Wed, 29 Jun 2022 21:44:12 +0200 Subject: [PATCH 1/5] Fixes #217 (kegg new keywords) --- src/bioservices/kegg.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/bioservices/kegg.py b/src/bioservices/kegg.py index 8d17969..b053a2e 100644 --- a/src/bioservices/kegg.py +++ b/src/bioservices/kegg.py @@ -1354,29 +1354,30 @@ def _parse(self, res): output[key] = data # strip only: expecting a single line (string) elif key in [ - "POSITION", + "ANNOTATION", + "CATEGORY", + "CLASS", + "COMPOSITION", + "CREATED", + "DATA_SOURCE", + "DEFINITION", "DESCRIPTION", "ENTRY", - "ORGANISM", - "CLASS", - "SYMBOL", + "EFFICACY", + "EQUATION", "FORMULA", "KEYWORDS", - "CATEGORY", - "ANNOTATION", - "DATA_SOURCE", + "HISTORY", "MASS", - "COMPOSITION", - "STRUCTURE", + "ORGANISM", "ORG_CODE", - "CREATED", - "DEFINITION", + "POSITION", + "RCLASS", "KO_PATHWAY", - "EQUATION", + "SYMBOL", + "STRUCTURE", "TYPE", - "RCLASS", "SYSNAME", - "HISTORY", "REL_PATHWAY", ]: # get rid of \n From 9079a1ab97ed7f7b3274da537c83247659a4d826 Mon Sep 17 00:00:00 2001 From: Thomas Cokelaer Date: Fri, 1 Jul 2022 15:09:56 +0200 Subject: [PATCH 2/5] update bioservices uniprot using new API --- src/bioservices/uniprot.py | 978 ++++++++++++++++--------------- test/webservices/test_uniprot.py | 50 +- 2 files changed, 521 insertions(+), 507 deletions(-) diff --git a/src/bioservices/uniprot.py b/src/bioservices/uniprot.py index 1cddbbe..b6797ba 100644 --- a/src/bioservices/uniprot.py +++ b/src/bioservices/uniprot.py @@ -41,410 +41,416 @@ """ import io import sys +import time +import urllib +import json + +import pandas as pd from bioservices.services import REST from bioservices import logger logger.name = __name__ -try: - import pandas as pd -except: - pass -__all__ = ["UniProt"] -# TODO:: falt files to get list of identifiers -# http://www.ebi.ac.uk/uniprot/database/download.html -# grep sp uniprot_sprot.fasta | grep HUMAN | awk '{print substr($1, 12, length($1))}' - -mapping = { - "UniProtKB AC/ID": "ACC+ID", - "UniProtKB": "ACC", - "UniProtKB": "ID", - "UniParc": "UPARC", - "UniRef50": "NF50", - "UniRef90": "NF90", - "UniRef100": "NF100", - "EMBL/GenBank/DDBJ": "EMBL_ID", - "EMBL/GenBank/DDBJ CDS": "EMBL", - "PIR": "PIR", - "UniGene": "UNIGENE_ID", - "Entrez Gene (GeneID)": "P_ENTREZGENEID", - "GI number*": "P_GI", - "IPI": "P_IPI", - "RefSeq Protein": "P_REFSEQ_AC", - "RefSeq Nucleotide": "REFSEQ_NT_ID", - "PDB": "PDB_ID", - "DisProt": "DISPROT_ID", - "HSSP": "HSSP_ID", - "DIP": "DIP_ID", - "MINT": "MINT_ID", - "Allergome": "ALLERGOME_ID", - "MEROPS": "MEROPS_ID", - "mycoCLAP": "MYCOCLAP_ID", - "PeroxiBase": "PEROXIBASE_ID", - "PptaseDB": "PPTASEDB_ID", - "REBASE": "REBASE_ID", - "TCDB": "TCDB_ID", - "PhosSite": "PHOSSITE_ID", - "DMDM": "DMDM_ID", - "Aarhus/Ghent-2DPAGE": "AARHUS_GHENT_2DPAGE_ID", - "World-2DPAGE": "WORLD_2DPAGE_ID", - "DNASU": "DNASU_ID", - "Ensembl": "ENSEMBL_ID", - "Ensembl Protein": "ENSEMBL_PRO_ID", - "Ensembl Transcript": "ENSEMBL_TRS_ID", - "Ensembl Genomes": "ENSEMBLGENOME_ID", - "Ensembl Genomes Protein": "ENSEMBLGENOME_PRO_ID", - "Ensembl Genomes Transcript": "ENSEMBLGENOME_TRS_ID", - "GeneID": "P_ENTREZGENEID", - "GenomeReviews": "GENOMEREVIEWS_ID", - "KEGG": "KEGG_ID", - "PATRIC": "PATRIC_ID", - "UCSC": "UCSC_ID", - "VectorBase": "VECTORBASE_ID", - "AGD": "AGD_ID", - "ArachnoServer": "ARACHNOSERVER_ID", - "CGD": "CGD", - "ConoServer": "CONOSERVER_ID", - "CYGD": "CYGD_ID", - "dictyBase": "DICTYBASE_ID", - "EchoBASE": "ECHOBASE_ID", - "EcoGene": "ECOGENE_ID", - "euHCVdb": "EUHCVDB_ID", - "EuPathDB": "EUPATHDB_ID", - "FlyBase": "FLYBASE_ID", - "GeneCards": "GENECARDS_ID", - "GeneFarm": "GENEFARM_ID", - "GenoList": "GENOLIST_ID", - "H-InvDB": "H_INVDB_ID", - "HGNC": "HGNC_ID", - "HPA": "HPA_ID", - "LegioList": "LEGIOLIST_ID", - "Leproma": "LEPROMA_ID", - "MaizeGDB": "MAIZEGDB_ID", - "MIM": "MIM_ID", - "MGI": "MGI_ID", - "neXtProt": "NEXTPROT_ID", - "Orphanet": "ORPHANET_ID", - "PharmGKB": "PHARMGKB_ID", - "PomBase": "POMBASE_ID", - "PseudoCAP": "PSEUDOCAP_ID", - "RGD": "RGD_ID", - "SGD": "SGD_ID", - "TAIR": "TAIR_ID", - "TubercuList": "TUBERCULIST_ID", - "WormBase": "WORMBASE_ID", - "WormBase Transcript": "WORMBASE_TRS_ID", - "WormBase Protein": "WORMBASE_PRO_ID", - "Xenbase": "XENBASE_ID", - "ZFIN": "ZFIN_ID", - "eggNOG": "EGGNOG_ID", - "GeneTree": "GENETREE_ID", - "HOGENOM": "HOGENOM_ID", - "HOVERGEN": "HOVERGEN_ID", - "KO": "KO_ID", - "OMA": "OMA_ID", - "OrthoDB": "ORTHODB_ID", - "ProtClustDB": "PROTCLUSTDB_ID", - "BioCyc": "BIOCYC_ID", - "Reactome": "REACTOME_ID", - "UniPathWay": "UNIPATHWAY_ID", - "CleanEx": "CLEANEX_ID", - "GermOnline": "GERMONLINE_ID", - "ChEMBL": "CHEMBL_ID", - "ChiTaRS": "CHITARS_ID", - "DrugBank": "DRUGBANK_ID", - "GenomeRNAi": "GENOMERNAI_ID", - "NextBio": "NEXTBIO_ID", -} +__all__ = ["UniProt"] class UniProt: """Interface to the `UniProt `_ service - .. rubric:: Identifiers mapping between databases: - :: >>> from bioservices import UniProt >>> u = UniProt(verbose=False) - >>> u.mapping("ACC", "KEGG_ID", query='P43403') + >>> u.mapping("UniProtKB_AC-ID", "KEGG", query='P43403') defaultdict(, {'P43403': ['hsa:7535']}) >>> res = u.search("P43403") # Returns sequence on the ZAP70_HUMAN accession Id >>> sequence = u.search("ZAP70_HUMAN", columns="sequence") + + .. versionchanged:: 1.10 + + Uniprot update its service in June 2022. Changes were made in the bioservices + API with small changes. User API is more or less the same. Main issues that may + be faced are related to change of output column names. Please see the + :attr:`_legacy_names` for corresponding changes. + + Some notes about searches. The *and* and *or* are now upper cases. + The *organism* and *taxonomy* fields are now *organism_id* and *taxonomy_id* + + """ - _mapping = mapping.copy() - _url = "https://www.uniprot.org" - # _valid_columns = ['citation', 'clusters', 'comments', 'database', - # 'domains', 'domain', 'ec', 'id', 'entry name', 'existence', - # 'families', 'feature', 'features', 'genes', 'go', 'go-id', 'interpro', - # 'interactor', 'keywords', 'keyword-id', 'last-modified', - # 'length', 'organism', 'organism-id', 'pathway', 'protein names', - # 'reviewed', 'score', 'sequence', '3d', 'subcellular locations', - # 'taxonomy', 'tools', 'version', 'virus hosts', 'lineage-id', - # 'sequence-modified', 'proteome'] + # June 2022, API changes and these labels changed: + _legacy_names = { + 'id': 'accession', + 'entry name': 'id', + 'genes': 'gene_names', + 'genes(PREFERRED)': 'gene_primary', + 'genes(ALTERNATIVE)': 'gene_synonym', + 'genes(OLN)': 'gene_oln', + 'genes(ORF)': 'gene_orf', + 'organism': 'organism_name', + 'organism-id': 'organism_id', + 'protein names': 'protein_name', + 'proteome': 'xref_proteomes', + 'lineage(ALL)': 'lineage', + 'virus hosts': 'virus_hosts', + + 'comment(ALTERNATIVE PRODUCTS)': 'cc_alternative_products', + 'feature(ALTERNATIVE SEQUENCE)': 'ft_var_seq', + 'comment(ERRONEOUS GENE MODEL PREDICTION)': 'error_gmodel_pred', + 'fragment': 'fragment', + 'encodedon': 'organelle', + 'length': 'length', + 'mass': 'mass', + 'comment(MASS SPECTROMETRY)': 'cc_mass_spectrometry', + 'feature(NATURAL VARIANT)': 'ft_variant', + 'feature(NON ADJACENT RESIDUES)': 'ft_non_cons', + 'feature(NON STANDARD RESIDUE)': 'ft_non_std', + 'feature(NON TERMINAL RESIDUE)': 'ft_non_ter', + 'comment(POLYMORPHISM)': 'cc_polymorphism', + 'comment(RNA EDITING)': 'cc_rna_editing', + 'sequence': 'sequence', + 'comment(SEQUENCE CAUTION)': 'cc_sequence_caution', + 'feature(SEQUENCE CONFLICT)': 'ft_conflict', + 'feature(SEQUENCE UNCERTAINTY)': 'ft_unsure', + 'version(sequence)': 'sequence_version', + + # function + 'comment(ABSORPTION)': 'absorption', + 'feature(ACTIVE SITE)': 'ft_act_site', + 'comment(ACTIVITY REGULATION)': 'cc_activity_regulation', + 'feature(BINDING SITE)': 'ft_binding', + 'chebi': 'ft_ca_bind', + 'chebi(Catalytic activity)': 'cc_catalytic_activity', + 'chebi(Cofactor)': 'cc_cofactor', + 'feature(DNA BINDING)': 'ft_dna_bind', + 'ec': 'ec', + 'comment(FUNCTION)': 'cc_function', + 'comment(KINETICS)': 'kinetics', + 'feature(METAL BINDING)': 'ft_metal', + 'feature(NP BIND)': 'ft_np_bind', + 'comment(PATHWAY)': 'cc_pathway', + 'comment(PH DEPENDENCE)': 'ph_dependence', + 'comment(REDOX POTENTIAL)': 'redox_potential', + 'rhea-id': 'rhea_id', + 'feature(SITE)': 'ft_site', + 'comment(TEMPERATURE DEPENDENCE)': 'temp_dependence', + + # misc + 'annotation score': 'annotation_score', + 'comment(CAUTION)': 'cc_caution', + 'features': 'feature', + 'keyword-id': 'keywordid', + 'keywords': 'keyword', + 'comment(MISCELLANEOUS)': 'cc_miscellaneous', + 'existence': 'protein_existence', + 'reviewed': 'reviewed', + 'tools': 'tools', + 'uniparcid': 'uniparc_id', + + # Interaction ============================= + "interactor": "cc_interaction", + "comment(SUBUNIT)": "cc_subunit", + + # GO + "go": "go", + "go(biological process)": "go_p", + "go(cellular component)": "go_c", + "go(molecular function)": "go_f", + "go-id": "go_id", + + # Date of + "created": "date_created", + "last-modified": "date_modified", + "sequence-modified": "date_sequence_modified", + "version(entry)": "version", + # STRUCTURE + "3d": "structure_3d", + "feature(BETA STRAND)": "ft_strand", + "feature(HELIX)": "ft_helix", + "feature(TURN)": "ft_turn", + + # subcellular function + "comment(SUBCELLULAR LOCATION)":"cc_subcellular_location", + "feature(INTRAMEMBRANE)":"ft_intramem", + "feature(TOPOLOGICAL DOMAIN)":"ft_topo_dom", + "feature(TRANSMEMBRANE)": "ft_transmem", + + # Pathology + 'comment(ALLERGEN)': 'cc_allergen', + 'comment(BIOTECHNOLOGY)': 'cc_biotechnology', + 'comment(DISRUPTION PHENOTYPE)': 'cc_disruption_phenotype', + 'comment(DISEASE)': 'cc_disease', + 'feature(MUTAGENESIS)': 'ft_mutagen', + 'comment(PHARMACEUTICAL)': 'cc_pharmaceutical', + 'comment(TOXIC DOSE)': 'cc_toxic_dose', + + # PTM + 'feature(CHAIN)': 'ft_chain', + 'feature(CROSS LINK)': 'ft_crosslnk', + 'feature(DISULFIDE BOND)': 'ft_disulfid', + 'feature(GLYCOSYLATION)': 'ft_carbohyd', + 'feature(INITIATOR METHIONINE)': 'ft_init_met', + 'feature(LIPIDATION)': 'ft_lipid', + 'feature(MODIFIED RESIDUE)': 'ft_mod_res', + 'feature(PEPTIDE)': 'ft_peptide', + 'comment(PTM)': 'cc_ptm', + 'feature(PROPEPTIDE)': 'ft_propep', + 'feature(SIGNAL)': 'ft_signal', + 'feature(TRANSIT)': 'ft_transit', + + # Family domains + 'feature(COILED COIL)': 'ft_coiled', + 'feature(COMPOSITIONAL BIAS)': 'ft_compbias', + 'comment(DOMAIN)': 'cc_domain', + 'feature(DOMAIN EXTENT)': 'ft_domain', + 'feature(MOTIF)': 'ft_motif', + 'families': 'protein_families', + 'feature(REGION)': 'ft_region', + 'feature(REPEAT)': 'ft_repeat', + 'comment(SIMILARITY)': '', + 'feature(ZINC FINGER)': 'ft_zn_fing', + + } + _valid_columns = [ - # Names & Taxonomy + # Names & Taxonomy ================================================ + "accession", "id", - "entry name", - "genes", - "genes(PREFERRED)", - "genes(ALTERNATIVE)", - "genes(OLN)", - "genes(ORF)", - "organism", - "organism-id", - "protein names", - "proteome", - "lineage(ALL)", - "lineage-id", - "virus hosts", - # Sequences + "gene_names", + "gene_primary", + "gene_synonym", + "gene_oln", + "gene_orf", + "organism_name", + "organism_id", + "protein_name", + "xref_proteomes", + "lineage", + "virus_hosts", + + # Sequences ======================================================== "fragment", "sequence", "length", "mass", - "encodedon", - "comment(ALTERNATIVE PRODUCTS)", - "comment(ERRONEOUS GENE MODEL PREDICTION)", - "comment(ERRONEOUS INITIATION)", - "comment(ERRONEOUS TERMINATION)", - "comment(ERRONEOUS TRANSLATION)", - "comment(FRAMESHIFT)", - "comment(MASS SPECTROMETRY)", - "comment(POLYMORPHISM)", - "comment(RNA EDITING)", - "comment(SEQUENCE CAUTION)", - "feature(ALTERNATIVE SEQUENCE)", - "feature(NATURAL VARIANT)", - "feature(NON ADJACENT RESIDUES)", - "feature(NON STANDARD RESIDUE)", - "feature(NON TERMINAL RESIDUE)", - "feature(SEQUENCE CONFLICT)", - "feature(SEQUENCE UNCERTAINTY)", - "version(sequence)", - # Family and Domains - "domains", - "domain", - "comment(DOMAIN)", - "comment(SIMILARITY)", - "feature(COILED COIL)", - "feature(COMPOSITIONAL BIAS)", - "feature(DOMAIN EXTENT)", - "feature(MOTIF)", - "feature(REGION)", - "feature(REPEAT)", - "feature(ZINC FINGER)", - # Function - "ec", - "comment(ABSORPTION)", - "comment(CATALYTIC ACTIVITY)", - "comment(COFACTOR)", - "comment(ENZYME REGULATION)", - "comment(FUNCTION)", - "comment(KINETICS)", - "comment(PATHWAY)", - "comment(REDOX POTENTIAL)", - "comment(TEMPERATURE DEPENDENCE)", - "comment(PH DEPENDENCE)", - "feature(ACTIVE SITE)", - "feature(BINDING SITE)", - "feature(DNA BINDING)", - "feature(METAL BINDING)", - "feature(NP BIND)", - "feature(SITE)", - # Gene Ontologys + "organelle", + "cc_alternative_products", + "error_gmodel_pred", + "cc_mass_spectrometry", + "cc_polymorphism", + "cc_rna_editing", + "cc_sequence_caution", + "ft_var_seq", + "ft_variant", + "ft_non_cons", + "ft_non_std", + "ft_non_ter", + "ft_conflict", + "ft_unsure", + "sequence_version", + + # Family and Domains ======================================== + 'ft_coiled', + 'ft_compbias', + 'cc_domain', + 'ft_domain', + 'ft_motif', + 'protein_families', + 'ft_region', + 'ft_repeat', + 'ft_zn_fing', + + # Function =================================================== + 'absorption', + 'ft_act_site', + 'cc_activity_regulation', + 'ft_binding', + 'ft_ca_bind', + 'cc_catalytic_activity', + 'cc_cofactor', + 'ft_dna_bind', + 'ec', + 'cc_function', + 'kinetics', + 'ft_metal', + 'ft_np_bind', + 'cc_pathway', + 'ph_dependence', + 'redox_potential', + #'rhea_id', + 'ft_site', + 'temp_dependence', + + # Gene Ontology ================================== "go", - "go(biological process)", - "go(molecular function)", - "go(cellular component)", - "go-id", - # InterPro - "interpro", - # Interaction - "interactor", - "comment(SUBUNIT)", + "go_p", + "go_f", + "go_c", + "go_id", + + # Interaction ====================================== + "cc_interaction", + "cc_subunit", + + # EXPRESSION ======================================= + "cc_developmental_stage", + "cc_induction", + "cc_tissue_specificity", + # Publications - "citation", - "citationmapping", + "lit_pubmed_id", + # Date of - "created", - "last-modified", - "sequence-modified", - "version(entry)", + "date_created", + "date_modified", + "date_sequence_modified", + "version", + # Structure - "3d", - "feature(BETA STRAND)", - "feature(HELIX)", - "feature(TURN)", + "structure_3d", + "ft_strand", + "ft_helix", + "ft_turn", + # Subcellular location - "comment(SUBCELLULAR LOCATION)", - "feature(INTRAMEMBRANE)", - "feature(TOPOLOGICAL DOMAIN)", - "feature(TRANSMEMBRANE)", - # Miscellaneous - "annotation score", - "score", - "features", - "comment(CAUTION)", - "comment(TISSUE SPECIFICITY)", - "comment(GENERAL)", - "keywords", - "context", - "existence", + "cc_subcellular_location", + "ft_intramem", + "ft_topo_dom", + "ft_transmem", + + # Miscellaneous ========================== + "annotation_score", + "cc_caution", + "comment_count", + #"feature", + "feature_count", + "keyword", + "keywordid", + "cc_miscellaneous", + "protein_existence", "tools", "reviewed", - "feature", - "families", - "subcellular locations", - "taxonomy", - "version", - "clusters", - "comments", - "database", - "keyword-id", - "pathway", - "score", - # Pathology & Biotech - "comment(ALLERGEN)", - "comment(BIOTECHNOLOGY)", - "comment(DISRUPTION PHENOTYPE)", - "comment(DISEASE)", - "comment(PHARMACEUTICAL)", - "comment(TOXIC DOSE)", + "uniparc_id", + + # Pathology + 'cc_allergen', + 'cc_biotechnology', + 'cc_disruption_phenotype', + 'cc_disease', + 'ft_mutagen', + 'cc_pharmaceutical', + 'cc_toxic_dose', + # PTM / Processsing - "comment(PTM)", - "feature(CHAIN)", - "feature(CROSS LINK)", - "feature(DISULFIDE BOND)", - "feature(GLYCOSYLATION)", - "feature(INITIATOR METHIONINE)", - "feature(LIPIDATION)", - "feature(MODIFIED RESIDUE)", - "feature(PEPTIDE)", - "feature(PROPEPTIDE)", - "feature(SIGNAL)", - "feature(TRANSIT)", - # Taxonomic lineage - "lineage(all)", - "lineage(SUPERKINGDOM)", - "lineage(KINGDOM)", - "lineage(SUBKINGDOM)", - "lineage(SUPERPHYLUM)", - "lineage(PHYLUM)", - "lineage(SUBPHYLUM)", - "lineage(SUPERCLASS)", - "lineage(CLASS)", - "lineage(SUBCLASS)", - "lineage(INFRACLASS)", - "lineage(SUPERORDER)", - "lineage(ORDER)", - "lineage(SUBORDER)", - "lineage(INFRAORDER)", - "lineage(PARVORDER)", - "lineage(SUPERFAMILY)", - "lineage(FAMILY)", - "lineage(SUBFAMILY)", - "lineage(TRIBE)", - "lineage(SUBTRIBE)", - "lineage(GENUS)", - "lineage(SUBGENUS)", - "lineage(SPECIES GROUP)", - "lineage(SPECIES SUBGROUP)", - "lineage(SPECIES)", - "lineage(SUBSPECIES)", - "lineage(VARIETAS)", - "lineage(FORMA)", - # Taxonomic identifier - "lineage-id(all)", - "lineage-id(SUPERKINGDOM)", - "lineage-id(KINGDOM)", - "lineage-id(SUBKINGDOM)", - "lineage-id(SUPERPHYLUM)", - "lineage-id(PHYLUM)", - "lineage-id(SUBPHYLUM)", - "lineage-id(SUPERCLASS)", - "lineage-id(CLASS)", - "lineage-id(SUBCLASS)", - "lineage-id(INFRACLASS)", - "lineage-id(SUPERORDER)", - "lineage-id(ORDER)", - "lineage-id(SUBORDER)", - "lineage-id(INFRAORDER)", - "lineage-id(PARVORDER)", - "lineage-id(SUPERFAMILY)", - "lineage-id(FAMILY)", - "lineage-id(SUBFAMILY)", - "lineage-id(TRIBE)", - "lineage-id(SUBTRIBE)", - "lineage-id(GENUS)", - "lineage-id(SUBGENUS)", - "lineage-id(SPECIES GROUP)", - "lineage-id(SPECIES SUBGROUP)", - "lineage-id(SPECIES)", - "lineage-id(SUBSPECIES)", - "lineage-id(VARIETAS)", - "lineage-id(FORMA)", - # Cross-references - "database(db_abbrev)", - "database(EMBL)", + 'ft_chain', + 'ft_crosslnk', + 'ft_disulfid', + 'ft_carbohyd', + 'ft_init_met', + 'ft_lipid', + 'ft_mod_res', + 'ft_peptide', + 'cc_ptm', + 'ft_propep', + 'ft_signal', + 'ft_transit', + + # not documented + 'xref_pdb' ] + _url = "https://rest.uniprot.org" def __init__(self, verbose=False, cache=False): """**Constructor** :param verbose: set to False to prevent informative messages + :param cache: set to True to cache request """ - self.services = REST(name="UniProt", url=UniProt._url, verbose=verbose, cache=cache) + self.services = REST(name="UniProt", url=UniProt._url, verbose=verbose, cache=cache, url_defined_later=True) self.TIMEOUT = 100 - + self._valid_mapping = None self._database = "uniprot" - def _download_flat_files(self): + def _download_flat_files(self, output="uniprot_sprot.dat.gz"): #pragma: no cover """could be used to get all data in flat files (about compressed 500Mb )""" + # deprecated in v1.10 due to API change in uniprot url = "ftp://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_sprot.dat.gz" self.services.logging.info("Downloading uniprot file from the web. May take some time.:") - import urllib - - urllib.urlretrieve(url, "uniprot_sprot.dat.gz") - - def mapping(self, fr="ID", to="KEGG_ID", query="P13368"): + urllib.request.urlretrieve(url, output) + + def _get_valid_mapping(self): + if not self._valid_mapping: + self._set_valid_mapping() + return self._valid_mapping + + def _set_valid_mapping(self): + fields = self.services.http_get("configure/idmapping/fields") + groups = fields["groups"] + rules = {} + for item in fields["rules"]: + ID = item['ruleId'] + rules[ID] = item + + # This is suppose to be a set of database name available in Uniprot + from_to = {} + for item in [x for group in groups for x in group['items']]: + # should be name, not DisplayName + name = item['name'] + if item['from']: + tos = rules[item['ruleId']]['tos'] + from_to[name] = tos + + self._valid_mapping = from_to + + valid_mapping = property(_get_valid_mapping, _set_valid_mapping) + + def mapping(self, fr="UniProtKB_AC-ID", to="KEGG", query="P13368", polling_interval_seconds=3, max_waiting_time=100): """This is an interface to the UniProt mapping service - :param fr: the source database identifier. See :attr:`_mapping`. - :param to: the targetted database identifier. See :attr:`_mapping`. + :param fr: the source database identifier. See :attr:`valid_mapping`. + :param to: the targetted database identifier. See :attr:`valid_mapping`. :param query: a string containing one or more IDs separated by a space It can also be a list of strings. - :param format: The output being a dictionary, this parameter is - deprecated and not used anymore - :return: a list. The first element is the source database Id. The second - is the targetted source identifier. Following elements are alternate - of one the entry and its mapped Id. If a query has several mapped - Ids, the query is repeated (see example with PDB mapping here below) - e.g., ["From:ID", "to:PDB_ID", "P43403"] + :param polling_interval_seconds: the number of seconds between each status check of the current job + :param max_waiting_time: the maximum number of seconds to wait for the final answer. + :return: a dictionary with two possible keys. The first one is 'results' + with the from / to answers and the second one 'failedIds' with Ids that were not found :: - >>> u.mapping("ACC", "KEGG_ID", 'P43403') - defaultdict(, {'P43403': ['hsa:7535']}) - >>> u.mapping("ACC", "KEGG_ID", 'P43403 P00958') - defaultdict(, {'P00958': ['sce:YGR264C'], 'P43403': ['hsa:7535']}) - >>> u.mapping("ID", "PDB_ID", "P43403") - defaultdict(, {'P43403': ['1FBV', '1M61', '1U59', - '2CBL', '2OQ1', '2OZO', '2Y1N', '3ZNI', '4A4B', '4A4C', '4K2R']}) + >>> u.mapping("UniProtKB_AC-ID", "KEGG", 'P43403') + {'results': [{'from': 'P43403', 'to': 'hsa:7535'}]} + + The output is a dictionary. Identifiers that were not found are stored in the keys + 'failedIds'. Succesful queries are stored in the 'results' key that is a list + of dictionaries with two keys set to 'from' and 'to'. The 'from' key should be in your input list. + The 'to' key is the result. Here we have the KEGG identifier recognised by its prefix 'hsa:', which is for human. + Sometimes the output ('to') it is more complicated. Consider the following example:: + + u.mapping("UniParc", "UniProtKB", 'UPI0000000001,UPI0000000002') - There is a web page that gives the list of correct `database identifiers - `_. You can also look at the - :attr:`_mapping` attribute. + You will see that the UniParc results is more complex than just an identifier. - :URL: http://www.uniprot.org/mapping/ + See :attr:`valid_mapping` attribut for list of valid mapping identifiers. + + Note that according to Uniprot (June 2022), there are various limits on ID Mapping Job Submission: + + ========= ===================================================================================== + Limit Details + ========= ===================================================================================== + 100,000 Total number of ids allowed in comma separated param ids in /idmapping/run api + 500,000 Total number of "mapped to" ids allowed + 100,000 Total number of "mapped to" ids allowed to be enriched by UniProt data + 10,000 Total number of "mapped to" ids allowed with filtering + ========= ===================================================================================== .. versionchanged:: 1.1.1 to return a dictionary instaed of a list .. versionchanged:: 1.1.2 the values for each key is now made of a list @@ -453,130 +459,137 @@ def mapping(self, fr="ID", to="KEGG_ID", query="P13368"): instead of just a string .. versionchanged:: 1.3.1:: use http_post instead of http_get. This is 3 times faster and allows queries with more than 600 entries in one go. + .. version 1.10.0:: new API due to uniprot website update """ - url = "mapping/" # the slash matters - - query = self.services.devtools.list2string(query, sep=" ", space=False) - # if isinstance(query, list): - # query = " ".join(query) - params = {"from": fr, "to": to, "format": "tab", "query": query} - result = self.services.http_post(url, frmt="txt", data=params) - - # changes in version 1.1.1 returns a dictionary instead of list - try: - result = result.split() - del result[0] - del result[0] - except: - self.services.logging.warning("Results seems empty...returning empty dictionary.") - return {} - - if len(result) == 0: - return {} - else: - # bug fix based on ticket #19 version 1.1.2 - # the default dict set empty list for all keys by default - from collections import defaultdict - - result_dict = defaultdict(list) - keys = result[0::2] - values = result[1::2] - for i, key in enumerate(keys): - result_dict[key].append(values[i]) - return result_dict + if isinstance(query, (list, tuple)): + query = ",".join(query) + elif isinstance(query,str): + pass - def searchUniProtId(self, uniprot_id, frmt="xml"): - self.services.logging.warning("DEPRECATED SINCE VERSION 1.3.1. use retrieve instead") + # First, we call the real mapping request + params = {"from": fr, "to": to, "ids": query} - def retrieve(self, uniprot_id, frmt="xml", database="uniprot"): + job = self.services.http_post("idmapping/run", frmt="json", data=params) + try: + job_id = job['jobId'] + except TypeError: + logger.error(self.services.last_response.content.decode()) + return + + # the job id will tell us about the job status + results = None + waiting_time = 0 + while not results and waiting_time < max_waiting_time: + logger.info("Waiting for {job_id} to complete") + results = self.services.http_get(f"idmapping/status/{job_id}", frmt="json") + if results != 500 and 'results' in results: + return results + else: #pragma: no cover + time.sleep(polling_interval_seconds) + results = None + waiting_time += polling_interval_seconds + + def retrieve(self, uniprot_id, frmt="json", database="uniprot", include=False): """Search for a uniprot ID in UniProtKB database - :param str uniprot: a valid UniProtKB ID or a list of identifiers. + :param str uniprot: a valid UniProtKB ID, or uniref, uniparc or taxonomy. :param str frmt: expected output format amongst xml, txt, fasta, gff, rdf - :return: is a list of identifiers is provided, the output is also a list + :param str database: database name in (uniprot, uniparc, uniref, taxonomy) + :param bool include: include data with RDF format. + :return: if the parameter uniprot_id is string, the output will be a a list of identifiers is provided, the output is also a list otherwise, a string. The content of the string of items in the list depends on the value of **frmt**. :: >>> u = UniProt() - >>> res = u.retrieve("P09958", frmt="xml") - >>> fasta = u.retrieve([u'P29317', u'Q5BKX8', u'Q8TCD6'], frmt='fasta') + >>> res = u.retrieve("P09958", frmt="txt") + >>> fasta = u.retrieve(['P29317', 'Q5BKX8', 'Q8TCD6'], frmt='fasta') >>> print(fasta[0]) - """ - _valid_formats = ["txt", "xml", "rdf", "gff", "fasta"] - self.services.devtools.check_param_in_list(frmt, _valid_formats) - queries = self.services.devtools.to_list(uniprot_id) + .. versionchanged:: 1.10 the xml format is now returned as raw XML. It is not + interpreted anymore. The RDF has now an additional option to include data + from referenced data sets directly in the returned data (set include=True parameter). + Default output format is now set to json. + """ + if database == 'uniprot': + if frmt not in ("txt", "xml", "rdf", "gff", "fasta", "json"):#pragma: no cover + self.services.logging.warning("frmt must be set to one of: txt, xml, rdf, gff, fasta, json.") + elif database == 'uniparc': + if frmt not in ( "xml", "rdf", "fasta", 'tsv', 'json'): #pragma: no cover + raise ValueError("frmt must be set to one of: tsv, xml, rdf, gff, fasta, json") + self.services.logging.warning("frmt must be set to one of: txt, xml, rdf, gff, fasta.") + elif database == 'uniref': + if frmt not in ("xml", "rdf", "fasta", 'tsv', 'json'): #pragma: no cover + self.services.logging.warning("frmt must be set to one of: xml, rdf, gff, fasta, json.") + elif database == "taxonomy": + pass + else: #pragma: no cover + self.services.logging.warning("database must be set to uniref, uniparc, uniprot or taxonomy") + + + if isinstance(uniprot_id, str): + queries = uniprot_id.split(",") + else: + queries = uniprot_id + #queries = self.services.devtools.to_list(uniprot_id) + # some magic here not documented on uniprot website...but multiple queries are possible url = [database + "/" + query + "." + frmt for query in queries] - res = self.services.http_get(url, frmt="txt") - if frmt == "xml": - res = [self.services.easyXML(x) for x in res] + + # the frmt=txt here is for the requests, nothing related to the uniprot format + res = self.services.http_get(url, frmt="txt", params={'include':include}) + if frmt == 'json': + for i, x in enumerate(res): + try: + res[i] = json.loads(x) + except: + pass + if isinstance(res, list) and len(res) == 1: res = res[0] return res - def get_fasta(self, id_): + def get_fasta(self, uniprot_id): """Returns FASTA string given a valid identifier + :param str uniprot_id: a valid identifier (e.g. P12345) - .. seealso:: :mod:`bioservices.apps.fasta` for dedicated tools to - manipulate FASTA - """ - from bioservices.apps.fasta import FASTA - - f = FASTA() - f.load_fasta(id_) - return f.fasta - - def get_fasta_sequence(self, id_): - """Returns FASTA sequence (Not FASTA) - - :param str id_: Should be the entry name - :return: returns fasta sequence (string) - - .. warning:: this is the sequence found in a fasta file, not the fasta - content itself. The difference is that the header is removed and the - formatting of end of lines every 60 characters is removed. + This is just an alias to :meth:`retrieve` when setting the format to 'fasta'. + Method kept for legacy. """ - from bioservices.apps.fasta import FASTA - - f = FASTA() - f.load_fasta(id_) - return f.sequence + res = self.retrieve(uniprot_id, frmt='fasta') + return res def search( self, query, - frmt="tab", + frmt="tsv", columns=None, - include=False, + include_isoforms=False, sort="score", compress=False, limit=None, offset=None, maxTrials=10, - database="uniprot", + database="uniprotkb", ): """Provide some interface to the uniprot search interface. :param str query: query must be a valid uniprot query. - See http://www.uniprot.org/help/text-search, http://www.uniprot.org/help/query-fields - See also example below + See https://www.uniprot.org/help/query-fields and examples below :param str frmt: a valid format amongst html, tab, xls, asta, gff, txt, xml, rdf, list, rss. If tab or xls, you can also provide the columns argument. (default is tab) :param str columns: comma-separated list of values. Works only if fomat - is tab or xls. For UnitProtKB, some possible columns are: - id, entry name, length, organism. Some column name must be followed by - database name (e.g., "database(PDB)"). Again, see uniprot website - for more details. See also :attr:`~bioservices.uniprot.UniProt._valid_columns` - for the full list of column keyword. - :param bool include: include isoform sequences when the frmt + is tsv or xls. For UnitProtKB, some possible columns are: + id, entry name, length, organism. + See also :attr:`~bioservices.uniprot.UniProt.valid_mapping` + for the full list of column keywords. + :param bool include_isoform: include isoform sequences when the frmt parameter is fasta. Include description when frmt is rdf. :param str sort: by score by default. Set to None to bypass this behaviour :param bool compress: gzip the results @@ -589,9 +602,9 @@ def search( To obtain the list of uniprot ID returned by the search of zap70 can be retrieved as follows:: - >>> u.search('zap70+AND+organism:9606', frmt='list') - >>> u.search("zap70+and+taxonomy:9606", frmt="tab", limit=3, - ... columns="entry name,length,id, genes") + >>> u.search('zap70+AND+organism_id:9606') + >>> u.search("zap70+AND+taxonomy_id:9606", frmt="tsv", limit=3, + ... columns="entry_name,length,id, gene_names") Entry name Length Entry Gene names CBLB_HUMAN 982 Q13191 CBLB RNF56 Nbla00127 CBL_HUMAN 906 P22681 CBL CBL2 RNF55 @@ -599,24 +612,24 @@ def search( other examples:: - >>> u.search("ZAP70+AND+organism:9606", limit=3, columns="id,database(PDB)") + >>> u.search("ZAP70+AND+organism_id:9606", limit=3, columns="id,xref_pdb") You can also do a search on several keywords. This is especially useful if you have a list of known entry names.:: - >>> u.search("ZAP70_HUMAN+or+CBL_HUMAN", frmt="tab", limit=3, + >>> u.search("ZAP70_HUMAN+OR+CBL_HUMAN", frmt="tsv", limit=3, ... columns="entry name,length,id, genes") Entry name Length Entry Gene names - Finally, note that when search for a query, you may have several hits:: + Finally, note that when you search for a query, you may have several hits:: >>> u.search("P12345) - including the ID P12345 but also related entries. If you need only the - entry that perfectly match the query, use:: + including the ID P12345 but also related entries. If you + need only the entry that perfectly match the query, use:: - >>> u.search("id:P12345") + >>> u.search("accession:P12345") This was provided from a user issue that was solved here: https://github.com/cokelaer/bioservices/issues/122 @@ -625,16 +638,24 @@ def search( .. warning:: some columns although valid may not return anything, not even in the header: 'score', 'taxonomy', 'tools'. this is a uniprot feature, not bioservices. + + .. versionchanged:: 1.10 + + Due to uniprot API changes in June 2022: + + * parameter 'include' is not named 'include_isoform + * default parameter 'tab' is now 'tsv' but does not change the results + """ params = {} if frmt is not None: _valid_formats = [ - "tab", "xls", "fasta", "gff", "txt", + "tsv", "xml", "rss", "list", @@ -645,7 +666,7 @@ def search( params["format"] = frmt if columns is not None: - self.services.devtools.check_param_in_list(frmt, ["tab", "xls"]) + self.services.devtools.check_param_in_list(frmt, ["tsv", "xls"]) # remove unneeded spaces before/after commas if any if "," in columns: @@ -653,17 +674,12 @@ def search( else: columns = [columns] - for col in columns: - if col.startswith("database(") is True: - pass - else: - self.services.devtools.check_param_in_list(col, self._valid_columns) # convert back to a string as expected by uniprot - params["columns"] = ",".join([x.strip() for x in columns]) + params["fields"] = ",".join([x.strip() for x in columns]) - if include is True and frmt in ["fasta", "rdf"]: - params["include"] = "yes" + if include_isoforms is True and frmt in ["fasta", "rdf"]: + params["includeIsoform"] = "yes" if compress is True: params["compress"] = "yes" @@ -673,31 +689,35 @@ def search( params["sort"] = sort if offset is not None: - if isinstance(offset, int): - params["offset"] = offset + #if isinstance(offset, int): + params["cursor"] = offset if limit is not None: if isinstance(limit, int): - params["limit"] = limit + params["size"] = limit + + # + are interpreted and have a meaning. See arrayexpress module for details - # + are interpreted and have a meaning. - params["query"] = query.replace("+", " ") - # res = s.request("/uniprot/?query=zap70+AND+organism:9606&format=xml", params) - res = self.services.http_get(database + "/", frmt="txt", params=params) + query = query.replace("+", " ") + params['query'] = query + del params['sort'] + + res = self.services.http_get(f"{database}/search", frmt="txt", params=params) return res - def quick_search(self, query, include=False, sort="score", limit=None): + def quick_search(self, query, include_isoforms=False, sort="score", limit=None): """a specialised version of :meth:`search` This is equivalent to:: u = uniprot.UniProt() - u.search(query, frmt="tab", include=False, sor="score", limit=None) + u.search(query, frmt='tsv', sort="score", limit=None) :returns: a dictionary. """ - res = self.search(query, "tab", include=include, sort=sort, limit=limit) + res = self.search(query, "tsv", include_isoforms=include_isoforms, + sort=sort, limit=limit) # if empty result, nothing to do if res and len(res) == 0: @@ -722,33 +742,23 @@ def quick_search(self, query, include=False, sort="score", limit=None): def uniref(self, query): """Calls UniRef service - :return: if you have Pandas installed, returns a dataframe (see example) - + This is an alias to :meth:`retrieve` :: >>> u = UniProt() - >>> df = u.uniref("member:Q03063") # of just A03063 - >>> df.Size + >>> u.uniref("Q03063") Another example from https://github.com/cokelaer/bioservices/issues/121 is the combination of uniprot and uniref filters:: - u.uniref("uniprot:(ec:1.1.1.282 taxonomy:bacteria reviewed:yes) AND identity:0.5") + u.uniref("uniprot:(ec:1.1.1.282 taxonomy_name:bacteria reviewed:true)") + .. versionchanged:: 1.10 due to uniprot API changes in June 2022, + we now return a json instead of a pandas dataframe. """ - try: - import pandas as pd - except: - print("uniref method requires Pandas", file=sys.stderr) - return res = self.services.http_get( - "uniref/", params={"query": query, "format": "tab"}, frmt="txt" + f"uniref/UniRef90_{query}.json", frmt="json" ) - try: - # python 2.X - res = pd.read_csv(io.StringIO(unicode(res)), sep="\t") - except: - res = pd.read_csv(io.StringIO(str(res.strip())), sep="\t") return res def get_df(self, entries, nChunk=100, organism=None, limit=10): @@ -762,10 +772,6 @@ def get_df(self, entries, nChunk=100, organism=None, limit=10): set it to None to keep all entries but this will be very slow :return: dataframe with indices being the uniprot id (e.g. DIG1_YEAST) - .. todo:: cleanup the content of the data frame to replace strings - separated by ; into a list of strings. e.g. the Gene Ontology IDs - - .. warning:: requires pandas library """ if isinstance(entries, str): entries = [entries] @@ -786,18 +792,19 @@ def get_df(self, entries, nChunk=100, organism=None, limit=10): this_entries = entries[i * nChunk : (i + 1) * nChunk] if len(this_entries): self.services.logging.info("uniprot.get_df {}/{}".format(i + 1, N)) - query = "+or+".join(this_entries) + query = "+OR+".join(this_entries) if organism: - query += "+and+" + organism + query += f"+AND+{organism}" res = self.search( query, - frmt="tab", + frmt="tsv", columns=",".join(self._valid_columns), limit=limit, ) else: break + if len(res) == 0: self.services.logging.warning("some entries %s not found" % entries) else: @@ -806,6 +813,7 @@ def get_df(self, entries, nChunk=100, organism=None, limit=10): df = pd.read_csv(io.StringIO(unicode(res)), sep="\t") except: df = pd.read_csv(io.StringIO(str(res)), sep="\t") + if isinstance(output, type(None)): output = df.copy() else: @@ -813,22 +821,15 @@ def get_df(self, entries, nChunk=100, organism=None, limit=10): # you may end up with duplicated... output.drop_duplicates(inplace=True) - # you may have new entries... - # output = output[output.Entry.apply(lambda x: x in entries)] - # to transform into list: columns = [ - "PubMed ID", - "Comments", - u"Domains", - "Protein families", + "lit_pubmed_id", + "protein_families", "Gene names", - "Gene ontology (GO)", - "Gene ontology IDs", - "InterPro", - "Interacts with", - "Keywords", + "go", + "go_ids", + "interaction", + "keyword", ] - #'Subcellular location'] for col in columns: try: res = output[col].apply( @@ -841,7 +842,8 @@ def get_df(self, entries, nChunk=100, organism=None, limit=10): self.services.logging.warning("column could not be parsed. %s" % col) # Sequences are splitted into chunks of 10 characters. let us rmeove # the spaces: - output["Sequence"].fillna("", inplace=True) - output.Sequence = output["Sequence"].apply(lambda x: x.replace(" ", "")) + if "sequence" in output.columns: + output["sequence"].fillna("", inplace=True) + output.Sequence = output["sequence"].apply(lambda x: x.replace(" ", "")) return output diff --git a/test/webservices/test_uniprot.py b/test/webservices/test_uniprot.py index 03e8309..920cb76 100644 --- a/test/webservices/test_uniprot.py +++ b/test/webservices/test_uniprot.py @@ -9,43 +9,55 @@ def uniprot(): def test_mapping(uniprot): - res = uniprot.mapping(fr="ACC+ID", to="KEGG_ID", query='P43403') - assert res['P43403'] == ['hsa:7535'] + + assert "KEGG" in uniprot.valid_mapping + res = uniprot.mapping("UniProtKB_AC-ID", "KEGG", "P43403,P123456") + res = uniprot.mapping("UniProtKB_AC-ID", "KEGG", ["P43403","P123456"]) + assert len(res['results']) == 1 + assert len(res['failedIds']) == 1 + try: - res = uniprot.mapping(fr="AC", to="KEID", query='P434') + res = uniprot.mapping("UniProtKB_AC-ID", "KEGGDUMMY", "P43403,P123456") assert False except: assert True def test_retrieve(uniprot): - uniprot.retrieve("P09958", frmt="rdf") - uniprot.retrieve("P09958", frmt="xml") - uniprot.retrieve("P09958", frmt="txt") - uniprot.retrieve("P09958", frmt="fasta") - uniprot.retrieve("P09958", frmt="gff") - try: - uniprot.retrieve("P09958", frmt="dummy") - assert False - except: - assert True + + for frmt in ['rdf', 'xml', 'txt', 'fasta', 'gff', 'json']: + uniprot.retrieve("P09958", frmt=frmt) + + # test input parameters + assert uniprot.retrieve("P09958", frmt='json') == uniprot.retrieve(['P09958'], frmt='json') + + for frmt in ['rdf', 'xml', 'txt', 'fasta', 'gff', 'json']: + uniprot.retrieve("P09958", frmt=frmt, database='uniref') + + assert uniprot.retrieve("P09958", frmt='json', database='dummy') in [400, 404] + def test_search(uniprot): + # two strings, or list, or a single string + uniprot.search("P43403", columns="id") + + uniprot.search('zap70+AND+organism:9606', frmt='list') - uniprot.search("zap70+and+taxonomy:9606", frmt="tab", limit=3, + uniprot.search("zap70+and+taxonomy:9606", frmt="tsv", limit=3, columns="entry name,length,id, genes, genes(PREFERRED), interpro, interactor") - uniprot.search("zap70+and+taxonomy:9606", frmt="tab", limit=3, + uniprot.search("zap70+and+taxonomy:9606", frmt="tsv", limit=3, columns="entry name, go(biological process), comment(FUNCTION), comment(DOMAIN), lineage(all)") - uniprot.search("ZAP70_HUMAN", frmt="tab", columns="sequence", limit=1) + uniprot.search("ZAP70_HUMAN", frmt="tsv", columns="sequence", limit=1) + +def test_quick_search(uniprot): uniprot.quick_search("ZAP70") + uniprot.quick_search("ZAP70","ZAP70") def test_uniref(uniprot): - df = uniprot.uniref("member:Q03063") - df.Size + assert 'goTerms' in uniprot.uniref("Q03063") def test_get_df(uniprot): df = uniprot.get_df(["P43403"]) def test_fasta(uniprot): "Q9Y617" in uniprot.get_fasta(["Q9Y617-1"]) - "Q9Y617" not in uniprot.get_fasta_sequence(["Q9Y617-1"]) From 7808ab011884e702286dc3d9773d808081a7bae3 Mon Sep 17 00:00:00 2001 From: Thomas Cokelaer Date: Fri, 1 Jul 2022 15:12:07 +0200 Subject: [PATCH 3/5] Remove py2.X support --- src/bioservices/uniprot.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/bioservices/uniprot.py b/src/bioservices/uniprot.py index b6797ba..c8847d5 100644 --- a/src/bioservices/uniprot.py +++ b/src/bioservices/uniprot.py @@ -40,7 +40,6 @@ """ import io -import sys import time import urllib import json @@ -808,11 +807,7 @@ def get_df(self, entries, nChunk=100, organism=None, limit=10): if len(res) == 0: self.services.logging.warning("some entries %s not found" % entries) else: - try: - # python 2.X - df = pd.read_csv(io.StringIO(unicode(res)), sep="\t") - except: - df = pd.read_csv(io.StringIO(str(res)), sep="\t") + df = pd.read_csv(io.StringIO(str(res)), sep="\t") if isinstance(output, type(None)): output = df.copy() From 8236c2a6313a8ec74b3aa7c30e85d884cef46cd4 Mon Sep 17 00:00:00 2001 From: Thomas Cokelaer Date: Fri, 1 Jul 2022 15:34:23 +0200 Subject: [PATCH 4/5] update metadata --- README.rst | 1 + setup.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index b8d43e6..dc8092d 100644 --- a/README.rst +++ b/README.rst @@ -176,6 +176,7 @@ Changelog ========= ==================================================================== Version Description ========= ==================================================================== +1.10.0 * Update uniprot to use the new API (june 2022) 1.9.0 * Update unichem to reflect new API 1.8.4 * biomodels. Fix #208 * KEGG: fixed #204 #202 and #203 diff --git a/setup.py b/setup.py index 822ac66..76d1d25 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ _MAJOR = 1 -_MINOR = 9 +_MINOR = 10 _MICRO = 0 version = '%d.%d.%d' % (_MAJOR, _MINOR, _MICRO) release = '%d.%d' % (_MAJOR, _MINOR) From 5b3160f08d48820fb10c2bb1edac12d82e0d6442 Mon Sep 17 00:00:00 2001 From: Thomas Cokelaer Date: Fri, 1 Jul 2022 15:54:22 +0200 Subject: [PATCH 5/5] Fix peptides.py to reflect new uniprot api --- src/bioservices/apps/peptides.py | 5 +++-- test/test_apps_peptides.py | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bioservices/apps/peptides.py b/src/bioservices/apps/peptides.py index 7566a12..3fc6aab 100644 --- a/src/bioservices/apps/peptides.py +++ b/src/bioservices/apps/peptides.py @@ -8,7 +8,6 @@ class Peptides(object): :: >>> p = Peptides() - >>> p.get_fasta_sequence("Q8IYB3") >>> p.get_peptide_position("Q8IYB3", "VPKPEPIPEPKEPSPE") 189 @@ -25,9 +24,11 @@ def __init__(self, verbose=False): self.sequences = {} def get_fasta_sequence(self, uniprot_name): - seq = self.u.get_fasta_sequence(uniprot_name) + seq = self.u.get_fasta(uniprot_name) + seq = "".join(seq.split("\n")[1:]) return seq + def get_phosphosite_position(self, uniprot_name, peptide): if uniprot_name not in self.sequences.keys(): seq = self.get_fasta_sequence(uniprot_name) diff --git a/test/test_apps_peptides.py b/test/test_apps_peptides.py index 447528b..c36071d 100644 --- a/test/test_apps_peptides.py +++ b/test/test_apps_peptides.py @@ -6,5 +6,4 @@ def test_peptides(): p = Peptides() pos = p.get_phosphosite_position("Q8IYB3", "VPKPEPIPEPKEPSPE") assert pos == [740, 901] - p.get_fasta_sequence("Q8IYB3")