diff --git a/README.rst b/README.rst index 9465e01c..c6ad6b5c 100644 --- a/README.rst +++ b/README.rst @@ -20,6 +20,10 @@ BIOSERVICES: access to biological web services programmatically .. image:: https://static.pepy.tech/personalized-badge/bioservices?period=month&units=international_system&left_color=black&right_color=orange&left_text=Downloads :target: https://pepy.tech/project/bioservices +.. image:: https://anaconda.org/conda-forge/bioservices/badges/version.svg + :target: https://anaconda.org/conda-forge/bioservices + + |Codacy-Grade| diff --git a/doc/all_relations.png b/doc/all_relations.png new file mode 100644 index 00000000..15a508a5 Binary files /dev/null and b/doc/all_relations.png differ diff --git a/doc/external_references.rst b/doc/external_references.rst index 88f3e165..fdeb07e4 100644 --- a/doc/external_references.rst +++ b/doc/external_references.rst @@ -5,7 +5,6 @@ References to BioServices on the Web * Galaxy: See the Log Archive at `Galaxy log archive `_ * EBI: See `EBI programming web services `_ * WikiPathways: `references `_ -* GeneProf: `example python `_ * http://www.scoop.it/t/bioinformatique * http://bioinfo-fr.net/bioservices-module-python * http://devbio.eu/?p=resources&presel=bioinfo diff --git a/doc/geneprof_tutorial.rst b/doc/geneprof_tutorial.rst deleted file mode 100644 index 89697f5c..00000000 --- a/doc/geneprof_tutorial.rst +++ /dev/null @@ -1,226 +0,0 @@ -GeneProf tutorial -===================== - -.. topic:: GeneProf tutorial - - .. versionadded:: 1.2.0 - .. sectionauthor:: Thomas Cokelaer, Dec 2013 - -`GeneProf `_ is a web-based, graphical software suite that allows users to analyse data produced using high-throughput sequencing platforms (RNA-seq and ChIP-seq; "Next-Generation Sequencing" or NGS): Next-gen analysis for next-gen data - - -BioServices uses the GeneProf Web Services to enable programmatic access to the public data stored in GeneProf's databases via Python. - -.. note:: GeneProf services is quite versatile and contains many resources and examples. For any technical or scientific questions related to the service itself, please see `GeneProf About&Help `_. - - -Here below you will find a couple of examples related to GeneProf. - - - -Histogram expression data --------------------------------------- -:Reference: https://www.geneprof.org/GeneProf/media/bpsm-2013/ - - -In the example below, we use geneprof to - -#. search for Gene identifiers related to an organism (mouse) and keyword (nanog). -#. From the gene identifiers, retrieve the gene expression values for a given gene in all experiments -#. plot histogram of the log values found above. - - -.. .. plot:: - :include-source: - :width: 80% - -.. note:: broken example on June 2017. Service should be fix soon. Issue - reported to the author. - -:: - - >>> from bioservices import GeneProf - >>> g = GeneProf(verbose=True) - >>> res = g.search_gene_ids("nanog", "mouse") - >>> print(res) - {10090: [29640, 14899]} - >>> expr1 = g.get_expression("mouse", 29640)['values'] - >>> expr2 = g.get_expression("mouse", 14899)['values'] - - >>> import math - >>> values1 = [math.log(x["RPKM"]+1, 2.) for x in expr1] - >>> values2 = [math.log(x["RPKM"]+1, 2.) for x in expr2] - - >>> from pylab import clf, subplot, hist - >>> clf() - >>> subplot(2,1,1) - >>> hist(values1) - >>> subplot(2,1,2) - >>> hist(values2) - - -Transcription factor network of stem cells -------------------------------------------------------- - -:References: https://www.geneprof.org/GeneProf/media/recomb-2013/ - - -Another example, here below consists in retrieving -the binding targets of transcription factors (about 70) in mouse -embryonic stem cells, and generate a SIF network that could be open and visualised in Cytoscape. - -The example below can probably be simplified and make use of tools such as networkx to manipulate -and visualise the final network. Please use with care:: - - >>> # first import and create a GenProf instance - >>> from bioservices import GeneProf - >>> g = GeneProf(verbose=False) - >>> - >>> # find all pubic experimental mouse samples in geneprof - >>> samples = g.get_list_experiment_samples("mouse")['samples'] - >>> # look at entries that contains "Gene" - >>> graph = {} - >>> mapgene = {} - >>> for i, entry in enumerate(samples): - ... print("progress %s/%s" % (i+1, len(samples))) - - ... # keep only entries that have cell type "embryonic stem cell" in the celltype - ... if "Gene" in entry.keys() and "Cell_Type" in entry.keys() and entry["Cell_Type"]=="embryonic stem cell": - ... - ... # aliases - ... sampleId = entry['sample_id'] - ... gene = entry["Gene"] - - ... # get gene id and save mapping in a dictionary to be used later - ... geneId = g.get_gene_id("mouse", "C_NAME", gene)['ids'] - ... mapgene[geneId[0]] = gene - - ... # get targets and print them - ... targets = g.get_targets_by_experiment_sample("mouse", sampleId) - - ... # could be simplied inside the geneprof.py module - ... if 'targets' in targets.keys(): - ... targets = targets['targets'] - - ... # print the results - ... for x in targets: - ... print gene, geneId[0], " ", x['feature_id'] - ... graph[gene] = [x['feature_id'] for x in targets] - - >>> # The graph saved in the graph variables is quite large. Let us simplified keeping target that - >>> # are in the list of genes only - >>> simple_graph = {} - >>> for k, v in graph.iteritems(): - ... simple_graph[k] = [mapgene[x] for x in v if x in mapgene.keys()] - >>> len(simple_graph.keys()) - 72 - >>> sum([len(simple_graph[x]) for x in simple_graph.keys()]) - 2137 - - -Finally, you can look at the graph with your favorite tool such as Cytoscape, Gephi. - -Here below, I'm using a basic graph visualisation tool implemented in `CellNOpt `_, which is not dedicated -for Network visualisation but contains a small interface to graphviz useful in this context (it has a python interface):: - - >>> from cno import CNOGraph - >>> c = CNOGraph() - >>> for k in simple_graph.keys(): - ... for v in simple_graph[k]: - ... c.add_edge(k, v, link="+") - >>> c.centrality_degree() - >>> c.graph['graph'] = {"splines":"true", "size":(20,20), - "dpi":200, "fixedsize":True} - >>> c.graph['node'] = {"width":.01, "height":.01, - 'size':0.01, "fontsize":8} - >>> c.plotdot(prog="fdp", node_attribute="degree") - -.. image:: geneprof_network.png - - - -Integrating expression data in pathways -------------------------------------------------------- - -:References: https://www.geneprof.org/GeneProf/media/recomb-2013/ - - - -This is another example from the reference above but based on tools available in bioservices so as to overlaid highthroughput gene expression -onto pathways and models from KEGG database. - -Fold changes in lymphoma vs. kidney -on selected KEGG pathways - -:: - - >>> from bioservices import KEGG, GeneProf, UniProt - >>> import StringIO - >>> import pandas - >>> g = GeneProf() - >>> k = KEGG() - >>> u = UniProt() - - >>> # load ENCODE RNA-seq into a DataFrame for later - >>> data = g.get_data("11_683_28_1", "txt") - >>> rnaseq = pandas.read_csv(StringIO.StringIO(data), sep="\t") - >>> gene_names = rnaseq['Ensembl Gene ID'] - - >>> # get a pathway diagram for the KEGG path hsa05202 ("Transcriptional - >>> # misregulation in cancers") - >>> res = k.parse(k.get("hsa05202")) - >>> # extract KEGG identifiers corresponding to the genes found in the pathway - >>> keggids = ["hsa:"+x for x in res['GENE'].keys()] - - >>> # we need to map the KEGG Ids to Ensembl Ids. We will use KEGG mapping and uniprot mapping - >>> # for cases where the former does not have associated mapping. - >>> ensemblids = {} - >>> for id_ in keggids: - ... res = k.parse(k.get(id_))['DBLINKS'] - ... if 'Ensembl' in res.keys(): - ... print id_, res['Ensembl'] - ... ensemblids[id_] = res['Ensembl'] - ... else: - ... if "UniProt" in res.keys(): - ... ids = res['UniProt'].split()[0] - ... m = u.mapping("ACC", "ENSEMBL_ID", query=ids) - ... if len(m): ensemblids[id_] = m[ids][0] - ... pass # no links to ensembl DB found - - >>> # what are the KEGG id transformed into Ensembl Ids that are in the ENCODE data set ? - >>> found = [x for x in ensemblids.values() if x in [str(y) for y in gene_names]] - >>> indices = [i for i, x in enumerate(rnaseq['Ensembl Gene ID']) if x in found] - >>> - >>> # now, we can pick out the log2 fold change values for visualization: - >>> data = rnaseq.ix[indices][['Ensembl Gene ID', 'log2FC Lymphoma / EmbryonicKidney']] - >>> # and keep only those that have a negative or positive value - >>> mid = 1.5 - >>> low = data[data['log2FC Lymphoma / EmbryonicKidney']<-mid] - >>> geneid_low = list(low['Ensembl Gene ID']) - >>> up = data[data['log2FC Lymphoma / EmbryonicKidney']>mid] - >>> geneid_up = list(up['Ensembl Gene ID']) - >>> mid = data[abs(data['log2FC Lymphoma / EmbryonicKidney'])>> geneid_mid = list(mid['Ensembl Gene ID']) - - >>> # now that we have the genes (in ensembl format), we need the kegg id - >>> keggid_low = [this for this in keggids if ensemblids[this] in geneid_low] - >>> keggid_mid = [this for this in keggids if ensemblids[this] in geneid_mid] - >>> keggid_up = [this for this in keggids if ensemblids[this] in geneid_up] - >>> # it is now time to look at the expression on the diagram - >>> colors = {} - >>> for id_ in keggids: colors[id_[4:]] = "gray," - >>> for id_ in keggid_low: colors[id_[4:]] = "blue," - >>> for id_ in keggid_up: colors[id_[4:]] = "orange," - >>> for id_ in keggid_mid: colors[id_[4:]] = "yellow," - >>> k.show_pathway("hsa05202", dcolor="white", keggid=colors) - -The last command will popup the KEGG diagram with the expression data on top of the diagram, as shown in the following picture: - -.. image:: geneprof_kegg_expression.png - :width: 100% - - - - - - diff --git a/doc/pymol_app.py b/doc/pymol_app.py new file mode 100644 index 00000000..ab95a222 --- /dev/null +++ b/doc/pymol_app.py @@ -0,0 +1,33 @@ +import __main__ +__main__.pymol_argv = [ 'pymol', '-qc'] # Quiet and no GUI + +import os +if os.path.isfile("bioservices_pdb.png"): + os.remove("bioservices_pdb.png") + +# BioServices 1: obtain the PDB ID from a given uniprot ID (P43403 i.e. ZAP70) +from bioservices import * +print("Retrieving PDB ID") +u = UniProt(verbose=False) +res = u.mapping(fr="UniProtKB_AC-ID", to="PDB", query="P43403") +pdb_id = res['results']['P43403'][0] # e.g, "1FBV" + +# BioServices 2: Download the PDB file from the PDB Web Service +print("Fetching PDB file") +p = pdbe.PDBe() +res = p.get_files(pdb_id) + +# General: save the fetched file in a temporary file +import tempfile +fh = tempfile.NamedTemporaryFile() +fh.write(res) +sname = fh.name + +# THIS IS NOT BIOSERVICES ANYMORE but PYMOL +import pymol +pymol.finish_launching() +pymol.cmd.load(sname) +pymol.cmd.png("bioservices_pdb.png", width="15cm", height="15cm", dpi=140) +#pymol.cmd.png("my_image.png") +# Get out! +pymol.cmd.quit() diff --git a/doc/references.rst b/doc/references.rst index 9a6ad5a9..8c9ede0b 100644 --- a/doc/references.rst +++ b/doc/references.rst @@ -29,7 +29,7 @@ Services ArrayExpress =================== -.. .. automodule:: bioservices.arrayexpress +.. automodule:: bioservices.arrayexpress :members: :undoc-members: :synopsis: @@ -130,10 +130,6 @@ GeneProf Currently removed from the main API from version 1.6.0 onwards. You can still get the code in earlier version or in the github repository in the attic/ directory -.. .. automodule:: bioservices.geneprof - :members: - :undoc-members: - :synopsis: QuickGO ================ diff --git a/doc/tutorials.rst b/doc/tutorials.rst index 4fca6b72..1c5a2d82 100644 --- a/doc/tutorials.rst +++ b/doc/tutorials.rst @@ -21,4 +21,3 @@ welcomed. compound_tutorial.rst convertor_tutorial.rst biomart.rst - geneprof_tutorial.rst diff --git a/src/bioservices/chembl.py b/src/bioservices/chembl.py index 8b2eebe6..89771d03 100644 --- a/src/bioservices/chembl.py +++ b/src/bioservices/chembl.py @@ -694,15 +694,19 @@ def get_organism(self, query=None, limit=20, offset=0, filters=None): params = {"limit": limit, "offset": offset, "filters": filters} return self._get_this_service("organism", query, params=params) + """ + # subservices removed apparatently (march 2023) hence the commented code + def search_protein_class(self, query, limit=20, offset=0): params = {"limit": limit, "offset": offset} return self._search("protein_class", query, params=params) def get_protein_class(self, query=None, limit=20, offset=0, filters=None): - """Protein family classification of TargetComponents""" + "Protein family classification of TargetComponents" params = {"limit": limit, "offset": offset, "filters": filters} return self._get_this_service("protein_class", query, params=params) + """ def get_substructure(self, structure, limit=20, offset=0, filters=None): """Molecule substructure search @@ -906,10 +910,10 @@ def get_image(self, query, dimensions=500, format="png", save=True, view=True, e .. todo:: ignorecoords option """ # NOTE: not async requests here. - self.devtools.check_range(dimensions, 1, 500) - self.devtools.check_param_in_list(engine, ["rdkit", "indigo"]) - self.devtools.check_param_in_list(format, ["png", "svg"]) - queries = self.devtools.to_list(query) + self.services.devtools.check_range(dimensions, 1, 500) + self.services.devtools.check_param_in_list(engine, ["rdkit", "indigo"]) + self.services.devtools.check_param_in_list(format, ["png", "svg"]) + queries = self.services.devtools.to_list(query) res = {"filenames": [], "images": [], "chemblids": []} for query in queries: diff --git a/src/bioservices/psicquic.py b/src/bioservices/psicquic.py index aeddd034..55e572a4 100644 --- a/src/bioservices/psicquic.py +++ b/src/bioservices/psicquic.py @@ -462,7 +462,7 @@ def query( try: index = names.index(service) except ValueError: - self.logging.error("The service you gave (%s) is not registered. See self.registery_names" % service) + self.services.logging.error("The service you gave (%s) is not registered. See self.registery_names" % service) raise ValueError # get the base url according to the service requested @@ -525,7 +525,7 @@ def queryAll( raise ValueError("database %s not in active databases" % x) for name in databases: - self.logging.warning("Querying %s" % name), + self.services.logging.warning("Querying %s" % name), res = self.query( name, query, @@ -541,7 +541,7 @@ def queryAll( results[name] = copy.copy(res) for name in databases: - self.logging.info("Found %s in %s" % (len(results[name]), name)) + self.services.logging.info("Found %s in %s" % (len(results[name]), name)) return results def getInteractionCounter(self, query): @@ -587,7 +587,7 @@ def knownName(self, data): """ - self.logging.info("converting data into known names") + self.services.logging.info("converting data into known names") idsA = [x[0].replace('"', "") for x in data] idsB = [x[1].replace('"', "") for x in data] # extract the first and second ID but let us check if it is part of a @@ -613,10 +613,10 @@ def knownName(self, data): if len(valid_dbs) >= 1: idsA[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1] else: - self.logging.debug("none of the DB for this entry (%s) are available" % (entry)) + self.services.logging.debug("none of the DB for this entry (%s) are available" % (entry)) idsA[i] = "?" + dbs[0] + ":" + IDs[0] except: - self.logging.info("Could not extract name from %s" % entry) + self.services.logging.info("Could not extract name from %s" % entry) idsA[i] = "??:" + entry # we add a : so that we are sure that a split(":") will work # the second ID for i, entry in enumerate(idsB): @@ -628,19 +628,19 @@ def knownName(self, data): if len(valid_dbs) >= 1: idsB[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1] else: - self.logging.debug("none of the DB (%s) for this entry are available" % (entry)) + self.services.logging.debug("none of the DB (%s) for this entry are available" % (entry)) idsB[i] = "?" + dbs[0] + ":" + IDs[0] except: - self.logging.info("Could not extract name from %s" % entry) + self.services.logging.info("Could not extract name from %s" % entry) idsB[i] = "??:" + entry countA = len([x for x in idsA if x.startswith("?")]) countB = len([x for x in idsB if x.startswith("?")]) if countA + countB > 0: - self.logging.warning("%s ids out of %s were not identified" % (countA + countB, len(idsA) * 2)) + self.services.logging.warning("%s ids out of %s were not identified" % (countA + countB, len(idsA) * 2)) print(set([x.split(":")[0] for x in idsA if x.startswith("?")])) print(set([x.split(":")[0] for x in idsB if x.startswith("?")])) - self.logging.info("knownName done") + self.services.logging.info("knownName done") return idsA, idsB def preCleaning(self, data): @@ -656,7 +656,7 @@ def postCleaningAll(self, data, keep_only="HUMAN", flatten=True, verbose=True): """ results = {} for k in data.keys(): - self.logging.info("Post cleaning %s" % k) + self.services.logging.info("Post cleaning %s" % k) ret = self.postCleaning(data[k], keep_only="HUMAN", verbose=verbose) if len(ret): results[k] = ret @@ -708,12 +708,12 @@ def postCleaning( def convertAll(self, data): results = {} for k in data.keys(): - self.logging.info("Analysing %s" % k) + self.services.logging.info("Analysing %s" % k) results[k] = self.convert(data[k], db=k) return results def convert(self, data, db=None): - self.logging.debug("converting the database %s" % db) + self.services.logging.debug("converting the database %s" % db) idsA, idsB = self.knownName(data) mapping = self.mappingOneDB(data) results = [] @@ -739,7 +739,7 @@ def convert(self, data, db=None): def mappingOneDB(self, data): query = {} - self.logging.debug("converting IDs with proper DB name (knownName function)") + self.services.logging.debug("converting IDs with proper DB name (knownName function)") entriesA, entriesB = self.knownName(data) # idsA and B contains list of a single identifier of the form db:id # the db is known from _mapping.uniprot otherwise it is called "unknown" @@ -788,7 +788,7 @@ def mappingOneDB(self, data): DBname = self._mapping_uniprot[k] if DBname is not None: - self.logging.warning("Request sent to uniprot for %s database (%s/%s)" % (DBname, counter, N)) + self.services.logging.warning("Request sent to uniprot for %s database (%s/%s)" % (DBname, counter, N)) res = self.uniprot.mapping(fr=DBname, to="ID", query=" ".join(this_query)) for x in this_query: if x not in res: # was not found @@ -801,7 +801,7 @@ def mappingOneDB(self, data): if len(res[x]) == 1: mapping[x] = res[x][0] else: - self.logging.warning("psicquic mapping found more than 1 id. keep first one") + self.services.logging.warning("psicquic mapping found more than 1 id. keep first one") mapping[x] = res[x][0] else: for x in this_query: diff --git a/src/bioservices/unichem.py b/src/bioservices/unichem.py index 3c97343f..4476f60b 100644 --- a/src/bioservices/unichem.py +++ b/src/bioservices/unichem.py @@ -96,7 +96,7 @@ class UniChem: res['compounds'][0] - .. changed:: version 1.9. drop xml parser. + .. versionchanged:: version 1.9. drop xml parser. """ @@ -384,6 +384,8 @@ def get_images(self, uci, filename=None): .. plot:: + from bioservices import UniChem + u = UniChem() res = u.get_images('304698', filename='test.svg') """ diff --git a/test/webservices/test_chembl.py b/test/webservices/test_chembl.py index 60ee058f..b318a565 100644 --- a/test/webservices/test_chembl.py +++ b/test/webservices/test_chembl.py @@ -2,7 +2,6 @@ import pytest import os -skiptravis = pytest.mark.skipif("TRAVIS_PYTHON_VERSION" in os.environ, reason="On travis") SMILE = "CC(=O)Oc1ccccc1C(=O)O" @@ -95,7 +94,7 @@ def test_activity(chembl): "mechanism", "metabolism", "molecule_form", - "protein_class", + #"protein_class", "source", "target", "target_component", @@ -119,8 +118,7 @@ def test_ATC(chembl): # SEARCHES ------------------ -@skiptravis -def test_search_protein_class(chembl): +def _test_search_protein_class(chembl): res1715 = chembl.get_protein_class(1715) # no good example. This returns noting but at least calls the method chembl.search_protein_class("CAMK") diff --git a/test/webservices/test_kegg.py b/test/webservices/test_kegg.py index 679d3d63..582cfffe 100644 --- a/test/webservices/test_kegg.py +++ b/test/webservices/test_kegg.py @@ -83,32 +83,32 @@ def test_parse_kgml_pathway(kegg): @pytest.mark.flaky(max_runs=3, min_passes=1) def test_ids1(kegg): - assert kegg.enzymeIds[0].startswith("ec") + assert kegg.enzymeIds[0].startswith("1.") @pytest.mark.flaky(max_runs=3, min_passes=1) def test_ids2(kegg): - assert kegg.compoundIds[0].startswith("cpd") + assert kegg.compoundIds[0].startswith("C") @pytest.mark.flaky(max_runs=3, min_passes=1) def test_ids3(kegg): - assert kegg.glycanIds[0].startswith("gl") + assert kegg.glycanIds[0].startswith("G") @pytest.mark.flaky(max_runs=3, min_passes=1) def test_ids4(kegg): - assert kegg.reactionIds[0].startswith("rn") + assert kegg.reactionIds[0].startswith("R") @pytest.mark.flaky(max_runs=3, min_passes=1) def test_ids5(kegg): - assert kegg.drugIds[0].startswith("dr") + assert kegg.drugIds[0].startswith("D") @pytest.mark.flaky(max_runs=3, min_passes=1) def test_ids6(kegg): - assert kegg.koIds[0].startswith("ko") + assert kegg.koIds[0].startswith("K") @pytest.mark.flaky(max_runs=3, min_passes=1) diff --git a/test/webservices/test_pride.py b/test/webservices/test_pride.py index 2ea4c500..ecdd5be1 100644 --- a/test/webservices/test_pride.py +++ b/test/webservices/test_pride.py @@ -1,8 +1,6 @@ from bioservices import PRIDE -import pytest import os - -pytestmark = pytest.mark.skipif("TRAVIS_PYTHON_VERSION" in os.environ, reason="On travis") +import pytest p = PRIDE() @@ -43,7 +41,8 @@ def test_stats(): assert "SUBMISSIONS_PER_YEAR" in p.get_stats() p.get_stats("SUBMISSIONS_PER_YEAR") - +@pytest.mark.xfail(reason="too slow", method="thread") +@pytest.mark.timeout(30) def test_peptide(): res = p.get_peptide_evidence("PXD016700") assert res["_embedded"]