From 215dace8826ec0b224b5e31ae7c30b5aac5701ff Mon Sep 17 00:00:00 2001 From: richard gowers Date: Thu, 10 Aug 2023 13:40:28 +0100 Subject: [PATCH] use pooch for PLB files previous solution used io.StringIO which rdkit doesn't like (and wasn't documented as accepted) --- gufe/tests/conftest.py | 82 ++++++++++++----------------- gufe/tests/test_proteincomponent.py | 50 ++++++------------ 2 files changed, 50 insertions(+), 82 deletions(-) diff --git a/gufe/tests/conftest.py b/gufe/tests/conftest.py index 034fac04..8ca4d444 100644 --- a/gufe/tests/conftest.py +++ b/gufe/tests/conftest.py @@ -4,8 +4,8 @@ import importlib.resources import urllib.request from urllib.error import URLError -import io import functools +import pooch import pytest from rdkit import Chem from rdkit.Chem import AllChem @@ -20,58 +20,42 @@ else: HAS_INTERNET = True +PLB_files = pooch.create( + path=pooch.os_cache('pdbinf'), + base_url='https://github.com/openforcefield/protein-ligand-benchmark/raw/d3387602bbeb0167abf00dfb81753d8936775dd2/data/', + version=None, + registry={ + 'p38/01_protein/crd/protein.pdb': '3f0bf718644e7c29f5200cd3def4240ac25ef5fb1948b2e64deb5015d8a45aa4', + 'mcl1/01_protein/crd/protein.pdb': 'f80ff9dd93a5d9dd6e90091e9631a8ce7fe0dc931e16543e22c1f92009660306', + 'cdk2/01_protein/crd/protein.pdb': '15d1e509d7951ca45ea266d51a627d5f452dcf0bb5bd48751ae57eb29e28ab69', + 'shp2/01_protein/crd/protein.pdb': 'd6759cbd135aaddaa658446064df4095d978d3681c014a0528b542d60b2c8770', + 'pde2/01_protein/crd/protein.pdb': '3b7967c1717789215452cdf919520625602d5438a9d2a18620726b8b1b3a8ef0', + 'cmet/01_protein/crd/protein.pdb': '155ec32941a9082dbdbbfde460ff97c88d4fe7e100e9a9577edb5a9e7b6467ae', + 'ptp1b/01_protein/crd/protein.pdb': 'bfa0f9204e96aa463b80946b788c4153cd24701291007eb77638a16fd156634e', + 'thrombin/01_protein/crd/protein.pdb': 'eb4ea18bef9c4c71dcdc922616d6719ee918112be87a0bd6b274c856eff1dd59', + 'cdk8/01_protein/crd/protein.pdb': 'b058774526a19775d8f438b14e9d6da331b6de74e0ef9e96db575f6c0bb067b2', + 'pfkfb3/01_protein/crd/protein.pdb': '4367710db0dbf284cc715ae9a8dd82d06bd77dcc3fb0885678e16632a2732dcc', + 'tyk2/01_protein/crd/protein.pdb': '9090684f4bdae90afbe5f2698a14c778396c024c19ceb6333de4808d9e29fae6', + 'syk/01_protein/crd/protein.pdb': 'f6199d0c1818eb5bb24e164426789cf39cae7aa32c8ca2e98f5f44d299a6f82f', + 'tnks2/01_protein/crd/protein.pdb': 'fc7681a05dbf07590aa8de133f981b6d8ae9cebcc23d54addc2c4fe80be80299', + 'eg5/01_protein/crd/protein.pdb': 'f2964a785c922502dc86fb4e2e5295d32d41d5b68b8c3246e989de5234c3fd0f', + 'hif2a/01_protein/crd/protein.pdb': '5bbf520e7c102a65cc7ba0253fd66f43562f77284c82b3b9613e997b7ac76c93', + + }, +) -class URLFileLike: - def __init__(self, url, encoding='utf-8'): - self.url = url - self.encoding = encoding - self.data = None - def __call__(self): +@pytest.fixture(params=['p38', 'mcl1', 'cdk2', 'shp2', 'pde2', 'cmet', 'ptp1b', + 'thrombin', 'cdk8', 'pfkfb3', 'tyk2', 'syk', 'tnks2', + 'eg5', 'hif2a', '181l']) +def PDB_files(request): + if request.param == '181l': + with importlib.resources.path('gufe.tests.data', '181l.pdb') as file: + return str(file) + else: if not HAS_INTERNET: # pragma: no-cover pytest.skip("Skipping because internet seems faulty") - - if self.data is None: - req = urllib.request.urlopen(self.url) - self.data = req.read().decode(self.encoding) - - return io.StringIO(self.data) - - -def get_test_filename(filename): - with importlib.resources.path('gufe.tests.data', filename) as file: - return str(file) - - -_benchmark_pdb_names = [ - "cmet_protein", - "hif2a_protein", - "mcl1_protein", - "p38_protein", - "ptp1b_protein", - "syk_protein", - "thrombin_protein", - "tnsk2_protein", - "tyk2_protein", - ] - - -_pl_benchmark_url_pattern = ( - "https://github.com/OpenFreeEnergy/openfe-benchmarks/blob/main/openfe_benchmarks/data/{name}.pdb?raw=true" -) - - -PDB_BENCHMARK_LOADERS = { - name: URLFileLike(url=_pl_benchmark_url_pattern.format(name=name)) - for name in _benchmark_pdb_names -} - -PDB_FILE_LOADERS = { - name: lambda: get_test_filename(name) - for name in ["181l.pdb"] -} - -ALL_PDB_LOADERS = dict(**PDB_BENCHMARK_LOADERS, **PDB_FILE_LOADERS) + return PLB_files.fetch('{}/01_protein/crd/protein.pdb'.format(request.param)) @pytest.fixture diff --git a/gufe/tests/test_proteincomponent.py b/gufe/tests/test_proteincomponent.py index 934095a7..90161c4a 100644 --- a/gufe/tests/test_proteincomponent.py +++ b/gufe/tests/test_proteincomponent.py @@ -14,7 +14,7 @@ from openmm import unit from numpy.testing import assert_almost_equal -from .conftest import ALL_PDB_LOADERS +from .conftest import PLB_files @pytest.fixture @@ -94,11 +94,8 @@ class TestProteinComponent(GufeTokenizableTestsMixin): def instance(self, PDB_181L_path): return self.cls.from_pdb_file(PDB_181L_path, name="Steve") - # From - @pytest.mark.parametrize('in_pdb_path', ALL_PDB_LOADERS.keys()) - def test_from_pdb_file(self, in_pdb_path): - in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - p = self.cls.from_pdb_file(in_pdb_io, name="Steve") + def test_from_pdb_file(self, PDB_files): + p = self.cls.from_pdb_file(PDB_files, name="Steve") assert isinstance(p, ProteinComponent) assert p.name == "Steve" @@ -177,21 +174,16 @@ def test_to_pdb_input_types(self, PDB_181L_OpenMMClean_path, tmp_path, output_func=p.to_pdb_file ) - @pytest.mark.parametrize('in_pdb_path', ALL_PDB_LOADERS.keys()) - def test_to_pdb_round_trip(self, in_pdb_path, tmp_path): - in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - - p = self.cls.from_pdb_file(in_pdb_io, name="Wuff") - out_file_name = "tmp_"+in_pdb_path+".pdb" + def test_to_pdb_round_trip(self, PDB_files, tmp_path): + p = self.cls.from_pdb_file(PDB_files, name="Wuff") + out_file_name = "tmp_foo.pdb" out_file = tmp_path / out_file_name p.to_pdb_file(str(out_file)) - ref_in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - # generate openMM reference file: - openmm_pdb = pdbfile.PDBFile(ref_in_pdb_io) - out_ref_file_name = "tmp_"+in_pdb_path+"_openmm_ref.pdb" + openmm_pdb = pdbfile.PDBFile(PDB_files) + out_ref_file_name = "tmp_foo_openmm_ref.pdb" out_ref_file = tmp_path / out_ref_file_name pdbfile.PDBFile.writeFile(openmm_pdb.topology, openmm_pdb.positions, file=open(str(out_ref_file), "w")) @@ -213,16 +205,11 @@ def test_dummy_from_dict(self, PDB_181L_OpenMMClean_path): assert p == p2 - # parametrize - @pytest.mark.parametrize('in_pdb_path', ALL_PDB_LOADERS.keys()) - def test_to_openmm_positions(self, in_pdb_path): - in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - ref_in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - - openmm_pdb = pdbfile.PDBFile(ref_in_pdb_io) + def test_to_openmm_positions(self, PDB_files): + openmm_pdb = pdbfile.PDBFile(PDB_files) openmm_pos = openmm_pdb.positions - p = self.cls.from_pdb_file(in_pdb_io, name="Bob") + p = self.cls.from_pdb_file(PDB_files, name="Bob") gufe_openmm_pos = p.to_openmm_positions() v1 = gufe_openmm_pos.value_in_unit(unit.nanometer) @@ -230,16 +217,11 @@ def test_to_openmm_positions(self, in_pdb_path): assert_almost_equal(actual=v1, desired=v2, decimal=6) - # parametrize - @pytest.mark.parametrize('in_pdb_path', ALL_PDB_LOADERS.keys()) - def test_to_openmm_topology(self, in_pdb_path): - in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - ref_in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - - openmm_pdb = pdbfile.PDBFile(ref_in_pdb_io) + def test_to_openmm_topology(self, PDB_files): + openmm_pdb = pdbfile.PDBFile(PDB_files) openmm_top = openmm_pdb.topology - p = self.cls.from_pdb_file(in_pdb_io, name="Bob") + p = self.cls.from_pdb_file(PDB_files, name="Bob") gufe_openmm_top = p.to_openmm_topology() assert_topology_equal(openmm_top, gufe_openmm_top) @@ -290,7 +272,9 @@ def test_protein_total_charge(self, PDB_181L_path): assert m1.total_charge == 7 def test_protein_total_charge_thromb(self): - m1 = self.cls.from_pdb_file(ALL_PDB_LOADERS["thrombin_protein"]()) + f = PLB_files.fetch('thrombin/01_protein/crd/protein.pdb') + + m1 = self.cls.from_pdb_file(f) assert m1.total_charge == 6