From 6ba6cd9e56eeeb1db976dc76ab71b7d82b8f4416 Mon Sep 17 00:00:00 2001 From: Waztom Date: Wed, 22 Jul 2020 15:11:36 +0100 Subject: [PATCH 1/3] Set van der Waals radius scale --- src/python/pipelines/xchem/xcos.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/python/pipelines/xchem/xcos.py b/src/python/pipelines/xchem/xcos.py index 6d82fba..3860e20 100644 --- a/src/python/pipelines/xchem/xcos.py +++ b/src/python/pipelines/xchem/xcos.py @@ -275,7 +275,8 @@ def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshol # NB reverse SuCOS scoring fm_score = getFeatureMapScore(bit, frag_mol) fm_score = np.clip(fm_score, 0, 1) - protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol, allowReordering=False) + # Change van der Waals radius scale for stricter overlay + protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol, allowReordering=False, vdwScale=0.2) protrude_dist = np.clip(protrude_dist, 0, 1) reverse_SuCOS_score = 0.5 * fm_score + 0.5 * (1 - protrude_dist) From 93ebb1bb3c60674297a6def96eaaafda4a37a1a7 Mon Sep 17 00:00:00 2001 From: Alan Christie Date: Wed, 22 Jul 2020 16:04:42 +0100 Subject: [PATCH 2/3] - An attempt to fix pull-requests from foreign repos - Testing now done locally - Publishing (and docker login) now also requires a password --- .travis.yml | 66 +++++++++++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/.travis.yml b/.travis.yml index 54732ce..f48c3c2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,78 +10,84 @@ # # DOCKER_USERNAME If PUBLISH_IMAGES is 'yes' # DOCKER_PASSWORD If PUBLISH_IMAGES is 'yes' +# +# ----------------- +# +# NOTE: Pull requests from foreign repositories will not +# result in encrupted variables being set. +# So, regardless of the state of PUBLISH_IMAGES, +# images will only be published if DOCKER_PASSWORD is defined. os: linux services: - docker stages: +- name: test - name: publish latest if: | branch = master \ - AND env(PUBLISH_IMAGES) = yes -- name: test latest - if: | - branch = master \ - AND env(PUBLISH_IMAGES) = yes + AND env(PUBLISH_IMAGES) = yes \ + AND env(DOCKER_PASSWORD) IS present - name: publish tag if: | tag IS present \ - AND env(PUBLISH_IMAGES) = yes + AND env(PUBLISH_IMAGES) = yes \ + AND env(DOCKER_PASSWORD) IS present - name: publish stable if: | tag IS present \ AND tag =~ ^([0-9]+\.){1,2}[0-9]+$ \ - AND env(PUBLISH_IMAGES) = yes - -before_script: -- docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" + AND env(PUBLISH_IMAGES) = yes \ + AND env(DOCKER_PASSWORD) IS present jobs: include: + - stage: test + name: Test Local Image + script: + - docker build -t informaticsmatters/rdkit_pipelines:latest -f Dockerfile-rdkit . + - docker build -t squonk/rdkit-pipelines-sdposter:latest -f Dockerfile-sdposter . + - git clone https://github.com/InformaticsMatters/pipelines-utils.git + - cd pipelines-utils/src/groovy + - groovy PipelineTester.groovy -indocker + # Publish-stage jobs... - # Every successful master build results in a latest image (above) + # Every successful master build results in a latest image # and every tag results in a tagged image in Docker Hub. # Tags that match a RegEx are considered 'official' tags # and also result in a 'stable' image tag. - stage: publish latest - name: Latest Image + name: Publish Latest Image script: # Build and push the pipelines-rdkit image and its sd-poster - docker build -t informaticsmatters/rdkit_pipelines:latest -f Dockerfile-rdkit . - - docker push informaticsmatters/rdkit_pipelines:latest - docker build -t squonk/rdkit-pipelines-sdposter:latest -f Dockerfile-sdposter . + - docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" + - docker push informaticsmatters/rdkit_pipelines:latest - docker push squonk/rdkit-pipelines-sdposter:latest - - stage: test latest - name: Test Latest Image - script: - # Pull the latest pipelines-rdkit image - # then clone the utils rep (containing the test engine) - # and then run the tests - - docker pull informaticsmatters/rdkit_pipelines:latest - - git clone https://github.com/InformaticsMatters/pipelines-utils.git - - cd pipelines-utils/src/groovy - - groovy PipelineTester.groovy -indocker - - stage: publish tag - name: Tagged Image + name: Publish Tagged Image script: # Build and push the pipelines-rdkit image and its sd-poster - docker build -t informaticsmatters/rdkit_pipelines:${TRAVIS_TAG} -f Dockerfile-rdkit . - - docker push informaticsmatters/rdkit_pipelines:${TRAVIS_TAG} - docker build -t squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG} -f Dockerfile-sdposter . + - docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" + - docker push informaticsmatters/rdkit_pipelines:${TRAVIS_TAG} - docker push squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG} - stage: publish stable - name: Stable Image + name: Publish Stable Image script: - # Pull the corresponding pipelines-rdkit image tag and push it as 'stable' + # Pull the corresponding pipelines-rdkit image tag + # and push it again as 'stable' + - docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" - docker pull informaticsmatters/rdkit_pipelines:${TRAVIS_TAG} - - docker tag informaticsmatters/rdkit_pipelines:${TRAVIS_TAG} informaticsmatters/rdkit_pipelines:stable - - docker push informaticsmatters/rdkit_pipelines:stable - docker pull squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG} + - docker tag informaticsmatters/rdkit_pipelines:${TRAVIS_TAG} informaticsmatters/rdkit_pipelines:stable - docker tag squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG} squonk/rdkit-pipelines-sdposter:stable + - docker push informaticsmatters/rdkit_pipelines:stable - docker push squonk/rdkit-pipelines-sdposter:stable From 7dffa0f116fad95ece2d97a777a1d7966fc547c7 Mon Sep 17 00:00:00 2001 From: Waztom Date: Fri, 24 Jul 2020 13:49:27 +0100 Subject: [PATCH 3/3] Addded MCs filter to XCOS and removed Scores 2-3 --- src/python/pipelines/xchem/xcos.py | 246 +++++------------------------ 1 file changed, 38 insertions(+), 208 deletions(-) diff --git a/src/python/pipelines/xchem/xcos.py b/src/python/pipelines/xchem/xcos.py index 6d82fba..590a70e 100644 --- a/src/python/pipelines/xchem/xcos.py +++ b/src/python/pipelines/xchem/xcos.py @@ -25,12 +25,12 @@ from rdkit.Chem.FeatMaps import FeatMaps from rdkit.Chem import AllChem, rdShapeHelpers from rdkit import RDConfig +from rdkit.Chem import rdFMCS import os, argparse import numpy as np import pandas as pd -from sklearn.neighbors import NearestNeighbors from datetime import datetime @@ -40,8 +40,6 @@ field_XCosRefMols = "XCos_RefMols" field_XCosNumHits = "XCos_NumHits" field_XCosScore1 = "XCos_Score1" -field_XCosScore2 = "XCos_Score2" -field_XCosScore3 = "XCos_Score3" date = datetime.today().strftime('%Y-%m-%d') @@ -101,168 +99,15 @@ def getFeatureMapScore(small_m, large_m, score_mode=FeatMaps.FeatMapScoreMode.Al except ZeroDivisionError: return 0 -def getNumberfeats(mol): - - featLists = [] - rawFeats = fdef.GetFeaturesForMol(mol) - # filter that list down to only include the ones we're intereted in - featLists.append([f for f in rawFeats if f.GetFamily() in keep]) - - return len(featLists) - - -def getFeatureMapXCOS(mol_list): - allFeats = [] - for m in mol_list: - - rawFeats = fdef.GetFeaturesForMol(m) - featDeats = [(f.GetType(), - f.GetPos().x, - f.GetPos().y, - f.GetPos().z) for f in rawFeats if f.GetFamily() in keep] - - allFeats.append(featDeats) - - - feature_map_df = pd.DataFrame([t for lst in allFeats for t in lst], - columns =['featType', 'x', 'y', 'z']) - - return feature_map_df - - -def getFeatureAgg(feature_map_df, rad_thresh): - - # Group data into unique feature types - grouped_df = feature_map_df.groupby('featType') - - data_to_add = [] - - for group_name, df_group in grouped_df: - - # Reset index df - df_group = df_group.reset_index() - - if len(df_group) == 1: - - data_to_add.append(df_group) - - if len(df_group) > 1: - - # Get feature name - feat_name = df_group.featType.unique()[0] - - # Use radius neighbours to find features within - # spere with radius thresh - neigh = NearestNeighbors(radius=rad_thresh) - - while len(df_group) > 0: - - neigh.fit(df_group[['x','y','z']]) - - # Get distances and indices of neigbours within radius threshold - rng = neigh.radius_neighbors() - neigh_dist = rng[0][0] - neigh_indices = rng[1][0] - - # Append the first index - NB clustering done relative to index 0 - neigh_indices = list(np.append(0, neigh_indices)) - - # Calculate average x,y,z coords for features in similar loc - x_avg = np.mean(df_group.iloc[neigh_indices].x) - y_avg = np.mean(df_group.iloc[neigh_indices].y) - z_avg = np.mean(df_group.iloc[neigh_indices].z) - - # Add feature with average x, y and z values - new_row = [(feat_name, x_avg, y_avg, z_avg)] - - cluster_df = pd.DataFrame(data=new_row, columns = ['featType', 'x', 'y', 'z']) - - data_to_add.append(cluster_df) - - # Remove indices of clustered neigbours - df_group = df_group.drop(df_group.index[neigh_indices]) - - # Create single DF from list of dfs - clustered_df = pd.concat(data_to_add) - - return clustered_df - # This is the main XCOS function -def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshold, COS_threshold, writer): +def getReverseScores(mols, frags, COS_threshold, writer): for mol in mols: # Get the bits compound_bits = getBits(mol) - # We are going to include a feature mapping score, where the - # number of features of the compound matching the clustered feats - # within a threshold are found - - # Get feature map of compound bits as df - feature_map_bits = getFeatureMapXCOS(compound_bits) - - # Group data into unique feature types - grouped_df = feature_map_bits.groupby('featType') - - no_feats_matched = [] - dist_feats_matched = [] - - # Use radius neighbours to find features within - # sphere with radius thresh - neigh = NearestNeighbors(radius=rad_threshold) - - # Loop through grouped features - for group_name, df_group in grouped_df: - - # Get feat name - feat_name = df_group.featType.unique()[0] - - # Get similar feats from cluster df - cluster_test = clustered_df[clustered_df.featType == feat_name] - - # Reset index df - df_group = df_group.reset_index() - - if len(cluster_test) == 1: - - # Calculate distances - x1_sub_x2 = (cluster_test.iloc[0].x - df_group.iloc[0].x) ** 2 - y1_sub_y2 = (cluster_test.iloc[0].y - df_group.iloc[0].y) ** 2 - z1_sub_z2 = (cluster_test.iloc[0].z - df_group.iloc[0].z) ** 2 - - diff_sum = x1_sub_x2 + y1_sub_y2 + z1_sub_z2 - - dist = diff_sum ** 0.5 - - if dist < rad_threshold: - # Let's get the number of feats matched - no_feats_matched.append(1) - - # Let's get the distance of the feats matched - dist_feats_matched.append([dist]) - - if len(cluster_test) > 1: - neigh.fit(cluster_test[['x', 'y', 'z']]) - - while len(df_group) > 0: - # Get distances and indices of neigbours within radius threshold - feat_coords = [[df_group.iloc[0].x, df_group.iloc[0].y, df_group.iloc[0].z]] - rng = neigh.radius_neighbors(feat_coords) - - neigh_dist = rng[0][0] - neigh_indices = rng[1][0] - - # Let's get the number of feats matched - no_feats_matched.append(len(neigh_indices)) - - # Remove index 0 of df_group - df_group = df_group.drop(df_group.index[0]) - - # Get total number of feat matches - no_feats = np.sum(no_feats_matched) - all_scores = [] for bit in compound_bits: @@ -271,52 +116,55 @@ def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshol no_bit_atoms = bit.GetNumAtoms() scores = [] + for frag_mol in frags: - # NB reverse SuCOS scoring - fm_score = getFeatureMapScore(bit, frag_mol) - fm_score = np.clip(fm_score, 0, 1) - protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol, allowReordering=False) - protrude_dist = np.clip(protrude_dist, 0, 1) - reverse_SuCOS_score = 0.5 * fm_score + 0.5 * (1 - protrude_dist) + # Get frag name for linking to score + frag_name = frag_mol.GetProp('_Name').strip('Mpro-') + + # Check if MCS yield > 0 atoms + mcs_match = rdFMCS.FindMCS([bit,frag_mol],ringMatchesRingOnly=True,matchValences=True) + + # Get number of atoms in MCS match found + no_mcs_atoms = Chem.MolFromSmarts(mcs_match.smartsString).GetNumAtoms() - # Get number of feats from bit for scaling score - no_bit_feats = getNumberfeats(bit) + if no_mcs_atoms == 0: - # Get some info and append to list - frag_name = frag_mol.GetProp('_Name').strip('Mpro-') + scores.append((frag_name, 0, no_bit_atoms)) + + if no_mcs_atoms > 0: + + # NB reverse SuCOS scoring + fm_score = getFeatureMapScore(bit, frag_mol) + fm_score = np.clip(fm_score, 0, 1) + + # Change van der Waals radius scale for stricter overlay + protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol, + allowReordering=False, + vdwScale=0.2) + protrude_dist = np.clip(protrude_dist, 0, 1) + + reverse_SuCOS_score = 0.5 * fm_score + 0.5 * (1 - protrude_dist) - scores.append((frag_name, reverse_SuCOS_score, no_bit_atoms, no_bit_feats)) + scores.append((frag_name, reverse_SuCOS_score, no_bit_atoms)) all_scores.append(scores) list_dfs = [] + for score in all_scores: - df = pd.DataFrame(data=score, columns=['Fragment', 'Score', 'No_bit_atoms', 'No_bit_feats']) + + df = pd.DataFrame(data=score, columns=['Fragment', 'Score', 'No_bit_atoms']) + # Get maximum scoring fragment for bit match df = df[df['Score'] == df['Score'].max()] list_dfs.append(df) final_df = pd.concat(list_dfs) - # Get total bit score and some denominator terms - bits_score = (final_df.No_bit_atoms * final_df.Score).sum() - total_atoms = final_df.No_bit_atoms.sum() - feat_match_fraction = no_feats / no_clustered_feats - # Score 1: the score is scaled by the number of bit atoms - score_1 = bits_score - - # Score 2: the score is scaled by the number of bit atoms - # penalised by the fraction of feats matched - # the to total number feats clustered - score_2 = score_1 * feat_match_fraction - - # Score 3: the score is determined by the fraction of matching - # features to the clustered features within a threshold. This - # should yield similar values to Tim's Featurestein method? - score_3 = feat_match_fraction - + score_1 = (final_df.No_bit_atoms * final_df.Score).sum() + # Let's only get frags above a threshold final_df = final_df[final_df.Score > COS_threshold] @@ -330,8 +178,6 @@ def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshol mol.SetProp(field_XCosRefMols, ','.join(all_frags)) mol.SetIntProp(field_XCosNumHits, len(all_frags)) mol.SetProp(field_XCosScore1, "{:.4f}".format(score_1)) - mol.SetProp(field_XCosScore2, "{:.4f}".format(score_2)) - mol.SetProp(field_XCosScore3, "{:.4f}".format(score_3)) # Write to file writer.write(mol) @@ -351,19 +197,8 @@ def process(molecules, fragments, writer): else: utils.log('Using', len(frag_mol_list), 'fragments. No errors') - feature_map_df = getFeatureMapXCOS(frag_mol_list) - utils.log('Feature map dataframe shape:', feature_map_df.shape) - - # Set radius threshold - rad_thresh = 1.5 - - # Aggregate features using nearest neigbours algo - clustered_df = getFeatureAgg(feature_map_df, rad_thresh=rad_thresh) - utils.log('Clustered dataframe shape:', clustered_df.shape) - no_clustered_feats = len(clustered_df) - - #clustered_df, mols, rad_threshold, COS_threshold, writer - getReverseScores(clustered_df, molecules, frag_mol_list, no_clustered_feats, 1.0, 0.50, writer) + #mols, frags, COS_threshold, writer + getReverseScores(molecules, frag_mol_list, 0.40, writer) def main(): @@ -380,7 +215,6 @@ def main(): parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') parser.add_argument('--metrics', action='store_true', help='Write metrics') - args = parser.parse_args() utils.log("XCos Args: ", args) @@ -393,15 +227,11 @@ def main(): clsMappings[field_XCosRefMols] = "java.lang.String" clsMappings[field_XCosNumHits] = "java.lang.Integer" clsMappings[field_XCosScore1] = "java.lang.Float" - clsMappings[field_XCosScore2] = "java.lang.Float" - clsMappings[field_XCosScore3] = "java.lang.Float" + fieldMetaProps.append({"fieldName":field_XCosRefMols, "values": {"source":source, "description":"XCos reference fragments"}}) fieldMetaProps.append({"fieldName":field_XCosNumHits, "values": {"source":source, "description":"XCos number of hits"}}) fieldMetaProps.append({"fieldName":field_XCosScore1, "values": {"source":source, "description":"XCos score 1"}}) - fieldMetaProps.append({"fieldName":field_XCosScore2, "values": {"source":source, "description":"XCos score 2"}}) - fieldMetaProps.append({"fieldName":field_XCosScore3, "values": {"source":source, "description":"XCos score 3"}}) - - + frags_input,frags_suppl = rdkit_utils.default_open_input(args.fragments, args.fragments_format) inputs_file, inputs_supplr = rdkit_utils.default_open_input(args.input, args.informat)