From 6ba6cd9e56eeeb1db976dc76ab71b7d82b8f4416 Mon Sep 17 00:00:00 2001
From: Waztom <waztom@gmail.com>
Date: Wed, 22 Jul 2020 15:11:36 +0100
Subject: [PATCH 1/3] Set van der Waals radius scale

---
 src/python/pipelines/xchem/xcos.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/python/pipelines/xchem/xcos.py b/src/python/pipelines/xchem/xcos.py
index 6d82fba..3860e20 100644
--- a/src/python/pipelines/xchem/xcos.py
+++ b/src/python/pipelines/xchem/xcos.py
@@ -275,7 +275,8 @@ def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshol
                 # NB reverse SuCOS scoring
                 fm_score = getFeatureMapScore(bit, frag_mol)
                 fm_score = np.clip(fm_score, 0, 1)
-                protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol, allowReordering=False)
+                # Change van der Waals radius scale for stricter overlay
+                protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol, allowReordering=False, vdwScale=0.2)
                 protrude_dist = np.clip(protrude_dist, 0, 1)
 
                 reverse_SuCOS_score = 0.5 * fm_score + 0.5 * (1 - protrude_dist)

From 93ebb1bb3c60674297a6def96eaaafda4a37a1a7 Mon Sep 17 00:00:00 2001
From: Alan Christie <alan.christie@matildapeak.com>
Date: Wed, 22 Jul 2020 16:04:42 +0100
Subject: [PATCH 2/3] - An attempt to fix pull-requests from foreign repos

- Testing now done locally
- Publishing (and docker login) now also requires a password
---
 .travis.yml | 66 +++++++++++++++++++++++++++++------------------------
 1 file changed, 36 insertions(+), 30 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 54732ce..f48c3c2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,78 +10,84 @@
 #
 # DOCKER_USERNAME       If PUBLISH_IMAGES is 'yes'
 # DOCKER_PASSWORD       If PUBLISH_IMAGES is 'yes'
+#
+# -----------------
+#
+# NOTE: Pull requests from foreign repositories will not
+#       result in encrupted variables being set.
+#       So, regardless of the state of PUBLISH_IMAGES,
+#       images will only be published if DOCKER_PASSWORD is defined.
 
 os: linux
 services:
 - docker
 
 stages:
+- name: test
 - name: publish latest
   if: |
     branch = master \
-    AND env(PUBLISH_IMAGES) = yes
-- name: test latest
-  if: |
-    branch = master \
-    AND env(PUBLISH_IMAGES) = yes
+    AND env(PUBLISH_IMAGES) = yes \
+    AND env(DOCKER_PASSWORD) IS present
 - name: publish tag
   if: |
     tag IS present \
-    AND env(PUBLISH_IMAGES) = yes
+    AND env(PUBLISH_IMAGES) = yes \
+    AND env(DOCKER_PASSWORD) IS present
 - name: publish stable
   if: |
     tag IS present \
     AND tag =~ ^([0-9]+\.){1,2}[0-9]+$ \
-    AND env(PUBLISH_IMAGES) = yes
-
-before_script:
-- docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD"
+    AND env(PUBLISH_IMAGES) = yes \
+    AND env(DOCKER_PASSWORD) IS present
 
 jobs:
   include:
 
+  - stage: test
+    name: Test Local Image
+    script:
+    - docker build -t informaticsmatters/rdkit_pipelines:latest -f Dockerfile-rdkit .
+    - docker build -t squonk/rdkit-pipelines-sdposter:latest -f Dockerfile-sdposter .
+    - git clone https://github.com/InformaticsMatters/pipelines-utils.git
+    - cd pipelines-utils/src/groovy
+    - groovy PipelineTester.groovy -indocker
+
   # Publish-stage jobs...
-  # Every successful master build results in a latest image (above)
+  # Every successful master build results in a latest image
   # and every tag results in a tagged image in Docker Hub.
   # Tags that match a RegEx are considered 'official' tags
   # and also result in a 'stable' image tag.
 
   - stage: publish latest
-    name: Latest Image
+    name: Publish Latest Image
     script:
     # Build and push the pipelines-rdkit image and its sd-poster
     - docker build -t informaticsmatters/rdkit_pipelines:latest -f Dockerfile-rdkit .
-    - docker push informaticsmatters/rdkit_pipelines:latest
     - docker build -t squonk/rdkit-pipelines-sdposter:latest -f Dockerfile-sdposter .
+    - docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD"
+    - docker push informaticsmatters/rdkit_pipelines:latest
     - docker push squonk/rdkit-pipelines-sdposter:latest
 
-  - stage: test latest
-    name: Test Latest Image
-    script:
-    # Pull the latest pipelines-rdkit image
-    # then clone the utils rep (containing the test engine)
-    # and then run the tests
-    - docker pull informaticsmatters/rdkit_pipelines:latest
-    - git clone https://github.com/InformaticsMatters/pipelines-utils.git
-    - cd pipelines-utils/src/groovy
-    - groovy PipelineTester.groovy -indocker
-
   - stage: publish tag
-    name: Tagged Image
+    name: Publish Tagged Image
     script:
     # Build and push the pipelines-rdkit image and its sd-poster
     - docker build -t informaticsmatters/rdkit_pipelines:${TRAVIS_TAG} -f Dockerfile-rdkit .
-    - docker push informaticsmatters/rdkit_pipelines:${TRAVIS_TAG}
     - docker build -t squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG} -f Dockerfile-sdposter .
+    - docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD"
+    - docker push informaticsmatters/rdkit_pipelines:${TRAVIS_TAG}
     - docker push squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG}
 
   - stage: publish stable
-    name: Stable Image
+    name: Publish Stable Image
     script:
-    # Pull the corresponding pipelines-rdkit image tag and push it as 'stable'
+    # Pull the corresponding pipelines-rdkit image tag
+    # and push it again as 'stable'
+    - docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD"
     - docker pull informaticsmatters/rdkit_pipelines:${TRAVIS_TAG}
-    - docker tag informaticsmatters/rdkit_pipelines:${TRAVIS_TAG} informaticsmatters/rdkit_pipelines:stable
-    - docker push informaticsmatters/rdkit_pipelines:stable
     - docker pull squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG}
+    - docker tag informaticsmatters/rdkit_pipelines:${TRAVIS_TAG} informaticsmatters/rdkit_pipelines:stable
     - docker tag squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG} squonk/rdkit-pipelines-sdposter:stable
+    - docker push informaticsmatters/rdkit_pipelines:stable
     - docker push squonk/rdkit-pipelines-sdposter:stable

From 7dffa0f116fad95ece2d97a777a1d7966fc547c7 Mon Sep 17 00:00:00 2001
From: Waztom <waztom@gmail.com>
Date: Fri, 24 Jul 2020 13:49:27 +0100
Subject: [PATCH 3/3] Addded MCs filter to XCOS and removed Scores 2-3

---
 src/python/pipelines/xchem/xcos.py | 246 +++++------------------------
 1 file changed, 38 insertions(+), 208 deletions(-)

diff --git a/src/python/pipelines/xchem/xcos.py b/src/python/pipelines/xchem/xcos.py
index 6d82fba..590a70e 100644
--- a/src/python/pipelines/xchem/xcos.py
+++ b/src/python/pipelines/xchem/xcos.py
@@ -25,12 +25,12 @@
 from rdkit.Chem.FeatMaps import FeatMaps
 from rdkit.Chem import AllChem, rdShapeHelpers
 from rdkit import RDConfig
+from rdkit.Chem import rdFMCS
 
 import os, argparse
 
 import numpy as np
 import pandas as pd
-from sklearn.neighbors import NearestNeighbors
 
 from datetime import datetime
 
@@ -40,8 +40,6 @@
 field_XCosRefMols = "XCos_RefMols"
 field_XCosNumHits = "XCos_NumHits"
 field_XCosScore1 = "XCos_Score1"
-field_XCosScore2 = "XCos_Score2"
-field_XCosScore3 = "XCos_Score3"
 
 
 date = datetime.today().strftime('%Y-%m-%d')
@@ -101,168 +99,15 @@ def getFeatureMapScore(small_m, large_m, score_mode=FeatMaps.FeatMapScoreMode.Al
     except ZeroDivisionError:
         return 0
 
-def getNumberfeats(mol):
-
-    featLists = []
-    rawFeats = fdef.GetFeaturesForMol(mol)
-    # filter that list down to only include the ones we're intereted in
-    featLists.append([f for f in rawFeats if f.GetFamily() in keep])
-
-    return len(featLists)
-
-
-def getFeatureMapXCOS(mol_list):
-    allFeats = []
-    for m in mol_list:
-
-        rawFeats = fdef.GetFeaturesForMol(m)
-        featDeats = [(f.GetType(),
-                      f.GetPos().x,
-                      f.GetPos().y,
-                      f.GetPos().z) for f in rawFeats if f.GetFamily() in keep]
-
-        allFeats.append(featDeats)
-
-
-    feature_map_df = pd.DataFrame([t for lst in allFeats for t in lst],
-                                  columns =['featType', 'x', 'y', 'z'])
-
-    return feature_map_df
-
-
-def getFeatureAgg(feature_map_df, rad_thresh):
-
-    # Group data into unique feature types
-    grouped_df = feature_map_df.groupby('featType')
-
-    data_to_add = []
-
-    for group_name, df_group in grouped_df:
-
-        # Reset index df
-        df_group = df_group.reset_index()
-
-        if len(df_group) == 1:
-
-            data_to_add.append(df_group)
-
-        if len(df_group) > 1:
-
-            # Get feature name
-            feat_name = df_group.featType.unique()[0]
-
-            # Use radius neighbours to find features within
-            # spere with radius thresh
-            neigh = NearestNeighbors(radius=rad_thresh)
-
-            while len(df_group) > 0:
-
-                neigh.fit(df_group[['x','y','z']])
-
-                # Get distances and indices of neigbours within radius threshold
-                rng = neigh.radius_neighbors()
-                neigh_dist = rng[0][0]
-                neigh_indices = rng[1][0]
-
-                # Append the first index - NB clustering done relative to index 0
-                neigh_indices = list(np.append(0, neigh_indices))
-
-                # Calculate average x,y,z coords for features in similar loc
-                x_avg = np.mean(df_group.iloc[neigh_indices].x)
-                y_avg = np.mean(df_group.iloc[neigh_indices].y)
-                z_avg = np.mean(df_group.iloc[neigh_indices].z)
-
-                # Add feature with average x, y and z values
-                new_row = [(feat_name, x_avg, y_avg, z_avg)]
-
-                cluster_df = pd.DataFrame(data=new_row, columns = ['featType', 'x', 'y', 'z'])
-
-                data_to_add.append(cluster_df)
-
-                # Remove indices of clustered neigbours
-                df_group = df_group.drop(df_group.index[neigh_indices])
-
-    # Create single DF from list of dfs
-    clustered_df = pd.concat(data_to_add)
-
-    return clustered_df
-
 
 # This is the main XCOS function
-def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshold, COS_threshold, writer):
+def getReverseScores(mols, frags, COS_threshold, writer):
 
     for mol in mols:
 
         # Get the bits
         compound_bits = getBits(mol)
 
-        # We are going to include a feature mapping score, where the
-        # number of features of the compound matching the clustered feats
-        # within a threshold are found
-
-        # Get feature map of compound bits as df
-        feature_map_bits = getFeatureMapXCOS(compound_bits)
-
-        # Group data into unique feature types
-        grouped_df = feature_map_bits.groupby('featType')
-
-        no_feats_matched = []
-        dist_feats_matched = []
-
-        # Use radius neighbours to find features within
-        # sphere with radius thresh
-        neigh = NearestNeighbors(radius=rad_threshold)
-
-        # Loop through grouped features
-        for group_name, df_group in grouped_df:
-
-            # Get feat name
-            feat_name = df_group.featType.unique()[0]
-
-            # Get similar feats from cluster df
-            cluster_test = clustered_df[clustered_df.featType == feat_name]
-
-            # Reset index df
-            df_group = df_group.reset_index()
-
-            if len(cluster_test) == 1:
-
-                # Calculate distances
-                x1_sub_x2 = (cluster_test.iloc[0].x - df_group.iloc[0].x) ** 2
-                y1_sub_y2 = (cluster_test.iloc[0].y - df_group.iloc[0].y) ** 2
-                z1_sub_z2 = (cluster_test.iloc[0].z - df_group.iloc[0].z) ** 2
-
-                diff_sum = x1_sub_x2 + y1_sub_y2 + z1_sub_z2
-
-                dist = diff_sum ** 0.5
-
-                if dist < rad_threshold:
-                    # Let's get the number of feats matched
-                    no_feats_matched.append(1)
-
-                    # Let's get the distance of the feats matched
-                    dist_feats_matched.append([dist])
-
-            if len(cluster_test) > 1:
-                neigh.fit(cluster_test[['x', 'y', 'z']])
-
-                while len(df_group) > 0:
-                    # Get distances and indices of neigbours within radius threshold
-                    feat_coords = [[df_group.iloc[0].x, df_group.iloc[0].y, df_group.iloc[0].z]]
-                    rng = neigh.radius_neighbors(feat_coords)
-
-                    neigh_dist = rng[0][0]
-                    neigh_indices = rng[1][0]
-
-                    # Let's get the number of feats matched
-                    no_feats_matched.append(len(neigh_indices))
-
-                    # Remove index 0 of df_group
-                    df_group = df_group.drop(df_group.index[0])
-
-        # Get total number of feat matches
-        no_feats = np.sum(no_feats_matched)
-
         all_scores = []
 
         for bit in compound_bits:
@@ -271,52 +116,55 @@ def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshol
             no_bit_atoms = bit.GetNumAtoms()
 
             scores = []
+
             for frag_mol in frags:
-                # NB reverse SuCOS scoring
-                fm_score = getFeatureMapScore(bit, frag_mol)
-                fm_score = np.clip(fm_score, 0, 1)
-                protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol, allowReordering=False)
-                protrude_dist = np.clip(protrude_dist, 0, 1)
 
-                reverse_SuCOS_score = 0.5 * fm_score + 0.5 * (1 - protrude_dist)
+                # Get frag name for linking to score
+                frag_name = frag_mol.GetProp('_Name').strip('Mpro-')
+                
+                # Check if MCS yield > 0 atoms
+                mcs_match = rdFMCS.FindMCS([bit,frag_mol],ringMatchesRingOnly=True,matchValences=True)
+                
+                # Get number of atoms in MCS match found
+                no_mcs_atoms = Chem.MolFromSmarts(mcs_match.smartsString).GetNumAtoms()
 
-                # Get number of feats from bit for scaling score
-                no_bit_feats = getNumberfeats(bit)
+                if no_mcs_atoms == 0:
 
-                # Get some info and append to list
-                frag_name = frag_mol.GetProp('_Name').strip('Mpro-')
+                    scores.append((frag_name, 0, no_bit_atoms))
+                
+                if no_mcs_atoms > 0:
+
+                    # NB reverse SuCOS scoring
+                    fm_score = getFeatureMapScore(bit, frag_mol)
+                    fm_score = np.clip(fm_score, 0, 1)
+
+                    # Change van der Waals radius scale for stricter overlay                     
+                    protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol,
+                                                                     allowReordering=False,
+                                                                     vdwScale=0.2)
+                    protrude_dist = np.clip(protrude_dist, 0, 1)
+
+                    reverse_SuCOS_score = 0.5 * fm_score + 0.5 * (1 - protrude_dist)
 
-                scores.append((frag_name, reverse_SuCOS_score, no_bit_atoms, no_bit_feats))
+                    scores.append((frag_name, reverse_SuCOS_score, no_bit_atoms))
 
             all_scores.append(scores)
 
             list_dfs = []
+
             for score in all_scores:
-                df = pd.DataFrame(data=score, columns=['Fragment', 'Score', 'No_bit_atoms', 'No_bit_feats'])
+
+                df = pd.DataFrame(data=score, columns=['Fragment', 'Score', 'No_bit_atoms'])
+                
                 # Get maximum scoring fragment for bit match
                 df = df[df['Score'] == df['Score'].max()]
                 list_dfs.append(df)
 
             final_df = pd.concat(list_dfs)
 
-            # Get total bit score and some denominator terms
-            bits_score = (final_df.No_bit_atoms * final_df.Score).sum()
-            total_atoms = final_df.No_bit_atoms.sum()
-            feat_match_fraction = no_feats / no_clustered_feats
-
             # Score 1: the score is scaled by the number of bit atoms
-            score_1 = bits_score
-
-            # Score 2: the score is scaled by the number of bit atoms
-            # penalised by the fraction of feats matched
-            # the to total number feats clustered
-            score_2 = score_1 * feat_match_fraction
-
-            # Score 3: the score is determined by the fraction of matching
-            # features to the clustered features within a threshold. This
-            # should yield similar values to Tim's Featurestein method?
-            score_3 = feat_match_fraction
-
+            score_1 = (final_df.No_bit_atoms * final_df.Score).sum()
+           
             # Let's only get frags above a threshold
             final_df = final_df[final_df.Score > COS_threshold]
 
@@ -330,8 +178,6 @@ def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshol
         mol.SetProp(field_XCosRefMols, ','.join(all_frags))
         mol.SetIntProp(field_XCosNumHits, len(all_frags))
         mol.SetProp(field_XCosScore1, "{:.4f}".format(score_1))
-        mol.SetProp(field_XCosScore2, "{:.4f}".format(score_2))
-        mol.SetProp(field_XCosScore3, "{:.4f}".format(score_3))
 
         # Write to file
         writer.write(mol)
@@ -351,19 +197,8 @@ def process(molecules, fragments, writer):
     else:
         utils.log('Using', len(frag_mol_list), 'fragments. No errors')
 
-    feature_map_df =  getFeatureMapXCOS(frag_mol_list)
-    utils.log('Feature map dataframe shape:', feature_map_df.shape)
-
-    # Set radius threshold
-    rad_thresh = 1.5
-
-    # Aggregate features using nearest neigbours algo
-    clustered_df = getFeatureAgg(feature_map_df, rad_thresh=rad_thresh)
-    utils.log('Clustered dataframe shape:', clustered_df.shape)
-    no_clustered_feats = len(clustered_df)
-
-    #clustered_df, mols, rad_threshold, COS_threshold, writer
-    getReverseScores(clustered_df, molecules, frag_mol_list, no_clustered_feats, 1.0, 0.50, writer)
+    #mols, frags, COS_threshold, writer
+    getReverseScores(molecules, frag_mol_list, 0.40, writer)
 
 
 def main():
@@ -380,7 +215,6 @@ def main():
     parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed')
     parser.add_argument('--metrics', action='store_true', help='Write metrics')
 
-
     args = parser.parse_args()
     utils.log("XCos Args: ", args)
 
@@ -393,15 +227,11 @@ def main():
     clsMappings[field_XCosRefMols] = "java.lang.String"
     clsMappings[field_XCosNumHits] = "java.lang.Integer"
     clsMappings[field_XCosScore1] = "java.lang.Float"
-    clsMappings[field_XCosScore2] = "java.lang.Float"
-    clsMappings[field_XCosScore3] = "java.lang.Float"
+
     fieldMetaProps.append({"fieldName":field_XCosRefMols,   "values": {"source":source, "description":"XCos reference fragments"}})
     fieldMetaProps.append({"fieldName":field_XCosNumHits,   "values": {"source":source, "description":"XCos number of hits"}})
     fieldMetaProps.append({"fieldName":field_XCosScore1,   "values": {"source":source, "description":"XCos score 1"}})
-    fieldMetaProps.append({"fieldName":field_XCosScore2,   "values": {"source":source, "description":"XCos score 2"}})
-    fieldMetaProps.append({"fieldName":field_XCosScore3,   "values": {"source":source, "description":"XCos score 3"}})
-
-
+    
     frags_input,frags_suppl = rdkit_utils.default_open_input(args.fragments, args.fragments_format)
 
     inputs_file, inputs_supplr = rdkit_utils.default_open_input(args.input, args.informat)