Skip to content

Commit

Permalink
Merge pull request #40 from Waztom/XCOS2
Browse files Browse the repository at this point in the history
Addded MCs filter to XCOS and removed Scores 2-3
  • Loading branch information
tdudgeon authored Jul 24, 2020
2 parents 93ebb1b + ceabf2d commit dbb5608
Showing 1 changed file with 39 additions and 203 deletions.
242 changes: 39 additions & 203 deletions src/python/pipelines/xchem/xcos.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@
from rdkit.Chem.FeatMaps import FeatMaps
from rdkit.Chem import AllChem, rdShapeHelpers
from rdkit import RDConfig
from rdkit.Chem import rdFMCS

import os, argparse

import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

from datetime import datetime

Expand All @@ -40,8 +40,6 @@
field_XCosRefMols = "XCos_RefMols"
field_XCosNumHits = "XCos_NumHits"
field_XCosScore1 = "XCos_Score1"
field_XCosScore2 = "XCos_Score2"
field_XCosScore3 = "XCos_Score3"


date = datetime.today().strftime('%Y-%m-%d')
Expand Down Expand Up @@ -101,168 +99,15 @@ def getFeatureMapScore(small_m, large_m, score_mode=FeatMaps.FeatMapScoreMode.Al
except ZeroDivisionError:
return 0

def getNumberfeats(mol):

featLists = []
rawFeats = fdef.GetFeaturesForMol(mol)
# filter that list down to only include the ones we're intereted in
featLists.append([f for f in rawFeats if f.GetFamily() in keep])

return len(featLists)


def getFeatureMapXCOS(mol_list):
allFeats = []
for m in mol_list:

rawFeats = fdef.GetFeaturesForMol(m)
featDeats = [(f.GetType(),
f.GetPos().x,
f.GetPos().y,
f.GetPos().z) for f in rawFeats if f.GetFamily() in keep]

allFeats.append(featDeats)


feature_map_df = pd.DataFrame([t for lst in allFeats for t in lst],
columns =['featType', 'x', 'y', 'z'])

return feature_map_df


def getFeatureAgg(feature_map_df, rad_thresh):

# Group data into unique feature types
grouped_df = feature_map_df.groupby('featType')

data_to_add = []

for group_name, df_group in grouped_df:

# Reset index df
df_group = df_group.reset_index()

if len(df_group) == 1:

data_to_add.append(df_group)

if len(df_group) > 1:

# Get feature name
feat_name = df_group.featType.unique()[0]

# Use radius neighbours to find features within
# spere with radius thresh
neigh = NearestNeighbors(radius=rad_thresh)

while len(df_group) > 0:

neigh.fit(df_group[['x','y','z']])

# Get distances and indices of neigbours within radius threshold
rng = neigh.radius_neighbors()
neigh_dist = rng[0][0]
neigh_indices = rng[1][0]

# Append the first index - NB clustering done relative to index 0
neigh_indices = list(np.append(0, neigh_indices))

# Calculate average x,y,z coords for features in similar loc
x_avg = np.mean(df_group.iloc[neigh_indices].x)
y_avg = np.mean(df_group.iloc[neigh_indices].y)
z_avg = np.mean(df_group.iloc[neigh_indices].z)

# Add feature with average x, y and z values
new_row = [(feat_name, x_avg, y_avg, z_avg)]

cluster_df = pd.DataFrame(data=new_row, columns = ['featType', 'x', 'y', 'z'])

data_to_add.append(cluster_df)

# Remove indices of clustered neigbours
df_group = df_group.drop(df_group.index[neigh_indices])

# Create single DF from list of dfs
clustered_df = pd.concat(data_to_add)

return clustered_df


# This is the main XCOS function
def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshold, COS_threshold, writer):
def getReverseScores(mols, frags, COS_threshold, writer):

for mol in mols:

# Get the bits
compound_bits = getBits(mol)

# We are going to include a feature mapping score, where the
# number of features of the compound matching the clustered feats
# within a threshold are found

# Get feature map of compound bits as df
feature_map_bits = getFeatureMapXCOS(compound_bits)

# Group data into unique feature types
grouped_df = feature_map_bits.groupby('featType')

no_feats_matched = []
dist_feats_matched = []

# Use radius neighbours to find features within
# sphere with radius thresh
neigh = NearestNeighbors(radius=rad_threshold)

# Loop through grouped features
for group_name, df_group in grouped_df:

# Get feat name
feat_name = df_group.featType.unique()[0]

# Get similar feats from cluster df
cluster_test = clustered_df[clustered_df.featType == feat_name]

# Reset index df
df_group = df_group.reset_index()

if len(cluster_test) == 1:

# Calculate distances
x1_sub_x2 = (cluster_test.iloc[0].x - df_group.iloc[0].x) ** 2
y1_sub_y2 = (cluster_test.iloc[0].y - df_group.iloc[0].y) ** 2
z1_sub_z2 = (cluster_test.iloc[0].z - df_group.iloc[0].z) ** 2

diff_sum = x1_sub_x2 + y1_sub_y2 + z1_sub_z2

dist = diff_sum ** 0.5

if dist < rad_threshold:
# Let's get the number of feats matched
no_feats_matched.append(1)

# Let's get the distance of the feats matched
dist_feats_matched.append([dist])

if len(cluster_test) > 1:
neigh.fit(cluster_test[['x', 'y', 'z']])

while len(df_group) > 0:
# Get distances and indices of neigbours within radius threshold
feat_coords = [[df_group.iloc[0].x, df_group.iloc[0].y, df_group.iloc[0].z]]
rng = neigh.radius_neighbors(feat_coords)

neigh_dist = rng[0][0]
neigh_indices = rng[1][0]

# Let's get the number of feats matched
no_feats_matched.append(len(neigh_indices))

# Remove index 0 of df_group
df_group = df_group.drop(df_group.index[0])

# Get total number of feat matches
no_feats = np.sum(no_feats_matched)

all_scores = []

for bit in compound_bits:
Expand All @@ -271,53 +116,62 @@ def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshol
no_bit_atoms = bit.GetNumAtoms()

scores = []

for frag_mol in frags:

# NB reverse SuCOS scoring
fm_score = getFeatureMapScore(bit, frag_mol)
fm_score = np.clip(fm_score, 0, 1)
# Change van der Waals radius scale for stricter overlay
protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol, allowReordering=False, vdwScale=0.2)
protrude_dist = np.clip(protrude_dist, 0, 1)

reverse_SuCOS_score = 0.5 * fm_score + 0.5 * (1 - protrude_dist)
# Get frag name for linking to score
frag_name = frag_mol.GetProp('_Name').strip('Mpro-')

# Check if MCS yield > 0 atoms
mcs_match = rdFMCS.FindMCS([bit,frag_mol],ringMatchesRingOnly=True,matchValences=True)

# Get number of atoms in MCS match found
no_mcs_atoms = Chem.MolFromSmarts(mcs_match.smartsString).GetNumAtoms()

# Get number of feats from bit for scaling score
no_bit_feats = getNumberfeats(bit)
if no_mcs_atoms == 0:

# Get some info and append to list
frag_name = frag_mol.GetProp('_Name').strip('Mpro-')
scores.append((frag_name, 0, no_bit_atoms))

if no_mcs_atoms > 0:

# NB reverse SuCOS scoring
fm_score = getFeatureMapScore(bit, frag_mol)
fm_score = np.clip(fm_score, 0, 1)

# Change van der Waals radius scale for stricter overlay
protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol,
allowReordering=False,
vdwScale=0.2)
protrude_dist = np.clip(protrude_dist, 0, 1)

reverse_SuCOS_score = 0.5 * fm_score + 0.5 * (1 - protrude_dist)

scores.append((frag_name, reverse_SuCOS_score, no_bit_atoms, no_bit_feats))
scores.append((frag_name, reverse_SuCOS_score, no_bit_atoms))

all_scores.append(scores)

list_dfs = []

for score in all_scores:
df = pd.DataFrame(data=score, columns=['Fragment', 'Score', 'No_bit_atoms', 'No_bit_feats'])

df = pd.DataFrame(data=score, columns=['Fragment', 'Score', 'No_bit_atoms'])

# Get maximum scoring fragment for bit match
df = df[df['Score'] == df['Score'].max()]
list_dfs.append(df)

final_df = pd.concat(list_dfs)

# Get total bit score and some denominator terms
bits_score = (final_df.No_bit_atoms * final_df.Score).sum()
total_atoms = final_df.No_bit_atoms.sum()
feat_match_fraction = no_feats / no_clustered_feats

# Score 1: the score is scaled by the number of bit atoms
score_1 = bits_score

# Score 2: the score is scaled by the number of bit atoms
# penalised by the fraction of feats matched
# the to total number feats clustered
score_2 = score_1 * feat_match_fraction

# Score 3: the score is determined by the fraction of matching
# features to the clustered features within a threshold. This
# should yield similar values to Tim's Featurestein method?
score_3 = feat_match_fraction

score_1 = (final_df.No_bit_atoms * final_df.Score).sum()

# Let's only get frags above a threshold
final_df = final_df[final_df.Score > COS_threshold]

Expand All @@ -331,8 +185,6 @@ def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshol
mol.SetProp(field_XCosRefMols, ','.join(all_frags))
mol.SetIntProp(field_XCosNumHits, len(all_frags))
mol.SetProp(field_XCosScore1, "{:.4f}".format(score_1))
mol.SetProp(field_XCosScore2, "{:.4f}".format(score_2))
mol.SetProp(field_XCosScore3, "{:.4f}".format(score_3))

# Write to file
writer.write(mol)
Expand All @@ -352,19 +204,8 @@ def process(molecules, fragments, writer):
else:
utils.log('Using', len(frag_mol_list), 'fragments. No errors')

feature_map_df = getFeatureMapXCOS(frag_mol_list)
utils.log('Feature map dataframe shape:', feature_map_df.shape)

# Set radius threshold
rad_thresh = 1.5

# Aggregate features using nearest neigbours algo
clustered_df = getFeatureAgg(feature_map_df, rad_thresh=rad_thresh)
utils.log('Clustered dataframe shape:', clustered_df.shape)
no_clustered_feats = len(clustered_df)

#clustered_df, mols, rad_threshold, COS_threshold, writer
getReverseScores(clustered_df, molecules, frag_mol_list, no_clustered_feats, 1.0, 0.50, writer)
#mols, frags, COS_threshold, writer
getReverseScores(molecules, frag_mol_list, 0.40, writer)


def main():
Expand All @@ -381,7 +222,6 @@ def main():
parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed')
parser.add_argument('--metrics', action='store_true', help='Write metrics')


args = parser.parse_args()
utils.log("XCos Args: ", args)

Expand All @@ -394,15 +234,11 @@ def main():
clsMappings[field_XCosRefMols] = "java.lang.String"
clsMappings[field_XCosNumHits] = "java.lang.Integer"
clsMappings[field_XCosScore1] = "java.lang.Float"
clsMappings[field_XCosScore2] = "java.lang.Float"
clsMappings[field_XCosScore3] = "java.lang.Float"

fieldMetaProps.append({"fieldName":field_XCosRefMols, "values": {"source":source, "description":"XCos reference fragments"}})
fieldMetaProps.append({"fieldName":field_XCosNumHits, "values": {"source":source, "description":"XCos number of hits"}})
fieldMetaProps.append({"fieldName":field_XCosScore1, "values": {"source":source, "description":"XCos score 1"}})
fieldMetaProps.append({"fieldName":field_XCosScore2, "values": {"source":source, "description":"XCos score 2"}})
fieldMetaProps.append({"fieldName":field_XCosScore3, "values": {"source":source, "description":"XCos score 3"}})



frags_input,frags_suppl = rdkit_utils.default_open_input(args.fragments, args.fragments_format)

inputs_file, inputs_supplr = rdkit_utils.default_open_input(args.input, args.informat)
Expand Down

0 comments on commit dbb5608

Please sign in to comment.