Skip to content

Commit

Permalink
featurestein and xcos scoring
Browse files Browse the repository at this point in the history
  • Loading branch information
tdudgeon committed Jul 7, 2020
1 parent 7dbe6ca commit 131d4fa
Show file tree
Hide file tree
Showing 7 changed files with 797 additions and 0 deletions.
Binary file added data/mpro/hits-17.sdf.gz
Binary file not shown.
Binary file added data/mpro/poses.sdf.gz
Binary file not shown.
2 changes: 2 additions & 0 deletions requirements-rdkit.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ im-pipelines-utils-rdkit==1.5.*
matplotlib==2.2.*
molvs==0.1.1
standardiser==0.1.9
pandas==1.0.1
scikit-learn==0.22.1
58 changes: 58 additions & 0 deletions src/nextflow/rdkit/xcos.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env nextflow

params.inputs = "data/mpro/poses.sdf"
params.fragments = "data/mpro/hits-17.sdf"
params.chunk = 500
params.limit = 0
params.digits = 4

inputs = file(params.inputs)
fragments = file(params.fragments)

process splitter {

container 'informaticsmatters/rdkit_pipelines:latest'

input:
file inputs

output:
file 'inputs_part*.sdf.gz' into inputs_parts mode flatten

"""
python -m pipelines_utils_rdkit.filter -i '$inputs' -c $params.chunk -l $params.limit -d $params.digits -o 'inputs_part_' -of sdf
"""
}

process xcos {

container 'informaticsmatters/rdkit_pipelines:latest'

input:
file part from inputs_parts
file fragments

output:
file 'scored_part*.sdf' into scored_parts

"""
python -m pipelines.rdkit.xcos -i '$part' -f '$fragments' -o '${part.name.replace('inputs', 'scored')[0..-8]}' -of sdf --no-gzip
"""
}

process joiner {

container 'informaticsmatters/rdkit_pipelines:latest'

publishDir ".", mode: 'move'

input:
file parts from scored_parts.collect()

output:
file 'xcos_scored.sdf.gz'

"""
cat '$parts' | gzip > xcos_scored.sdf.gz
"""
}
177 changes: 177 additions & 0 deletions src/python/pipelines/rdkit/featurestein-generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#!/usr/bin/env python

# Copyright 2020 Informatics Matters Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Ligand pose scoring using 'FeatureStein'.
This module generates a merged feature map from a set of 3D ligands.
The output is a pickle of the merged feature map that can be read by the featurestein-score.py module to
generate scores.
"""

from __future__ import print_function
import argparse, os, sys, gzip, pickle

from rdkit import Chem, rdBase, RDConfig
from rdkit.Chem import AllChem, rdShapeHelpers
from rdkit.Chem.FeatMaps import FeatMaps
from rdkit.Chem.FeatMaps.FeatMapUtils import CombineFeatMaps

from pipelines_utils import parameter_utils, utils
from pipelines_utils_rdkit import rdkit_utils


### start function definitions #########################################

ffact = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))

fmParams = {}
for k in ffact.GetFeatureFamilies():
fparams = FeatMaps.FeatMapParams()
fmParams[k] = fparams

exclude = ()

def filterFeature(f):
if f.GetFamily() in exclude:
return None
else:
return f

def getRawFeatures(mol):
rawFeats = ffact.GetFeaturesForMol(mol)
# filter that list down to only include the ones we're interested in
filtered = list(filter(filterFeature, rawFeats))
return filtered

def getFeatureMap(mol):
feats = getRawFeatures(mol)
return FeatMaps.FeatMap(feats=feats, weights=[1]*len(feats),params=fmParams)

def score_featmaps(fm1, fm2):
"Generate the score for 2 feature maps"
return fm1.ScoreFeats(fm2.GetFeatures()) / fm1.GetNumFeatures()

def build_feat_data(mols):
"Build the feature maps and do the all vs. all comparison"
fmaps = []
scores = []
for mol1 in mols:
fm1 = getFeatureMap(mol1)
fmaps.append(fm1)
row = []
for mol2 in mols:
fm2 = getFeatureMap(mol2)
score = score_featmaps(fm1, fm2)
row.append(score)
#print(len(data), len(row), score)
scores.append(row)
return fmaps, scores

def find_closest(scores):
#print('Find closest for', len(scores), len(scores[0]))
best_score = 0
for i in range(len(scores)):
for j in range(len(scores)):
if i == j:
continue
score = scores[i][j]
if score > best_score:
best_score = score
best_row = i
best_col = j
return best_score, best_row, best_col

def merge_feat_maps(fmaps, scores):
"Merge the 2 closest feature maps, remove them form the data and replace with the merged feature map"
best_score, best_row, best_col = find_closest(scores)
#print(best_score, best_row, best_col)
feat1 = fmaps[best_row]
feat2 = fmaps[best_col]
utils.log('Merging', best_row, 'and', best_col, 'with score', best_score, '#features:', feat1.GetNumFeatures(), feat2.GetNumFeatures())
merged = CombineFeatMaps(feat1, feat2, mergeMetric=1, mergeTol=1.5, dirMergeMode=0)
# need to make sure we delete the biggest index first to avoid changing the smaller index
if best_row > best_col:
a = best_row
b = best_col
else:
a = best_col
b = best_row

#print('Initial:', len(fmaps), len(scores), ','.join([str(len(x)) for x in scores]))
del fmaps[a]
del fmaps[b]
del scores[a]
del scores[b]
for row in scores:
del row[a]
del row[b]

merged_scores = []
for i in range(len(fmaps)):
fmap = fmaps[i]
score1 = score_featmaps(fmap, merged)
score2 = score_featmaps(merged, fmap)
scores[i].append(score1)
merged_scores.append(score2)

fmaps.append(merged)
merged_scores.append(score_featmaps(merged, merged))
scores.append(merged_scores)


def process(inputs, fname):

mols = [m for m in inputs if m]
fmaps, scores = build_feat_data(mols)
merged_fmaps = fmaps.copy()
utils.log('Processing', len(fmaps), 'molecules')
while len(merged_fmaps) > 1:
merge_feat_maps(merged_fmaps, scores)
merged_fmap = merged_fmaps[0]
pickle.dump(merged_fmap, open(fname, "wb" ))
utils.log('Wrote merged feature map with', merged_fmap.GetNumFeatures(), 'features as pickle to', fname)

return len(mols), merged_fmap.GetNumFeatures()

### start main execution #########################################

def main():

global fmaps

parser = argparse.ArgumentParser(description='FeatureStein generation with RDKit')
parameter_utils.add_default_input_args(parser)
parser.add_argument('-f', '--feat-map', default='featurestein.p', help='Name of pickle to generate')
parser.add_argument('--metrics', action='store_true', help='Write metrics')

args = parser.parse_args()
utils.log("FeatureStein Args: ", args)

inputs_file, inputs_supplr = rdkit_utils. \
default_open_input(args.input, args.informat)

# this does the processing
num_mols, num_feats = process(inputs_supplr, args.feat_map)

inputs_file.close()

if args.metrics:
utils.write_metrics(output_base, {'__StatusMessage__': 'Generated ' + num_feats + ' from ' + num_mols + ' molecules',
'__InputCount__':num_mols, 'RDKitFeatureMap':num_mols})


if __name__ == "__main__":
main()
141 changes: 141 additions & 0 deletions src/python/pipelines/rdkit/featurestein-score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/env python

# Copyright 2020 Informatics Matters Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Ligand pose scoring using 'FeatureStein'.
FeatureStein is a merged RDKit feature map that estimates the overlap of a ligand with a set of ligands (e.g. fragment
screening hits) based on the RDKit feature maps.
See featurestein-generate.py for how to generate the merged feature maps.
"""

from __future__ import print_function
import argparse, os, sys, gzip, pickle, traceback
from rdkit import Chem, rdBase, RDConfig
from rdkit.Chem import AllChem, rdShapeHelpers
from rdkit.Chem.FeatMaps import FeatMaps
from pipelines_utils import parameter_utils, utils
from pipelines_utils_rdkit import rdkit_utils


### start function definitions #########################################

field_FeatureSteinScore = "FeatureStein_Score"

# Setting up the features to use in FeatureMap
ffact = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
fmaps = None


def filter_feature(f):
result = f.GetFamily() in fmaps.params.keys()
return result

def get_raw_features(mol):
rawFeats = ffact.GetFeaturesForMol(mol)
# filter that list down to only include the ones we're interested in
filtered = list(filter(filter_feature, rawFeats))
return filtered

def create_feature_map(mol):
feats = get_raw_features(mol)
return FeatMaps.FeatMap(feats=feats, weights=[1]*len(feats),params=fmaps.params)

def score_featmaps(fm1):
"Generate the score for 2 feature maps"
if fm1.GetNumFeatures() == 0:
return 0
else:
score = fm1.ScoreFeats(fmaps.GetFeatures())
#utils.log(score, fm1.GetNumFeatures())
return score / fm1.GetNumFeatures()

def get_fmap_score(mol):
featMap = create_feature_map(mol)
score = score_featmaps(featMap)
return score

def process(inputs, writer):
total = 0
success = 0
errors = 0
for mol in inputs:
total += 1
if mol is None:
errors += 1
continue
try:
score = get_fmap_score(mol)
# utils.log('Score:', score)
if total % 1000 == 0:
utils.log('Processed molecule', total, '...')
mol.SetDoubleProp(field_FeatureSteinScore, score)
writer.write(mol)
success += 1
except:
utils.log("Error scoring molecule", sys.exc_info()[0])
traceback.print_exc()
errors += 1

return total, success, errors

### start main execution #########################################

def main():

global fmaps

parser = argparse.ArgumentParser(description='FeatureStein scoring with RDKit')
parameter_utils.add_default_io_args(parser)
parser.add_argument('-f', '--feat-map', help='Feature Map pickle to score with')
parser.add_argument('--metrics', action='store_true', help='Write metrics')


args = parser.parse_args()
utils.log("FeatureStein Args: ", args)

source = "featurestein-score.py"
datasetMetaProps = {"source":source, "description": "FeatureStein scoring using RDKit " + rdBase.rdkitVersion}

clsMappings = {}
fieldMetaProps = []
clsMappings[field_FeatureSteinScore] = "java.lang.Float"
fieldMetaProps.append({"fieldName":field_FeatureSteinScore, "values": {"source":source, "description":"FeatureStein score"}})

pkl_file = open(args.feat_map, 'rb')
fmaps = pickle.load(pkl_file)
utils.log('FeatureMap has', fmaps.GetNumFeatures(), "features")

inputs_file,output,inputs_supplr,writer,output_base = rdkit_utils. \
default_open_input_output(args.input, args.informat, args.output,
'featurestein', args.outformat,
valueClassMappings=clsMappings,
datasetMetaProps=datasetMetaProps,
fieldMetaProps=fieldMetaProps)

# this does the processing
total, success, errors = process(inputs_supplr, writer)

inputs_file.close()
writer.flush()
writer.close()
output.close()

if args.metrics:
utils.write_metrics(output_base, {'__InputCount__':total, '__OutputCount__':success, '__ErrorCount__':errors, 'RDKitFeatureMap':success})


if __name__ == "__main__":
main()
Loading

0 comments on commit 131d4fa

Please sign in to comment.