featurestein and xcos scoring

InformaticsMatters · Jul 7, 2020 · 131d4fa · 131d4fa
1 parent 7dbe6ca
commit 131d4fa
Show file tree

Hide file tree

Showing 7 changed files with 797 additions and 0 deletions.
diff --git a/data/mpro/hits-17.sdf.gz b/data/mpro/hits-17.sdf.gz
diff --git a/data/mpro/poses.sdf.gz b/data/mpro/poses.sdf.gz
diff --git a/requirements-rdkit.txt b/requirements-rdkit.txt
@@ -3,3 +3,5 @@ im-pipelines-utils-rdkit==1.5.*
 matplotlib==2.2.*
 molvs==0.1.1
 standardiser==0.1.9
+pandas==1.0.1
+scikit-learn==0.22.1
diff --git a/src/nextflow/rdkit/xcos.nf b/src/nextflow/rdkit/xcos.nf
@@ -0,0 +1,58 @@
+#!/usr/bin/env nextflow
+
+params.inputs = "data/mpro/poses.sdf"
+params.fragments = "data/mpro/hits-17.sdf"
+params.chunk = 500
+params.limit = 0
+params.digits = 4
+
+inputs = file(params.inputs)
+fragments = file(params.fragments)
+
+process splitter {
+
+    container 'informaticsmatters/rdkit_pipelines:latest'
+
+    input:
+    file inputs
+
+    output:
+    file 'inputs_part*.sdf.gz' into inputs_parts mode flatten
+
+    """
+    python -m pipelines_utils_rdkit.filter -i '$inputs' -c $params.chunk -l $params.limit -d $params.digits -o 'inputs_part_' -of sdf
+    """
+}
+
+process xcos {
+
+    container 'informaticsmatters/rdkit_pipelines:latest'
+
+	input:
+    file part from inputs_parts
+    file fragments
+
+    output:
+    file 'scored_part*.sdf' into scored_parts
+
+    """
+    python -m pipelines.rdkit.xcos -i '$part' -f '$fragments' -o '${part.name.replace('inputs', 'scored')[0..-8]}' -of sdf --no-gzip
+    """
+}
+
+process joiner {
+
+    container 'informaticsmatters/rdkit_pipelines:latest'
+
+    publishDir ".", mode: 'move'
+
+    input:
+	file parts from scored_parts.collect()
+
+	output:
+	file 'xcos_scored.sdf.gz'
+
+	"""
+	cat '$parts' | gzip > xcos_scored.sdf.gz
+	"""
+}
diff --git a/src/python/pipelines/rdkit/featurestein-generate.py b/src/python/pipelines/rdkit/featurestein-generate.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python
+
+# Copyright 2020 Informatics Matters Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Ligand pose scoring using 'FeatureStein'.
+This module generates a merged feature map from a set of 3D ligands.
+The output is a pickle of the merged feature map that can be read by the featurestein-score.py module to
+generate scores.
+"""
+
+from __future__ import print_function
+import argparse, os, sys, gzip, pickle
+
+from rdkit import Chem, rdBase, RDConfig
+from rdkit.Chem import AllChem, rdShapeHelpers
+from rdkit.Chem.FeatMaps import FeatMaps
+from rdkit.Chem.FeatMaps.FeatMapUtils import CombineFeatMaps
+
+from pipelines_utils import parameter_utils, utils
+from pipelines_utils_rdkit import rdkit_utils
+
+
+### start function definitions #########################################
+
+ffact = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
+
+fmParams = {}
+for k in ffact.GetFeatureFamilies():
+    fparams = FeatMaps.FeatMapParams()
+    fmParams[k] = fparams
+
+exclude = ()
+
+def filterFeature(f):
+    if f.GetFamily() in exclude:
+        return None
+    else:
+        return f
+
+def getRawFeatures(mol):
+    rawFeats = ffact.GetFeaturesForMol(mol)
+    # filter that list down to only include the ones we're interested in
+    filtered = list(filter(filterFeature, rawFeats))
+    return filtered
+
+def getFeatureMap(mol):
+    feats = getRawFeatures(mol)
+    return FeatMaps.FeatMap(feats=feats, weights=[1]*len(feats),params=fmParams)
+
+def score_featmaps(fm1, fm2):
+    "Generate the score for 2 feature maps"
+    return fm1.ScoreFeats(fm2.GetFeatures()) / fm1.GetNumFeatures()
+
+def build_feat_data(mols):
+    "Build the feature maps and do the all vs. all comparison"
+    fmaps = []
+    scores = []
+    for mol1 in mols:
+        fm1 = getFeatureMap(mol1)
+        fmaps.append(fm1)
+        row = []
+        for mol2 in mols:
+            fm2 = getFeatureMap(mol2)
+            score = score_featmaps(fm1, fm2)
+            row.append(score)
+            #print(len(data), len(row), score)
+        scores.append(row)
+    return fmaps, scores
+
+def find_closest(scores):
+    #print('Find closest for', len(scores), len(scores[0]))
+    best_score = 0
+    for i in range(len(scores)):
+        for j in range(len(scores)):
+            if i == j:
+                continue
+            score = scores[i][j]
+            if score > best_score:
+                best_score = score
+                best_row = i
+                best_col = j
+    return best_score, best_row, best_col
+
+def merge_feat_maps(fmaps, scores):
+    "Merge the 2 closest feature maps, remove them form the data and replace with the merged feature map"
+    best_score, best_row, best_col = find_closest(scores)
+    #print(best_score, best_row, best_col)
+    feat1 = fmaps[best_row]
+    feat2 = fmaps[best_col]
+    utils.log('Merging', best_row, 'and', best_col, 'with score', best_score, '#features:', feat1.GetNumFeatures(), feat2.GetNumFeatures())
+    merged = CombineFeatMaps(feat1, feat2, mergeMetric=1, mergeTol=1.5, dirMergeMode=0)
+    # need to make sure we delete the biggest index first to avoid changing the smaller index
+    if best_row > best_col:
+        a = best_row
+        b = best_col
+    else:
+        a = best_col
+        b = best_row
+
+    #print('Initial:', len(fmaps), len(scores), ','.join([str(len(x)) for x in scores]))
+    del fmaps[a]
+    del fmaps[b]
+    del scores[a]
+    del scores[b]
+    for row in scores:
+        del row[a]
+        del row[b]
+
+    merged_scores = []
+    for i in range(len(fmaps)):
+        fmap = fmaps[i]
+        score1 = score_featmaps(fmap, merged)
+        score2 = score_featmaps(merged, fmap)
+        scores[i].append(score1)
+        merged_scores.append(score2)
+
+    fmaps.append(merged)
+    merged_scores.append(score_featmaps(merged, merged))
+    scores.append(merged_scores)
+
+
+def process(inputs, fname):
+
+    mols = [m for m in inputs if m]
+    fmaps, scores = build_feat_data(mols)
+    merged_fmaps = fmaps.copy()
+    utils.log('Processing', len(fmaps), 'molecules')
+    while len(merged_fmaps) > 1:
+        merge_feat_maps(merged_fmaps, scores)
+    merged_fmap = merged_fmaps[0]
+    pickle.dump(merged_fmap, open(fname, "wb" ))
+    utils.log('Wrote merged feature map with', merged_fmap.GetNumFeatures(), 'features as pickle to', fname)
+
+    return len(mols), merged_fmap.GetNumFeatures()
+
+### start main execution #########################################
+
+def main():
+
+    global fmaps
+
+    parser = argparse.ArgumentParser(description='FeatureStein generation with RDKit')
+    parameter_utils.add_default_input_args(parser)
+    parser.add_argument('-f', '--feat-map', default='featurestein.p', help='Name of pickle to generate')
+    parser.add_argument('--metrics', action='store_true', help='Write metrics')
+
+    args = parser.parse_args()
+    utils.log("FeatureStein Args: ", args)
+
+    inputs_file, inputs_supplr = rdkit_utils. \
+        default_open_input(args.input, args.informat)
+
+    # this does the processing
+    num_mols, num_feats = process(inputs_supplr, args.feat_map)
+
+    inputs_file.close()
+
+    if args.metrics:
+        utils.write_metrics(output_base, {'__StatusMessage__': 'Generated ' + num_feats + ' from ' + num_mols + ' molecules',
+                                          '__InputCount__':num_mols, 'RDKitFeatureMap':num_mols})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/python/pipelines/rdkit/featurestein-score.py b/src/python/pipelines/rdkit/featurestein-score.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+
+# Copyright 2020 Informatics Matters Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Ligand pose scoring using 'FeatureStein'.
+FeatureStein is a merged RDKit feature map that estimates the overlap of a ligand with a set of ligands (e.g. fragment
+screening hits) based on the RDKit feature maps.
+See featurestein-generate.py for how to generate the merged feature maps.
+"""
+
+from __future__ import print_function
+import argparse, os, sys, gzip, pickle, traceback
+from rdkit import Chem, rdBase, RDConfig
+from rdkit.Chem import AllChem, rdShapeHelpers
+from rdkit.Chem.FeatMaps import FeatMaps
+from pipelines_utils import parameter_utils, utils
+from pipelines_utils_rdkit import rdkit_utils
+
+
+### start function definitions #########################################
+
+field_FeatureSteinScore = "FeatureStein_Score"
+
+# Setting up the features to use in FeatureMap
+ffact = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
+fmaps = None
+
+
+def filter_feature(f):
+    result = f.GetFamily() in fmaps.params.keys()
+    return result
+
+def get_raw_features(mol):
+    rawFeats = ffact.GetFeaturesForMol(mol)
+    # filter that list down to only include the ones we're interested in
+    filtered = list(filter(filter_feature, rawFeats))
+    return filtered
+
+def create_feature_map(mol):
+    feats = get_raw_features(mol)
+    return FeatMaps.FeatMap(feats=feats, weights=[1]*len(feats),params=fmaps.params)
+
+def score_featmaps(fm1):
+    "Generate the score for 2 feature maps"
+    if fm1.GetNumFeatures() == 0:
+        return 0
+    else:
+        score = fm1.ScoreFeats(fmaps.GetFeatures())
+        #utils.log(score, fm1.GetNumFeatures())
+        return score / fm1.GetNumFeatures()
+
+def get_fmap_score(mol):
+    featMap = create_feature_map(mol)
+    score = score_featmaps(featMap)
+    return score
+
+def process(inputs, writer):
+    total = 0
+    success = 0
+    errors = 0
+    for mol in inputs:
+        total += 1
+        if mol is None:
+            errors += 1
+            continue
+        try:
+            score = get_fmap_score(mol)
+            # utils.log('Score:', score)
+            if total % 1000 == 0:
+                utils.log('Processed molecule', total, '...')
+            mol.SetDoubleProp(field_FeatureSteinScore, score)
+            writer.write(mol)
+            success += 1
+        except:
+            utils.log("Error scoring molecule", sys.exc_info()[0])
+            traceback.print_exc()
+            errors += 1
+
+    return total, success, errors
+
+### start main execution #########################################
+
+def main():
+
+    global fmaps
+
+    parser = argparse.ArgumentParser(description='FeatureStein scoring with RDKit')
+    parameter_utils.add_default_io_args(parser)
+    parser.add_argument('-f', '--feat-map', help='Feature Map pickle to score with')
+    parser.add_argument('--metrics', action='store_true', help='Write metrics')
+
+
+    args = parser.parse_args()
+    utils.log("FeatureStein Args: ", args)
+
+    source = "featurestein-score.py"
+    datasetMetaProps = {"source":source, "description": "FeatureStein scoring using RDKit " + rdBase.rdkitVersion}
+
+    clsMappings = {}
+    fieldMetaProps = []
+    clsMappings[field_FeatureSteinScore] = "java.lang.Float"
+    fieldMetaProps.append({"fieldName":field_FeatureSteinScore,   "values": {"source":source, "description":"FeatureStein score"}})
+
+    pkl_file = open(args.feat_map, 'rb')
+    fmaps = pickle.load(pkl_file)
+    utils.log('FeatureMap has', fmaps.GetNumFeatures(), "features")
+
+    inputs_file,output,inputs_supplr,writer,output_base = rdkit_utils. \
+        default_open_input_output(args.input, args.informat, args.output,
+                                  'featurestein', args.outformat,
+                                  valueClassMappings=clsMappings,
+                                  datasetMetaProps=datasetMetaProps,
+                                  fieldMetaProps=fieldMetaProps)
+
+    # this does the processing
+    total, success, errors = process(inputs_supplr, writer)
+
+    inputs_file.close()
+    writer.flush()
+    writer.close()
+    output.close()
+
+    if args.metrics:
+        utils.write_metrics(output_base, {'__InputCount__':total, '__OutputCount__':success, '__ErrorCount__':errors, 'RDKitFeatureMap':success})
+
+
+if __name__ == "__main__":
+    main()