Skip to content

Commit

Permalink
further improvements to charge and taut/stereo enumerators
Browse files Browse the repository at this point in the history
  • Loading branch information
tdudgeon committed Dec 17, 2019
1 parent ee89e17 commit 915dcef
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 20 deletions.
6 changes: 3 additions & 3 deletions src/python/pipelines/dimorphite/enumerate_charges.dsd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ serviceConfig:
- mw
defaultValue: hac
visible: true
executorClassName: org.squonk.execution.steps.impl.ThinDatasetDockerExecutorStep
thinDescriptors:
- input: input
executorClassName: org.squonk.execution.steps.impl.DefaultDockerExecutorStep
#thinDescriptors:
#- input: input
inputRoutes:
- route: FILE
outputRoutes:
Expand Down
19 changes: 11 additions & 8 deletions src/python/pipelines/dimorphite/enumerate_charges.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ def writeEnumeratedMols(src_mol, enum_mols, writer, index):

def add_src_mol_ref(src_mol, target_mol, index):
"""
Add the ID of the source molecule to the enumerated molecule as the field named EnumChargeSrcMol.
The ID is taken form the uuid field if it exists, if not form the _Name field if it exists and finally
from the index parameter (the index of the source molecule in the input) if neither of those fields are found.
Add the ID of the source molecule to the enumerated molecule as the field named EnumChargesSrcMolUUID.
The ID is taken form the uuid field if it exists, if not form the _Name field if it exists.
The EnumChargesSrcMolIdx field is always set with the index of the source molecule in the input..
:param src_mol:
:param target_mol:
:param index:
Expand All @@ -59,11 +59,11 @@ def add_src_mol_ref(src_mol, target_mol, index):
parent = src_mol.GetProp('uuid')
elif src_mol.HasProp('_name_'):
parent = src_mol.GetProp('_Name')
else:
parent = str(index)

if parent:
target_mol.SetProp('EnumChargeSrcMol', parent)
target_mol.SetProp('EnumChargesSrcMolUUID', parent)

target_mol.SetIntProp('EnumChargesSrcMolIdx', index)

### start main execution #########################################

Expand All @@ -88,9 +88,12 @@ def main():
source = "enumerate_charges.py"
datasetMetaProps = {"source":source, "description": "Enumerate charges using Dimorphite-dl"}
clsMappings = {
"EnumChargeSrcMol": "java.lang.String"}
"EnumChargesSrcMolUUID": "java.lang.String",
"EnumChargesSrcMolIdx": "java.lang.Integer"
}
fieldMetaProps = [
{"fieldName":"EnumChargeSrcMol", "values": {"source":source, "description":"ID of source molecule"}}
{"fieldName":"EnumChargesSrcMolUUID", "values": {"source":source, "description":"UUID of source molecule"}},
{"fieldName":"EnumChargesSrcMolIdx", "values": {"source":source, "description":"Index of source molecule"}}
]

oformat = utils.determine_output_format(args.outformat)
Expand Down
44 changes: 38 additions & 6 deletions src/python/pipelines/rdkit/sanifier.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

# Copyright 2017 Informatics Matters Ltd.
# Copyright 2019 Informatics Matters Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

### Use MolVS to do tautomer enumeration, sterochemistry enumeration, charge neutralisation.
### Use MolVS to do tautomer enumeration, stereochemistry enumeration, charge neutralisation.

import sys, argparse

Expand Down Expand Up @@ -73,7 +73,28 @@ def main():
if args.standardize:
getStandardMolecule = STANDARD_MOL_METHODS[args.standardize_method]

input ,output ,suppl ,writer ,output_base = rdkit_utils.default_open_input_output(args.input, args.informat, args.output, 'sanify', args.outformat)
# handle metadata
source = "sanifier.py"
datasetMetaProps = {"source":source, "description": "Enumerate tautomers and stereoisomers"}
clsMappings = {
"EnumTautIsoSourceMolUUID": "java.lang.String",
"EnumTautIsoSourceMolIdx": "java.lang.Integer"
}
fieldMetaProps = [
{"fieldName":"EnumTautIsoSourceMolUUID", "values": {"source":source, "description":"UUID of source molecule"}},
{"fieldName":"EnumTautIsoSourceMolIdx", "values": {"source":source, "description":"Index of source molecule"}}
]

oformat = utils.determine_output_format(args.outformat)

input,output,suppl,writer,output_base = rdkit_utils. \
default_open_input_output(args.input, args.informat, args.output,
'sanifier', args.outformat,
thinOutput=False, valueClassMappings=clsMappings,
datasetMetaProps=datasetMetaProps,
fieldMetaProps=fieldMetaProps)


i=0
count=0
errors=0
Expand Down Expand Up @@ -110,11 +131,13 @@ def main():
parentUuid = None

results = []
results.append(mol)


if args.enumerate_tauts:
utils.log("Enumerating tautomers")
results = enumerateTautomers(mol)
else:
results.append(mol)

if args.enumerate_stereo:
utils.log("Enumerating steroisomers")
Expand All @@ -125,10 +148,14 @@ def main():
results.extend(enumerated)

for m in results:
# copy the src mol props
for name in mol.GetPropNames():
m.SetProp(name, mol.GetProp(name))
# add our new props
m.ClearProp("uuid")
m.SetIntProp("SourceMolNum", i)
m.SetIntProp("EnumTautIsoSourceMolIdx", i)
if parentUuid:
m.SetProp("SourceMolUUID", parentUuid)
m.SetProp("EnumTautIsoSourceMolUUID", parentUuid)

count = write_out(results,count,writer,args.mol_format,args.outformat)

Expand All @@ -139,6 +166,11 @@ def main():
input.close()
output.close()

# re-write the metadata as we now know the size
if oformat == 'json':
utils.write_squonk_datasetmetadata(output_base, False, clsMappings, datasetMetaProps, fieldMetaProps, size=count)


if args.meta:
utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, '__ErrorCount__':errors , 'RDKitSanify':count })

Expand Down
4 changes: 1 addition & 3 deletions src/python/pipelines/rdkit/sanifier_enumerator.dsd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,7 @@ serviceConfig:
minValues: 1
maxValues: 1
visible: true
executorClassName: org.squonk.execution.steps.impl.ThinDatasetDockerExecutorStep
thinDescriptors:
- input: input
executorClassName: org.squonk.execution.steps.impl.DefaultDockerExecutorStep
inputRoutes:
- route: FILE
outputRoutes:
Expand Down

0 comments on commit 915dcef

Please sign in to comment.