microbiomedata · hrshdhgd · Jul 15, 2021 · Jul 15, 2021 · Jul 15, 2021 · Jul 15, 2021
diff --git a/.gitignore b/.gitignore
@@ -98,4 +98,7 @@ tests/test_config.ini
 
 # Don't lock
 Pipfile.lock
+
 sphinx/_build/*
+sample_annotator/text_mining/settings.ini
+sample_annotator/text_mining/input/*
diff --git a/Makefile b/Makefile
@@ -16,3 +16,14 @@ $(SAMPLE_SCHEMA_JSON): $(SAMPLE_SCHEMA_YAML)
 test:
 	pipenv install --dev
 	pipenv run python -m unittest
+
+# Lock requirements
+requirements.txt:
+	pipenv lock --requirements
+
+# NER files
+text_mining/input/%_nodes.tsv: text_mining/input/%.json
+	kgx transform $< --input-format obojson --output $@ --output-format tsv 
+
+text_mining/terms/%_termlist.tsv: text_mining/input/%_nodes.tsv
+	python -m runner.runner prepare-termlist -i $< -o $@
diff --git a/requirements.txt b/requirements.txt
@@ -102,4 +102,5 @@ typing-extensions==3.10.0.0
 urllib3==1.26.6; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'
 watchdog==2.1.3; python_version >= '3.6'
 wrapt==1.12.1
-zipp==3.5.0; python_version >= '3.6'
+zipp==3.5.0; python_version >= '3.6'
+runner@git+https://github.com/monarch-initiative/runner.git
diff --git a/sample_annotator/sample_annotator.py b/sample_annotator/sample_annotator.py
@@ -7,17 +7,17 @@
 import logging
 import pandas as pd
 import bioregistry
+import os
 
-
-from nmdc_schema.nmdc import Biosample, GeolocationValue, QuantityValue
+from nmdc_schema.nmdc import Biosample, GeolocationValue, QuantityValue, OntologyClass
 from nmdc_schema.nmdc import slots as nmdc_slots
 
 from .geolocation.geotools import GeoEngine
 from .measurements.measurements import MeasurementEngine
 from .metadata.sample_schema import SampleSchema, underscore
 from .report_model import AnnotationReport, Message, PackageCombo, AnnotationMultiSampleReport, Category, SAMPLE, STUDY
 
-
+from sample_annotator.text_mining.TextMining import SETTINGS_FILENAME, TextMining
 from linkml_runtime.linkml_model.meta import ClassDefinition, SchemaDefinition, SlotDefinition, Definition
 
 KEY_ENV_PACKAGE = nmdc_slots.env_package.name
@@ -207,8 +207,36 @@ def perform_text_mining(self, sample: SAMPLE, report: AnnotationReport):
         """
         Performs text mining
         """
-        # TODO: Mark and Harshad to populate
-        ...
+        keys_of_interest = ['env_broad_scale', 'env_local_scale', 'env_medium']
+        PWD = os.path.dirname(os.path.realpath(__file__))
+        TEXT_MINING_DIR = os.path.join(PWD,'text_mining')
+        NER_INPUT_FILE = os.path.join(TEXT_MINING_DIR,'input/input.tsv')
+        NER_OUTPUT_FILE = os.path.join(TEXT_MINING_DIR, 'output/runNER_Output.tsv')
+
+        sample_of_interest = {key: sample[key] for key in keys_of_interest if key in sample.keys() and sample[key] is not None}
+        if not sample_of_interest:
+            report.add_message('Nothing to NER.')
+        else:
+            sample_df = pd.DataFrame.from_dict(sample_of_interest, orient='index')\
+                                    .reset_index()\
+                                    .rename(columns={'index':'id', 0:'text'})
+
+            sample_df.to_csv(NER_INPUT_FILE, index=None, sep='\t')
+
+
+
+            # Steps that lead to NER
+            text_miner = TextMining()
+            text_miner.create_settings_file(path=TEXT_MINING_DIR)
+            text_miner.mine(os.path.join(TEXT_MINING_DIR, SETTINGS_FILENAME))
+
+            # Post-process NER
+            ner_result_df = pd.read_csv(NER_OUTPUT_FILE, sep='\t', low_memory=False)
+
+            for key in sample_of_interest.keys():
+                match = ner_result_df.loc[ner_result_df['PREFERRED FORM'] == sample[key]]['ENTITY ID']
+                if len(match) > 0:
+                    sample[key] = match[match.index[0]]
 
     def perform_geolocation_inference(self, sample: SAMPLE, report: AnnotationReport):
         """

diff --git a/sample_annotator/text_mining/TextMining.py b/sample_annotator/text_mining/TextMining.py
@@ -2,12 +2,92 @@
 from typing import Optional, List, Set, Any
 from dataclasses import dataclass
 import logging
+from unittest import runner
 from nmdc_schema.nmdc import QuantityValue
 import re
+import os
+import configparser
+from runner import runner
+import json
+
+PWD = os.path.dirname(os.path.realpath(__file__))
+SETTINGS_JSON = 'settings.json'
+SETTINGS_FILENAME = 'settings.ini'
 
 @dataclass
 class TextMining():
-    ...
+    """
+    Text mining Class
+    """
+
+
+    def create_settings_file(self, path: str = PWD, ontList: List = ['ENVO']) -> None: 
+        """
+        Dynamically creates the settings.ini file for OGER to get parameters.
+
+        :param path: Path of the 'nlp' folder
+        :param ontList: The ontology to be used as dictionary e.g. ['ENVO', 'CHEBI']
+        :return: None.
+
+        -   Include header: Output file will have column names
+
+        -   The 'Shared' section declares global variables that can be used in other sections
+            e.g. Data root.
+            root = location of the working directory
+            accessed in other sections using => ${Shared:root}/
+
+        -   Input formats accepted:
+            txt, txt_json, bioc_xml, bioc_json, conll, pubmed,
+            pxml, pxml.gz, pmc, nxml, pubtator, pubtator_fbk,
+            becalmabstracts, becalmpatents
+
+        -   Two iter-modes available: [collection or document]
+            document:- 'n' input files = 'n' output files
+            (provided every file has ontology terms)
+            collection:- n input files = 1 output file
+
+        -   Export formats possible:
+            tsv, txt, text_tsv, xml, text_xml, bioc_xml,
+            bioc_json, bionlp, bionlp.ann, brat, brat.ann,
+            conll, pubtator, pubanno_json, pubtator, pubtator_fbk,
+            europepmc, europepmc.zip, odin, becalm_tsv, becalm_json
+            These can be passed as a list for multiple outputs too.
+
+        -   Multiple Termlists can be declared in separate sections
+            e.g. [Termlist1], [Termlist2] ...[Termlistn] with each having
+            their own paths
+
+        -   Normalization used: lowercase and stem-Porter
+        """
+
+        config = configparser.ConfigParser()
+        config['Section'] = {}
+        config['Shared'] = {}
+
+        # Settings required by OGER
+
+        with open(os.path.join(path,SETTINGS_JSON)) as stream:
+            self.object = json.load(stream)
+        config['Main'] = self.object['Main']
+        config.set('Main','input-directory', os.path.join(path,self.object['Relative-Path']['input-dir']))
+        config.set('Main','output-directory', os.path.join(path,self.object['Relative-Path']['output-dir']))
+        config.set('Main','termlist_stopwords', os.path.join(path,self.object['Relative-Path']['stopwords']))
 
-    # TODO: bring in Mark/Harshad's code
+        # Iterate through ontoList to register paths of corresponding termlists
+        for idx, ont in enumerate(ontList):
+            termlist_path = os.path.join(path,'terms/'+ont.lower()+'_termlist.tsv')
+            config.set('Main','termlist'+str(idx+1)+'_path', termlist_path)
+
+        # Write
+
+        with open(os.path.join(path, SETTINGS_FILENAME), 'w') as settings_file:
+            config.write(settings_file)
+
+    def mine(self, setting_file):
+        runner.run_oger(settings=setting_file)
+
 
+if __name__ == '__main__':
+    text_mining = TextMining()
+    text_mining.create_settings_file(PWD, ['ENVO'])
+    text_mining.mine(setting_file=os.path.join(PWD,SETTINGS_FILENAME))
diff --git a/sample_annotator/text_mining/settings.json b/sample_annotator/text_mining/settings.json
@@ -0,0 +1,18 @@
+{
+    "Main": {
+        "include_header" : true,
+        "pointer-type" : "glob",
+        "pointers" : "*.tsv",
+        "iter-mode" : "collection",
+        "article-format" : "txt_tsv",
+        "export_format": "tsv",
+        "termlist_normalize": "lowercase stem-Porter"
+
+    },
+    "Relative-Path": {
+        "input-dir": "input",
+        "output-dir": "output",
+        "stopwords": "stopwords/stopWords.txt"
+    }
+
+}
diff --git a/sample_annotator/text_mining/settings_sample.ini b/sample_annotator/text_mining/settings_sample.ini
@@ -0,0 +1,69 @@
+[Section]
+# OGER settings.ini file for argument loading
+[Shared]
+
+[Main]
+include_header = True
+input-directory = input
+output-directory = output
+pointer-type = glob
+pointers = *.tsv
+# options: collection OR document
+iter-mode = collection
+# article-format options:
+#    'txt_directory',
+#    'txt_id',
+#    'txt_collection',
+#    'txt_json',
+#    'txt_tar',
+#    'txt_tsv',
+#    'conll',
+#    'pubtator',
+#    'pubtator_fbk',
+#    'pxmlgz',
+#    'pxml_directory',
+#    'pxml_id',
+#    'bioc_xml',
+#    'bioc_json',
+#    'download_pubmed',
+#    'download_pmc',
+#    'download_bad_pmc',
+#    'download_fictious_pmc',
+#    'download_random_pmc',
+article-format = txt_tsv
+# export_format options:
+#    'tsv'
+#    'txt'
+#    'xml'
+#    'text_xml'
+#    'bioc'
+#    'bioc_xml'
+#    'bioc_json'
+#    'odin':
+#    'bionlp'
+#    'bionlp.ann'
+#    'brat'
+#    'brat.ann'
+#    'conll'
+#    'becalm_tsv'
+#    'becalm_json'
+#    'pubanno_json'
+#    'pubanno_json.tgz'
+#    'pubtator'
+#    'pubtator_fbk'
+#    'europepmc'
+#    'europepmc.zip'
+export_format = tsv
+# Multiple termlists can be added by giving them numbers
+# as shown below. Each termlist could have a separate stopword list
+# Initially there's just one for all.
+termlist1_path = terms/envo_termlist.tsv
+# termlist2_path = terms/ncbitaxon_termlist.tsv
+# termlist3_path = terms/po_termlist.tsv
+# termlist4_path = terms/ecocore_termlist.tsv
+# termlist5_path = terms/go_termlist.tsv
+# termlist6_path = terms/obi_termlist.tsv
+# termlist7_path = terms/ncit_termlist.tsv
+
+termlist_stopwords = stopwords/stopWords.txt
+termlist_normalize = lowercase stem-Porter