microbiomedata · hrshdhgd · Jul 15, 2021 · Jul 15, 2021 · Jul 15, 2021 · Jul 15, 2021
diff --git a/requirements.txt b/requirements.txt
@@ -102,4 +102,5 @@ typing-extensions==3.10.0.0
 urllib3==1.26.6; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'
 watchdog==2.1.3; python_version >= '3.6'
 wrapt==1.12.1
-zipp==3.5.0; python_version >= '3.6'
+zipp==3.5.0; python_version >= '3.6'
+oger@git+https://github.com/OntoGene/OGER.git
diff --git a/sample_annotator/text_mining/TextMining.py b/sample_annotator/text_mining/TextMining.py
@@ -4,10 +4,85 @@
 import logging
 from nmdc_schema.nmdc import QuantityValue
 import re
+import os
+import configparser
+
+SETTINGS_FILENAME = 'settings.ini'
+PATH = '.'
 
 @dataclass
 class TextMining():
-    ...
+    """
+    Text mining Class
+    """
+
+
+    def create_settings_file(self, path: str = PATH, ontList: List = ['ENVO']) -> None: 
+        """
+        Dynamically creates the settings.ini file for OGER to get parameters.
+
+        :param path: Path of the 'nlp' folder
+        :param ontList: The ontology to be used as dictionary e.g. ['ENVO', 'CHEBI']
+        :return: None.
+
+        -   Include header: Output file will have column names
+
+        -   The 'Shared' section declares global variables that can be used in other sections
+            e.g. Data root.
+            root = location of the working directory
+            accessed in other sections using => ${Shared:root}/
+
+        -   Input formats accepted:
+            txt, txt_json, bioc_xml, bioc_json, conll, pubmed,
+            pxml, pxml.gz, pmc, nxml, pubtator, pubtator_fbk,
+            becalmabstracts, becalmpatents
+
+        -   Two iter-modes available: [collection or document]
+            document:- 'n' input files = 'n' output files
+            (provided every file has ontology terms)
+            collection:- n input files = 1 output file
+
+        -   Export formats possible:
+            tsv, txt, text_tsv, xml, text_xml, bioc_xml,
+            bioc_json, bionlp, bionlp.ann, brat, brat.ann,
+            conll, pubtator, pubanno_json, pubtator, pubtator_fbk,
+            europepmc, europepmc.zip, odin, becalm_tsv, becalm_json
+            These can be passed as a list for multiple outputs too.
+
+        -   Multiple Termlists can be declared in separate sections
+            e.g. [Termlist1], [Termlist2] ...[Termlistn] with each having
+            their own paths
+
+        -   Normalization used: lowercase and stem-Porter
+        """
+
+        config = configparser.ConfigParser()
+        config['Section'] = {}
+        config['Shared'] = {}
+
+        # Settings required by OGER
+        config['Main'] = {
+            'include_header' : True,
+            'input-directory' : os.path.join(path,'input'),
+            'output-directory' : os.path.join(path,'output'),
+            'pointer-type' : 'glob',
+            'pointers' : '*.tsv',
+            'iter-mode' : 'collection',
+            'article-format' : 'txt_tsv',
+            'export_format': 'tsv',
+            'termlist_stopwords': os.path.join(path,'stopwords','stopWords.txt'),
 MAIN_MODEL_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'model') 
 MAIN_SCHEMA_DIR = os.path.join(MAIN_MODEL_DIR, 'schema') 
 MIXS_SCHEMA = os.path.join(MAIN_SCHEMA_DIR, 'mixs.json') 
 MAIN_MODEL_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'model') 
 MAIN_SCHEMA_DIR = os.path.join(MAIN_MODEL_DIR, 'schema') 
 MIXS_SCHEMA = os.path.join(MAIN_SCHEMA_DIR, 'mixs.json') 
+            'termlist_normalize': 'lowercase stem-Porter'
+        }
 
-    # TODO: bring in Mark/Harshad's code
+        # Iterate throough ontoList to register paths of corresponding termlists
+        for idx, ont in enumerate(ontList):
+            termlist_path = 'terms/'+ont.lower()+'_termlist.tsv'
+            config.set('Main','termlist'+str(idx+1)+'_filename', termlist_path)
+
+        # Write
+        with open(os.path.join(path, SETTINGS_FILENAME), 'w') as settings_file:
+            config.write(settings_file)
 
+if __name__ == '__main__':
+    text_mining = TextMining()
+    text_mining.create_settings_file('.', ['ENVO'])
diff --git a/sample_annotator/text_mining/settings.ini b/sample_annotator/text_mining/settings.ini
@@ -0,0 +1,19 @@
+[Section]
+
+[Shared]
+
+[Main]
+include_header = True
+input-directory = ./input
+output-directory = ./output
+pointer-type = glob
+pointers = *.tsv
+iter-mode = collection
+article-format = txt_tsv
+export_format = tsv
+termlist_stopwords = ./stopwords/stopWords.txt
+termlist_normalize = lowercase stem-Porter
+termlist1_filename = terms/envo_termlist.tsv
+termlist2_filename = terms/chebi_termlist.tsv
+termlist3_filename = terms/abc_termlist.tsv
+
diff --git a/sample_annotator/text_mining/settings_sample.ini b/sample_annotator/text_mining/settings_sample.ini
@@ -0,0 +1,69 @@
+[Section]
+# OGER settings.ini file for argument loading
+[Shared]
+
+[Main]
+include_header = True
+input-directory = input
+output-directory = output
+pointer-type = glob
+pointers = *.tsv
+# options: collection OR document
+iter-mode = collection
+# article-format options:
+#    'txt_directory',
+#    'txt_id',
+#    'txt_collection',
+#    'txt_json',
+#    'txt_tar',
+#    'txt_tsv',
+#    'conll',
+#    'pubtator',
+#    'pubtator_fbk',
+#    'pxmlgz',
+#    'pxml_directory',
+#    'pxml_id',
+#    'bioc_xml',
+#    'bioc_json',
+#    'download_pubmed',
+#    'download_pmc',
+#    'download_bad_pmc',
+#    'download_fictious_pmc',
+#    'download_random_pmc',
+article-format = txt_tsv
+# export_format options:
+#    'tsv'
+#    'txt'
+#    'xml'
+#    'text_xml'
+#    'bioc'
+#    'bioc_xml'
+#    'bioc_json'
+#    'odin':
+#    'bionlp'
+#    'bionlp.ann'
+#    'brat'
+#    'brat.ann'
+#    'conll'
+#    'becalm_tsv'
+#    'becalm_json'
+#    'pubanno_json'
+#    'pubanno_json.tgz'
+#    'pubtator'
+#    'pubtator_fbk'
+#    'europepmc'
+#    'europepmc.zip'
+export_format = tsv
+# Multiple termlists can be added by giving them numbers
+# as shown below. Each termlist could have a separate stopword list
+# Initially there's just one for all.
+termlist1_path = terms/envo_termlist.tsv
+# termlist2_path = terms/ncbitaxon_termlist.tsv
+# termlist3_path = terms/po_termlist.tsv
+# termlist4_path = terms/ecocore_termlist.tsv
+# termlist5_path = terms/go_termlist.tsv
+# termlist6_path = terms/obi_termlist.tsv
+# termlist7_path = terms/ncit_termlist.tsv
+
+termlist_stopwords = stopwords/stopWords.txt
+termlist_normalize = lowercase stem-Porter