Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NER #9

Open
wants to merge 32 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
a0f40e6
added OGER
hrshdhgd Jul 15, 2021
1cc3e99
sample settings file
hrshdhgd Jul 15, 2021
a8f71f9
standard English stopwords
hrshdhgd Jul 15, 2021
979758b
ENVO termlist
hrshdhgd Jul 15, 2021
fc76d1f
dynamically create settings file #3
hrshdhgd Jul 15, 2021
cc1a48c
minor edit #3
hrshdhgd Jul 15, 2021
40dcf21
doc update
hrshdhgd Jul 15, 2021
6b530f8
update documentation
hrshdhgd Jul 15, 2021
6a9a6db
OGER <-> runner
hrshdhgd Jul 15, 2021
ff986c2
enter steps to convert ont.json - ont_termlist.tsv
hrshdhgd Jul 15, 2021
0bd7c0e
added requirements to makefile
hrshdhgd Jul 16, 2021
ff02969
updated gitignore
hrshdhgd Jul 22, 2021
f2ab344
updated code for NER
hrshdhgd Jul 22, 2021
f6fb1f9
first stab at testing ner
hrshdhgd Jul 22, 2021
50cb577
added input and expected info for testing
hrshdhgd Jul 22, 2021
18a76da
pickled termlist
hrshdhgd Jul 22, 2021
e0444b4
another stab at tests
hrshdhgd Jul 23, 2021
44bb428
made settings more flex with JSON
hrshdhgd Jul 23, 2021
d23c00b
minor edit
hrshdhgd Jul 27, 2021
fe25710
removed settings.ini from vrsn ctrl :autogenerated
hrshdhgd Jul 28, 2021
71b631f
updated gitignore
hrshdhgd Jul 28, 2021
ad656de
updated gitignore
hrshdhgd Jul 28, 2021
e3838c7
coded up perform_text_mining
hrshdhgd Jul 28, 2021
58c1aee
changed test
hrshdhgd Jul 28, 2021
b975494
typo
hrshdhgd Jul 28, 2021
e19ca26
updated tests and code
hrshdhgd Jul 28, 2021
30acdde
clean-up
hrshdhgd Jul 28, 2021
efdc72a
temp change of spelling
hrshdhgd Jul 28, 2021
0d330ee
added more elements in o/p for testing
hrshdhgd Jul 28, 2021
fc7197a
added code to assert expected == actual
hrshdhgd Jul 28, 2021
6f1588f
removed debugger
hrshdhgd Jul 30, 2021
9191db7
minor correction
hrshdhgd Aug 26, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,7 @@ tests/test_config.ini

# Don't lock
Pipfile.lock

sphinx/_build/*
sample_annotator/text_mining/settings.ini
sample_annotator/text_mining/input/*
11 changes: 11 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,14 @@ $(SAMPLE_SCHEMA_JSON): $(SAMPLE_SCHEMA_YAML)
test:
pipenv install --dev
pipenv run python -m unittest

# Lock requirements
requirements.txt:
pipenv lock --requirements

# NER files
text_mining/input/%_nodes.tsv: text_mining/input/%.json
kgx transform $< --input-format obojson --output $@ --output-format tsv

text_mining/terms/%_termlist.tsv: text_mining/input/%_nodes.tsv
python -m runner.runner prepare-termlist -i $< -o $@
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,5 @@ typing-extensions==3.10.0.0
urllib3==1.26.6; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'
watchdog==2.1.3; python_version >= '3.6'
wrapt==1.12.1
zipp==3.5.0; python_version >= '3.6'
zipp==3.5.0; python_version >= '3.6'
hrshdhgd marked this conversation as resolved.
Show resolved Hide resolved
runner@git+https://github.com/monarch-initiative/runner.git
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. We need a pypi release (for which I need to decide on name)
  2. if Pipfile is SOT then this needs to go in Pipfile

38 changes: 33 additions & 5 deletions sample_annotator/sample_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@
import logging
import pandas as pd
import bioregistry
import os


from nmdc_schema.nmdc import Biosample, GeolocationValue, QuantityValue
from nmdc_schema.nmdc import Biosample, GeolocationValue, QuantityValue, OntologyClass
from nmdc_schema.nmdc import slots as nmdc_slots

from .geolocation.geotools import GeoEngine
from .measurements.measurements import MeasurementEngine
from .metadata.sample_schema import SampleSchema, underscore
from .report_model import AnnotationReport, Message, PackageCombo, AnnotationMultiSampleReport, Category, SAMPLE, STUDY


from sample_annotator.text_mining.TextMining import SETTINGS_FILENAME, TextMining
from linkml_runtime.linkml_model.meta import ClassDefinition, SchemaDefinition, SlotDefinition, Definition

KEY_ENV_PACKAGE = nmdc_slots.env_package.name
Expand Down Expand Up @@ -207,8 +207,36 @@ def perform_text_mining(self, sample: SAMPLE, report: AnnotationReport):
"""
Performs text mining
"""
# TODO: Mark and Harshad to populate
...
keys_of_interest = ['env_broad_scale', 'env_local_scale', 'env_medium']
PWD = os.path.dirname(os.path.realpath(__file__))
TEXT_MINING_DIR = os.path.join(PWD,'text_mining')
NER_INPUT_FILE = os.path.join(TEXT_MINING_DIR,'input/input.tsv')
NER_OUTPUT_FILE = os.path.join(TEXT_MINING_DIR, 'output/runNER_Output.tsv')

sample_of_interest = {key: sample[key] for key in keys_of_interest if key in sample.keys() and sample[key] is not None}
if not sample_of_interest:
report.add_message('Nothing to NER.')
else:
sample_df = pd.DataFrame.from_dict(sample_of_interest, orient='index')\
.reset_index()\
.rename(columns={'index':'id', 0:'text'})

sample_df.to_csv(NER_INPUT_FILE, index=None, sep='\t')



# Steps that lead to NER
text_miner = TextMining()
text_miner.create_settings_file(path=TEXT_MINING_DIR)
text_miner.mine(os.path.join(TEXT_MINING_DIR, SETTINGS_FILENAME))

# Post-process NER
ner_result_df = pd.read_csv(NER_OUTPUT_FILE, sep='\t', low_memory=False)

for key in sample_of_interest.keys():
match = ner_result_df.loc[ner_result_df['PREFERRED FORM'] == sample[key]]['ENTITY ID']
if len(match) > 0:
sample[key] = match[match.index[0]]

def perform_geolocation_inference(self, sample: SAMPLE, report: AnnotationReport):
"""
Expand Down
84 changes: 82 additions & 2 deletions sample_annotator/text_mining/TextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,92 @@
from typing import Optional, List, Set, Any
from dataclasses import dataclass
import logging
from unittest import runner
from nmdc_schema.nmdc import QuantityValue
import re
import os
import configparser
from runner import runner
import json

PWD = os.path.dirname(os.path.realpath(__file__))
SETTINGS_JSON = 'settings.json'
SETTINGS_FILENAME = 'settings.ini'

@dataclass
class TextMining():
...
"""
Text mining Class
"""


def create_settings_file(self, path: str = PWD, ontList: List = ['ENVO']) -> None:
"""
Dynamically creates the settings.ini file for OGER to get parameters.

:param path: Path of the 'nlp' folder
:param ontList: The ontology to be used as dictionary e.g. ['ENVO', 'CHEBI']
:return: None.

- Include header: Output file will have column names

- The 'Shared' section declares global variables that can be used in other sections
e.g. Data root.
root = location of the working directory
accessed in other sections using => ${Shared:root}/

- Input formats accepted:
txt, txt_json, bioc_xml, bioc_json, conll, pubmed,
pxml, pxml.gz, pmc, nxml, pubtator, pubtator_fbk,
becalmabstracts, becalmpatents

- Two iter-modes available: [collection or document]
document:- 'n' input files = 'n' output files
(provided every file has ontology terms)
collection:- n input files = 1 output file

- Export formats possible:
tsv, txt, text_tsv, xml, text_xml, bioc_xml,
bioc_json, bionlp, bionlp.ann, brat, brat.ann,
conll, pubtator, pubanno_json, pubtator, pubtator_fbk,
europepmc, europepmc.zip, odin, becalm_tsv, becalm_json
These can be passed as a list for multiple outputs too.

- Multiple Termlists can be declared in separate sections
e.g. [Termlist1], [Termlist2] ...[Termlistn] with each having
their own paths

- Normalization used: lowercase and stem-Porter
"""

config = configparser.ConfigParser()
config['Section'] = {}
config['Shared'] = {}

# Settings required by OGER

with open(os.path.join(path,SETTINGS_JSON)) as stream:
self.object = json.load(stream)
config['Main'] = self.object['Main']
config.set('Main','input-directory', os.path.join(path,self.object['Relative-Path']['input-dir']))
config.set('Main','output-directory', os.path.join(path,self.object['Relative-Path']['output-dir']))
config.set('Main','termlist_stopwords', os.path.join(path,self.object['Relative-Path']['stopwords']))

# TODO: bring in Mark/Harshad's code
# Iterate through ontoList to register paths of corresponding termlists
for idx, ont in enumerate(ontList):
termlist_path = os.path.join(path,'terms/'+ont.lower()+'_termlist.tsv')
config.set('Main','termlist'+str(idx+1)+'_path', termlist_path)

# Write

with open(os.path.join(path, SETTINGS_FILENAME), 'w') as settings_file:
config.write(settings_file)

def mine(self, setting_file):
runner.run_oger(settings=setting_file)


if __name__ == '__main__':
text_mining = TextMining()
text_mining.create_settings_file(PWD, ['ENVO'])
text_mining.mine(setting_file=os.path.join(PWD,SETTINGS_FILENAME))
18 changes: 18 additions & 0 deletions sample_annotator/text_mining/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"Main": {
"include_header" : true,
"pointer-type" : "glob",
"pointers" : "*.tsv",
"iter-mode" : "collection",
"article-format" : "txt_tsv",
"export_format": "tsv",
"termlist_normalize": "lowercase stem-Porter"

},
"Relative-Path": {
"input-dir": "input",
"output-dir": "output",
"stopwords": "stopwords/stopWords.txt"
}

}
69 changes: 69 additions & 0 deletions sample_annotator/text_mining/settings_sample.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
[Section]
# OGER settings.ini file for argument loading
[Shared]

[Main]
include_header = True
input-directory = input
output-directory = output
pointer-type = glob
pointers = *.tsv
# options: collection OR document
iter-mode = collection
# article-format options:
# 'txt_directory',
# 'txt_id',
# 'txt_collection',
# 'txt_json',
# 'txt_tar',
# 'txt_tsv',
# 'conll',
# 'pubtator',
# 'pubtator_fbk',
# 'pxmlgz',
# 'pxml_directory',
# 'pxml_id',
# 'bioc_xml',
# 'bioc_json',
# 'download_pubmed',
# 'download_pmc',
# 'download_bad_pmc',
# 'download_fictious_pmc',
# 'download_random_pmc',
article-format = txt_tsv
# export_format options:
# 'tsv'
# 'txt'
# 'xml'
# 'text_xml'
# 'bioc'
# 'bioc_xml'
# 'bioc_json'
# 'odin':
# 'bionlp'
# 'bionlp.ann'
# 'brat'
# 'brat.ann'
# 'conll'
# 'becalm_tsv'
# 'becalm_json'
# 'pubanno_json'
# 'pubanno_json.tgz'
# 'pubtator'
# 'pubtator_fbk'
# 'europepmc'
# 'europepmc.zip'
export_format = tsv
# Multiple termlists can be added by giving them numbers
# as shown below. Each termlist could have a separate stopword list
# Initially there's just one for all.
termlist1_path = terms/envo_termlist.tsv
# termlist2_path = terms/ncbitaxon_termlist.tsv
# termlist3_path = terms/po_termlist.tsv
# termlist4_path = terms/ecocore_termlist.tsv
# termlist5_path = terms/go_termlist.tsv
# termlist6_path = terms/obi_termlist.tsv
# termlist7_path = terms/ncit_termlist.tsv

termlist_stopwords = stopwords/stopWords.txt
termlist_normalize = lowercase stem-Porter
Loading