-
Notifications
You must be signed in to change notification settings - Fork 84
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #71 from nipunsadvilkar/npn-abbr-refactor
- Loading branch information
Showing
14 changed files
with
164 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import blingfire | ||
import nltk | ||
import pysbd | ||
import spacy | ||
import stanza | ||
|
||
from syntok.tokenizer import Tokenizer | ||
import syntok.segmenter as syntok_segmenter | ||
|
||
pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False) | ||
|
||
nlp = spacy.blank('en') | ||
nlp.add_pipe(nlp.create_pipe("sentencizer")) | ||
nlp_dep = spacy.load('en_core_web_sm', disable=["ner"]) | ||
#stanza.download('en') | ||
stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize') | ||
|
||
syntok_tokenizer = Tokenizer() | ||
|
||
def blingfire_tokenize(text): | ||
return blingfire.text_to_sentences(text).split('\n') | ||
|
||
def nltk_tokenize(text): | ||
return nltk.sent_tokenize(text) | ||
|
||
def pysbd_tokenize(text): | ||
segments = pysbd_segmenter.segment(text) | ||
segments = [s.strip() for s in segments] | ||
return segments | ||
|
||
def spacy_tokenize(text): | ||
return [sent.text.strip("\n") for sent in nlp(text).sents] | ||
|
||
def spacy_dep_tokenize(text): | ||
return [sent.text.strip("\n") for sent in nlp_dep(text).sents] | ||
|
||
def stanza_tokenize(text): | ||
return [e.text for e in stanza_nlp(text).sentences] | ||
|
||
def make_sentences(segmented_tokens): | ||
for sentence in segmented_tokens: | ||
yield "".join(str(token) for token in sentence).strip() | ||
|
||
def syntok_tokenize(text): | ||
tokens = syntok_tokenizer.split(text) | ||
result = syntok_segmenter.split(iter(tokens)) | ||
segments = [sent for sent in make_sentences(result)] | ||
return segments | ||
|
||
def speed_benchmark(big_text, tokenize_func): | ||
segments = tokenize_func(big_text) | ||
return segments | ||
|
||
if __name__ == "__main__": | ||
import time | ||
libraries = ( | ||
blingfire_tokenize, | ||
nltk_tokenize, | ||
pysbd_tokenize, | ||
spacy_tokenize, | ||
spacy_dep_tokenize, | ||
stanza_tokenize, | ||
syntok_tokenize) | ||
|
||
for tokenize_func in libraries: | ||
t = time.time() | ||
# wget http://www.gutenberg.org/files/1661/1661-0.txt -P benchmarks/ | ||
with open('benchmarks/1661-0.txt') as bigfile: | ||
big_text = bigfile.read() | ||
sentences = speed_benchmark(big_text, tokenize_func) | ||
|
||
time_taken = time.time() - t | ||
print() | ||
print(tokenize_func.__name__) | ||
print('Speed : {:>20.2f} ms'.format(time_taken * 1000)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.