-
Notifications
You must be signed in to change notification settings - Fork 0
/
OpenNLP.java
125 lines (92 loc) · 3.31 KB
/
OpenNLP.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package us.fourfrontdev.OpenNLP;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
public class OpenNLP {
public OpenNLP() {}
public static void main(String[] args) {
// detectLanguage();
processText();
}
private static void processText() {
/* https://opennlp.apache.org/docs/1.9.4/manual/opennlp.html
* 1. Detect Sentences
* 2. Tokenize Text
* 3. Find Names
* To find names in raw text the text must be segmented into tokens and sentences.
*/
String text = "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29. Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group. Robert Steven Scavilla is the CEO of FourFront, LLC. Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a director of this British industrial conglomerate.";
// 1. detect sentences
String sentenceText[] = sentenceDetector(text);
// 2. Tokenize text
List<String[]>ts = TokenizedText(sentenceText);
// 3. Find Names
findNames(ts);
}
private static void findNames(List<String[]> ts) {
InputStream is = null;
is = OpenNLP.class.getResourceAsStream("/en-ner-person.bin");
try {
TokenNameFinderModel model = new TokenNameFinderModel(is);
NameFinderME nameFinder = new NameFinderME(model);
ts.forEach(docs -> {
Span nameSpans[] = nameFinder.find(docs);
for(Span s: nameSpans) {
String entity=s.getType() + ": " + getCoveredText(s, docs);
System.out.println(entity);
}
});
nameFinder.clearAdaptiveData();
} catch (IOException e) {
e.printStackTrace();
}
}
private static String getCoveredText(Span s, String[] docs) {
/*
* Span s has a getCoverredText method, but I can't get it to work??
*/
String entity = "";
for(int i=s.getStart() ; i<s.getEnd() ; i++) {
entity+=(docs[i] + " ");
}
return entity;
}
private static List<String[]> TokenizedText(String[] sentenceText) {
InputStream tokenM = null;
tokenM = OpenNLP.class.getResourceAsStream("/en-token.bin");
TokenizerModel model=null;
try {
model = new TokenizerModel(tokenM);
} catch (IOException e) {
e.printStackTrace();
}
Tokenizer tokenizer = new TokenizerME(model);
List<String[]> ts = new ArrayList<String[]>();
for(String sentence : sentenceText) {
ts.add(tokenizer.tokenize(sentence));
}
return ts;
}
private static String[] sentenceDetector(String text) {
InputStream is = null;
is = OpenNLP.class.getResourceAsStream("/en-sent.bin");
String sentences[] = null;
try {
SentenceModel model = new SentenceModel(is);
SentenceDetectorME sentenceDetector = new SentenceDetectorME(model);
sentences = sentenceDetector.sentDetect(text);
} catch (IOException e) {
e.printStackTrace();
}
return sentences;
}
}