Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added 2 features: word segment and number conversion #264

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package zemberek.examples.normalization;

import zemberek.morphology.TurkishMorphology;
import zemberek.normalization.NumberTextConverter;

public class NumberConversion {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In examples, prefer command like names. Like "ConvertNumbers"


public static void main(String[] args) {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
NumberTextConverter numberTextConverter = new NumberTextConverter(morphology);
String[] examples = {"yirmi 4 milyon, yüz 80 iki bin altmış 3 kişi geldi","yirmi 4 milyon yüz 80 iki bin altmış 3 ekmek aldım", "sekiz yüz elli 1 buçuk"};
System.out.println("Convert textual numbers to numerically values");
for (String example: examples) {
System.out.println("Example: " + example);
String s = numberTextConverter.replaceTextualNumberWithNumerically(example);
System.out.println("Response: " + s);
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package zemberek.examples.normalization;

import zemberek.morphology.TurkishMorphology;
import zemberek.normalization.WordSegmenter;

import java.io.IOException;
import java.util.List;

public class WordSegment {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"SegmentConnectedWords" or such.


public static void main(String[] args) throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();

WordSegmenter segmenter = new WordSegmenter(morphology);
String[] examples = {"istanbulyağmurluolacak", "benimlegelirmisin"};
for (String example: examples) {
System.out.println("Example is : " + example);
List<String> wordBreak = segmenter.wordBreak(example,4);
wordBreak.forEach(System.out::println);
System.out.println("---------------------------");
}

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
package zemberek.normalization;

import zemberek.core.turkish.PrimaryPos;
import zemberek.core.turkish.SecondaryPos;
import zemberek.morphology.TurkishMorphology;
import zemberek.morphology.analysis.SentenceAnalysis;
import zemberek.morphology.analysis.SingleAnalysis;
import zemberek.morphology.analysis.WordAnalysis;
import zemberek.morphology.analysis.tr.TurkishNumbers;
import zemberek.tokenization.Token;
import zemberek.tokenization.TurkishTokenizer;

import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class NumberTextConverter {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add small doc, what it does exactly.


private TurkishMorphology morphology;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can be final

private String patternToFind = "(\\s\\d+\\s(buçuk)\\s)|(\\s(bir|iki|üç|dört|beş|altı|yedi|sekiz|dokuz|on|yirmi|otuz|kırk|elli|atmış|altmış|yetmiş|seksen|doksan|yüz|bin|milyon|milyar)\\s(buçuk)\\s)";
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can be final.

private Pattern halfNumberPatternT2N;

public NumberTextConverter(TurkishMorphology morphology) {
this.morphology = morphology;
this.halfNumberPatternT2N = Pattern.compile(patternToFind);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can be static final defined as a class parameter. No need to initialize it here.

}

public String replaceTextualNumberWithNumerically(String sentence) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Document.

List<List<String>> numbers = collectNumberStrings(sentence);
String result = convertNumberListToNumericalString(sentence, numbers);
return result;
}

public String replaceNumericallyWithTextualNumber(String sentence) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

document

String result = convertNumberListToTextualString(sentence);
return result;
}

public String concatHalfNumberPair(String text) {
String sentence = " " + text + " ";
Matcher match = halfNumberPatternT2N.matcher(sentence);

StringBuilder spacedSentence = new StringBuilder();
int lastIndex = 0;
DecimalFormat df = new DecimalFormat("#");
df.setMaximumFractionDigits(4);
while (match.find()) {
String matchedPart = sentence.substring(match.start(), match.end());
String[] split = matchedPart.split(" ");
String newString = "";
if(split.length > 1) {
Double bucukNumber = 0.0;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"halfNumber"?

if(TurkishNumbers.hasNumber(split[1])){
Long number = Long.parseLong(split[1]);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Prefer primitive values as much as possible. long, double etc.

bucukNumber = number + 0.5;
} else {
bucukNumber = TurkishNumbers.convertToNumber(split[1]) + 0.5;
}

newString = matchedPart.substring(0, matchedPart.indexOf(split[1])) + " " + df.format(bucukNumber);
} else {
newString = split[0] + " " + matchedPart.substring(split[0].length()) + " ";
}
spacedSentence.append(sentence.substring(lastIndex, match.start())).append(" ").append(newString).append(" ");
lastIndex = match.end();
}

if(lastIndex != 0 && lastIndex != sentence.length()) {
spacedSentence.append(sentence.substring(lastIndex)).append(" ");
}

if (spacedSentence.toString().isEmpty()) {
return text;
} else {
return spacedSentence.toString().trim().replaceAll(" +", " ");
}
}

private List<List<String>> collectNumberStrings(String sentence){
List<WordAnalysis> analyses = morphology.analyzeSentence(sentence);
SentenceAnalysis result = morphology.disambiguate(sentence, analyses);
List<List<String>> numbers = new ArrayList<>();
List<String> number = new ArrayList<>();
List<SingleAnalysis> bestAnalysis = result.bestAnalysis();
for (int i = 0; i < analyses.size(); i++) {
SingleAnalysis sa = bestAnalysis.get(i);
String lemma = sa.getDictionaryItem().lemma;

if ((sa.getDictionaryItem().primaryPos == PrimaryPos.Numeral
&& sa.getDictionaryItem().secondaryPos == SecondaryPos.Cardinal) && !lemma.equals("buçuk")) {
if(!TurkishNumbers.hasOnlyNumber(lemma)) {
number.add(lemma);
} else {
String convertNumberToString = TurkishNumbers.convertNumberToString(lemma);
number.addAll(Arrays.asList(convertNumberToString.trim().replaceAll(" +", " ").split(" ")));
}
} else {
if(number.size() > 0) {
numbers.add(number);
number = new ArrayList<>();
}
}
}
if(!number.isEmpty()) {
numbers.add(number);
}
return numbers;
}

private String convertNumberListToTextualString(String sentence){
List<zemberek.tokenization.Token> tokens = TurkishTokenizer.ALL.tokenize(sentence);
String resultText = "";
for (zemberek.tokenization.Token token: tokens) {
if(token.getText().equals(" ")){
resultText += token.getText();
} else {
if(TurkishNumbers.hasOnlyNumber(token.getText())){
resultText += TurkishNumbers.convertNumberToString(token.getText());
} else {
resultText += token.getText();
}
}
}
return resultText;
}

private String convertNumberListToNumericalString(String sentence, List<List<String>> numbers){
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is rather complex, I will check later. If you add some tests, it is fine.

String resultText = sentence;
for (List<String> list : numbers) {
List<List<String>> multipleSubNumbers = getMultipleSubNumbers(list);
for (List<String> subNumbers : multipleSubNumbers) {
Long numberValue = TurkishNumbers.convertToNumber(subNumbers.toArray(new String[subNumbers.size()]));

List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(resultText);
String combineWords = "";
for (int i = 0; i < tokens.size(); i++) {
long l = TurkishNumbers.convertToNumber(subNumbers.get(0));
if(subNumbers.get(0).contains(tokens.get(i).getText()) || (l+"").contains(tokens.get(i).getText())) {
int subCounter = 0;
boolean isMatch = true;
if(i + subNumbers.size() < tokens.size()) {
for (int j = i; j < i + subNumbers.size(); j++) {
String numberStr = "UNK_NUMBER_FORMAT";
try {
numberStr = TurkishNumbers.convertNumberToString(tokens.get(j).getText());
} catch (Exception e){}
if(!(tokens.get(j).getText().equals(subNumbers.get(subCounter)) || numberStr.equals(subNumbers.get(subCounter)))) {
isMatch = false;
break;
}
subCounter++;
}
} else {
isMatch = false;
}
if(isMatch) {
combineWords += numberValue + " ";
i = i + subNumbers.size() - 1;
} else {
combineWords += tokens.get(i).getText() + " ";
}
} else {
combineWords += tokens.get(i).getText() + " ";
}
}
resultText = combineWords.trim();
}
}
resultText = concatHalfNumberPair(resultText);
return resultText;
}

private static List<List<String>> getMultipleSubNumbers(List<String> textualNumbers){
int startIndex = 0;
int endIndex = textualNumbers.size();
boolean isEnd = false;
List<List<String>> muliptleNumbers = new ArrayList<>();
int counterForLimit = 0;
while (!isEnd) {
counterForLimit++;
List<String> subList = textualNumbers.subList(startIndex, endIndex);
String[] array = subList.toArray(new String[endIndex - startIndex]);
Long numberValue = -1L;
if(array == null || array.length == 0) {
numberValue = -1L;
} else {
numberValue = TurkishNumbers.convertToNumber(array);
}

if(numberValue == -1) {
endIndex--;
} else {
muliptleNumbers.add(textualNumbers.subList(startIndex, endIndex));
startIndex = endIndex;
endIndex = textualNumbers.size();
}
if(startIndex >= textualNumbers.size() || startIndex == endIndex || counterForLimit > 500) {
isEnd = true;
}
}
return muliptleNumbers;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package zemberek.normalization;

import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;

import zemberek.morphology.TurkishMorphology;
import zemberek.morphology.analysis.WordAnalysis;

public class WordSegmenter {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add small doc. To class or public methods. If you add some tests it is fine, I will check algortihm when I have time.


private TurkishMorphology morphology;
private int topSuggestionCount = 10;

public WordSegmenter(TurkishMorphology morphology) {
this.morphology = morphology;
}
public WordSegmenter(TurkishMorphology morphology, int topSuggestionCount) {
this(morphology);
this.topSuggestionCount = topSuggestionCount;
}

public List<String> wordBreak(String s) {
return wordBreaker(s, this.topSuggestionCount);
}
public List<String> wordBreak(String s, int suggestCount) {
return wordBreaker(s, suggestCount);
}

private List<String> wordBreaker(String s, int topSuggestionCount) {
ArrayList<String> [] pos = new ArrayList[s.length()+1];
pos[0]=new ArrayList<String>();

int lastIndex = 0;
for(int i=0; i<s.length(); i++){
if(pos[i]!=null){
for(int j=i+1; j<=s.length(); j++){
String sub = s.substring(i,j);
if(sub.length() > 2) {
WordAnalysis analyze = morphology.analyze(sub);
if(analyze.isCorrect()){
if(pos[j]==null){
ArrayList<String> list = new ArrayList<String>();
list.add(sub);
pos[j]=list;
}else{
pos[j].add(sub);
}
lastIndex = j;
}
}
}
}
}

if(pos[lastIndex]==null){
return new ArrayList<String>();
}else{
LinkedHashSet<String> resultSet = new LinkedHashSet<String>();
dfs(pos, resultSet, "", s.length());
List<String> result = resultSet.stream().collect(Collectors.toList());
if(!result.isEmpty() && result.get(0).isEmpty()) {
result.clear();
}
if(result.size() < topSuggestionCount){
return result;
} else {
return result.subList(0, topSuggestionCount);
}
}
}

private void dfs(ArrayList<String> [] pos, Set<String> result, String curr, int i){
if(i==0){
result.add(curr.trim());
return;
}

if(pos[i] == null){
dfs(pos, result, curr, i-1);
} else {
for(String s: pos[i]){
String combined = s + " "+ curr;
dfs(pos, result, combined.trim().replaceAll("\\s+", " "), i-s.length());
}
}
}
}