diff --git a/examples/src/main/java/zemberek/examples/normalization/NumberConversion.java b/examples/src/main/java/zemberek/examples/normalization/NumberConversion.java new file mode 100644 index 00000000..0eabc7f5 --- /dev/null +++ b/examples/src/main/java/zemberek/examples/normalization/NumberConversion.java @@ -0,0 +1,20 @@ +package zemberek.examples.normalization; + +import zemberek.morphology.TurkishMorphology; +import zemberek.normalization.NumberTextConverter; + +public class NumberConversion { + + public static void main(String[] args) { + TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); + NumberTextConverter numberTextConverter = new NumberTextConverter(morphology); + String[] examples = {"yirmi 4 milyon, yüz 80 iki bin altmış 3 kişi geldi","yirmi 4 milyon yüz 80 iki bin altmış 3 ekmek aldım", "sekiz yüz elli 1 buçuk"}; + System.out.println("Convert textual numbers to numerically values"); + for (String example: examples) { + System.out.println("Example: " + example); + String s = numberTextConverter.replaceTextualNumberWithNumerically(example); + System.out.println("Response: " + s); + } + } + +} diff --git a/examples/src/main/java/zemberek/examples/normalization/WordSegment.java b/examples/src/main/java/zemberek/examples/normalization/WordSegment.java new file mode 100644 index 00000000..e41f708b --- /dev/null +++ b/examples/src/main/java/zemberek/examples/normalization/WordSegment.java @@ -0,0 +1,24 @@ +package zemberek.examples.normalization; + +import zemberek.morphology.TurkishMorphology; +import zemberek.normalization.WordSegmenter; + +import java.io.IOException; +import java.util.List; + +public class WordSegment { + + public static void main(String[] args) throws IOException { + TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); + + WordSegmenter segmenter = new WordSegmenter(morphology); + String[] examples = {"istanbulyağmurluolacak", "benimlegelirmisin"}; + for (String example: examples) { + System.out.println("Example is : " + example); + List wordBreak = segmenter.wordBreak(example,4); + wordBreak.forEach(System.out::println); + System.out.println("---------------------------"); + } + + } +} diff --git a/normalization/src/main/java/zemberek/normalization/NumberTextConverter.java b/normalization/src/main/java/zemberek/normalization/NumberTextConverter.java new file mode 100644 index 00000000..75b60c9f --- /dev/null +++ b/normalization/src/main/java/zemberek/normalization/NumberTextConverter.java @@ -0,0 +1,206 @@ +package zemberek.normalization; + +import zemberek.core.turkish.PrimaryPos; +import zemberek.core.turkish.SecondaryPos; +import zemberek.morphology.TurkishMorphology; +import zemberek.morphology.analysis.SentenceAnalysis; +import zemberek.morphology.analysis.SingleAnalysis; +import zemberek.morphology.analysis.WordAnalysis; +import zemberek.morphology.analysis.tr.TurkishNumbers; +import zemberek.tokenization.Token; +import zemberek.tokenization.TurkishTokenizer; + +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class NumberTextConverter { + + private TurkishMorphology morphology; + private String patternToFind = "(\\s\\d+\\s(buçuk)\\s)|(\\s(bir|iki|üç|dört|beş|altı|yedi|sekiz|dokuz|on|yirmi|otuz|kırk|elli|atmış|altmış|yetmiş|seksen|doksan|yüz|bin|milyon|milyar)\\s(buçuk)\\s)"; + private Pattern halfNumberPatternT2N; + + public NumberTextConverter(TurkishMorphology morphology) { + this.morphology = morphology; + this.halfNumberPatternT2N = Pattern.compile(patternToFind); + } + + public String replaceTextualNumberWithNumerically(String sentence) { + List> numbers = collectNumberStrings(sentence); + String result = convertNumberListToNumericalString(sentence, numbers); + return result; + } + + public String replaceNumericallyWithTextualNumber(String sentence) { + String result = convertNumberListToTextualString(sentence); + return result; + } + + public String concatHalfNumberPair(String text) { + String sentence = " " + text + " "; + Matcher match = halfNumberPatternT2N.matcher(sentence); + + StringBuilder spacedSentence = new StringBuilder(); + int lastIndex = 0; + DecimalFormat df = new DecimalFormat("#"); + df.setMaximumFractionDigits(4); + while (match.find()) { + String matchedPart = sentence.substring(match.start(), match.end()); + String[] split = matchedPart.split(" "); + String newString = ""; + if(split.length > 1) { + Double bucukNumber = 0.0; + if(TurkishNumbers.hasNumber(split[1])){ + Long number = Long.parseLong(split[1]); + bucukNumber = number + 0.5; + } else { + bucukNumber = TurkishNumbers.convertToNumber(split[1]) + 0.5; + } + + newString = matchedPart.substring(0, matchedPart.indexOf(split[1])) + " " + df.format(bucukNumber); + } else { + newString = split[0] + " " + matchedPart.substring(split[0].length()) + " "; + } + spacedSentence.append(sentence.substring(lastIndex, match.start())).append(" ").append(newString).append(" "); + lastIndex = match.end(); + } + + if(lastIndex != 0 && lastIndex != sentence.length()) { + spacedSentence.append(sentence.substring(lastIndex)).append(" "); + } + + if (spacedSentence.toString().isEmpty()) { + return text; + } else { + return spacedSentence.toString().trim().replaceAll(" +", " "); + } + } + + private List> collectNumberStrings(String sentence){ + List analyses = morphology.analyzeSentence(sentence); + SentenceAnalysis result = morphology.disambiguate(sentence, analyses); + List> numbers = new ArrayList<>(); + List number = new ArrayList<>(); + List bestAnalysis = result.bestAnalysis(); + for (int i = 0; i < analyses.size(); i++) { + SingleAnalysis sa = bestAnalysis.get(i); + String lemma = sa.getDictionaryItem().lemma; + + if ((sa.getDictionaryItem().primaryPos == PrimaryPos.Numeral + && sa.getDictionaryItem().secondaryPos == SecondaryPos.Cardinal) && !lemma.equals("buçuk")) { + if(!TurkishNumbers.hasOnlyNumber(lemma)) { + number.add(lemma); + } else { + String convertNumberToString = TurkishNumbers.convertNumberToString(lemma); + number.addAll(Arrays.asList(convertNumberToString.trim().replaceAll(" +", " ").split(" "))); + } + } else { + if(number.size() > 0) { + numbers.add(number); + number = new ArrayList<>(); + } + } + } + if(!number.isEmpty()) { + numbers.add(number); + } + return numbers; + } + + private String convertNumberListToTextualString(String sentence){ + List tokens = TurkishTokenizer.ALL.tokenize(sentence); + String resultText = ""; + for (zemberek.tokenization.Token token: tokens) { + if(token.getText().equals(" ")){ + resultText += token.getText(); + } else { + if(TurkishNumbers.hasOnlyNumber(token.getText())){ + resultText += TurkishNumbers.convertNumberToString(token.getText()); + } else { + resultText += token.getText(); + } + } + } + return resultText; + } + + private String convertNumberListToNumericalString(String sentence, List> numbers){ + String resultText = sentence; + for (List list : numbers) { + List> multipleSubNumbers = getMultipleSubNumbers(list); + for (List subNumbers : multipleSubNumbers) { + Long numberValue = TurkishNumbers.convertToNumber(subNumbers.toArray(new String[subNumbers.size()])); + + List tokens = TurkishTokenizer.DEFAULT.tokenize(resultText); + String combineWords = ""; + for (int i = 0; i < tokens.size(); i++) { + long l = TurkishNumbers.convertToNumber(subNumbers.get(0)); + if(subNumbers.get(0).contains(tokens.get(i).getText()) || (l+"").contains(tokens.get(i).getText())) { + int subCounter = 0; + boolean isMatch = true; + if(i + subNumbers.size() < tokens.size()) { + for (int j = i; j < i + subNumbers.size(); j++) { + String numberStr = "UNK_NUMBER_FORMAT"; + try { + numberStr = TurkishNumbers.convertNumberToString(tokens.get(j).getText()); + } catch (Exception e){} + if(!(tokens.get(j).getText().equals(subNumbers.get(subCounter)) || numberStr.equals(subNumbers.get(subCounter)))) { + isMatch = false; + break; + } + subCounter++; + } + } else { + isMatch = false; + } + if(isMatch) { + combineWords += numberValue + " "; + i = i + subNumbers.size() - 1; + } else { + combineWords += tokens.get(i).getText() + " "; + } + } else { + combineWords += tokens.get(i).getText() + " "; + } + } + resultText = combineWords.trim(); + } + } + resultText = concatHalfNumberPair(resultText); + return resultText; + } + + private static List> getMultipleSubNumbers(List textualNumbers){ + int startIndex = 0; + int endIndex = textualNumbers.size(); + boolean isEnd = false; + List> muliptleNumbers = new ArrayList<>(); + int counterForLimit = 0; + while (!isEnd) { + counterForLimit++; + List subList = textualNumbers.subList(startIndex, endIndex); + String[] array = subList.toArray(new String[endIndex - startIndex]); + Long numberValue = -1L; + if(array == null || array.length == 0) { + numberValue = -1L; + } else { + numberValue = TurkishNumbers.convertToNumber(array); + } + + if(numberValue == -1) { + endIndex--; + } else { + muliptleNumbers.add(textualNumbers.subList(startIndex, endIndex)); + startIndex = endIndex; + endIndex = textualNumbers.size(); + } + if(startIndex >= textualNumbers.size() || startIndex == endIndex || counterForLimit > 500) { + isEnd = true; + } + } + return muliptleNumbers; + } +} diff --git a/normalization/src/main/java/zemberek/normalization/WordSegmenter.java b/normalization/src/main/java/zemberek/normalization/WordSegmenter.java new file mode 100644 index 00000000..4fc2164a --- /dev/null +++ b/normalization/src/main/java/zemberek/normalization/WordSegmenter.java @@ -0,0 +1,88 @@ +package zemberek.normalization; + +import java.io.IOException; +import java.util.*; +import java.util.stream.Collectors; + +import zemberek.morphology.TurkishMorphology; +import zemberek.morphology.analysis.WordAnalysis; + +public class WordSegmenter { + + private TurkishMorphology morphology; + private int topSuggestionCount = 10; + + public WordSegmenter(TurkishMorphology morphology) { + this.morphology = morphology; + } + public WordSegmenter(TurkishMorphology morphology, int topSuggestionCount) { + this(morphology); + this.topSuggestionCount = topSuggestionCount; + } + + public List wordBreak(String s) { + return wordBreaker(s, this.topSuggestionCount); + } + public List wordBreak(String s, int suggestCount) { + return wordBreaker(s, suggestCount); + } + + private List wordBreaker(String s, int topSuggestionCount) { + ArrayList [] pos = new ArrayList[s.length()+1]; + pos[0]=new ArrayList(); + + int lastIndex = 0; + for(int i=0; i 2) { + WordAnalysis analyze = morphology.analyze(sub); + if(analyze.isCorrect()){ + if(pos[j]==null){ + ArrayList list = new ArrayList(); + list.add(sub); + pos[j]=list; + }else{ + pos[j].add(sub); + } + lastIndex = j; + } + } + } + } + } + + if(pos[lastIndex]==null){ + return new ArrayList(); + }else{ + LinkedHashSet resultSet = new LinkedHashSet(); + dfs(pos, resultSet, "", s.length()); + List result = resultSet.stream().collect(Collectors.toList()); + if(!result.isEmpty() && result.get(0).isEmpty()) { + result.clear(); + } + if(result.size() < topSuggestionCount){ + return result; + } else { + return result.subList(0, topSuggestionCount); + } + } + } + + private void dfs(ArrayList [] pos, Set result, String curr, int i){ + if(i==0){ + result.add(curr.trim()); + return; + } + + if(pos[i] == null){ + dfs(pos, result, curr, i-1); + } else { + for(String s: pos[i]){ + String combined = s + " "+ curr; + dfs(pos, result, combined.trim().replaceAll("\\s+", " "), i-s.length()); + } + } + } +}