diff --git a/.idea/checkstyle-idea.xml b/.idea/checkstyle-idea.xml
new file mode 100644
index 00000000..760d8299
--- /dev/null
+++ b/.idea/checkstyle-idea.xml
@@ -0,0 +1,16 @@
+
+
+
+ 10.9.3
+ JavaOnly
+ true
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/compiler.xml b/.idea/compiler.xml
new file mode 100644
index 00000000..1339a282
--- /dev/null
+++ b/.idea/compiler.xml
@@ -0,0 +1,25 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/encodings.xml b/.idea/encodings.xml
new file mode 100644
index 00000000..c3e3257d
--- /dev/null
+++ b/.idea/encodings.xml
@@ -0,0 +1,33 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/jarRepositories.xml b/.idea/jarRepositories.xml
new file mode 100644
index 00000000..c3ddb503
--- /dev/null
+++ b/.idea/jarRepositories.xml
@@ -0,0 +1,25 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 00000000..9b865bc2
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 00000000..94a25f7f
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 00000000..f182fe2e
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,70 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1680791954209
+
+
+ 1680791954209
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/apps/src/main/java/zemberek/apps/ApplicationRunner.java b/apps/src/main/java/zemberek/apps/ApplicationRunner.java
index e637f19b..59cc10ea 100644
--- a/apps/src/main/java/zemberek/apps/ApplicationRunner.java
+++ b/apps/src/main/java/zemberek/apps/ApplicationRunner.java
@@ -39,7 +39,8 @@ private static void listApplications(List apps) {
String simpleName = app.getClass().getSimpleName();
System.out.println(simpleName);
System.out.println(Strings.repeat("-", simpleName.length()));
- String wrapped = wrap(app.description(), 80);
+ int wrappedDescriptionLengthLimit = 80;
+ String wrapped = wrap(app.description(), wrappedDescriptionLengthLimit);
System.out.println(wrapped);
System.out.println();
}
diff --git a/apps/src/main/java/zemberek/apps/fasttext/ClassificationConsole.java b/apps/src/main/java/zemberek/apps/fasttext/ClassificationConsole.java
index 2cb356b0..ca73db01 100644
--- a/apps/src/main/java/zemberek/apps/fasttext/ClassificationConsole.java
+++ b/apps/src/main/java/zemberek/apps/fasttext/ClassificationConsole.java
@@ -119,15 +119,9 @@ private String replaceWordsWithLemma(String sentence) {
private String removeNonWords(String sentence) {
List docTokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
List reduced = new ArrayList<>(docTokens.size());
+
for (Token token : docTokens) {
- if (
- token.getType() == Type.PercentNumeral ||
- token.getType() == Type.Number ||
- token.getType() == Type.Punctuation ||
- token.getType() == Type.RomanNumeral ||
- token.getType() == Type.Time ||
- token.getType() == Type.UnknownWord ||
- token.getType() == Type.Unknown) {
+ if (isTokenNonWord(token)) {
if (!token.getText().contains("__")) {
continue;
}
@@ -138,6 +132,16 @@ private String removeNonWords(String sentence) {
return String.join(" ", reduced);
}
+ private Boolean isTokenNonWord(Token token) {
+ return token.getType() == Type.PercentNumeral ||
+ token.getType() == Type.Number ||
+ token.getType() == Type.Punctuation ||
+ token.getType() == Type.RomanNumeral ||
+ token.getType() == Type.Time ||
+ token.getType() == Type.UnknownWord ||
+ token.getType() == Type.Unknown;
+ }
+
public static void main(String[] args) {
new ClassificationConsole().execute(args);
}
diff --git a/core/target/classes/zemberek/core/syllable/accepted-syllable-prefixes b/core/target/classes/zemberek/core/syllable/accepted-syllable-prefixes
new file mode 100644
index 00000000..b49b0e80
--- /dev/null
+++ b/core/target/classes/zemberek/core/syllable/accepted-syllable-prefixes
@@ -0,0 +1,62 @@
+bl
+br
+ch
+cl
+cr
+cy
+dj
+dr
+dz
+fl
+fr
+gh
+gl
+gr
+gy
+hr
+hy
+kh
+kl
+kn
+kr
+ks
+ky
+ll
+ly
+mb
+mc
+mn
+my
+ph
+pl
+pn
+pr
+ps
+pt
+rh
+sc
+sf
+sh
+sk
+sl
+sm
+sn
+sp
+sr
+st
+sv
+sw
+sy
+şl
+şn
+şv
+th
+tr
+ts
+tw
+ty
+vl
+wh
+zh
+zl
+zw
diff --git a/core/target/classes/zemberek/core/text/html-char-map-common.txt b/core/target/classes/zemberek/core/text/html-char-map-common.txt
new file mode 100644
index 00000000..e26c6eb1
--- /dev/null
+++ b/core/target/classes/zemberek/core/text/html-char-map-common.txt
@@ -0,0 +1,168 @@
+#287:ğ
+#286:Ğ
+#304:İ
+#305:ı
+#351:ş
+#350:Ş
+quot:"
+#34:"
+amp:&
+#38:&
+apos:'
+#39:'
+lt:<
+#60:<
+gt:>
+#62:>
+nbsp:
+#160:
+cent:¢
+#162:¢
+pound:£
+#163:£
+acute:´
+#180:´
+cedil:¸
+#184:¸
+raquo:»
+#187:»
+laquo:«
+#171:«
+Agrave:À
+#192:À
+Aacute:Á
+#193:Á
+Acirc:Â
+#194:Â
+Atilde:Ã
+#195:Ã
+Auml:Ä
+#196:Ä
+Aring:Å
+#197:Å
+Ccedil:Ç
+#199:Ç
+Egrave:È
+#200:È
+Eacute:É
+#201:É
+Ecirc:Ê
+#202:Ê
+Euml:Ë
+#203:Ë
+Igrave:Ì
+#204:Ì
+Iacute:Í
+#205:Í
+Icirc:Î
+#206:Î
+Iuml:Ï
+#207:Ï
+Ntilde:Ñ
+#209:Ñ
+Ograve:Ò
+#210:Ò
+Oacute:Ó
+#211:Ó
+Ocirc:Ô
+#212:Ô
+Otilde:Õ
+#213:Õ
+Ouml:Ö
+#214:Ö
+Ugrave:Ù
+#217:Ù
+Uacute:Ú
+#218:Ú
+Ucirc:Û
+#219:Û
+Uuml:Ü
+#220:Ü
+Yacute:Ý
+#221:Ý
+THORN:Þ
+#222:Þ
+szlig:ß
+#223:ß
+agrave:à
+#224:à
+aacute:á
+#225:á
+acirc:â
+#226:â
+atilde:ã
+#227:ã
+auml:ä
+#228:ä
+aring:å
+#229:å
+aelig:æ
+#230:æ
+ccedil:ç
+#231:ç
+egrave:è
+#232:è
+eacute:é
+#233:é
+ecirc:ê
+#234:ê
+euml:ë
+#235:ë
+igrave:ì
+#236:ì
+iacute:í
+#237:í
+icirc:î
+#238:î
+iuml:ï
+#239:ï
+eth:ð
+#240:ð
+ntilde:ñ
+#241:ñ
+ograve:ò
+#242:ò
+oacute:ó
+#243:ó
+ocirc:ô
+#244:ô
+otilde:õ
+#245:õ
+ouml:ö
+#246:ö
+ugrave:ù
+#249:ù
+uacute:ú
+#250:ú
+ucirc:û
+#251:û
+uuml:ü
+#252:ü
+lsquo:‘
+#8216:‘
+rsquo:’
+#8217:’
+sbquo:‚
+#8218:‚
+ldquo:“
+#8220:“
+rdquo:”
+#8221:”
+bdquo:„
+#8222:„
+hellip:…
+#8230:…
+prime:′
+#8242:′
+Prime:″
+#8243:″
+lsaquo:‹
+#8249:‹
+rsaquo:›
+#8250:›
+oline:‾
+#8254:‾
+frasl:⁄
+#8260:⁄
+euro:€
+#8364:€
\ No newline at end of file
diff --git a/core/target/classes/zemberek/core/text/html-char-map-full.txt b/core/target/classes/zemberek/core/text/html-char-map-full.txt
new file mode 100644
index 00000000..1ebbcbe0
--- /dev/null
+++ b/core/target/classes/zemberek/core/text/html-char-map-full.txt
@@ -0,0 +1,517 @@
+#287:ğ
+#286:Ğ
+#304:İ
+#305:ı
+#351:ş
+#350:Ş
+#145:'
+#146:'
+#147:"
+#148:"
+#151:-
+am:&
+#38:&
+apos:'
+#39:'
+quot:"
+#34:"
+lt:<
+#60:<
+gt:>
+#62:>
+nbsp:
+#160:
+iexcl:¡
+#161:¡
+cent:¢
+#162:¢
+pound:£
+#163:£
+curren:¤
+#164:¤
+yen:¥
+#165:¥
+brvbar:¦
+#166:¦
+sect:§
+#167:§
+uml:¨
+#168:¨
+copy:©
+#169:©
+ordf:ª
+#170:ª
+laquo:«
+#171:«
+not:¬
+#172:¬
+shy:
+#173:
+reg:®
+#174:®
+macr:¯
+#175:¯
+deg:°
+#176:°
+plusmn:±
+#177:±
+sup2:²
+#178:²
+sup3:³
+#179:³
+acute:´
+#180:´
+micro:µ
+#181:µ
+para:¶
+#182:¶
+middot:·
+#183:·
+cedil:¸
+#184:¸
+sup1:¹
+#185:¹
+ordm:º
+#186:º
+raquo:»
+#187:»
+frac14:¼
+#188:¼
+frac12:½
+#189:½
+frac34:¾
+#190:¾
+iquest:¿
+#191:¿
+Agrave:À
+#192:À
+Aacute:Á
+#193:Á
+Acirc:Â
+#194:Â
+Atilde:Ã
+#195:Ã
+Auml:Ä
+#196:Ä
+Aring:Å
+#197:Å
+AElig:Æ
+#198:Æ
+Ccedil:Ç
+#199:Ç
+Egrave:È
+#200:È
+Eacute:É
+#201:É
+Ecirc:Ê
+#202:Ê
+Euml:Ë
+#203:Ë
+Igrave:Ì
+#204:Ì
+Iacute:Í
+#205:Í
+Icirc:Î
+#206:Î
+Iuml:Ï
+#207:Ï
+ETH:Ð
+#208:Ð
+Ntilde:Ñ
+#209:Ñ
+Ograve:Ò
+#210:Ò
+Oacute:Ó
+#211:Ó
+Ocirc:Ô
+#212:Ô
+Otilde:Õ
+#213:Õ
+Ouml:Ö
+#214:Ö
+times:×
+#215:×
+Oslash:Ø
+#216:Ø
+Ugrave:Ù
+#217:Ù
+Uacute:Ú
+#218:Ú
+Ucirc:Û
+#219:Û
+Uuml:Ü
+#220:Ü
+Yacute:Ý
+#221:Ý
+THORN:Þ
+#222:Þ
+szlig:ß
+#223:ß
+agrave:à
+#224:à
+aacute:á
+#225:á
+acirc:â
+#226:â
+atilde:ã
+#227:ã
+auml:ä
+#228:ä
+aring:å
+#229:å
+aelig:æ
+#230:æ
+ccedil:ç
+#231:ç
+egrave:è
+#232:è
+eacute:é
+#233:é
+ecirc:ê
+#234:ê
+euml:ë
+#235:ë
+igrave:ì
+#236:ì
+iacute:í
+#237:í
+icirc:î
+#238:î
+iuml:ï
+#239:ï
+eth:ð
+#240:ð
+ntilde:ñ
+#241:ñ
+ograve:ò
+#242:ò
+oacute:ó
+#243:ó
+ocirc:ô
+#244:ô
+otilde:õ
+#245:õ
+ouml:ö
+#246:ö
+divide:÷
+#247:÷
+oslash:ø
+#248:ø
+ugrave:ù
+#249:ù
+uacute:ú
+#250:ú
+ucirc:û
+#251:û
+uuml:ü
+#252:ü
+yacute:ý
+#253:ý
+thorn:þ
+#254:þ
+yuml:ÿ
+#255:ÿ
+OElig:Œ
+#338:Œ
+oelig:œ
+#339:œ
+Scaron:Š
+#352:Š
+scaron:š
+#353:š
+Yuml:Ÿ
+#376:Ÿ
+fnof:ƒ
+#402:ƒ
+circ:ˆ
+#710:ˆ
+tilde:˜
+#732:˜
+Alpha:Α
+#913:Α
+Beta:Β
+#914:Β
+Gamma:Γ
+#915:Γ
+Delta:Δ
+#916:Δ
+Epsilon:Ε
+#917:Ε
+Zeta:Ζ
+#918:Ζ
+Eta:Η
+#919:Η
+Theta:Θ
+#920:Θ
+Iota:Ι
+#921:Ι
+Kappa:Κ
+#922:Κ
+Lambda:Λ
+#923:Λ
+Mu:Μ
+#924:Μ
+Nu:Ν
+#925:Ν
+Xi:Ξ
+#926:Ξ
+Omicron:Ο
+#927:Ο
+Pi:Π
+#928:Π
+Rho:Ρ
+#929:Ρ
+Sigma:Σ
+#931:Σ
+Tau:Τ
+#932:Τ
+Upsilon:Υ
+#933:Υ
+Phi:Φ
+#934:Φ
+Chi:Χ
+#935:Χ
+Psi:Ψ
+#936:Ψ
+Omega:Ω
+#937:Ω
+alpha:α
+#945:α
+beta:β
+#946:β
+gamma:γ
+#947:γ
+delta:δ
+#948:δ
+epsilon:ε
+#949:ε
+zeta:ζ
+#950:ζ
+eta:η
+#951:η
+theta:θ
+#952:θ
+iota:ι
+#953:ι
+kappa:κ
+#954:κ
+lambda:λ
+#955:λ
+mu:μ
+#956:μ
+nu:ν
+#957:ν
+xi:ξ
+#958:ξ
+omicron:ο
+#959:ο
+pi:π
+#960:π
+rho:ρ
+#961:ρ
+sigmaf:ς
+#962:ς
+sigma:σ
+#963:σ
+tau:τ
+#964:τ
+upsilon:υ
+#965:υ
+phi:φ
+#966:φ
+chi:χ
+#967:χ
+psi:ψ
+#968:ψ
+omega:ω
+#969:ω
+thetasym:ϑ
+#977:ϑ
+upsih:ϒ
+#978:ϒ
+piv:ϖ
+#982:ϖ
+ensp:
+#8194:
+emsp:
+#8195:
+thinsp:
+#8201:
+#zwnj:
+#8204:
+#zwj:
+#8205:
+#lrm:
+#8206:
+#rlm:
+#8207:
+ndash:–
+#8211:–
+mdash:—
+#8212:—
+lsquo:‘
+#8216:‘
+rsquo:’
+#8217:’
+sbquo:‚
+#8218:‚
+ldquo:“
+#8220:“
+rdquo:”
+#8221:”
+bdquo:„
+#8222:„
+dagger:†
+#8224:†
+Dagger:‡
+#8225:‡
+bull:•
+#8226:•
+hellip:…
+#8230:…
+permil:‰
+#8240:‰
+prime:′
+#8242:′
+Prime:″
+#8243:″
+lsaquo:‹
+#8249:‹
+rsaquo:›
+#8250:›
+oline:‾
+#8254:‾
+frasl:⁄
+#8260:⁄
+euro:€
+#8364:€
+image:ℑ
+#8465:ℑ
+weierp:℘
+#8472:℘
+real:ℜ
+#8476:ℜ
+trade:™
+#8482:™
+alefsym:ℵ
+#8501:ℵ
+larr:←
+#8592:←
+uarr:↑
+#8593:↑
+rarr:→
+#8594:→
+darr:↓
+#8595:↓
+harr:↔
+#8596:↔
+crarr:↵
+#8629:↵
+lArr:⇐
+#8656:⇐
+uArr:⇑
+#8657:⇑
+rArr:⇒
+#8658:⇒
+dArr:⇓
+#8659:⇓
+hArr:⇔
+#8660:⇔
+forall:∀
+#8704:∀
+part:∂
+#8706:∂
+exist:∃
+#8707:∃
+empty:∅
+#8709:∅
+nabla:∇
+#8711:∇
+isin:∈
+#8712:∈
+notin:∉
+#8713:∉
+ni:∋
+#8715:∋
+prod:∏
+#8719:∏
+sum:∑
+#8721:∑
+minus:−
+#8722:−
+lowast:∗
+#8727:∗
+radic:√
+#8730:√
+prop:∝
+#8733:∝
+infin:∞
+#8734:∞
+ang:∠
+#8736:∠
+and:∧
+#8743:∧
+or:∨
+#8744:∨
+cap:∩
+#8745:∩
+cup:∪
+#8746:∪
+int:∫
+#8747:∫
+there4:∴
+#8756:∴
+sim:∼
+#8764:∼
+cong:≅
+#8773:≅
+asymp:≈
+#8776:≈
+ne:≠
+#8800:≠
+equiv:≡
+#8801:≡
+le:≤
+#8804:≤
+ge:≥
+#8805:≥
+sub:⊂
+#8834:⊂
+sup:⊃
+#8835:⊃
+nsub:⊄
+#8836:⊄
+sube:⊆
+#8838:⊆
+supe:⊇
+#8839:⊇
+oplus:⊕
+#8853:⊕
+otimes:⊗
+#8855:⊗
+perp:⊥
+#8869:⊥
+sdot:⋅
+#8901:⋅
+lceil:⌈
+#8968:⌈
+rceil:⌉
+#8969:⌉
+lfloor:⌊
+#8970:⌊
+rfloor:⌋
+#8971:⌋
+lang:〈
+#9001:〈
+rang:〉
+#9002:〉
+loz:◊
+#9674:◊
+spades:♠
+#9824:♠
+clubs:♣
+#9827:♣
+hearts:♥
+#9829:♥
+diams:♦
+#9830:♦
\ No newline at end of file
diff --git a/core/target/classes/zemberek/core/text/special-char-to-simple-char.txt b/core/target/classes/zemberek/core/text/special-char-to-simple-char.txt
new file mode 100644
index 00000000..7d3c61fb
--- /dev/null
+++ b/core/target/classes/zemberek/core/text/special-char-to-simple-char.txt
@@ -0,0 +1,54 @@
+À:A
+Á:A
+Â:A
+Ã:A
+Ä:A
+Å:A
+È:E
+É:E
+Ê:E
+Ë:E
+Ì:I
+Í:I
+Î:İ
+Ï:I
+Ñ:N
+Ò:O
+Ó:O
+Ô:O
+Õ:O
+Ù:U
+Ú:U
+Û:U
+à:a
+á:a
+â:a
+ã:a
+ä:a
+å:a
+è:e
+é:e
+ê:e
+ë:e
+ì:i
+í:i
+î:i
+ï:i
+ñ:n
+ò:o
+ó:o
+ô:o
+õ:o
+ù:u
+ú:u
+û:u
+‘:'
+’:'
+“:"
+”:"
+…:...
+′:'
+″:"
+´:'
+»:"
+«:"
\ No newline at end of file
diff --git a/normalization/src/main/java/zemberek/normalization/SingleWordSpellChecker.java b/normalization/src/main/java/zemberek/normalization/SingleWordSpellChecker.java
index 6c9613f4..28a1692c 100644
--- a/normalization/src/main/java/zemberek/normalization/SingleWordSpellChecker.java
+++ b/normalization/src/main/java/zemberek/normalization/SingleWordSpellChecker.java
@@ -524,10 +524,10 @@ public boolean equals(Object o) {
@Override
public int hashCode() {
int result;
- long temp;
+ long penaltyAsLongVariable;
result = node.hashCode();
- temp = Double.doubleToLongBits(penalty);
- result = 31 * result + (int) (temp ^ (temp >>> 32));
+ penaltyAsLongVariable = Double.doubleToLongBits(penalty);
+ result = 31 * result + (int) (penaltyAsLongVariable ^ (penaltyAsLongVariable >>> 32));
result = 31 * result + index;
return result;
}