diff --git a/.idea/checkstyle-idea.xml b/.idea/checkstyle-idea.xml new file mode 100644 index 00000000..760d8299 --- /dev/null +++ b/.idea/checkstyle-idea.xml @@ -0,0 +1,16 @@ + + + + 10.9.3 + JavaOnly + true + + + \ No newline at end of file diff --git a/.idea/compiler.xml b/.idea/compiler.xml new file mode 100644 index 00000000..1339a282 --- /dev/null +++ b/.idea/compiler.xml @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 00000000..c3e3257d --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/jarRepositories.xml b/.idea/jarRepositories.xml new file mode 100644 index 00000000..c3ddb503 --- /dev/null +++ b/.idea/jarRepositories.xml @@ -0,0 +1,25 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 00000000..9b865bc2 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,13 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 00000000..94a25f7f --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 00000000..f182fe2e --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + + + + + + + + 1680791954209 + + + + + + + + + + + \ No newline at end of file diff --git a/apps/src/main/java/zemberek/apps/ApplicationRunner.java b/apps/src/main/java/zemberek/apps/ApplicationRunner.java index e637f19b..59cc10ea 100644 --- a/apps/src/main/java/zemberek/apps/ApplicationRunner.java +++ b/apps/src/main/java/zemberek/apps/ApplicationRunner.java @@ -39,7 +39,8 @@ private static void listApplications(List apps) { String simpleName = app.getClass().getSimpleName(); System.out.println(simpleName); System.out.println(Strings.repeat("-", simpleName.length())); - String wrapped = wrap(app.description(), 80); + int wrappedDescriptionLengthLimit = 80; + String wrapped = wrap(app.description(), wrappedDescriptionLengthLimit); System.out.println(wrapped); System.out.println(); } diff --git a/apps/src/main/java/zemberek/apps/fasttext/ClassificationConsole.java b/apps/src/main/java/zemberek/apps/fasttext/ClassificationConsole.java index 2cb356b0..ca73db01 100644 --- a/apps/src/main/java/zemberek/apps/fasttext/ClassificationConsole.java +++ b/apps/src/main/java/zemberek/apps/fasttext/ClassificationConsole.java @@ -119,15 +119,9 @@ private String replaceWordsWithLemma(String sentence) { private String removeNonWords(String sentence) { List docTokens = TurkishTokenizer.DEFAULT.tokenize(sentence); List reduced = new ArrayList<>(docTokens.size()); + for (Token token : docTokens) { - if ( - token.getType() == Type.PercentNumeral || - token.getType() == Type.Number || - token.getType() == Type.Punctuation || - token.getType() == Type.RomanNumeral || - token.getType() == Type.Time || - token.getType() == Type.UnknownWord || - token.getType() == Type.Unknown) { + if (isTokenNonWord(token)) { if (!token.getText().contains("__")) { continue; } @@ -138,6 +132,16 @@ private String removeNonWords(String sentence) { return String.join(" ", reduced); } + private Boolean isTokenNonWord(Token token) { + return token.getType() == Type.PercentNumeral || + token.getType() == Type.Number || + token.getType() == Type.Punctuation || + token.getType() == Type.RomanNumeral || + token.getType() == Type.Time || + token.getType() == Type.UnknownWord || + token.getType() == Type.Unknown; + } + public static void main(String[] args) { new ClassificationConsole().execute(args); } diff --git a/core/target/classes/zemberek/core/syllable/accepted-syllable-prefixes b/core/target/classes/zemberek/core/syllable/accepted-syllable-prefixes new file mode 100644 index 00000000..b49b0e80 --- /dev/null +++ b/core/target/classes/zemberek/core/syllable/accepted-syllable-prefixes @@ -0,0 +1,62 @@ +bl +br +ch +cl +cr +cy +dj +dr +dz +fl +fr +gh +gl +gr +gy +hr +hy +kh +kl +kn +kr +ks +ky +ll +ly +mb +mc +mn +my +ph +pl +pn +pr +ps +pt +rh +sc +sf +sh +sk +sl +sm +sn +sp +sr +st +sv +sw +sy +şl +şn +şv +th +tr +ts +tw +ty +vl +wh +zh +zl +zw diff --git a/core/target/classes/zemberek/core/text/html-char-map-common.txt b/core/target/classes/zemberek/core/text/html-char-map-common.txt new file mode 100644 index 00000000..e26c6eb1 --- /dev/null +++ b/core/target/classes/zemberek/core/text/html-char-map-common.txt @@ -0,0 +1,168 @@ +#287:ğ +#286:Ğ +#304:İ +#305:ı +#351:ş +#350:Ş +quot:" +#34:" +amp:& +#38:& +apos:' +#39:' +lt:< +#60:< +gt:> +#62:> +nbsp: +#160: +cent:¢ +#162:¢ +pound:£ +#163:£ +acute:´ +#180:´ +cedil:¸ +#184:¸ +raquo:» +#187:» +laquo:« +#171:« +Agrave:À +#192:À +Aacute:Á +#193:Á +Acirc: +#194: +Atilde:à +#195:à +Auml:Ä +#196:Ä +Aring:Å +#197:Å +Ccedil:Ç +#199:Ç +Egrave:È +#200:È +Eacute:É +#201:É +Ecirc:Ê +#202:Ê +Euml:Ë +#203:Ë +Igrave:Ì +#204:Ì +Iacute:Í +#205:Í +Icirc:Î +#206:Î +Iuml:Ï +#207:Ï +Ntilde:Ñ +#209:Ñ +Ograve:Ò +#210:Ò +Oacute:Ó +#211:Ó +Ocirc:Ô +#212:Ô +Otilde:Õ +#213:Õ +Ouml:Ö +#214:Ö +Ugrave:Ù +#217:Ù +Uacute:Ú +#218:Ú +Ucirc:Û +#219:Û +Uuml:Ü +#220:Ü +Yacute:Ý +#221:Ý +THORN:Þ +#222:Þ +szlig:ß +#223:ß +agrave:à +#224:à +aacute:á +#225:á +acirc:â +#226:â +atilde:ã +#227:ã +auml:ä +#228:ä +aring:å +#229:å +aelig:æ +#230:æ +ccedil:ç +#231:ç +egrave:è +#232:è +eacute:é +#233:é +ecirc:ê +#234:ê +euml:ë +#235:ë +igrave:ì +#236:ì +iacute:í +#237:í +icirc:î +#238:î +iuml:ï +#239:ï +eth:ð +#240:ð +ntilde:ñ +#241:ñ +ograve:ò +#242:ò +oacute:ó +#243:ó +ocirc:ô +#244:ô +otilde:õ +#245:õ +ouml:ö +#246:ö +ugrave:ù +#249:ù +uacute:ú +#250:ú +ucirc:û +#251:û +uuml:ü +#252:ü +lsquo:‘ +#8216:‘ +rsquo:’ +#8217:’ +sbquo:‚ +#8218:‚ +ldquo:“ +#8220:“ +rdquo:” +#8221:” +bdquo:„ +#8222:„ +hellip:… +#8230:… +prime:′ +#8242:′ +Prime:″ +#8243:″ +lsaquo:‹ +#8249:‹ +rsaquo:› +#8250:› +oline:‾ +#8254:‾ +frasl:⁄ +#8260:⁄ +euro:€ +#8364:€ \ No newline at end of file diff --git a/core/target/classes/zemberek/core/text/html-char-map-full.txt b/core/target/classes/zemberek/core/text/html-char-map-full.txt new file mode 100644 index 00000000..1ebbcbe0 --- /dev/null +++ b/core/target/classes/zemberek/core/text/html-char-map-full.txt @@ -0,0 +1,517 @@ +#287:ğ +#286:Ğ +#304:İ +#305:ı +#351:ş +#350:Ş +#145:' +#146:' +#147:" +#148:" +#151:- +am:& +#38:& +apos:' +#39:' +quot:" +#34:" +lt:< +#60:< +gt:> +#62:> +nbsp: +#160: +iexcl:¡ +#161:¡ +cent:¢ +#162:¢ +pound:£ +#163:£ +curren:¤ +#164:¤ +yen:¥ +#165:¥ +brvbar:¦ +#166:¦ +sect:§ +#167:§ +uml:¨ +#168:¨ +copy:© +#169:© +ordf:ª +#170:ª +laquo:« +#171:« +not:¬ +#172:¬ +shy: +#173: +reg:® +#174:® +macr:¯ +#175:¯ +deg:° +#176:° +plusmn:± +#177:± +sup2:² +#178:² +sup3:³ +#179:³ +acute:´ +#180:´ +micro:µ +#181:µ +para:¶ +#182:¶ +middot:· +#183:· +cedil:¸ +#184:¸ +sup1:¹ +#185:¹ +ordm:º +#186:º +raquo:» +#187:» +frac14:¼ +#188:¼ +frac12:½ +#189:½ +frac34:¾ +#190:¾ +iquest:¿ +#191:¿ +Agrave:À +#192:À +Aacute:Á +#193:Á +Acirc: +#194: +Atilde:à +#195:à +Auml:Ä +#196:Ä +Aring:Å +#197:Å +AElig:Æ +#198:Æ +Ccedil:Ç +#199:Ç +Egrave:È +#200:È +Eacute:É +#201:É +Ecirc:Ê +#202:Ê +Euml:Ë +#203:Ë +Igrave:Ì +#204:Ì +Iacute:Í +#205:Í +Icirc:Î +#206:Î +Iuml:Ï +#207:Ï +ETH:Ð +#208:Ð +Ntilde:Ñ +#209:Ñ +Ograve:Ò +#210:Ò +Oacute:Ó +#211:Ó +Ocirc:Ô +#212:Ô +Otilde:Õ +#213:Õ +Ouml:Ö +#214:Ö +times:× +#215:× +Oslash:Ø +#216:Ø +Ugrave:Ù +#217:Ù +Uacute:Ú +#218:Ú +Ucirc:Û +#219:Û +Uuml:Ü +#220:Ü +Yacute:Ý +#221:Ý +THORN:Þ +#222:Þ +szlig:ß +#223:ß +agrave:à +#224:à +aacute:á +#225:á +acirc:â +#226:â +atilde:ã +#227:ã +auml:ä +#228:ä +aring:å +#229:å +aelig:æ +#230:æ +ccedil:ç +#231:ç +egrave:è +#232:è +eacute:é +#233:é +ecirc:ê +#234:ê +euml:ë +#235:ë +igrave:ì +#236:ì +iacute:í +#237:í +icirc:î +#238:î +iuml:ï +#239:ï +eth:ð +#240:ð +ntilde:ñ +#241:ñ +ograve:ò +#242:ò +oacute:ó +#243:ó +ocirc:ô +#244:ô +otilde:õ +#245:õ +ouml:ö +#246:ö +divide:÷ +#247:÷ +oslash:ø +#248:ø +ugrave:ù +#249:ù +uacute:ú +#250:ú +ucirc:û +#251:û +uuml:ü +#252:ü +yacute:ý +#253:ý +thorn:þ +#254:þ +yuml:ÿ +#255:ÿ +OElig:Œ +#338:Œ +oelig:œ +#339:œ +Scaron:Š +#352:Š +scaron:š +#353:š +Yuml:Ÿ +#376:Ÿ +fnof:ƒ +#402:ƒ +circ:ˆ +#710:ˆ +tilde:˜ +#732:˜ +Alpha:Α +#913:Α +Beta:Β +#914:Β +Gamma:Γ +#915:Γ +Delta:Δ +#916:Δ +Epsilon:Ε +#917:Ε +Zeta:Ζ +#918:Ζ +Eta:Η +#919:Η +Theta:Θ +#920:Θ +Iota:Ι +#921:Ι +Kappa:Κ +#922:Κ +Lambda:Λ +#923:Λ +Mu:Μ +#924:Μ +Nu:Ν +#925:Ν +Xi:Ξ +#926:Ξ +Omicron:Ο +#927:Ο +Pi:Π +#928:Π +Rho:Ρ +#929:Ρ +Sigma:Σ +#931:Σ +Tau:Τ +#932:Τ +Upsilon:Υ +#933:Υ +Phi:Φ +#934:Φ +Chi:Χ +#935:Χ +Psi:Ψ +#936:Ψ +Omega:Ω +#937:Ω +alpha:α +#945:α +beta:β +#946:β +gamma:γ +#947:γ +delta:δ +#948:δ +epsilon:ε +#949:ε +zeta:ζ +#950:ζ +eta:η +#951:η +theta:θ +#952:θ +iota:ι +#953:ι +kappa:κ +#954:κ +lambda:λ +#955:λ +mu:μ +#956:μ +nu:ν +#957:ν +xi:ξ +#958:ξ +omicron:ο +#959:ο +pi:π +#960:π +rho:ρ +#961:ρ +sigmaf:ς +#962:ς +sigma:σ +#963:σ +tau:τ +#964:τ +upsilon:υ +#965:υ +phi:φ +#966:φ +chi:χ +#967:χ +psi:ψ +#968:ψ +omega:ω +#969:ω +thetasym:ϑ +#977:ϑ +upsih:ϒ +#978:ϒ +piv:ϖ +#982:ϖ +ensp:  +#8194:  +emsp:  +#8195:  +thinsp:  +#8201:  +#zwnj:  +#8204:  +#zwj:  +#8205:  +#lrm:  +#8206:  +#rlm:  +#8207:  +ndash:– +#8211:– +mdash:— +#8212:— +lsquo:‘ +#8216:‘ +rsquo:’ +#8217:’ +sbquo:‚ +#8218:‚ +ldquo:“ +#8220:“ +rdquo:” +#8221:” +bdquo:„ +#8222:„ +dagger:† +#8224:† +Dagger:‡ +#8225:‡ +bull:• +#8226:• +hellip:… +#8230:… +permil:‰ +#8240:‰ +prime:′ +#8242:′ +Prime:″ +#8243:″ +lsaquo:‹ +#8249:‹ +rsaquo:› +#8250:› +oline:‾ +#8254:‾ +frasl:⁄ +#8260:⁄ +euro:€ +#8364:€ +image:ℑ +#8465:ℑ +weierp:℘ +#8472:℘ +real:ℜ +#8476:ℜ +trade:™ +#8482:™ +alefsym:ℵ +#8501:ℵ +larr:← +#8592:← +uarr:↑ +#8593:↑ +rarr:→ +#8594:→ +darr:↓ +#8595:↓ +harr:↔ +#8596:↔ +crarr:↵ +#8629:↵ +lArr:⇐ +#8656:⇐ +uArr:⇑ +#8657:⇑ +rArr:⇒ +#8658:⇒ +dArr:⇓ +#8659:⇓ +hArr:⇔ +#8660:⇔ +forall:∀ +#8704:∀ +part:∂ +#8706:∂ +exist:∃ +#8707:∃ +empty:∅ +#8709:∅ +nabla:∇ +#8711:∇ +isin:∈ +#8712:∈ +notin:∉ +#8713:∉ +ni:∋ +#8715:∋ +prod:∏ +#8719:∏ +sum:∑ +#8721:∑ +minus:− +#8722:− +lowast:∗ +#8727:∗ +radic:√ +#8730:√ +prop:∝ +#8733:∝ +infin:∞ +#8734:∞ +ang:∠ +#8736:∠ +and:∧ +#8743:∧ +or:∨ +#8744:∨ +cap:∩ +#8745:∩ +cup:∪ +#8746:∪ +int:∫ +#8747:∫ +there4:∴ +#8756:∴ +sim:∼ +#8764:∼ +cong:≅ +#8773:≅ +asymp:≈ +#8776:≈ +ne:≠ +#8800:≠ +equiv:≡ +#8801:≡ +le:≤ +#8804:≤ +ge:≥ +#8805:≥ +sub:⊂ +#8834:⊂ +sup:⊃ +#8835:⊃ +nsub:⊄ +#8836:⊄ +sube:⊆ +#8838:⊆ +supe:⊇ +#8839:⊇ +oplus:⊕ +#8853:⊕ +otimes:⊗ +#8855:⊗ +perp:⊥ +#8869:⊥ +sdot:⋅ +#8901:⋅ +lceil:⌈ +#8968:⌈ +rceil:⌉ +#8969:⌉ +lfloor:⌊ +#8970:⌊ +rfloor:⌋ +#8971:⌋ +lang:〈 +#9001:〈 +rang:〉 +#9002:〉 +loz:◊ +#9674:◊ +spades:♠ +#9824:♠ +clubs:♣ +#9827:♣ +hearts:♥ +#9829:♥ +diams:♦ +#9830:♦ \ No newline at end of file diff --git a/core/target/classes/zemberek/core/text/special-char-to-simple-char.txt b/core/target/classes/zemberek/core/text/special-char-to-simple-char.txt new file mode 100644 index 00000000..7d3c61fb --- /dev/null +++ b/core/target/classes/zemberek/core/text/special-char-to-simple-char.txt @@ -0,0 +1,54 @@ +À:A +Á:A +Â:A +Ã:A +Ä:A +Å:A +È:E +É:E +Ê:E +Ë:E +Ì:I +Í:I +Î:İ +Ï:I +Ñ:N +Ò:O +Ó:O +Ô:O +Õ:O +Ù:U +Ú:U +Û:U +à:a +á:a +â:a +ã:a +ä:a +å:a +è:e +é:e +ê:e +ë:e +ì:i +í:i +î:i +ï:i +ñ:n +ò:o +ó:o +ô:o +õ:o +ù:u +ú:u +û:u +‘:' +’:' +“:" +”:" +…:... +′:' +″:" +´:' +»:" +«:" \ No newline at end of file diff --git a/normalization/src/main/java/zemberek/normalization/SingleWordSpellChecker.java b/normalization/src/main/java/zemberek/normalization/SingleWordSpellChecker.java index 6c9613f4..28a1692c 100644 --- a/normalization/src/main/java/zemberek/normalization/SingleWordSpellChecker.java +++ b/normalization/src/main/java/zemberek/normalization/SingleWordSpellChecker.java @@ -524,10 +524,10 @@ public boolean equals(Object o) { @Override public int hashCode() { int result; - long temp; + long penaltyAsLongVariable; result = node.hashCode(); - temp = Double.doubleToLongBits(penalty); - result = 31 * result + (int) (temp ^ (temp >>> 32)); + penaltyAsLongVariable = Double.doubleToLongBits(penalty); + result = 31 * result + (int) (penaltyAsLongVariable ^ (penaltyAsLongVariable >>> 32)); result = 31 * result + index; return result; }