Skip to content

Commit

Permalink
add support miltilingual detection with Coptic script
Browse files Browse the repository at this point in the history
  • Loading branch information
umputun committed Aug 2, 2024
1 parent 7dca4f6 commit c7a2b87
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 1 deletion.
6 changes: 5 additions & 1 deletion lib/tgspam/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,8 @@ func (d *Detector) isMultiLang(msg string) spamcheck.Response {
scripts["Georgian"] = true
case r == 'ї':
scripts["Ukrainian"] = true
case unicode.In(r, unicode.Coptic):
scripts["Coptic"] = true
default:
// check for mathematical alphanumeric symbols and letterlike symbols
if unicode.In(r, unicode.Other_Math, unicode.Other_Alphabetic) ||
Expand All @@ -630,7 +632,9 @@ func (d *Detector) isMultiLang(msg string) spamcheck.Response {
}

count := 0
words := strings.Fields(msg)
words := strings.FieldsFunc(msg, func(r rune) bool {
return unicode.IsSpace(r) || r == '-'
})
for _, word := range words {
if isMultiLingual(word) {
count++
Expand Down
1 change: 1 addition & 0 deletions lib/tgspam/detector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,7 @@ func TestDetector_CheckMultiLang(t *testing.T) {
{"WithCyrillic real example 3", "Всем привет, есть простая шабашка, подойдет любому. Даю 15 тысяч. Накину на проезд, сигареты, обед. ", 0, false},
{"WithCyrillic and i", "Привет мiр", 0, false},
{"strange with cyrillic", "𝐇айди и𝐇𝐓и𝐦𝐇ы𝐞 ф𝐨𝐓𝐤и лю𝐛𝐨й д𝐞𝐁𝐲ш𝐤и ч𝐞𝐩𝐞𝟑 𝐛𝐨𝐓а", 7, true},
{"coptic capital leter", "✔️ⲠⲢⲞⳜⲈЙ-ⲖЮⳜⲨЮ-ⲆⲈⲂⲨⲰⲔⲨ...\n\nⲎⲀЙⲆⳘ ⲤⲔⲢЫⲦⲈ ⲂⳘⲆⲞⲤЫ-ⲪⲞⲦⲞⳠⲔⳘ ⳘⲎⲦⳘⲘⲎⲞⲄⲞ-ⲬⲀⲢⲀⲔⲦⲈⲢⲀ..\n@INTIM0CHKI110DE\n\n", 5, true},
}

for _, tt := range tests {
Expand Down

0 comments on commit c7a2b87

Please sign in to comment.