Skip to content

Commit

Permalink
universal detection of scripts for multi-lingual check
Browse files Browse the repository at this point in the history
  • Loading branch information
umputun committed Aug 4, 2024
1 parent 5424f66 commit 68dcc02
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 38 deletions.
64 changes: 28 additions & 36 deletions lib/tgspam/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -583,50 +583,42 @@ func (d *Detector) isMultiLang(msg string) spamcheck.Response {
isMultiLingual := func(word string) bool {
scripts := make(map[string]bool)
for _, r := range word {
switch {
case r == 'i': // skip 'i' because it's used in many languages
if r == 'i' || unicode.IsSpace(r) { // skip 'i' (common in many langs) and spaces
continue
case unicode.Is(unicode.Latin, r) || unicode.In(r, unicode.Number):
scripts["Latin"] = true
case unicode.Is(unicode.Cyrillic, r):
scripts["Cyrillic"] = true
case unicode.Is(unicode.Greek, r):
scripts["Greek"] = true
case unicode.Is(unicode.Han, r):
scripts["Han"] = true
case unicode.Is(unicode.Arabic, r):
scripts["Arabic"] = true
case unicode.Is(unicode.Hebrew, r):
scripts["Hebrew"] = true
case unicode.Is(unicode.Devanagari, r):
scripts["Devanagari"] = true
case unicode.Is(unicode.Thai, r):
scripts["Thai"] = true
case unicode.Is(unicode.Hiragana, r) || unicode.Is(unicode.Katakana, r):
scripts["Japanese"] = true
case unicode.Is(unicode.Hangul, r):
scripts["Korean"] = true
case unicode.Is(unicode.Bengali, r):
scripts["Bengali"] = true
case unicode.Is(unicode.Armenian, r):
scripts["Armenian"] = true
case unicode.Is(unicode.Georgian, r):
scripts["Georgian"] = true
case r == 'ї':
scripts["Ukrainian"] = true
case unicode.In(r, unicode.Coptic):
scripts["Coptic"] = true
default:
}

scriptFound := false
for name, table := range unicode.Scripts {
if unicode.Is(table, r) {
if name != "Common" && name != "Inherited" {
scripts[name] = true
if len(scripts) > 1 {
return true
}
scriptFound = true
}
break
}
}

// if no specific script was found, it might be a symbol or punctuation
if !scriptFound {
// check for mathematical alphanumeric symbols and letterlike symbols
if unicode.In(r, unicode.Other_Math, unicode.Other_Alphabetic) ||
(r >= '\U0001D400' && r <= '\U0001D7FF') || // Mathematical Alphanumeric Symbols
(r >= '\u2100' && r <= '\u214F') { // Letterlike Symbols
scripts["Mathematical"] = true
if len(scripts) > 1 {
return true
}
} else if !unicode.IsPunct(r) && !unicode.IsSymbol(r) {
// if it's not punctuation or a symbol, count it as "Other"
scripts["Other"] = true
if len(scripts) > 1 {
return true
}
}
}
if len(scripts) > 1 {
return true
}
}
return false
}
Expand Down
5 changes: 3 additions & 2 deletions lib/tgspam/detector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ func TestDetector_CheckMultiLang(t *testing.T) {
count int
spam bool
}{
{"No MultiLang", "Hello, world!", 0, false},
{"No MultiLang", "Hello, world!\n 12345-980! _", 0, false},
{"One MultiLang", "Hi therе", 1, false},
{"Two MultiLang", "Gооd moфning", 2, true},
{"WithCyrillic no MultiLang", "Привет мир", 0, false},
Expand All @@ -633,7 +633,8 @@ func TestDetector_CheckMultiLang(t *testing.T) {
{"WithCyrillic real example 3", "Всем привет, есть простая шабашка, подойдет любому. Даю 15 тысяч. Накину на проезд, сигареты, обед. ", 0, false},
{"WithCyrillic and i", "Привет мiр", 0, false},
{"strange with cyrillic", "𝐇айди и𝐇𝐓и𝐦𝐇ы𝐞 ф𝐨𝐓𝐤и лю𝐛𝐨й д𝐞𝐁𝐲ш𝐤и ч𝐞𝐩𝐞𝟑 𝐛𝐨𝐓а", 7, true},
{"coptic capital leter", "✔️ⲠⲢⲞⳜⲈЙ-ⲖЮⳜⲨЮ-ⲆⲈⲂⲨⲰⲔⲨ...\n\nⲎⲀЙⲆⳘ ⲤⲔⲢЫⲦⲈ ⲂⳘⲆⲞⲤЫ-ⲪⲞⲦⲞⳠⲔⳘ ⳘⲎⲦⳘⲘⲎⲞⲄⲞ-ⲬⲀⲢⲀⲔⲦⲈⲢⲀ..\n@INTIM0CHKI110DE\n\n", 5, true},
{"coptic capital leter", "✔️ⲠⲢⲞⳜⲈЙ-ⲖЮⳜⲨЮ-ⲆⲈⲂⲨⲰⲔⲨ...\n\nⲎⲀЙⲆⳘ ⲤⲔⲢЫⲦⲈ ⲂⳘⲆⲞⲤЫ-ⲪⲞⲦⲞⳠⲔⳘ ⳘⲎⲦⳘⲘⲎⲞⲄⲞ-ⲬⲀⲢⲀⲔⲦⲈⲢⲀ..\n@INTIM0CHKI110DE\n\n", 6, true},
{"mix with gothic, cyrillic and greek", "𐌿РОВЕРЬ ЛЮБУЮ НА НАЛИЧИЕ ПОШЛЫХ ΦΟͲΟ-ΒͶДξΟ, 🍑НАБЕРИ В Т𐌲 𐌿ОИСКЕ СЛОВО: 30GRL", 5, true},
}

for _, tt := range tests {
Expand Down

0 comments on commit 68dcc02

Please sign in to comment.