From c7a2b8793592b3f1378f439df77d6346ac8242a6 Mon Sep 17 00:00:00 2001 From: Umputun Date: Fri, 2 Aug 2024 11:26:53 -0500 Subject: [PATCH] add support miltilingual detection with Coptic script --- lib/tgspam/detector.go | 6 +++++- lib/tgspam/detector_test.go | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/tgspam/detector.go b/lib/tgspam/detector.go index 4c8e6d9..c9f229f 100644 --- a/lib/tgspam/detector.go +++ b/lib/tgspam/detector.go @@ -614,6 +614,8 @@ func (d *Detector) isMultiLang(msg string) spamcheck.Response { scripts["Georgian"] = true case r == 'ї': scripts["Ukrainian"] = true + case unicode.In(r, unicode.Coptic): + scripts["Coptic"] = true default: // check for mathematical alphanumeric symbols and letterlike symbols if unicode.In(r, unicode.Other_Math, unicode.Other_Alphabetic) || @@ -630,7 +632,9 @@ func (d *Detector) isMultiLang(msg string) spamcheck.Response { } count := 0 - words := strings.Fields(msg) + words := strings.FieldsFunc(msg, func(r rune) bool { + return unicode.IsSpace(r) || r == '-' + }) for _, word := range words { if isMultiLingual(word) { count++ diff --git a/lib/tgspam/detector_test.go b/lib/tgspam/detector_test.go index e4b8a64..6ec9688 100644 --- a/lib/tgspam/detector_test.go +++ b/lib/tgspam/detector_test.go @@ -633,6 +633,7 @@ func TestDetector_CheckMultiLang(t *testing.T) { {"WithCyrillic real example 3", "Всем привет, есть простая шабашка, подойдет любому. Даю 15 тысяч. Накину на проезд, сигареты, обед. ", 0, false}, {"WithCyrillic and i", "Привет мiр", 0, false}, {"strange with cyrillic", "𝐇айди и𝐇𝐓и𝐦𝐇ы𝐞 ф𝐨𝐓𝐤и лю𝐛𝐨й д𝐞𝐁𝐲ш𝐤и ч𝐞𝐩𝐞𝟑 𝐛𝐨𝐓а", 7, true}, + {"coptic capital leter", "✔️ⲠⲢⲞⳜⲈЙ-ⲖЮⳜⲨЮ-ⲆⲈⲂⲨⲰⲔⲨ...\n\nⲎⲀЙⲆⳘ ⲤⲔⲢЫⲦⲈ ⲂⳘⲆⲞⲤЫ-ⲪⲞⲦⲞⳠⲔⳘ ⳘⲎⲦⳘⲘⲎⲞⲄⲞ-ⲬⲀⲢⲀⲔⲦⲈⲢⲀ..\n@INTIM0CHKI110DE\n\n", 5, true}, } for _, tt := range tests {