Implement multi-language words spam detection (#94)

* Implement multi-language words spam detection This update introduces a feature to detect spam messages that use words mixing characters from multiple languages. The feature is turned off by default and can be enabled with the `--multi-lang=` command line parameter. Tests have been added to confirm the new functionality and the README has been updated with its description. * add test with special symbols * add more real examples to tests
umputun · Jun 23, 2024 · 71637ab · 71637ab
1 parent 182ebe3
commit 71637ab
Show file tree

Hide file tree

Showing 5 changed files with 102 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -121,6 +121,11 @@ This option is disabled by default. If set to `true`, the bot will check the mes
 
 This option is disabled by default. If set to `true`, the bot will check the message for the presence of any image. If the message contains images but no text, it will be marked as spam.
 
+**Multi-language words**
+
+Using words that mix characters from multiple languages is a common spam technique. To detect such messages, the bot can check the message for the presence of such words. This option is disabled by default and can be enabled with the `--multi-lang=, [$MULTI_LANG]` parameter. Setting it to a number above `0` will enable this check, and the bot will mark the message as spam if it contains words with characters from more than one language in more than the specified number of words.
+
+
 ### Admin chat/group
 
 Optionally, user can specify the admin chat/group name/id. In this case, the bot will send a message to the admin chat as soon as a spammer is detected. Admin can see all the spam and all banned users and could also unban the user, confirm the ban or get results of spam checks by clicking a button directly on the message.
@@ -233,6 +238,7 @@ Success! The new status is: DISABLED. /help
  --min-msg-len= min message length to check (default: 50) [$MIN_MSG_LEN]
  --max-emoji= max emoji count in message, -1 to disable check (default: 2) [$MAX_EMOJI]
  --min-probability= min spam probability percent to ban (default: 50) [$MIN_PROBABILITY]
+ --multi-lang= number of words in different languages to consider as spam, 0 to disable (default: 0) [$MULTI_LANG]
  --paranoid paranoid mode, check all messages [$PARANOID]
  --first-messages-count= number of first messages to check (default: 1) [$FIRST_MESSAGES_COUNT]
  --training training mode, passive spam detection only [$TRAINING]

diff --git a/app/main.go b/app/main.go
@@ -90,6 +90,7 @@ type options struct {
  MinMsgLen int `long:"min-msg-len" env:"MIN_MSG_LEN" default:"50" description:"min message length to check"`
  MaxEmoji int `long:"max-emoji" env:"MAX_EMOJI" default:"2" description:"max emoji count in message, -1 to disable check"`
  MinSpamProbability float64 `long:"min-probability" env:"MIN_PROBABILITY" default:"50" description:"min spam probability percent to ban"`
+ MultiLangWords int `long:"multi-lang" env:"MULTI_LANG" default:"0" description:"number of words in different languages to consider as spam"`
 
  ParanoidMode bool `long:"paranoid" env:"PARANOID" description:"paranoid mode, check all messages"`
  FirstMessagesCount int `long:"first-messages-count" env:"FIRST_MESSAGES_COUNT" default:"1" description:"number of first messages to check"`
@@ -355,6 +356,7 @@ func activateServer(ctx context.Context, opts options, sf *bot.SpamFilter, loc *
  MetaLinksLimit: opts.Meta.LinksLimit,
  MetaLinksOnly: opts.Meta.LinksOnly,
  MetaImageOnly: opts.Meta.ImageOnly,
+ MultiLangLimit: opts.MultiLangWords,
  OpenAIEnabled: opts.OpenAI.Token != "",
  SamplesDataPath: opts.Files.SamplesDataPath,
  DynamicDataPath: opts.Files.DynamicDataPath,
@@ -402,6 +404,7 @@ func makeDetector(opts options) *tgspam.Detector {
  FirstMessageOnly: !opts.ParanoidMode,
  FirstMessagesCount: opts.FirstMessagesCount,
  OpenAIVeto: opts.OpenAI.Veto,
+ MultiLangWords: opts.MultiLangWords,
  }
 
  // FirstMessagesCount and ParanoidMode are mutually exclusive.

diff --git a/app/webapi/webapi.go b/app/webapi/webapi.go
@@ -69,6 +69,7 @@ type Settings struct {
  MetaLinksLimit int `json:"meta_links_limit"`
  MetaLinksOnly bool `json:"meta_links_only"`
  MetaImageOnly bool `json:"meta_image_only"`
+ MultiLangLimit int `json:"multi_lang_limit"`
  OpenAIEnabled bool `json:"openai_enabled"`
  SamplesDataPath string `json:"samples_data_path"`
  DynamicDataPath string `json:"dynamic_data_path"`

diff --git a/lib/tgspam/detector.go b/lib/tgspam/detector.go
@@ -14,6 +14,7 @@ import (
  "strings"
  "sync"
  "time"
+ "unicode"
 
  "github.com/umputun/tg-spam/lib/approved"
  "github.com/umputun/tg-spam/lib/spamcheck"
@@ -54,6 +55,7 @@ type Config struct {
  HTTPClient HTTPClient // http client to use for requests
  MinSpamProbability float64 // minimum spam probability to consider a message spam with classifier, if 0 - ignored
  OpenAIVeto bool // if true, openai will be used to veto spam messages, otherwise it will be used to veto ham messages
+ MultiLangWords int // if true, check for number of multi-lingual words
 }
 
 // SampleUpdater is an interface for updating spam/ham samples on the fly.
@@ -141,6 +143,10 @@ func (d *Detector) Check(req spamcheck.Request) (spam bool, cr []spamcheck.Respo
  cr = append(cr, d.isCasSpam(req.UserID))
  }
 
+ if d.MultiLangWords > 0 {
+ cr = append(cr, d.isMultiLang(req.Msg))
+ }
+
  // check for message length exceed the minimum size, if min message length is set.
  // the check is done after first simple checks, because stop words and emojis can be triggered by short messages as well.
  if len([]rune(req.Msg)) < d.MinMsgLen {
@@ -571,3 +577,58 @@ func (d *Detector) isManyEmojis(msg string) spamcheck.Response {
  count := countEmoji(msg)
  return spamcheck.Response{Name: "emoji", Spam: count > d.MaxAllowedEmoji, Details: fmt.Sprintf("%d/%d", count, d.MaxAllowedEmoji)}
 }
+
+// isMultiLang checks if a given message contains more than MultiLangWords multi-lingual words.
+func (d *Detector) isMultiLang(msg string) spamcheck.Response {
+ isMultiLingual := func(word string) bool {
+ scripts := make(map[string]bool)
+ for _, r := range word {
+ switch {
+ case unicode.Is(unicode.Latin, r):
+ scripts["Latin"] = true
+ case unicode.Is(unicode.Cyrillic, r):
+ scripts["Cyrillic"] = true
+ case unicode.Is(unicode.Greek, r):
+ scripts["Greek"] = true
+ case unicode.Is(unicode.Han, r):
+ scripts["Han"] = true
+ case unicode.Is(unicode.Arabic, r):
+ scripts["Arabic"] = true
+ case unicode.Is(unicode.Hebrew, r):
+ scripts["Hebrew"] = true
+ case unicode.Is(unicode.Devanagari, r):
+ scripts["Devanagari"] = true
+ case unicode.Is(unicode.Thai, r):
+ scripts["Thai"] = true
+ case unicode.Is(unicode.Hiragana, r) || unicode.Is(unicode.Katakana, r):
+ scripts["Japanese"] = true
+ case unicode.Is(unicode.Hangul, r):
+ scripts["Korean"] = true
+ case unicode.Is(unicode.Bengali, r):
+ scripts["Bengali"] = true
+ case unicode.Is(unicode.Armenian, r):
+ scripts["Armenian"] = true
+ case unicode.Is(unicode.Georgian, r):
+ scripts["Georgian"] = true
+ case r == 'ї':
+ scripts["Ukrainian"] = true
+ }
+ if len(scripts) > 1 {
+ return true
+ }
+ }
+ return false
+ }
+
+ count := 0
+ words := strings.Fields(msg)
+ for _, word := range words {
+ if isMultiLingual(word) {
+ count++
+ }
+ }
+ if count >= d.MultiLangWords {
+ return spamcheck.Response{Name: "multi-lingual", Spam: true, Details: fmt.Sprintf("%d/%d", count, d.MultiLangWords)}
+ }
+ return spamcheck.Response{Name: "multi-lingual", Spam: false, Details: fmt.Sprintf("%d/%d", count, d.MultiLangWords)}
+}
diff --git a/lib/tgspam/detector_test.go b/lib/tgspam/detector_test.go
@@ -582,6 +582,37 @@ func TestDetector_CheckWithMeta(t *testing.T) {
  })
 }
 
+func TestDetector_CheckMultiLang(t *testing.T) {
+ d := NewDetector(Config{MultiLangWords: 2, MaxAllowedEmoji: -1})
+ tests := []struct {
+ name string
+ input string
+ count int
+ spam bool
+ }{
+ {"No MultiLang", "Hello, world!", 0, false},
+ {"One MultiLang", "Hi therе", 1, false},
+ {"Two MultiLang", "Gооd moфning", 2, true},
+ {"WithCyrillic no MultiLang", "Привет мир", 0, false},
+ {"WithCyrillic two MultiLang", "Привеt мip", 2, true},
+ {"WithCyrillic and special symbols", "Привет мир -@#$%^&*(_", 0, false},
+ {"WithCyrillic real example 1", "Ищем заинтeрeсoвaнных в зaрaбoткe нa кpиптoвaлютe. Всeгдa хотeли пoпpoбовать сeбя в этом, нo нe знали с чeго нaчaть? Тогдa вaм кo мнe 3aнимаемся aрбuтражeм, зaрабaтывaeм на paзницe курсов с минимaльныmи pискaми 💲Рынok oчень волатильный и нам это выгoднo, пo этoмe пишиte @vitalgoescra и зapaбaтывaйтe сo мнoй ", 31, true},
+ {"WithCyrillic real example 2", "В поuске паpтнеров, заuнтересованных в пассuвном дoходе с затpатой мuнuмум лuчного временu. Все деталu в лс", 10, true},
+ {"WithCyrillic real example 3", "Всем привет, есть простая шабашка, подойдет любому. Даю 15 тысяч. Накину на проезд, сигареты, обед. ", 0, false},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ spam, cr := d.Check(spamcheck.Request{Msg: tt.input})
+ assert.Equal(t, tt.spam, spam)
+ require.Len(t, cr, 1)
+ assert.Equal(t, "multi-lingual", cr[0].Name)
+ assert.Equal(t, tt.spam, cr[0].Spam)
+ assert.Equal(t, fmt.Sprintf("%d/2", tt.count), cr[0].Details)
+ })
+ }
+}
+
 func TestDetector_UpdateSpam(t *testing.T) {
  upd := &mocks.SampleUpdaterMock{
  AppendFunc: func(msg string) error {