Implement multi-language words spam detection

This update introduces a feature to detect spam messages that use words mixing characters from multiple languages. The feature is turned off by default and can be enabled with the `--multi-lang=` command line parameter. Tests have been added to confirm the new functionality and the README has been updated with its description.
umputun · Jun 23, 2024 · 3a1375d · 3a1375d
1 parent 182ebe3
commit 3a1375d
Show file tree

Hide file tree

Showing 5 changed files with 98 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -121,6 +121,11 @@ This option is disabled by default. If set to `true`, the bot will check the mes
 
 This option is disabled by default. If set to `true`, the bot will check the message for the presence of any image. If the message contains images but no text, it will be marked as spam.
 
+**Multi-language words**
+
+Using words that mix characters from multiple languages is a common spam technique. To detect such messages, the bot can check the message for the presence of such words. This option is disabled by default and can be enabled with the `--multi-lang=, [$MULTI_LANG]` parameter. Setting it to a number above `0` will enable this check, and the bot will mark the message as spam if it contains words with characters from more than one language in more than the specified number of words.
+
+
 ### Admin chat/group
 
 Optionally, user can specify the admin chat/group name/id. In this case, the bot will send a message to the admin chat as soon as a spammer is detected. Admin can see all the spam and all banned users and could also unban the user, confirm the ban or get results of spam checks by clicking a button directly on the message.
@@ -233,6 +238,7 @@ Success! The new status is: DISABLED. /help
  --min-msg-len= min message length to check (default: 50) [$MIN_MSG_LEN]
  --max-emoji= max emoji count in message, -1 to disable check (default: 2) [$MAX_EMOJI]
  --min-probability= min spam probability percent to ban (default: 50) [$MIN_PROBABILITY]
+ --multi-lang= number of words in different languages to consider as spam, 0 to disable (default: 0) [$MULTI_LANG]
  --paranoid paranoid mode, check all messages [$PARANOID]
  --first-messages-count= number of first messages to check (default: 1) [$FIRST_MESSAGES_COUNT]
  --training training mode, passive spam detection only [$TRAINING]

diff --git a/app/main.go b/app/main.go
@@ -90,6 +90,7 @@ type options struct {
  MinMsgLen int `long:"min-msg-len" env:"MIN_MSG_LEN" default:"50" description:"min message length to check"`
  MaxEmoji int `long:"max-emoji" env:"MAX_EMOJI" default:"2" description:"max emoji count in message, -1 to disable check"`
  MinSpamProbability float64 `long:"min-probability" env:"MIN_PROBABILITY" default:"50" description:"min spam probability percent to ban"`
+ MultiLangWords int `long:"multi-lang" env:"MULTI_LANG" default:"0" description:"number of words in different languages to consider as spam"`
 
  ParanoidMode bool `long:"paranoid" env:"PARANOID" description:"paranoid mode, check all messages"`
  FirstMessagesCount int `long:"first-messages-count" env:"FIRST_MESSAGES_COUNT" default:"1" description:"number of first messages to check"`
@@ -355,6 +356,7 @@ func activateServer(ctx context.Context, opts options, sf *bot.SpamFilter, loc *
  MetaLinksLimit: opts.Meta.LinksLimit,
  MetaLinksOnly: opts.Meta.LinksOnly,
  MetaImageOnly: opts.Meta.ImageOnly,
+ MultiLangLimit: opts.MultiLangWords,
  OpenAIEnabled: opts.OpenAI.Token != "",
  SamplesDataPath: opts.Files.SamplesDataPath,
  DynamicDataPath: opts.Files.DynamicDataPath,
@@ -402,6 +404,7 @@ func makeDetector(opts options) *tgspam.Detector {
  FirstMessageOnly: !opts.ParanoidMode,
  FirstMessagesCount: opts.FirstMessagesCount,
  OpenAIVeto: opts.OpenAI.Veto,
+ MultiLangWords: opts.MultiLangWords,
  }
 
  // FirstMessagesCount and ParanoidMode are mutually exclusive.

diff --git a/app/webapi/webapi.go b/app/webapi/webapi.go
@@ -69,6 +69,7 @@ type Settings struct {
  MetaLinksLimit int `json:"meta_links_limit"`
  MetaLinksOnly bool `json:"meta_links_only"`
  MetaImageOnly bool `json:"meta_image_only"`
+ MultiLangLimit int `json:"multi_lang_limit"`
  OpenAIEnabled bool `json:"openai_enabled"`
  SamplesDataPath string `json:"samples_data_path"`
  DynamicDataPath string `json:"dynamic_data_path"`

diff --git a/lib/tgspam/detector.go b/lib/tgspam/detector.go
@@ -14,6 +14,7 @@ import (
  "strings"
  "sync"
  "time"
+ "unicode"
 
  "github.com/umputun/tg-spam/lib/approved"
  "github.com/umputun/tg-spam/lib/spamcheck"
@@ -54,6 +55,7 @@ type Config struct {
  HTTPClient HTTPClient // http client to use for requests
  MinSpamProbability float64 // minimum spam probability to consider a message spam with classifier, if 0 - ignored
  OpenAIVeto bool // if true, openai will be used to veto spam messages, otherwise it will be used to veto ham messages
+ MultiLangWords int // if true, check for number of multi-lingual words
 }
 
 // SampleUpdater is an interface for updating spam/ham samples on the fly.
@@ -141,6 +143,10 @@ func (d *Detector) Check(req spamcheck.Request) (spam bool, cr []spamcheck.Respo
  cr = append(cr, d.isCasSpam(req.UserID))
  }
 
+ if d.MultiLangWords > 0 {
+ cr = append(cr, d.isMultiLang(req.Msg))
+ }
+
  // check for message length exceed the minimum size, if min message length is set.
  // the check is done after first simple checks, because stop words and emojis can be triggered by short messages as well.
  if len([]rune(req.Msg)) < d.MinMsgLen {
@@ -571,3 +577,58 @@ func (d *Detector) isManyEmojis(msg string) spamcheck.Response {
  count := countEmoji(msg)
  return spamcheck.Response{Name: "emoji", Spam: count > d.MaxAllowedEmoji, Details: fmt.Sprintf("%d/%d", count, d.MaxAllowedEmoji)}
 }
+
+// isMultiLang checks if a given message contains more than MultiLangWords multi-lingual words.
+func (d *Detector) isMultiLang(msg string) spamcheck.Response {
+ isMultiLingual := func(word string) bool {
+ scripts := make(map[string]bool)
+ for _, r := range word {
+ switch {
+ case unicode.Is(unicode.Latin, r):
+ scripts["Latin"] = true
+ case unicode.Is(unicode.Cyrillic, r):
+ scripts["Cyrillic"] = true
+ case unicode.Is(unicode.Greek, r):
+ scripts["Greek"] = true
+ case unicode.Is(unicode.Han, r):
+ scripts["Han"] = true
+ case unicode.Is(unicode.Arabic, r):
+ scripts["Arabic"] = true
+ case unicode.Is(unicode.Hebrew, r):
+ scripts["Hebrew"] = true
+ case unicode.Is(unicode.Devanagari, r):
+ scripts["Devanagari"] = true
+ case unicode.Is(unicode.Thai, r):
+ scripts["Thai"] = true
+ case unicode.Is(unicode.Hiragana, r) || unicode.Is(unicode.Katakana, r):
+ scripts["Japanese"] = true
+ case unicode.Is(unicode.Hangul, r):
+ scripts["Korean"] = true
+ case unicode.Is(unicode.Bengali, r):
+ scripts["Bengali"] = true
+ case unicode.Is(unicode.Armenian, r):
+ scripts["Armenian"] = true
+ case unicode.Is(unicode.Georgian, r):
+ scripts["Georgian"] = true
+ case r == 'ї':
+ scripts["Ukrainian"] = true
+ }
+ if len(scripts) > 1 {
+ return true
+ }
+ }
+ return false
+ }
+
+ count := 0
+ words := strings.Fields(msg)
+ for _, word := range words {
+ if isMultiLingual(word) {
+ count++
+ }
+ }
+ if count >= d.MultiLangWords {
+ return spamcheck.Response{Name: "multi-lingual", Spam: true, Details: fmt.Sprintf("%d/%d", count, d.MultiLangWords)}
+ }
+ return spamcheck.Response{Name: "multi-lingual", Spam: false, Details: fmt.Sprintf("%d/%d", count, d.MultiLangWords)}
+}
diff --git a/lib/tgspam/detector_test.go b/lib/tgspam/detector_test.go
@@ -582,6 +582,33 @@ func TestDetector_CheckWithMeta(t *testing.T) {
  })
 }
 
+func TestDetector_CheckMultiLang(t *testing.T) {
+ d := NewDetector(Config{MultiLangWords: 2, MaxAllowedEmoji: -1})
+ tests := []struct {
+ name string
+ input string
+ count int
+ spam bool
+ }{
+ {"No MultiLang", "Hello, world!", 0, false},
+ {"One MultiLang", "Hi therе", 1, false},
+ {"Two MultiLang", "Gооd moфning", 2, true},
+ {"WithCyrillic no MultiLang", "Привет мир", 0, false},
+ {"WithCyrillic two MultiLang", "Привеt мip", 2, true},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ spam, cr := d.Check(spamcheck.Request{Msg: tt.input})
+ assert.Equal(t, tt.spam, spam)
+ require.Len(t, cr, 1)
+ assert.Equal(t, "multi-lingual", cr[0].Name)
+ assert.Equal(t, tt.spam, cr[0].Spam)
+ assert.Equal(t, fmt.Sprintf("%d/2", tt.count), cr[0].Details)
+ })
+ }
+}
+
 func TestDetector_UpdateSpam(t *testing.T) {
  upd := &mocks.SampleUpdaterMock{
  AppendFunc: func(msg string) error {