Skip to content

Commit

Permalink
Implement multi-language words spam detection (#94)
Browse files Browse the repository at this point in the history
* Implement multi-language words spam detection

This update introduces a feature to detect spam messages that use words mixing characters from multiple languages. The feature is turned off by default and can be enabled with the `--multi-lang=` command line parameter. Tests have been added to confirm the new functionality and the README has been updated with its description.

* add test with special symbols

* add more real examples to tests
  • Loading branch information
umputun committed Jun 23, 2024
1 parent 182ebe3 commit 71637ab
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 0 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@ This option is disabled by default. If set to `true`, the bot will check the mes

This option is disabled by default. If set to `true`, the bot will check the message for the presence of any image. If the message contains images but no text, it will be marked as spam.

**Multi-language words**

Using words that mix characters from multiple languages is a common spam technique. To detect such messages, the bot can check the message for the presence of such words. This option is disabled by default and can be enabled with the `--multi-lang=, [$MULTI_LANG]` parameter. Setting it to a number above `0` will enable this check, and the bot will mark the message as spam if it contains words with characters from more than one language in more than the specified number of words.


### Admin chat/group

Optionally, user can specify the admin chat/group name/id. In this case, the bot will send a message to the admin chat as soon as a spammer is detected. Admin can see all the spam and all banned users and could also unban the user, confirm the ban or get results of spam checks by clicking a button directly on the message.
Expand Down Expand Up @@ -233,6 +238,7 @@ Success! The new status is: DISABLED. /help
--min-msg-len= min message length to check (default: 50) [$MIN_MSG_LEN]
--max-emoji= max emoji count in message, -1 to disable check (default: 2) [$MAX_EMOJI]
--min-probability= min spam probability percent to ban (default: 50) [$MIN_PROBABILITY]
--multi-lang= number of words in different languages to consider as spam, 0 to disable (default: 0) [$MULTI_LANG]
--paranoid paranoid mode, check all messages [$PARANOID]
--first-messages-count= number of first messages to check (default: 1) [$FIRST_MESSAGES_COUNT]
--training training mode, passive spam detection only [$TRAINING]
Expand Down
3 changes: 3 additions & 0 deletions app/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ type options struct {
MinMsgLen int `long:"min-msg-len" env:"MIN_MSG_LEN" default:"50" description:"min message length to check"`
MaxEmoji int `long:"max-emoji" env:"MAX_EMOJI" default:"2" description:"max emoji count in message, -1 to disable check"`
MinSpamProbability float64 `long:"min-probability" env:"MIN_PROBABILITY" default:"50" description:"min spam probability percent to ban"`
MultiLangWords int `long:"multi-lang" env:"MULTI_LANG" default:"0" description:"number of words in different languages to consider as spam"`

ParanoidMode bool `long:"paranoid" env:"PARANOID" description:"paranoid mode, check all messages"`
FirstMessagesCount int `long:"first-messages-count" env:"FIRST_MESSAGES_COUNT" default:"1" description:"number of first messages to check"`
Expand Down Expand Up @@ -355,6 +356,7 @@ func activateServer(ctx context.Context, opts options, sf *bot.SpamFilter, loc *
MetaLinksLimit: opts.Meta.LinksLimit,
MetaLinksOnly: opts.Meta.LinksOnly,
MetaImageOnly: opts.Meta.ImageOnly,
MultiLangLimit: opts.MultiLangWords,
OpenAIEnabled: opts.OpenAI.Token != "",
SamplesDataPath: opts.Files.SamplesDataPath,
DynamicDataPath: opts.Files.DynamicDataPath,
Expand Down Expand Up @@ -402,6 +404,7 @@ func makeDetector(opts options) *tgspam.Detector {
FirstMessageOnly: !opts.ParanoidMode,
FirstMessagesCount: opts.FirstMessagesCount,
OpenAIVeto: opts.OpenAI.Veto,
MultiLangWords: opts.MultiLangWords,
}

// FirstMessagesCount and ParanoidMode are mutually exclusive.
Expand Down
1 change: 1 addition & 0 deletions app/webapi/webapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ type Settings struct {
MetaLinksLimit int `json:"meta_links_limit"`
MetaLinksOnly bool `json:"meta_links_only"`
MetaImageOnly bool `json:"meta_image_only"`
MultiLangLimit int `json:"multi_lang_limit"`
OpenAIEnabled bool `json:"openai_enabled"`
SamplesDataPath string `json:"samples_data_path"`
DynamicDataPath string `json:"dynamic_data_path"`
Expand Down
61 changes: 61 additions & 0 deletions lib/tgspam/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"strings"
"sync"
"time"
"unicode"

"github.com/umputun/tg-spam/lib/approved"
"github.com/umputun/tg-spam/lib/spamcheck"
Expand Down Expand Up @@ -54,6 +55,7 @@ type Config struct {
HTTPClient HTTPClient // http client to use for requests
MinSpamProbability float64 // minimum spam probability to consider a message spam with classifier, if 0 - ignored
OpenAIVeto bool // if true, openai will be used to veto spam messages, otherwise it will be used to veto ham messages
MultiLangWords int // if true, check for number of multi-lingual words
}

// SampleUpdater is an interface for updating spam/ham samples on the fly.
Expand Down Expand Up @@ -141,6 +143,10 @@ func (d *Detector) Check(req spamcheck.Request) (spam bool, cr []spamcheck.Respo
cr = append(cr, d.isCasSpam(req.UserID))
}

if d.MultiLangWords > 0 {
cr = append(cr, d.isMultiLang(req.Msg))
}

// check for message length exceed the minimum size, if min message length is set.
// the check is done after first simple checks, because stop words and emojis can be triggered by short messages as well.
if len([]rune(req.Msg)) < d.MinMsgLen {
Expand Down Expand Up @@ -571,3 +577,58 @@ func (d *Detector) isManyEmojis(msg string) spamcheck.Response {
count := countEmoji(msg)
return spamcheck.Response{Name: "emoji", Spam: count > d.MaxAllowedEmoji, Details: fmt.Sprintf("%d/%d", count, d.MaxAllowedEmoji)}
}

// isMultiLang checks if a given message contains more than MultiLangWords multi-lingual words.
func (d *Detector) isMultiLang(msg string) spamcheck.Response {
isMultiLingual := func(word string) bool {
scripts := make(map[string]bool)
for _, r := range word {
switch {
case unicode.Is(unicode.Latin, r):
scripts["Latin"] = true
case unicode.Is(unicode.Cyrillic, r):
scripts["Cyrillic"] = true
case unicode.Is(unicode.Greek, r):
scripts["Greek"] = true
case unicode.Is(unicode.Han, r):
scripts["Han"] = true
case unicode.Is(unicode.Arabic, r):
scripts["Arabic"] = true
case unicode.Is(unicode.Hebrew, r):
scripts["Hebrew"] = true
case unicode.Is(unicode.Devanagari, r):
scripts["Devanagari"] = true
case unicode.Is(unicode.Thai, r):
scripts["Thai"] = true
case unicode.Is(unicode.Hiragana, r) || unicode.Is(unicode.Katakana, r):
scripts["Japanese"] = true
case unicode.Is(unicode.Hangul, r):
scripts["Korean"] = true
case unicode.Is(unicode.Bengali, r):
scripts["Bengali"] = true
case unicode.Is(unicode.Armenian, r):
scripts["Armenian"] = true
case unicode.Is(unicode.Georgian, r):
scripts["Georgian"] = true
case r == 'ї':
scripts["Ukrainian"] = true
}
if len(scripts) > 1 {
return true
}
}
return false
}

count := 0
words := strings.Fields(msg)
for _, word := range words {
if isMultiLingual(word) {
count++
}
}
if count >= d.MultiLangWords {
return spamcheck.Response{Name: "multi-lingual", Spam: true, Details: fmt.Sprintf("%d/%d", count, d.MultiLangWords)}
}
return spamcheck.Response{Name: "multi-lingual", Spam: false, Details: fmt.Sprintf("%d/%d", count, d.MultiLangWords)}
}
31 changes: 31 additions & 0 deletions lib/tgspam/detector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,37 @@ func TestDetector_CheckWithMeta(t *testing.T) {
})
}

func TestDetector_CheckMultiLang(t *testing.T) {
d := NewDetector(Config{MultiLangWords: 2, MaxAllowedEmoji: -1})
tests := []struct {
name string
input string
count int
spam bool
}{
{"No MultiLang", "Hello, world!", 0, false},
{"One MultiLang", "Hi therе", 1, false},
{"Two MultiLang", "Gооd moфning", 2, true},
{"WithCyrillic no MultiLang", "Привет мир", 0, false},
{"WithCyrillic two MultiLang", "Привеt мip", 2, true},
{"WithCyrillic and special symbols", "Привет мир -@#$%^&*(_", 0, false},
{"WithCyrillic real example 1", "Ищем заинтeрeсoвaнных в зaрaбoткe нa кpиптoвaлютe. Всeгдa хотeли пoпpoбовать сeбя в этом, нo нe знали с чeго нaчaть? Тогдa вaм кo мнe 3aнимаемся aрбuтражeм, зaрабaтывaeм на paзницe курсов с минимaльныmи pискaми 💲Рынok oчень волатильный и нам это выгoднo, пo этoмe пишиte @vitalgoescra и зapaбaтывaйтe сo мнoй ", 31, true},
{"WithCyrillic real example 2", "В поuске паpтнеров, заuнтересованных в пассuвном дoходе с затpатой мuнuмум лuчного временu. Все деталu в лс", 10, true},
{"WithCyrillic real example 3", "Всем привет, есть простая шабашка, подойдет любому. Даю 15 тысяч. Накину на проезд, сигареты, обед. ", 0, false},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
spam, cr := d.Check(spamcheck.Request{Msg: tt.input})
assert.Equal(t, tt.spam, spam)
require.Len(t, cr, 1)
assert.Equal(t, "multi-lingual", cr[0].Name)
assert.Equal(t, tt.spam, cr[0].Spam)
assert.Equal(t, fmt.Sprintf("%d/2", tt.count), cr[0].Details)
})
}
}

func TestDetector_UpdateSpam(t *testing.T) {
upd := &mocks.SampleUpdaterMock{
AppendFunc: func(msg string) error {
Expand Down

0 comments on commit 71637ab

Please sign in to comment.