Skip to content

Commit

Permalink
Implement multi-language words spam detection
Browse files Browse the repository at this point in the history
This update introduces a feature to detect spam messages that use words mixing characters from multiple languages. The feature is turned off by default and can be enabled with the `--multi-lang=` command line parameter. Tests have been added to confirm the new functionality and the README has been updated with its description.
  • Loading branch information
umputun committed Jun 23, 2024
1 parent 182ebe3 commit 3a1375d
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 0 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@ This option is disabled by default. If set to `true`, the bot will check the mes

This option is disabled by default. If set to `true`, the bot will check the message for the presence of any image. If the message contains images but no text, it will be marked as spam.

**Multi-language words**

Using words that mix characters from multiple languages is a common spam technique. To detect such messages, the bot can check the message for the presence of such words. This option is disabled by default and can be enabled with the `--multi-lang=, [$MULTI_LANG]` parameter. Setting it to a number above `0` will enable this check, and the bot will mark the message as spam if it contains words with characters from more than one language in more than the specified number of words.


### Admin chat/group

Optionally, user can specify the admin chat/group name/id. In this case, the bot will send a message to the admin chat as soon as a spammer is detected. Admin can see all the spam and all banned users and could also unban the user, confirm the ban or get results of spam checks by clicking a button directly on the message.
Expand Down Expand Up @@ -233,6 +238,7 @@ Success! The new status is: DISABLED. /help
--min-msg-len= min message length to check (default: 50) [$MIN_MSG_LEN]
--max-emoji= max emoji count in message, -1 to disable check (default: 2) [$MAX_EMOJI]
--min-probability= min spam probability percent to ban (default: 50) [$MIN_PROBABILITY]
--multi-lang= number of words in different languages to consider as spam, 0 to disable (default: 0) [$MULTI_LANG]
--paranoid paranoid mode, check all messages [$PARANOID]
--first-messages-count= number of first messages to check (default: 1) [$FIRST_MESSAGES_COUNT]
--training training mode, passive spam detection only [$TRAINING]
Expand Down
3 changes: 3 additions & 0 deletions app/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ type options struct {
MinMsgLen int `long:"min-msg-len" env:"MIN_MSG_LEN" default:"50" description:"min message length to check"`
MaxEmoji int `long:"max-emoji" env:"MAX_EMOJI" default:"2" description:"max emoji count in message, -1 to disable check"`
MinSpamProbability float64 `long:"min-probability" env:"MIN_PROBABILITY" default:"50" description:"min spam probability percent to ban"`
MultiLangWords int `long:"multi-lang" env:"MULTI_LANG" default:"0" description:"number of words in different languages to consider as spam"`

ParanoidMode bool `long:"paranoid" env:"PARANOID" description:"paranoid mode, check all messages"`
FirstMessagesCount int `long:"first-messages-count" env:"FIRST_MESSAGES_COUNT" default:"1" description:"number of first messages to check"`
Expand Down Expand Up @@ -355,6 +356,7 @@ func activateServer(ctx context.Context, opts options, sf *bot.SpamFilter, loc *
MetaLinksLimit: opts.Meta.LinksLimit,
MetaLinksOnly: opts.Meta.LinksOnly,
MetaImageOnly: opts.Meta.ImageOnly,
MultiLangLimit: opts.MultiLangWords,
OpenAIEnabled: opts.OpenAI.Token != "",
SamplesDataPath: opts.Files.SamplesDataPath,
DynamicDataPath: opts.Files.DynamicDataPath,
Expand Down Expand Up @@ -402,6 +404,7 @@ func makeDetector(opts options) *tgspam.Detector {
FirstMessageOnly: !opts.ParanoidMode,
FirstMessagesCount: opts.FirstMessagesCount,
OpenAIVeto: opts.OpenAI.Veto,
MultiLangWords: opts.MultiLangWords,
}

// FirstMessagesCount and ParanoidMode are mutually exclusive.
Expand Down
1 change: 1 addition & 0 deletions app/webapi/webapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ type Settings struct {
MetaLinksLimit int `json:"meta_links_limit"`
MetaLinksOnly bool `json:"meta_links_only"`
MetaImageOnly bool `json:"meta_image_only"`
MultiLangLimit int `json:"multi_lang_limit"`
OpenAIEnabled bool `json:"openai_enabled"`
SamplesDataPath string `json:"samples_data_path"`
DynamicDataPath string `json:"dynamic_data_path"`
Expand Down
61 changes: 61 additions & 0 deletions lib/tgspam/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"strings"
"sync"
"time"
"unicode"

"github.com/umputun/tg-spam/lib/approved"
"github.com/umputun/tg-spam/lib/spamcheck"
Expand Down Expand Up @@ -54,6 +55,7 @@ type Config struct {
HTTPClient HTTPClient // http client to use for requests
MinSpamProbability float64 // minimum spam probability to consider a message spam with classifier, if 0 - ignored
OpenAIVeto bool // if true, openai will be used to veto spam messages, otherwise it will be used to veto ham messages
MultiLangWords int // if true, check for number of multi-lingual words
}

// SampleUpdater is an interface for updating spam/ham samples on the fly.
Expand Down Expand Up @@ -141,6 +143,10 @@ func (d *Detector) Check(req spamcheck.Request) (spam bool, cr []spamcheck.Respo
cr = append(cr, d.isCasSpam(req.UserID))
}

if d.MultiLangWords > 0 {
cr = append(cr, d.isMultiLang(req.Msg))
}

// check for message length exceed the minimum size, if min message length is set.
// the check is done after first simple checks, because stop words and emojis can be triggered by short messages as well.
if len([]rune(req.Msg)) < d.MinMsgLen {
Expand Down Expand Up @@ -571,3 +577,58 @@ func (d *Detector) isManyEmojis(msg string) spamcheck.Response {
count := countEmoji(msg)
return spamcheck.Response{Name: "emoji", Spam: count > d.MaxAllowedEmoji, Details: fmt.Sprintf("%d/%d", count, d.MaxAllowedEmoji)}
}

// isMultiLang checks if a given message contains more than MultiLangWords multi-lingual words.
func (d *Detector) isMultiLang(msg string) spamcheck.Response {
isMultiLingual := func(word string) bool {
scripts := make(map[string]bool)
for _, r := range word {
switch {
case unicode.Is(unicode.Latin, r):
scripts["Latin"] = true
case unicode.Is(unicode.Cyrillic, r):
scripts["Cyrillic"] = true
case unicode.Is(unicode.Greek, r):
scripts["Greek"] = true
case unicode.Is(unicode.Han, r):
scripts["Han"] = true
case unicode.Is(unicode.Arabic, r):
scripts["Arabic"] = true
case unicode.Is(unicode.Hebrew, r):
scripts["Hebrew"] = true
case unicode.Is(unicode.Devanagari, r):
scripts["Devanagari"] = true
case unicode.Is(unicode.Thai, r):
scripts["Thai"] = true
case unicode.Is(unicode.Hiragana, r) || unicode.Is(unicode.Katakana, r):
scripts["Japanese"] = true
case unicode.Is(unicode.Hangul, r):
scripts["Korean"] = true
case unicode.Is(unicode.Bengali, r):
scripts["Bengali"] = true
case unicode.Is(unicode.Armenian, r):
scripts["Armenian"] = true
case unicode.Is(unicode.Georgian, r):
scripts["Georgian"] = true
case r == 'ї':
scripts["Ukrainian"] = true
}
if len(scripts) > 1 {
return true
}
}
return false
}

count := 0
words := strings.Fields(msg)
for _, word := range words {
if isMultiLingual(word) {
count++
}
}
if count >= d.MultiLangWords {
return spamcheck.Response{Name: "multi-lingual", Spam: true, Details: fmt.Sprintf("%d/%d", count, d.MultiLangWords)}
}
return spamcheck.Response{Name: "multi-lingual", Spam: false, Details: fmt.Sprintf("%d/%d", count, d.MultiLangWords)}
}
27 changes: 27 additions & 0 deletions lib/tgspam/detector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,33 @@ func TestDetector_CheckWithMeta(t *testing.T) {
})
}

func TestDetector_CheckMultiLang(t *testing.T) {
d := NewDetector(Config{MultiLangWords: 2, MaxAllowedEmoji: -1})
tests := []struct {
name string
input string
count int
spam bool
}{
{"No MultiLang", "Hello, world!", 0, false},
{"One MultiLang", "Hi therе", 1, false},
{"Two MultiLang", "Gооd moфning", 2, true},
{"WithCyrillic no MultiLang", "Привет мир", 0, false},
{"WithCyrillic two MultiLang", "Привеt мip", 2, true},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
spam, cr := d.Check(spamcheck.Request{Msg: tt.input})
assert.Equal(t, tt.spam, spam)
require.Len(t, cr, 1)
assert.Equal(t, "multi-lingual", cr[0].Name)
assert.Equal(t, tt.spam, cr[0].Spam)
assert.Equal(t, fmt.Sprintf("%d/2", tt.count), cr[0].Details)
})
}
}

func TestDetector_UpdateSpam(t *testing.T) {
upd := &mocks.SampleUpdaterMock{
AppendFunc: func(msg string) error {
Expand Down

0 comments on commit 3a1375d

Please sign in to comment.