-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
retrieve_domains.sh
792 lines (624 loc) · 27.2 KB
/
retrieve_domains.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
#!/bin/bash
# Retrieves domains from various sources, processes them and outputs a raw file
# that contains the cumulative domains from all sources over time.
readonly FUNCTION='bash scripts/tools.sh'
readonly RAW='data/raw.txt'
readonly RAW_LIGHT='data/raw_light.txt'
readonly SEARCH_TERMS='config/search_terms.csv'
readonly WHITELIST='config/whitelist.txt'
readonly BLACKLIST='config/blacklist.txt'
readonly ROOT_DOMAINS='data/root_domains.txt'
readonly SUBDOMAINS='data/subdomains.txt'
readonly SUBDOMAINS_TO_REMOVE='config/subdomains.txt'
readonly DEAD_DOMAINS='data/dead_domains.txt'
readonly PARKED_DOMAINS='data/parked_domains.txt'
readonly PHISHING_TARGETS='config/phishing_targets.csv'
readonly SOURCE_LOG='config/source_log.csv'
readonly DOMAIN_LOG='config/domain_log.csv'
TIMESTAMP="$(date -u +"%H:%M:%S %d-%m-%y")"
readonly TIMESTAMP
# Matches example.com, example[.]com, 1.1.1.1
readonly DOMAIN_REGEX='[[:alnum:].-]+\[?\.\]?[[:alnum:]-]+'
# Matches example-com, 1.1.1.1
# https://github.com/jarelllama/Scam-Blocklist/issues/349
readonly DOMAIN_DASH_REGEX='[[:alnum:].-]+-[[:alnum:]-]+'
# Only matches domains
readonly STRICT_DOMAIN_REGEX='[[:alnum:].-]+\.[[:alnum:]-]*[a-z]{2,}[[:alnum:]-]*'
readonly -a SOURCES=(
source_aa419
source_dnstwist
source_emerging_threats
source_fakewebsitebuster
source_guntab
source_jeroengui_phishing
source_jeroengui_scam
source_manual
source_petscams
source_phishstats
source_phishstats_nrd
source_regex
source_scamadviser
source_scamdirectory
source_stopgunscams
source_google_search
)
# Function 'source' calls on the respective functions of each source to
# retrieve results. The results are then passed to the 'process_source'
# function for further processing.
source() {
# Check whether to use existing retrieved results
if [[ -d data/pending ]]; then
printf "\nUsing existing lists of retrieved results.\n"
readonly USE_EXISTING=true
fi
mkdir -p data/pending
# Download dependencies here to not bias the processing time of
# the sources (done in parallel):
# Install idn (requires sudo) (note -qq does not seem to work here)
# Call shell wrapper to download toplist
# Download NRD feed
{ command -v idn &> /dev/null || sudo apt-get install idn > /dev/null; } \
& $FUNCTION --download-toplist \
& { [[ "$USE_EXISTING" != true ]] && download_nrd_feed; }
wait
# Declare default values and run each source function
for SOURCE in "${SOURCES[@]}"; do
# Skip commented out sources
[[ "$SOURCE" == \#* ]] && continue
local ignore_from_light=false
local rate_limited=false
local query_count=''
local execution_time
execution_time="$(date +%s)"
$SOURCE
[[ "$USE_EXISTING" == true ]] && continue
# The Google Search source is processed by individual search terms, not
# as one source
[[ "$source" == 'Google Search' ]] && continue
process_source
done
}
# Function 'filter' logs the given entries and removes them from the results
# file.
# Input:
# $1: entries to remove passed in a variable
# $2: tag given to entries
# --no-log: do not log entries into the domain log
# --preserve: save entries for manual review and for rerun
# Output:
# Number of entries that were passed
filter() {
local entries="$1"
local tag="$2"
# Return with 0 entries if no entries passed
[[ -z "$entries" ]] && { printf "0"; return; }
# Remove entries from results file
comm -23 "$results_file" <(printf "%s" "$entries") > results.tmp
mv results.tmp "$results_file"
if [[ "$3" != '--no-log' ]]; then
log_domains "$entries" "$tag"
fi
if [[ "$3" == '--preserve' ]]; then
# Save entries for manual review and for rerun
mawk -v tag="$tag" '{print $0 " (" tag ")"}' <<< "$entries" \
>> manual_review.tmp
printf "%s\n" "$entries" >> "${results_file}.tmp"
fi
# Return number of entries
# Note wc -w is used here because wc -l counts empty variables as 1 line
wc -w <<< "$entries"
}
# Function 'process_source' filters the results retrieved from the caller
# source. The output is a cumulative filtered domains file of all filtered
# domains from all sources in this run.
process_source() {
[[ ! -f "$results_file" ]] && return
# Remove http(s):, slashes, and square brackets (this is done here once
# instead of multiple times in the source functions)
# Note that this still allows invalid entries to get through so they can be
# flagged later on.
sed -i 's/https\?:https://; s/[/]//g; s/\[//; s/\]//' "$results_file"
# Convert Unicode to Punycode
# '--no-tld' to fix 'idn: tld_check_4z: Missing input' error
idn --no-tld < "$results_file" > results.tmp
mv results.tmp "$results_file"
$FUNCTION --format "$results_file"
# Count number of unfiltered domains pending
raw_count="$(wc -l < "$results_file")"
# Remove known dead domains (includes subdomains)
dead="$(comm -12 <(sort "$DEAD_DOMAINS") "$results_file")"
dead_count="$(filter "$dead" dead --no-log)"
# Logging disabled as it inflated log size
# Remove known parked domains (includes subdomains)
parked="$(comm -12 <(sort "$PARKED_DOMAINS") "$results_file")"
parked_count="$(filter "$parked" parked --no-log)"
# Logging disabled as it inflated log size
# Strip away subdomains
while read -r subdomain; do # Loop through common subdomains
subdomains="$(mawk "/^${subdomain}\./" "$results_file")" || continue
# Strip subdomains down to their root domains
sed -i "s/^${subdomain}\.//" "$results_file"
# Save subdomains and root domains to be filtered later
printf "%s\n" "$subdomains" >> subdomains.tmp
printf "%s\n" "$subdomains" | sed "s/^${subdomain}\.//" >> root_domains.tmp
# Log subdomains excluding 'www' (too many of them)
log_domains "$(mawk '!/^www\./' <<< "$subdomains")" subdomain
done < "$SUBDOMAINS_TO_REMOVE"
sort -u "$results_file" -o "$results_file"
# Remove domains already in raw file
comm -23 "$results_file" "$RAW" > results.tmp
mv results.tmp "$results_file"
# Log blacklisted domains
log_domains "$(comm -12 "$BLACKLIST" "$results_file")" blacklist
# Remove whitelisted domains excluding blacklisted domains
# Note whitelist matching uses regex
whitelisted="$(grep -Ef "$WHITELIST" "$results_file" | grep -vxFf "$BLACKLIST")"
whitelisted_count="$(filter "$whitelisted" whitelist)"
# Remove domains with whitelisted TLDs
# mawk does not work with this expression so grep is intentionally chosen
# over awk. The same applies for the invalid check below.
whitelisted_tld="$(grep -E '\.(gov|edu|mil)(\.[a-z]{2})?$' "$results_file")"
whitelisted_tld_count="$(filter "$whitelisted_tld" tld)"
# Remove non-domain entries including IP addresses excluding Punycode
invalid="$(grep -vE "^${STRICT_DOMAIN_REGEX}$" "$results_file")"
# Note invalid entries are not counted
filter "$invalid" invalid --preserve > /dev/null
# Remove domains in toplist excluding blacklisted domains
# Note the toplist does not include subdomains
in_toplist="$(comm -12 toplist.tmp "$results_file" | grep -vxFf "$BLACKLIST")"
in_toplist_count="$(filter "$in_toplist" toplist --preserve)"
# Collate filtered domains
cat "$results_file" >> retrieved_domains.tmp
if [[ "$ignore_from_light" != true ]]; then
# Collate filtered domains from light sources
cat "$results_file" >> retrieved_light_domains.tmp
fi
log_domains "$results_file" unsaved
log_source
rm "$results_file"
if [[ -f "${results_file}.tmp" ]]; then
# Save entries that are pending manual review for rerun
mv "${results_file}.tmp" "$results_file"
$FUNCTION --format "$results_file"
fi
}
# Function 'build' appends the filtered domains into the raw files and presents
# some basic numbers to the user.
build() {
# For telegram message
workflow_url='https://github.com/jarelllama/Scam-Blocklist/actions/workflows/build_deploy.yml'
if [[ -f manual_review.tmp ]]; then
# Print domains requiring manual review
printf "\n\e[1mEntries requiring manual review:\e[0m\n"
sed 's/(/(\o033[31m/; s/)/\o033[0m)/' manual_review.tmp
# Send telegram notification
$FUNCTION --send-telegram \
"Entries requiring manual review:\n$(<manual_review.tmp)"
printf "\nTelegram notification sent.\n"
fi
$FUNCTION --format retrieved_domains.tmp
# Return if no new domains to add
if [[ ! -s retrieved_domains.tmp ]]; then
printf "\n\e[1mNo new domains to add.\e[0m\n"
[[ "$USE_EXISTING" == true ]] && return
# Send Telegram update if not using existing results
$FUNCTION --send-telegram \
"Run completed. No new domains added.\n${workflow_url}"
return
fi
# Collate only filtered subdomains and root domains into the subdomains
# file and root domains file
if [[ -f root_domains.tmp ]]; then
# Find root domains (subdomains stripped off) in the filtered domains
root_domains="$(comm -12 <(sort root_domains.tmp) retrieved_domains.tmp)"
# Check if any filtered root domains are found to avoid appending an
# empty line
if [[ -n "$root_domains" ]]; then
# Collate filtered root domains to exclude from dead check
printf "%s\n" "$root_domains" >> "$ROOT_DOMAINS"
sort -u "$ROOT_DOMAINS" -o "$ROOT_DOMAINS"
# Collate filtered subdomains for dead check
# grep is used here as mawk does not interpret variables with
# multiple lines well when matching.
grep "\.${root_domains}$" subdomains.tmp >> "$SUBDOMAINS"
sort -u "$SUBDOMAINS" -o "$SUBDOMAINS"
fi
fi
count_before="$(wc -l < "$RAW")"
# Add domains to raw file
sort -u retrieved_domains.tmp "$RAW" -o "$RAW"
if [[ -f retrieved_light_domains.tmp ]]; then
# Add domains to raw light file
cat retrieved_light_domains.tmp >> "$RAW_LIGHT"
$FUNCTION --format "$RAW_LIGHT"
fi
count_after="$(wc -l < "$RAW")"
count_added="$(( count_after - count_before ))"
printf "\nAdded new domains to blocklist.\nBefore: %s Added: %s After: %s\n" \
"$count_before" "$count_added" "$count_after"
# Mark sources/events as saved in the log files
sed -i "/${TIMESTAMP}/s/,unsaved/,saved/" "$SOURCE_LOG"
sed -i "/${TIMESTAMP}/s/,unsaved/,saved/" "$DOMAIN_LOG"
[[ "$USE_EXISTING" == true ]] && return
# Send Telegram update if not using existing results
$FUNCTION --send-telegram \
"Run completed. Retrieved ${count_added} domains.\n${workflow_url}"
}
# Function 'log_source' prints and logs statistics for each source using the
# variables declared in the 'process_source' function.
log_source() {
local item
local status='unsaved'
if [[ "$source" == 'Google Search' ]]; then
item="\"${search_term:0:100}...\""
fi
# Check for errors to log
if [[ "$rate_limited" == true ]]; then
status='eror: rate_limited'
elif (( raw_count == 0 )); then
status='error: empty'
fi
final_count="$(wc -l < "$results_file")"
total_whitelisted_count="$(( whitelisted_count + whitelisted_tld_count ))"
excluded_count="$(( dead_count + parked_count ))"
echo "${TIMESTAMP},${source},${item},${raw_count},${final_count},\
${total_whitelisted_count},${dead_count},${parked_count},${in_toplist_count},\
${query_count},${status}" >> "$SOURCE_LOG"
[[ "$rate_limited" == true ]] && return
printf "\n\e[1mSource: %s\e[0m\n" "${item:-$source}"
if [[ "$status" == 'error: empty' ]]; then
printf "\e[1;31mNo results retrieved. Potential error occurred.\e[0m\n"
# Send telegram notification
$FUNCTION --send-telegram \
"Source '$source' retrieved no results. Potential error occurred."
else
printf "Raw:%4s Final:%4s Whitelisted:%4s Excluded:%4s Toplist:%4s\n" \
"$raw_count" "$final_count" "$total_whitelisted_count" \
"$excluded_count" "$in_toplist_count"
fi
printf "Processing time: %s second(s)\n" "$(( $(date +%s) - execution_time ))"
echo "----------------------------------------------------------------------"
}
# Function 'log_domains' calls a shell wrapper to log domain processing events
# into the domain log.
# $1: domains to log either in a file or variable
# $2: event type (dead, whitelisted, etc.)
log_domains() {
$FUNCTION --log-domains "$1" "$2" "$source" "$TIMESTAMP"
}
# Function 'download_nrd_feed' calls a shell wrapper to download the NRD feed.
# Output:
# nrd.tmp
download_nrd_feed() {
$FUNCTION --download-nrd-feed
# Remove already processed domains to save processing time
comm -23 nrd.tmp <(sort "$RAW" "$DEAD_DOMAINS" "$PARKED_DOMAINS") > temp
mv temp nrd.tmp
}
cleanup() {
# Initialize pending directory if no domains to be saved for rerun
find data/pending -type d -empty -delete
find . -maxdepth 1 -type f -name "*.tmp" -delete
}
# The 'source_<source>' functions are to retrieve results from the respective
# sources.
# Input:
# $source: name of the source to use in the console and logs
# $ignore_from_light: if true, the results are not included in the light
# version (default is false)
# $results_file: file path to save retrieved results to be used for
# further processing
# $USE_EXISTING: if true, skip the retrieval process and use the
# existing results files (if found)
# Output:
# $results_file (if results retrieved)
#
# Note the output results can be in URL form without subfolders.
source_google_search() {
source='Google Search'
if [[ "$USE_EXISTING" == true ]]; then
# Use existing retrieved results
# Loop through the results from each search term
for results_file in data/pending/domains_google_search_*.tmp; do
[[ ! -f "$results_file" ]] && return
# Set execution time for each individual search term
execution_time="$(date +%s)"
# Remove header from file name
search_term="${results_file#data/pending/domains_google_search_}"
# Remove file extension from file name to get search term
search_term="${search_term%.tmp}"
process_source
done
return
fi
# Retrieve new results
local url='https://customsearch.googleapis.com/customsearch/v1'
local search_id="$GOOGLE_SEARCH_ID"
local search_api_key="$GOOGLE_SEARCH_API_KEY"
# Install csvkit
command -v csvgrep &> /dev/null || pip install -q csvkit
# Install jq
command -v jq &> /dev/null || apt-get install -qq jq
# Get active search terms
# csvkit has to be used here as the search terms may contain commas which
# makes using mawk complicated.
search_terms="$(csvgrep -c 2 -m 'y' -i "$SEARCH_TERMS" | csvcut -c 1 \
| tail -n +2)"
# Loop through search terms
while read -r search_term; do
# Stop if rate limited
if [[ "$rate_limited" == true ]]; then
printf "\n\e[1;31mBoth Google Search API keys are rate limited.\e[0m\n"
return
fi
search_google "$search_term"
done <<< "$search_terms"
}
search_google() {
search_term="${1//\"/}" # Remove quotes before encoding
# Replace non-alphanumeric characters with spaces
encoded_search_term="${search_term//[^[:alnum:]]/%20}"
results_file="data/pending/domains_google_search_${search_term:0:100}.tmp"
query_count=0
# Set execution time for each individual search term
execution_time="$(date +%s)"
touch "$results_file" # Create results file to ensure proper logging
# Loop through each page of results
for start in {1..100..10}; do
# Indentation intentionally lacking here
# Restrict to results from the last 30 days
params="cx=${search_id}&key=${search_api_key}&exactTerms=${encoded_search_term}&dateRestrict=m1&sort=date&start=${start}&filter=0"
page_results="$(curl -sS "${url}?${params}")"
(( query_count++ ))
# Use next API key if first key is rate limited
if [[ "$page_results" == *rateLimitExceeded* ]]; then
# Stop all searches if second key is also rate limited
if [[ "$search_id" == "$GOOGLE_SEARCH_ID_2" ]]; then
rate_limited=true
break
fi
printf "\n\e[1mGoogle Search rate limited. Switching API keys.\e[0m\n"
# Switch API keys
readonly search_api_key="$GOOGLE_SEARCH_API_KEY_2"
readonly search_id="$GOOGLE_SEARCH_ID_2"
# Continue to next page (current rate limited page is not repeated)
continue
fi
# Stop search term if page has no results
jq -e '.items' &> /dev/null <<< "$page_results" || break
# Get domains from each page
page_domains="$(jq -r '.items[].link' <<< "$page_results" \
| mawk -F '/' '{print $3}')"
printf "%s\n" "$page_domains" >> "$results_file"
# Stop search term if no more pages are required
(( $(wc -w <<< "$page_domains") < 10 )) && break
done
process_source
}
source_dnstwist() {
source='dnstwist'
results_file="data/pending/domains_${source}.tmp"
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
# Install dnstwist
command -v dnstwist > /dev/null || pip install -q dnstwist
# Get the top 15 TLDs from the NRD feed
# Only 10,000 entries are sampled to save time while providing the same
# ranking as 100,000 entries and above.
tlds="$(shuf -n 10000 nrd.tmp | mawk -F '.' '{print $NF}' | sort | uniq -c \
| sort -nr | head -n 15 | mawk '{print $2}')"
# Remove duplicate targets from targets file
mawk -F ',' '!seen[$1]++' "$PHISHING_TARGETS" > temp
mv temp "$PHISHING_TARGETS"
# Get targets ignoring disabled ones
targets="$(mawk -F ',' '$5 != "y" {print $1}' "$PHISHING_TARGETS" | tail -n +2)"
# Loop through the targets
while read -r domain; do
# Get row and counts for the target domain
row="$(mawk -F ',' -v domain="$domain" \
'$1 == domain {printf $1","$2","$3","$4}' "$PHISHING_TARGETS")"
count="$(mawk -F ',' '{print $3}' <<< "$row")"
runs="$(mawk -F ',' '{print $4}' <<< "$row")"
# Run dnstwist
results="$(dnstwist "${domain}.com" -f list)"
# Append TLDs to results
while read -r tld; do
printf "%s\n" "$results" | sed "s/\.com/.${tld}/" >> results.tmp
done <<< "$tlds"
sort -u results.tmp -o results.tmp
# Get matching NRDs
comm -12 results.tmp nrd.tmp > temp
mv temp results.tmp
# Collate results
cat results.tmp >> "$results_file"
# Update counts for the target domain
count="$(( count + $(wc -l < results.tmp) ))"
(( runs++ ))
counts_run="$(( count / runs ))"
sed -i "s/${row}/${domain},${counts_run},${count},${runs}/" \
"$PHISHING_TARGETS"
# Reset results file for the next target domain
rm results.tmp
done <<< "$targets"
}
source_regex() {
source='Regex'
ignore_from_light=true
results_file='data/pending/domains_regex.tmp'
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
# Remove duplicate targets from targets file
mawk -F ',' '!seen[$1]++' "$PHISHING_TARGETS" > temp
mv temp "$PHISHING_TARGETS"
# Get targets ignoring disabled ones
targets="$(mawk -F ',' '$10 == "n" {print $1}' "$PHISHING_TARGETS")"
# Loop through the targets
while read -r domain; do
# Get row and counts for the target domain
row="$(mawk -F ',' -v domain="$domain" \
'$1 == domain {printf $6","$7","$8","$9}' "$PHISHING_TARGETS")"
count="$(mawk -F ',' '{print $3}' <<< "$row")"
runs="$(mawk -F ',' '{print $4}' <<< "$row")"
# Get regex of target
pattern="$(mawk -F ',' '{printf $1}' <<< "$row")"
escaped_domain="${domain//[.]/\\.}"
regex="${pattern/&/${escaped_domain}}"
# Get matches in NRD feed
results="$(mawk "/${regex}/" nrd.tmp | sort -u)"
# Collate results
printf "%s\n" "$results" >> "$results_file"
# Escape periods and backslashes
row="$(printf "%s" "$row" | sed 's/[.\]/\\&/g')"
# Escape '&', periods and backslashes
pattern="$(printf "%s" "$pattern" | sed 's/[&.\]/\\&/g')"
# Update counts for the target domain
count="$(( count + $(wc -w <<< "$results") ))"
(( runs++ ))
counts_run="$(( count / runs ))"
sed -i "/${domain}/s/${row}/${pattern},${counts_run},${count},${runs}/" \
"$PHISHING_TARGETS"
done <<< "$targets"
}
source_manual() {
source='Manual'
results_file='data/pending/domains_manual.tmp'
# Process only if file is found (source is the file itself)
[[ -f "$results_file" ]] && process_source
}
source_aa419() {
source='aa419.org'
results_file="data/pending/domains_${source}.tmp"
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
# Install jq
command -v jq &> /dev/null || apt-get install -qq jq
local url='https://api.aa419.org/fakesites'
# Trailing slash intentionally omitted
curl -sSH "Auth-API-Id:${AA419_API_ID}" "${url}/0/250?Status=active" \
--retry 2 --retry-all-errors | jq -r '.[].Domain' > "$results_file"
}
source_emerging_threats() {
source='Emerging Threats'
results_file='data/pending/domains_emerging_threats.tmp'
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
local url='https://raw.githubusercontent.com/jarelllama/Emerging-Threats/main/data/phishing.txt'
curl -sSL "$url" -o "$results_file"
}
source_fakewebsitebuster() {
source='fakewebsitebuster.com'
results_file="data/pending/domains_${source}.tmp"
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
local url='https://fakewebsitebuster.com/category/website-reviews'
# Regarding grep pipe errors, see:
# https://github.com/jarelllama/Scam-Blocklist/issues/349
curl -sS --retry 2 --retry-all-errors "${url}/" \
| grep -oE 'rel="bookmark">.*</a></h2>' \
| grep -oE "([0-9]|[A-Z])${DOMAIN_REGEX}" \
| head -n 50 > "$results_file" # Keep only newest 50 results
}
source_guntab() {
source='guntab.com'
ignore_from_light=true
results_file="data/pending/domains_${source}.tmp"
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
local url='https://www.guntab.com/scam-websites'
curl -sS --retry 2 --retry-all-errors "${url}/" \
| grep -zoE '<table class="datatable-list table">.*</table>' \
| grep -aoE "${DOMAIN_REGEX}$" > "$results_file"
# Note results are not sorted by time added
}
source_jeroengui_phishing() {
source='Jeroengui phishing'
ignore_from_light=true
results_file='data/pending/domains_jeroengui_phishing.tmp'
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
local url='https://file.jeroengui.be/phishing/last_week.txt'
# Get URLs with no subdirectories, exclude IP addresses and extract domains
curl -sSL "$url" | grep -Po "^https?:https://\K${STRICT_DOMAIN_REGEX}(?=/?$)" \
> "$results_file"
}
source_jeroengui_scam() {
source='Jeroengui scam'
results_file='data/pending/domains_jeroengui_scam.tmp'
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
local url='https://file.jeroengui.be/scam/last_week.txt'
curl -sSL "$url" | grep -Po "^https?:https://\K${STRICT_DOMAIN_REGEX}(?=/?$)" \
> "$results_file"
}
source_petscams() {
source='petscams.com'
results_file="data/pending/domains_${source}.tmp"
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
local url='https://petscams.com'
# First page must not have '/page'
curl -sS --retry 2 --retry-all-errors "${url}/" >> results.tmp
curl -sSZ --retry 2 --retry-all-errors "${url}/page/[2-15]/" >> results.tmp
# Each page in theory should return 15 domains, but the regex also matches
# domains under 'Latest Articles' at the bottom of the page, so the number
# of domains returned per page may be >15.
# [:alpha:] is used because [a-z] does not seem to work here
# Matching '/">' ensures not matching false positives
grep -Po "<a href=\"https://petscams.com/[[:alpha:]-]+/\K${DOMAIN_DASH_REGEX}(?=/\">)" \
results.tmp | mawk '{sub(/-[0-9]$/, "", $0);
gsub(/-/, ".", $0); print $0}' > "$results_file"
rm results.tmp
}
source_phishstats() {
source='PhishStats'
ignore_from_light=true
results_file='data/pending/domains_phishstats.tmp'
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
local url='https://phishstats.info/phish_score.csv'
# Get URLs with no subdirectories, exclude IP addresses and extract domains
# (?=/?\"?$) is lookahead that matches an optional slash followed by an
# optional end quote at the end of the line.
curl -sSL "$url" | mawk -F ',' '{print $3}' \
| grep -Po "^\"?https?:https://\K${STRICT_DOMAIN_REGEX}(?=/?\"?$)" \
| sort -u -o "$results_file"
# Get matching NRDs for light version (Unicode ignored)
comm -12 "$results_file" nrd.tmp > phishstats_nrds.tmp
}
source_phishstats_nrd() {
# For the light version
# Only includes domains found in the NRD feed
source='PhishStats (NRDs)'
results_file='data/pending/domains_phishstats_nrd.tmp'
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
mv phishstats_nrds.tmp "$results_file"
}
source_scamadviser() {
source='scamadviser.com'
results_file="data/pending/domains_${source}.tmp"
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
local url='https://www.scamadviser.com/articles'
# Regarding grep pipe errors, see:
# https://github.com/jarelllama/Scam-Blocklist/issues/349
# Trailing slash intentionally omitted
curl -sSZ --retry 2 --retry-all-errors "${url}?p=[1-15]" \
| grep -oE '<h2 class="mb-0">.*</h2>' \
| grep -oE "([0-9]|[A-Z])${DOMAIN_REGEX}" > "$results_file"
}
source_scamdirectory() {
source='scam.directory'
results_file="data/pending/domains_${source}.tmp"
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
local url='https://scam.directory/category'
curl -sS --retry 2 --retry-all-errors "${url}/" \
| grep -Po "href=\"/\K${DOMAIN_DASH_REGEX}(?=\" title)" \
| mawk 'NR<=50 {gsub(/-/, ".", $0); print $0}' > "$results_file"
# Keep only newest 50 results
}
source_stopgunscams() {
source='stopgunscams.com'
results_file="data/pending/domains_${source}.tmp"
[[ "$USE_EXISTING" == true ]] && { process_source; return; }
local url='https://stopgunscams.com'
# Regarding using /sitemap:
# https://github.com/jarelllama/Scam-Blocklist/issues/365
# Trailing slash intentionally omitted
curl -sS --retry 2 --retry-all-errors "${url}/sitemap" \
| grep -Po "class=\"rank-math-html-sitemap__link\">\K${DOMAIN_REGEX}" \
| head -n 100 > "$results_file" # Keep only newest 100 results
}
# Entry point
trap cleanup EXIT
$FUNCTION --format-all
source
build