forked from Kikobeats/top-crawler-agents
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate.js
50 lines (42 loc) · 1.46 KB
/
generate.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
/* global AbortController, fetch */
'use strict'
const crawlers = require('crawler-user-agents')
const { writeFile } = require('fs/promises')
const { load } = require('cheerio')
const pFilter = require('p-filter')
const pEvery = require('p-every')
const CHECK = { true: '✅', false: '❌' }
const MAX_CONCURRENCY = 10
const REQ_TIMEOUT = 10000
const candidates = [...new Set(crawlers.flatMap(crawler => crawler.instances))]
const URLS = [
'https://twitter.com/Kikobeats/status/1687837848802578432',
'https://www.tesla.com/ms/order/5YJSA1E21MF426731'
]
const verifyUrl = userAgent => async url => {
try {
const controller = new AbortController()
setTimeout(() => controller.abort(), REQ_TIMEOUT)
const res = await fetch(url, {
signal: controller.signal,
headers: { 'user-agent': userAgent, redirect: 'manual' }
})
const html = await res.text()
const $ = load(html)
const imageUrl = $('meta[property="og:image"]').attr('content')
return !!imageUrl
} catch (_) {
return false
}
}
const verify = async (userAgent, index) =>
pEvery(URLS, verifyUrl(userAgent)).then(result => {
console.log(`${CHECK[result]} ${index}/${candidates.length} ${userAgent}`)
return result
})
Promise.resolve()
.then(() => pFilter(candidates, verify, { concurrency: MAX_CONCURRENCY }))
.then(async result => {
await writeFile('index.json', JSON.stringify(result, null, 2))
console.log(`\nGenerated ${result.length} crawlers ✨`)
})