-
Notifications
You must be signed in to change notification settings - Fork 16
/
browser.py
69 lines (57 loc) · 2.84 KB
/
browser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import time
from selenium import webdriver
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.edge.options import Options as EdgeOptions
from bs4 import BeautifulSoup, NavigableString
class Browser:
""" 网页浏览器 """
def __init__(self, proxy:str=None) -> None:
self.proxy = proxy
self.driver = self.create_driver()
def create_driver(self):
edge_options = EdgeOptions() # 创建Edge选项
edge_options.add_argument("--headless") # 启用无头模式
edge_options.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0
edge_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 实现了规避监测
edge_options.add_experimental_option('excludeSwitches', ['enable-logging']) # 省略log
if self.proxy: # 添加代理
edge_options.add_argument(f'--proxy-server={self.proxy}')
# 配置浏览器设置以忽略图片
prefs = {"profile.managed_default_content_settings.images": 2}
edge_options.add_experimental_option("prefs", prefs)
driver = webdriver.Edge(
service=EdgeService(EdgeChromiumDriverManager().install()),
options=edge_options)
return driver
def webpage_text(self, url:str) -> str:
""" 访问网页,读取文本内容返回"""
self.driver.get(url)
# self.driver.execute_script(f"window.open('{url}');")
# self.driver.switch_to.window(self.driver.window_handles[-1])
time.sleep(1)
html_content = self.driver.page_source
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(html_content, 'html.parser')
text = self.full_text(soup)
self.driver.close()
return text
def full_text(self, soup:BeautifulSoup) -> str:
fulltext = soup.get_text(separator='\n',strip=True)
return fulltext
def distill_text(self, soup:BeautifulSoup) -> str:
tags_to_search = ['title', 'article', 'section', 'div', 'p', 'table', 'tr', 'td', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
tags_needed = soup.find_all(tags_to_search)
texts = []
for tag in tags_needed: # True 匹配所有标签
direct_texts = [child for child in tag.children if isinstance(child, NavigableString)]
direct_text = ''.join(direct_texts).strip()
if direct_text:
texts.append(direct_text)
text = '\n'.join(texts)
return text
if __name__ == '__main__':
browser = Browser()
url = 'https://latorc.com:888'
text = browser.webpage_text(url)
text