From 1ce5e280afa5d940e892082f8a760cc8c0bd9d31 Mon Sep 17 00:00:00 2001 From: NaiboWang-Alienware Date: Sun, 8 Oct 2023 17:09:27 +0800 Subject: [PATCH] Bug fix about history length for some website --- ExecuteStage/.vscode/launch.json | 2 +- ExecuteStage/easyspider_executestage.py | 64 +++++++++++++++++-------- 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/ExecuteStage/.vscode/launch.json b/ExecuteStage/.vscode/launch.json index 33c97bc7..d251c634 100644 --- a/ExecuteStage/.vscode/launch.json +++ b/ExecuteStage/.vscode/launch.json @@ -12,7 +12,7 @@ "justMyCode": false, // "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"] // "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"] - "args": ["--id", "[36]", "--headless", "0", "--user_data", "0", "--keyboard", "0"] + "args": ["--id", "[14]", "--headless", "0", "--user_data", "0", "--keyboard", "0"] } ] } \ No newline at end of file diff --git a/ExecuteStage/easyspider_executestage.py b/ExecuteStage/easyspider_executestage.py index b78fe99d..6e50fefd 100644 --- a/ExecuteStage/easyspider_executestage.py +++ b/ExecuteStage/easyspider_executestage.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- # import atexit +import undetected_chromedriver as uc from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, replace_field_values, write_to_csv, write_to_excel, write_to_json from myChrome import MyChrome from threading import Thread, Event @@ -41,7 +42,6 @@ from lxml import etree import onnxruntime onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志 -import undetected_chromedriver as uc # import pandas as pd # import numpy # import pytesseract @@ -157,7 +157,7 @@ def __init__(self, browser_t, id, service, version, event, saveName, config): self.OUTPUT.append([]) # 添加表头 self.writeMode = 0 elif self.outputFormat == "json": - self.writeMode = 3 # JSON模式无需判断是否存在文件 + self.writeMode = 3 # JSON模式无需判断是否存在文件 elif self.outputFormat == "mysql": self.mysql = myMySQL(config["mysql_config_path"]) self.mysql.create_table(self.saveName, service["outputParameters"]) @@ -409,7 +409,8 @@ def saveData(self, exit=False): elif self.outputFormat == "json": file_name = "Data/Task_" + \ str(self.id) + "/" + self.saveName + '.json' - write_to_json(file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord, self.outputParameters.keys()) + write_to_json(file_name, self.OUTPUT, self.outputParametersTypes, + self.outputParametersRecord, self.outputParameters.keys()) elif self.outputFormat == "mysql": self.mysql.write_to_mysql( self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes) @@ -647,7 +648,8 @@ def switchSelect(self, para, loopValue): optionValue = loopValue optionMode = 1 try: - xpath = replace_field_values(para["xpath"], self.outputParameters, self) + xpath = replace_field_values( + para["xpath"], self.outputParameters, self) dropdown = Select(self.browser.find_element( By.XPATH, xpath, iframe=para["iframe"])) try: @@ -678,7 +680,8 @@ def switchSelect(self, para, loopValue): def moveToElement(self, para, loopElement=None, loopPath="", index=0): time.sleep(0.1) # 移动之前等待0.1秒 loopPath = replace_field_values(loopPath, self.outputParameters, self) - xpath = replace_field_values(para["xpath"], self.outputParameters, self) + xpath = replace_field_values( + para["xpath"], self.outputParameters, self) if para["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath if xpath == "": path = loopPath @@ -873,8 +876,11 @@ def judgeExecute(self, node, loopElement, clickPath="", index=0): def loopExecute(self, node, loopValue, clickPath="", index=0): time.sleep(0.1) # 第一次执行循环的时候强制等待1秒 thisHandle = self.browser.current_window_handle # 记录本次循环内的标签页的ID - thisHistoryLength = self.browser.execute_script( - 'return history.length') # 记录本次循环内的history的length + try: + thisHistoryLength = self.browser.execute_script( + 'return history.length') # 记录本次循环内的history的length + except: + thisHistoryLength = 0 self.history["index"] = thisHistoryLength self.history["handle"] = thisHandle if int(node["parameters"]["loopType"]) == 0: # 单个元素循环 @@ -1009,7 +1015,7 @@ def loopExecute(self, node, loopValue, clickPath="", index=0): # else: # time.sleep(2) # 切换历史记录等待: - self.recordLog("Change history back time or: ", + self.recordLog("Change history back time or: ", node["parameters"]["historyWait"]) try: self.browser.execute_script('window.stop()') @@ -1030,7 +1036,8 @@ def loopExecute(self, node, loopValue, clickPath="", index=0): # 千万不要忘了分割!! for path in node["parameters"]["pathList"].split("\n"): try: - path = replace_field_values(path, self.outputParameters, self) + path = replace_field_values( + path, self.outputParameters, self) element = self.browser.find_element( By.XPATH, path, iframe=node["parameters"]["iframe"]) # self.recordLog("循环元素|Loop element:", path) @@ -1224,13 +1231,17 @@ def openPage(self, para, loopValue): "return history.length") except: self.history["index"] = 0 + except Exception as e: + self.print_and_log("History Length Error") + self.history["index"] = 0 self.scrollDown(para) # 控制屏幕向下滚动 # 键盘输入事件 def inputInfo(self, para, loopValue): time.sleep(0.1) # 输入之前等待0.1秒 try: - xpath = replace_field_values(para["xpath"], self.outputParameters, self) + xpath = replace_field_values( + para["xpath"], self.outputParameters, self) textbox = self.browser.find_element( By.XPATH, xpath, iframe=para["iframe"]) # textbox.send_keys(Keys.CONTROL, 'a') @@ -1289,8 +1300,10 @@ def clickElement(self, para, loopElement=None, clickPath="", index=0): try: # element = self.browser.find_element( # By.XPATH, path, iframe=para["iframe"]) - clickPath = replace_field_values(clickPath, self.outputParameters, self) - xpath = replace_field_values(para["xpath"], self.outputParameters, self) + clickPath = replace_field_values( + clickPath, self.outputParameters, self) + xpath = replace_field_values( + para["xpath"], self.outputParameters, self) if para["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath if xpath == "": path = clickPath @@ -1375,6 +1388,9 @@ def clickElement(self, para, loopElement=None, clickPath="", index=0): pass self.history["index"] = self.browser.execute_script( "return history.length") + except Exception as e: + self.print_and_log("History Length Error") + self.history["index"] = 0 else: try: self.history["index"] = self.browser.execute_script( @@ -1387,6 +1403,9 @@ def clickElement(self, para, loopElement=None, clickPath="", index=0): self.history["index"] = self.browser.execute_script( "return history.length") # 如果打开了新窗口,切换到新窗口 + except Exception as e: + self.print_and_log("History Length Error") + self.history["index"] = 0 self.scrollDown(para) # 根据参数配置向下滚动 # rt.end() @@ -1556,7 +1575,8 @@ def clearOutputParameters(self): # 提取数据事件 def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0): - parentPath = replace_field_values(parentPath, self.outputParameters, self) + parentPath = replace_field_values( + parentPath, self.outputParameters, self) if para["clear"] == 1: self.clearOutputParameters() try: @@ -1762,8 +1782,8 @@ def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0): if __name__ == '__main__': from multiprocessing import freeze_support - freeze_support() # 防止无限死循环多开 - + freeze_support() # 防止无限死循环多开 + # 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度 # If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed config = { @@ -1959,13 +1979,17 @@ def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0): elif cloudflare == 1: if sys.platform == "win32": options.binary_location = "C:\\Program Files\\Google\\Chrome Beta\\Application\\chrome.exe" # 需要用自己的浏览器 - # options.add_argument("--auto-open-devtools-for-tabs") + # options.add_argument("--auto-open-devtools-for-tabs") # options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" # 需要用自己的浏览器 - browser_t = MyUCChrome(options=options, driver_executable_path=driver_path) + browser_t = MyUCChrome( + options=options, driver_executable_path=driver_path) links = list(filter(isnotnull, service["links"].split("\n"))) - browser_t.execute_script('window.open("'+ links[0] +'","_blank");') # open page in new tab - time.sleep(5) # wait until page has loaded - browser_t.switch_to.window(browser_t.window_handles[1]) # switch to new tab + # open page in new tab + browser_t.execute_script( + 'window.open("' + links[0] + '","_blank");') + time.sleep(5) # wait until page has loaded + browser_t.switch_to.window( + browser_t.window_handles[1]) # switch to new tab # browser_t = uc.Chrome() else: print("Cloudflare模式只支持Windows x64平台。")