Skip to content

Commit

Permalink
Deal with data:
Browse files Browse the repository at this point in the history
  • Loading branch information
naibo committed Nov 22, 2023
1 parent 4025e25 commit c197ff1
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 10 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":135,"name":"【软科排名】-中国最好学科排名|最权威的大学学科|高校学科排名","url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","create_time":"2023/11/22 21:51:29","update_time":"2023/11/22 21:51:29","version":"0.5.0","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"containJudge":false,"desc":"https://www.shanghairanking.cn/rankings/bcsr/2023","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.shanghairanking.cn/rankings/bcsr/2023","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.shanghairanking.cn/rankings/bcsr/2023"},{"id":1,"name":"loopTimes_循环_1","nodeId":4,"nodeName":"循环","desc":"循环循环执行的次数(0代表无限循环)","type":"int","exampleValue":0,"value":0}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"湖北大学\n "},{"id":1,"name":"参数3_文本","desc":"","type":"text","recordASField":1,"exampleValue":"(哲学)"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3,4],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div/div[2]/div/a[1]/span[2]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"paras":[],"allXPaths":"","loopType":1}},{"id":4,"index":4,"parentId":2,"type":1,"option":8,"title":"循环","sequence":[6,5],"isInLoop":true,"position":1,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"ant-pagination-next\")]/a[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/div[1]/div[2]/div[1]/div[3]/div[2]/div[1]/div[1]/ul[1]/li[5]/a[1]","//a[contains(., '')]","//A[@class='ant-pagination-item-link']","/html/body/div[last()-3]/div/div/div[last()-2]/div/div/div/div[last()-1]/div/ul/li[last()-1]/a"]}},{"id":6,"index":5,"parentId":4,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":1,"parameters":{"history":5,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"paras":[],"allXPaths":["/html/body/div[1]/div[1]/div[1]/div[2]/div[1]/div[3]/div[2]/div[1]/div[1]/ul[1]/li[5]/a[1]","//a[contains(., '')]","//A[@class='ant-pagination-item-link']","/html/body/div[last()-3]/div/div/div[last()-2]/div/div/div/div[last()-1]/div/ul/li[last()-1]/a"],"loopType":0}},{"id":5,"index":6,"parentId":4,"type":1,"option":8,"title":"循环","sequence":[8,7],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[1]/div[2]/div[1]/div[3]/div[2]/div[1]/div[1]/div[2]/table[1]/tbody[1]/tr/td[4]/div[1]/div[2]/div[1]/div[1]/div[1]/a[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/div[1]/div[2]/div[1]/div[3]/div[2]/div[1]/div[1]/div[2]/table[1]/tbody[1]/tr[1]/td[4]/div[1]/div[2]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '湖北大学')]","//A[@class='name-cn']","/html/body/div[last()-3]/div/div/div[last()-1]/div/div/div/div[last()-1]/div/div[last()-1]/table/tbody/tr[last()-29]/td[last()-1]/div/div/div[last()-1]/div/div/a"]}},{"id":8,"index":7,"parentId":5,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":1,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"paras":[{"nodeType":1,"contentType":0,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"湖北大学\n "}],"unique_index":"dgby6tuc79nlp9toe7t","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}},{"id":7,"index":8,"parentId":5,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"paras":[{"nodeType":0,"contentType":0,"relative":false,"name":"参数3_文本","desc":"","extractType":0,"relativeXPath":"/html/body/div[1]/div[1]/div[1]/div[2]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[3]","allXPaths":["/html/body/div[1]/div[1]/div[1]/div[2]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[3]","//div[contains(., '(哲学)')]","/html/body/div[last()-3]/div/div/div[last()-1]/div/div/div/div[last()-1]/div/div[last()-2]/div[last()-1]/div[last()-1]"],"exampleValues":[{"num":0,"value":"(哲学)"}],"unique_index":"xgoa18n26rllp9toqoa","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions .temp_to_pub/EasySpider_windows_x64/tasks/236.json

Large diffs are not rendered by default.

41 changes: 31 additions & 10 deletions ExecuteStage/easyspider_executestage.py
Original file line number Diff line number Diff line change
Expand Up @@ -894,6 +894,7 @@ def loopExecute(self, node, loopValue, clickPath="", index=0):
thisHistoryLength = 0
self.history["index"] = thisHistoryLength
self.history["handle"] = thisHandle
thisHitoryURL = self.browser.current_url
if int(node["parameters"]["loopType"]) == 0: # 单个元素循环
# 无跳转标签页操作
count = 0 # 执行次数
Expand Down Expand Up @@ -1033,14 +1034,19 @@ def loopExecute(self, node, loopValue, clickPath="", index=0):
self.browser.execute_script('window.stop()')
except:
pass
ti = 0
if self.browser.current_url.startswith("data:"):
try:
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
except: # 超时的情况下
pass
while self.browser.current_url != thisHitoryURL: # 如果执行完一次循环之后网址发生了变化
try:
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
except: # 超时的情况下
pass
ti += 1
if self.browser.current_url == thisHitoryURL or ti > thisHistoryLength: # 如果执行完一次循环之后网址发生了变化
break
time.sleep(2)
elements = self.browser.find_elements(By.XPATH,
xpath, iframe=node["parameters"]["iframe"])
xpath, iframe=node["parameters"]["iframe"])
if index > 0:
index -= 1 # 如果是data:开头的网址,就要重试一次
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
Expand Down Expand Up @@ -1110,14 +1116,29 @@ def loopExecute(self, node, loopValue, clickPath="", index=0):
self.browser.execute_script('window.stop()')
except:
pass
# if self.browser.current_url.startswith("data:"):
# try:
# self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
# except: # 超时的情况下
# pass
# time.sleep(2)
# elements = self.browser.find_elements(By.XPATH,
# xpath, iframe=node["parameters"]["iframe"])
# if index > 0:
# index -= 1 # 如果是data:开头的网址,就要重试一次
ti = 0
if self.browser.current_url.startswith("data:"):
try:
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
except: # 超时的情况下
pass
while self.browser.current_url != thisHitoryURL: # 如果执行完一次循环之后网址发生了变化
try:
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
except: # 超时的情况下
pass
ti += 1
if self.browser.current_url == thisHitoryURL or ti > thisHistoryLength: # 如果执行完一次循环之后网址发生了变化
break
time.sleep(2)
elements = self.browser.find_elements(By.XPATH,
xpath, iframe=node["parameters"]["iframe"])
xpath, iframe=node["parameters"]["iframe"])
if index > 0:
index -= 1 # 如果是data:开头的网址,就要重试一次
except NoSuchElementException:
Expand Down

0 comments on commit c197ff1

Please sign in to comment.