-
Notifications
You must be signed in to change notification settings - Fork 513
/
taobao2.0.py
71 lines (56 loc) · 2.29 KB
/
taobao2.0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 28 11:00:07 2019
@author: Administrator
"""
import requests
import re
def getHTMLText(url,headers):
try:
r=requests.get(url,headers=headers,timeout=30)
r.raise_for_status # 如果是200,表示返回的内容正确;如果不是200,会产生HttpError异常
r.encoding=r.apparent_encoding# 将对文本中分析的编码来替换整体的编码
#print(r.text[:1000])
return r.text
except:
print ("urlError")
def parserPage(ilt,html):
try:
plt=re.findall(r'"view_price":"[\d.]*"',html)# 匹配键值对——"view_price":"[\d.]*"
tlt=re.findall(r'"raw_title":".*?"',html) # *?表示前一个字符0或无限次扩展,最小匹配(只取到最后一个”为止的内容,这样就约束了匹配的内容就是商品本身的名字)
'''
去掉view_price字段,只取价格部分
eval函数用来执行字符串表达式,并返回表达式的值(相当于将外面的双引号去掉)
这里string.split(pattern)不是正则表达式re.split(pattern, string),是将字符串分割,[1]是获得键值对的后面部分
'''
for i in range(len(plt)):
price=eval(plt[i].split(':')[1])
title=eval(tlt[i].split(':')[1])
ilt.append([price,title])
#print(ilt)
except:
print ("parserError")
def printGoodsList(ilt):
tplt="{:4}\t{:8}\t{:16}"# 定义输出格式的模板
print(tplt.format("num","price","name"))
count=0
for g in ilt:
count=count+1
print(tplt.format(count,g[0],g[1]))
def main():
goods='爬虫书'
depth=3 # 爬取的深度,即有多个网页时决定爬几个网页
headers = {"User_Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Cookie" : "登陆淘宝后F12在控制台中输入document.cookie获取Cookie"
}
start_url="https://s.taobao.com/search?q=" + goods
infoList=[]
for i in range(depth):
try:
url=start_url+'&s='+str(44*i)
html=getHTMLText(url,headers)
parserPage(infoList,html)
except:
continue
printGoodsList(infoList)
main()