爬取博客园首页:srapy_cnblog
首先还是命令行创建project,然后依次编写各项文件
import scrapy
class CnblogItem(scrapy.Item):
title = scrapy.Field() #定义爬取的标题
link = scrapy.Field() #定义爬取的连接
import scrapy
from cnblog.items import CnblogItem
class CnblogSpiderSpider(scrapy.Spider):
name = "cnblog_spider"
allowed_domains = ["cnblogs.com"]
url = 'https://www.cnblogs.com/sitehome/p/'
offset = 1
start_urls = [url+str(offset)]
def parse(self, response):
item = CnblogItem()
item['title'] = response.xpath('//a[@class="titlelnk"]/text()').extract() #使用xpath搜索
item['link'] = response.xpath('//a[@class="titlelnk"]/@href').extract()
yield item
print("第{0}页爬取完成".format(self.offset))
if self.offset < 10: #爬取到第几页
self.offset += 1
url2 = self.url+str(self.offset) #拼接url
print(url2)
yield scrapy.Request(url=url2, callback=self.parse)
class FilePipeline(object):
def process_item(self, item, spider):
data = ''
with open('cnblog.txt', 'a', encoding='utf-8') as f:
titles = item['title']
links = item['link']
for i, j in zip(titles, links):
data += i+' '+j+'\n'
f.write(data)
f.close()
return item
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
#user-agent新添加
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
#新修改
ITEM_PIPELINES = {
'cnblog.pipelines.FilePipeline': 300, #实现保存到txt文件
}
from scrapy import cmdline
cmdline.execute("scrapy crawl cnblog_spider --nolog".split()) #--nolog是以不显示日志的形式运行,如果需要看详细信息,可以去掉
现在,我们这个例子就算是写完了,运行main.py,就会生成一个cnblog.Ttxt的文件,里面就是我们爬取下来的内容了。