具体代码如下
今天开始接触这个框架,反反爬措施没做多少,一直被发现
# -*- coding: utf-8 -*-import scrapyfrom lagou.items import LagouItemimport reclass LagouZhaopinSpider(scrapy.Spider):name = 'lagou_zhaopin'allowed_domains = ['']start_urls = ["/zhaopin/1/"]def parse(self, response):li_list = response.xpath("//li[@class='con_list_item default_list']")for i in li_list:data_dict = LagouItem()data_dict["title"] = i.xpath(".//h3/text()").extract()data_dict["addr"] = i.xpath(".//span[@class='add']/em/text()").extract()#提取详情页数据detail_url = i.xpath('.//a[@class="position_link"]/@href').extract_first()yield scrapy.Request(detail_url,callback=self.parse_detail,meta={"data_dict": data_dict})# 请求下一页#extract_first()得到的是str类型next_page_url = response.xpath("//a[text()='下一页']/@href").extract_first()print(next_page_url)if next_page_url != "javascript:;":yield scrapy.Request(next_page_url, callback=self.parse)def parse_detail(self, response): #详情页数据提取data_dict = response.meta["data_dict"]#去掉多余字符(空表,换行等)content_list = response.xpath('//*[@id="job_detail"]/dd[2]/div//text()').extract()content_list = [re.sub(r"\s", "", i) for i in content_list] #将多余字符替换成空字符data_dict["detail"] = [i for i in content_list if len(i) > 0] #去掉空字符yield data_dict
网上找了一堆user-agent
在setting.py设置
import randomUSER_AGENT_LIST = ['MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23','Opera/9.20 (Macintosh; Intel Mac OS X; U; en)','Opera/9.0 (Macintosh; PPC Mac OS X; U; en)','iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)','Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)','iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)','Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/0101 Firefox/5.0','Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/0101 Firefox/9.0','Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/0813 Firefox/16.0','Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)','Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)']USER_AGENT = random.choice(USER_AGENT_LIST)
在pipelines.py中打印结果
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: /en/latest/topics/item-pipeline.htmlfrom pymongo import MongoClientclient = MongoClient()db = client["lagou"]class LagouPipeline:def process_item(self, item, spider):#保存数据的地方print(item)#db.zhaopin.insert_one(dict(item))return item
运行结果下面
用户体验:爬取速度非常快(没有做反反爬措施很容易被发现)