import requests,sslimport os,time,jsonfrom selenium import webdriver,commonfrom lxml import etreeroot_dir='douban/img'if not os.path.exists(root_dir):os.mkdir(root_dir)driver=webdriver.PhantomJS()def spider(page):#新方法:解决数据加密问题 base_url='/subject_search?search_text=python&cat=1001&start=%s'%(page*15)driver.get(base_url)time.sleep(3)# file_name=root_dir+"/%s.png"%page # driver.save_screenshot(file_name) #页面内容 # print(driver.page_source) content_parse(driver.page_source)''' #正常方法 #加密数据,原理是可逆的,但是很难破译,一般用selenium\phantomjs方法处理 base_url='/subject_search?search_text=python&cat=1001&start=%s'%(page*15) # base_url='/tag/python?start=%s&type=T'%(page*20)#可以爬取数据,只是少些 data={ "search_text":"python", "cat":"1001", "start":"0", } headers={ "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language":"zh-CN,zh;q=0.9", "Connection":"keep-alive", "Cookie":"ll=108288; bid=zC8kpZs6khI; __utmz=30149280.1525247893.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=DC81A8A4F064F1DE970236B8B877DBFFE|d52a04191f120a868c6c3ff087438bf3; __utma=30149280.1644564112.1525247893.1525250902.1526440071.3; __utmc=30149280; __utmt=1; __utmt_douban=1; __utma=81379588.1571071381.1526440124.1526440124.1526440124.1; __utmc=81379588; __utmz=81379588.1526440124.1.1.utmcsr=|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmb=81379588.1.10.1526440124; gr_user_id=4e32d00a-05ad-4881-8ad5-4fc2e338d275; gr_cs1_bbbbbbeb-cc42-4ba4-84a0-76472504134b=user_id%3A0; __yadk_uid=DmyfVTl078LDu7W8HHzZx9OFcQfxkdGE; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1526440125%2C%22https%3A%2F%%2F%22%5D; _pk_ses.100001.3ac3=*; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=bbbbbbeb-cc42-4ba4-84a0-76472504134b_true; _pk_id.100001.3ac3=33dc8abe8c87c40b.1526440125.1.1526440186.1526440125.; __utmb=30149280.6.10.1526440071", "Host":"", "Referer":"/subject_search?search_text=python&cat=1001&start=0", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36", } response=requests.get(base_url,data=data,headers=headers,verify=False) # print(response.content.decode('utf-8')) file_name=root_dir+"/%s.html"%page with open(file_name,'w',encoding='utf-8') as f: f.write(response.content.decode('utf-8')) '''#解析函数def content_parse(content):data = []tree=etree.HTML(content)books=tree.xpath('//div[@class="item-root"]')print(len(books))##遍历列表,获取每本书的信息 for book in books:data_dict = {}# 书名 book_name = book.xpath('.//div[@class="title"]/a')if book_name != []:book_name = book_name[0].textprint(book_name)# 图片 book_src = book.xpath('./a/img/@src')if book_src!=[]:book_src = book_src[0]print(book_src)#书的链接 book_href = book.xpath('.//div[@class="title"]/a/@href')if book_href != []:book_href = book_href[0]print(book_href)# 评分 book_score = book.xpath('.//span[@class="rating_nums"]')if book_score != []:book_score = book_score[0].textprint(book_score)#details book_details = book.xpath('.//div[@class="meta abstract"]')if book_details != []:book_details = book_details[0].textprint(book_details)print('~~~~~~~~~~~~~~~~~~~~`')data_dict['book_name']=book_namedata_dict['book_src']=book_srcdata_dict['book_href']=book_hrefdata_dict['book_score']=book_scoredata_dict['book_details']=book_detailsdata.append(data_dict)file_name='douban/douban_data.json' dj=json.dumps(data,ensure_ascii=False)with open(file_name,'w',encoding='utf-8') as f:f.write(dj)f.close()print(len(data), type(dj))if __name__=='__main__':for i in range(2):spider(i)
/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6 /Users/apple/PycharmProjects/stage4/spider/_3_13/03douban.py
15
Head First Python(中文版) : Head First Python
/view/subject/l/public/s27262723.jpg
/subject/10561367/
7.9
巴里(Barry.P.) / 林琪 等 / 中国电力出版社 / -3-1 / 68.00元
~~~~~~~~~~~~~~~~~~~~`
Python灰帽子 : 黑客与逆向工程师的Python编程之道
/view/subject/l/public/s4676930.jpg
/subject/6025284/
7.5
[美] Justin Seitz / 丁赟卿 译 / 崔孝晨 审校 / 电子工业出版社 / -3 / 39.00元
~~~~~~~~~~~~~~~~~~~~`
Python网络编程基础 : 使用Python构建网络程序的综合指南
/view/subject/l/public/s2604186.jpg
/subject/2152386/
7.2
John Goerzen / 莫迟 等 / 电子工业出版社 / / 68.00元
~~~~~~~~~~~~~~~~~~~~`
Python基础教程
/view/subject/l/public/s4387251.jpg
/subject/4866934/
8.0
Magnus Lie Hetland / 司维 / 曾军崴 / 谭颖华 / 人民邮电出版社 / -7 / 69.00元
~~~~~~~~~~~~~~~~~~~~`
Python Algorithms : Mastering Basic Algorithms in the Python Language
/view/subject/l/public/s6999960.jpg
/subject/4915945/
8.9
Magnus Lie Hetland / Apress / -11-24 / USD 49.99
~~~~~~~~~~~~~~~~~~~~`
父与子的编程之旅 : 与小卡特一起学Python
/view/subject/l/public/s28825823.jpg
/subject/26005639/
8.5
桑德 (Warren Sande) / 桑德 (Carter Sande) / 苏金国 / 易郑超 / 人民邮电出版社 / -10-1 / CNY 69.00
~~~~~~~~~~~~~~~~~~~~`
Python核心编程(第二版)
/view/subject/l/public/s3140466.jpg
/subject/3112503/
7.7
[美]Wesley J. Chun(陳仲才) / CPUG / 人民邮电出版社 / -06 / 89.00元
~~~~~~~~~~~~~~~~~~~~`
利用Python进行数据分析
/view/subject/l/public/s27275372.jpg
/subject/25779298/
8.5
Wes McKinney / 唐学韬 / 机械工业出版社 / -11-18 / 89.00
~~~~~~~~~~~~~~~~~~~~`
贝叶斯思维 : 统计建模的Python学习法
/view/subject/l/public/s28023092.jpg
/subject/26340992/
7.4
[美]Allen B. Downey(艾伦·唐尼) / 许杨毅 / 人民邮电出版社 / -3 / 49.00
~~~~~~~~~~~~~~~~~~~~`
Django Web开发指南 : Python Web Development with Django
/view/subject/l/public/s3789820.jpg
/subject/3740086/
6.5
Jeff Forcier / Paul Bissex / 徐旭铭 / 机械工业出版社 / -5 / 49.00元
~~~~~~~~~~~~~~~~~~~~`
Python源码剖析 : 深度探索动态语言核心技术
/view/subject/l/public/s3435132.jpg
/subject/3117898/
8.7
陈儒 / 电子工业出版社 / -6 / 69.80元
~~~~~~~~~~~~~~~~~~~~`
Python学习手册 : (第3版)
/view/subject/l/public/s3952568.jpg
/subject/3948354/
8.2
Mark Lutz / 侯靖 / 机械工业出版社 / -8 / 89.00元
~~~~~~~~~~~~~~~~~~~~`
Python编程:从入门到实践 : 从入门到实践
/view/subject/l/public/s28891775.jpg
/subject/26829016/
9.1
[美]埃里克·马瑟斯 / 袁国忠 / 人民邮电出版社 / -7-1 / CNY 89.00
~~~~~~~~~~~~~~~~~~~~`
Python入门经典 : 以解决计算问题为导向的Python编程实践
/view/subject/l/public/s11430346.jpg
/subject/11610789/
8.2
(美)William F. Punch/Richard Enbody / 张敏 / 机械工业出版社 / -8-1 / 79.00元
~~~~~~~~~~~~~~~~~~~~`
学习Python : Learning Python, Second Edition
/view/subject/l/public/s1436455.jpg
/subject/1426816/
8.2
Mark Lutz / David Ascher / 东南大学出版社 / 6月 / 68.00元
~~~~~~~~~~~~~~~~~~~~`
15 <class 'str'>
15
python绝技:运用python成为顶级黑客 : 运用Python成为顶级黑客
/view/subject/l/public/s28385338.jpg
/subject/26702570/
7.6
[美] TJ O'Connor / 崔孝晨 / 武晓音 / 电子工业出版社 / -1 / 79.00元
~~~~~~~~~~~~~~~~~~~~`
Expert Python Programming : Best practices for designing, coding, and distributing your Pyth
/view/subject/l/public/s29690103.jpg
/subject/3285148/
8.3
Tarek Ziadé / Packt Publishing / -9-26 / USD 44.99
~~~~~~~~~~~~~~~~~~~~`
"笨办法"学Python
/view/subject/l/public/s27836847.jpg
/subject/26264642/
7.9
肖 (Zed A.Shaw) / 王巍巍 / 人民邮电出版社 / -11-1 / CNY 49.00
~~~~~~~~~~~~~~~~~~~~`
Effective Python : 编写高质量Python代码的59个有效方法
/view/subject/l/public/s28384052.jpg
/subject/26709315/
8.7
布雷特·斯拉特金(Brett Slatkin) / 爱飞翔 / 机械工业出版社 / -1 / 59
~~~~~~~~~~~~~~~~~~~~`
Python语言及其应用 : Python语言及其应用
/view/subject/l/public/s28352586.jpg
/subject/26675127/
8.4
[美] Bill Lubanovic / 丁嘉瑞 / 梁杰 / 禹常隆 / 人民邮电出版社 / -1 / 79.00元
~~~~~~~~~~~~~~~~~~~~`
A Byte of Python
/view/subject/l/public/s4612135.jpg
/subject/5948760/
8.7
Swaroop C H / Lulu Marketplace / -10-1 / USD 27.98
~~~~~~~~~~~~~~~~~~~~`
Python Cookbook : (第2版)中文版
/view/subject/l/public/s4357883.jpg
/subject/4828875/
8.6
Alex Martelli / Anna Ravenscroft / David Ascher / 高铁军 / 人民邮电出版社 / -5-1 / 99.00元
~~~~~~~~~~~~~~~~~~~~`
Dive Into Python
/view/subject/l/public/s29694522.jpg
/subject/1440658/
8.2
Mark Pilgrim / Apress / -11-5 / GBP 31.49
~~~~~~~~~~~~~~~~~~~~`
Python学习手册(第4版)
/view/subject/l/public/s4683230.jpg
/subject/6049132/
7.9
[美] Mark Lutz / 李军 / 刘红伟 / 机械工业出版社 / -4 / 119.00元
~~~~~~~~~~~~~~~~~~~~`
Flask Web Development : Developing Web Applications with Python
/view/subject/l/public/s27205547.jpg
/subject/25814739/
8.3
Miguel Grinberg / O'Reilly Media / -5-25 / USD 24.99
~~~~~~~~~~~~~~~~~~~~`
可爱的Python
/view/subject/l/public/s3901817.jpg
/subject/3884108/
7.4
哲思社区 / 电子工业出版社 / -9 / 55.00元
~~~~~~~~~~~~~~~~~~~~`
流畅的Python
/view/subject/l/public/s29434304.jpg
/subject/27028517/
9.3
[巴西] Luciano Ramalho / 安道 / 吴珂 / 人民邮电出版社 / -5-15 / 139元
~~~~~~~~~~~~~~~~~~~~`
Python网络数据采集
/view/subject/l/public/s29086659.jpg
/subject/26740503/
7.7
米切尔 (Ryan Mitchell) / 陶俊杰 / 陈小莉 / 人民邮电出版社 / -3-1 / CNY 59.00
~~~~~~~~~~~~~~~~~~~~`
Effective Python : 59 Specific Ways to Write Better Python
/view/subject/l/public/s28008426.jpg
/subject/26312313/
8.3
Brett Slatkin / Addison-Wesley Professional / -3-8 / USD 39.99
~~~~~~~~~~~~~~~~~~~~`
Python高级编程
/view/subject/l/public/s4163751.jpg
/subject/4212921/
7.6
Tarek Ziadé / 姚军 / 夏海轮 / 译 / 人民邮电出版社 / -1 / 45.00元
~~~~~~~~~~~~~~~~~~~~`
15 <class 'str'>
Process finished with exit code 0