700字范文,内容丰富有趣,生活中的好帮手!
700字范文 > python爬虫1——获取网站源代码(豆瓣图书top250信息)

python爬虫1——获取网站源代码(豆瓣图书top250信息)

时间:2020-10-21 20:11:37

相关推荐

python爬虫1——获取网站源代码(豆瓣图书top250信息)

# -*- coding: utf-8 -*-import requestsimport reimport sysreload(sys)sys.setdefaultencoding('utf-8')class Spider(object):def __init__(self):print('开始爬取豆瓣图书top250的内容。。。。。。')# 传入url,返回网页源代码def getSourceCode(self, url):html = requests.get(url)return html.text# 从源代码中提取出我们需要的内容块:{书名、作者出版社等、评分、评价人数}。def getEveryBookContent(self, sourceCode):everyBookContent = re.findall('<table width="100%">(.*?)</table>', sourceCode, re.S)# everyBookContent = re.findall('<div class="pl2">(.*?)</div>(.*?)<p class="pl">(.*?)</p>', sourceCode, re.S)return everyBookContent# 从内容块中提取出数据def getBookInfo(self, eachBookContent):bookInfo = {}# bookInfo['title'] = re.subn('( |\n|<br/>|</?span.*?>)', "", re.search('<a href=.*?>(.*?)</a>', eachBookContent, re.S).group(1))[0]bookInfo['title'] = re.sub('( |\n|<br/>|</?span.*?>)', "", re.search('<a href=.*?>(.*?)</a>', eachBookContent, re.S).group(1))bookInfo['author'] = re.search('<p class="pl">(.*?)</p>', eachBookContent, re.S).group(1)bookInfo['discussNum'] = re.sub('( |\n|<br/>)', "", re.search('<span class="pl">\((.*?)\)</span>', eachBookContent, re.S).group(1))bookInfo['score'] = re.search('<span class="rating_nums">(.*?)</span>', eachBookContent, re.S).group(1)return bookInfo# 将结果保存到文件def saveBookInfo(self, bookList):f = open("bookList.txt", "a")for each in bookList:f.writelines('书 名:\t {}\n'.format(each['title']))f.writelines('作 者:\t {}\n'.format(each['author']))f.writelines('评论数:\t {}\n'.format(each['discussNum']))f.writelines('评 分:\t {}\n\n'.format(each['score']))f.close()def start(self, url):sourceCode = self.getSourceCode(url)everyBookContent = self.getEveryBookContent(sourceCode)bookList = []for each in everyBookContent:bookList.append(self.getBookInfo(each))self.saveBookInfo(bookList)if __name__ == '__main__':douban = Spider()url = '/top250?start=0'i = 0while i <= 225:url = '/top250?start={}'.format(i)douban.start(url)i += 25

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。