豆瓣 top250 电影 python爬虫 数据分析 词云 饼图 柱状图

时间:2019-09-10 09:10:39


豆瓣 top250 电影 python爬虫 数据分析 词云 饼图 柱状图

最近突然对 python 的爬虫感兴趣,就花了两三天时间自己试了试。借鉴了一些他人的代码,自己也在网上查了查,实现了自己想要的分析。



# -*- coding: utf-8 -*-"""Created on Sun May 5 14:08:23 @author: zhen chenMIT Licence.Python version: 3.7Email: robinchen@Description: This a practice for crawler in the movie reviewing website douban, crawling for top250 movies, and analyze their result"""import requests # 联系网络的包,a package for requesting from websitesimport xlwt # 读写 excel 的包,a package for reading and writing in excel, not supporting xlsx writingfrom bs4 import BeautifulSoup # 分析网页数据的包,a package for webstie data analysisfrom collections import Counter # 计算列表中元素的包,counter the num of each element in a listimport collectionsimport matplotlib.pyplot as plt # 画图的包from pylab import mpl # 设置图形中字体样式与大小的包mpl.rcParams['font.sans-serif'] = ['SimHei'] mpl.rcParams['font.size'] = 6.0import time import randomimport jieba # 中文分词包from wordcloud import WordCloud # 词云包import re # 正则表达式包,for cutting the punctuationsheaders = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36','Host':''}## data neededmovie_list_english_name = []movie_list_chinese_name = []director_list = []time_list = []star_list = []reviewNum_list = []quote_list = []nation_list = []category_list = []num = 0for i in range(0, 10):link = '/top250?start=' + str(i*25) # 250部电影一共 10个网页, 10 pages for total 250 movies res = requests.get(link, headers = headers, timeout = 10)time.sleep(random.random()*3) # 每抓一个网页休息2~3秒,防止被反爬措施封锁 IP,avoid being blocked of the IP address# res.text is the content of the crawlersoup = BeautifulSoup(res.text, "lxml") # lxml 是一个解码方式,lxml is one decoding model for Beautifulsoupdiv_title_list = soup.find_all('div', class_ = 'hd') # 寻找 hd 类型的类,find classes whose tag are hddiv_info_list = soup.find_all('div', class_ = 'bd')div_star_list = soup.find_all('div', class_ = 'star')div_quote_list = soup.find_all('p', class_ = 'quote')for each in div_title_list:# a表示 html 中的超链接,a is href link of html# strip 去掉收尾的空格,strip() is for stripping spacing at the beginning and end of a stringmovie = each.a.span.text.strip() # 只能得到第一个字段,only get the first span of text this methodmovie_list_chinese_name.append(movie)# 通过css 定位得到第二个字段,从而得到英文名字,get second span by css location div_title_list2 = soup.select('div.hd > a > span:nth-of-type(2)')for each in div_title_list2:movie = each.text#movie = movie.replace(u'\xa0', u' ')movie = movie.strip('\xa0/\xa0') # 去掉英文名字中的空格,strip the extra string in the english namemovie_list_english_name.append(movie)for each in div_info_list:num += 1info = each.p.text.strip()if len(info) < 3: # 筛选掉不符合条件的信息,skip the information not neededcontinue# 搜索电影上映年代,find the movie yearlines = info.split('\n') # 将信息按照换行符分割成不同句子,split the info into two linestime_start = lines[1].find('20')if time_start < 0:time_start = lines[1].find('19')time_len = lines[1][time_start : time_start + 4]time_list.append(time_len)time_end = time_start + 4# find the director English name. some director name string strange, so drop this# for i in range(len(info)):# if info[i].encode( 'UTF-8' ).isalpha():#break# if i != len(info) - 1:# start = i# end = info.find('主')# director = info[start : end - 3]# director_list_english_name.append(director)# 搜索电影导演中文名,find the director nameend = info.find('主')if end < 0:end = info.find('...')director = info[4 : end - 3]director_list.append(director)# 搜索电影来源地,find the nation of the moviefrequent = 0start = 0end = 0line2 = lines[1]for j in range(len(line2)):if line2[j] == '\xa0':frequent += 1if frequent == 2 and start == 0:start = j + 1if frequent == 3:end = jbreaknation = line2[start : end]nation_list.append(nation)# 搜索电影类型,find the category of the moviefrequent = 0start = 0for j in range(len(line2)):if line2[j] == '\xa0':frequent += 1if frequent == 4 and start == 0:start = j + 1category = line2[start : len(line2)]category_list.append(category)# 搜索电影评分,find the star of each movie for each in div_star_list:info = each.text.strip()star = float(info[0 : 3])star_list.append(star)end = info.find('人')reviewNum = int(info[3 : end])reviewNum_list.append(reviewNum)# 搜索电影代表评论,find the best quote for each moviefor each in div_quote_list:info = each.text.strip()quote_list.append(info)if len(quote_list) == 249: # 第250部电影没有代表性评论,单独处理。the 250th movie has no quote, so add a blank onequote_list.append(' ') file = xlwt.Workbook()table = file.add_sheet('sheet1', cell_overwrite_ok = True)table.write( 0, 0, "排名")table.write( 0, 1, "电影中文名")table.write( 0, 2, "电影其他名")table.write( 0, 3, "时间")table.write( 0, 4, "导演")table.write( 0, 5, "国家或地区")table.write( 0, 6, "评分")table.write( 0, 7, "评分人数")table.write( 0, 8, "电影类型")table.write( 0, 9, '代表性评论')for i in range(len(nation_list)):table.write(i + 1, 0, i + 1)table.write(i + 1, 1, movie_list_chinese_name[i])table.write(i + 1, 2, movie_list_english_name[i])table.write(i + 1, 3, time_list[i])table.write(i + 1, 4, director_list[i])table.write(i + 1, 5, nation_list[i])table.write(i + 1, 6, star_list[i])table.write(i + 1, 7, reviewNum_list[i])table.write(i + 1, 8, category_list[i])table.write(i + 1, 9, quote_list[i])# 导出到 xls 文件里,save to xls file file.save('豆瓣 top 250 电影爬虫抓取.xls')# 分析电影来源地,analysis nationslocations = []for i in range(len(nation_list)):nations = nation_list[i].split(' ') for j in range(len(nations)):if nations[j] == '西德':nations[j] = '德国'locations.append(nations[j])result = Counter(locations)result_sort = sorted(result.items(), key = lambda x: x[1], reverse = True) # order descending and by x[1]result_sort = collections.OrderedDict(result_sort)othervalue = 0for i in range(10, len(result)):othervalue += list(result_sort.values())[i]# 画饼状图,draw the pie picture using matplotlibdef make_autopct(values): # 定义饼状图中数字显示方式, define the values formats in the piedef my_autopct(pct):total = sum(values)val = int(round(pct*total/100.0))return '{p:.1f}%({v:d})'.format(p = pct, v = val)return my_autopctvalues = []labels = []for i in range(10):values.append(list(result_sort.values())[i])labels.append(list(result_sort.keys())[i])values.append(othervalue)labels.append('其他地区')plt.rcParams['savefig.dpi'] = 200 # 定义图形清晰度,set dpi for figure, affect the figure's sizeplt.rcParams['figure.dpi'] = 200 #set dpi for figurew, l, p = plt.pie(values, explode = [0.02 for i in range(11)], labels = labels, pctdistance = 0.8, radius = 1, rotatelabels = True, autopct = make_autopct(values))[t.set_rotation(315) for t in p] # 设置标签旋转,rotate the text for the labelsplt.title('豆瓣 TOP250 电影来源地', y = -0.1)plt.show()# 分析电影类型,analysis categoriescategories = []for i in range(len(category_list)):category = category_list[i].split(' ') for j in range(len(category)):categories.append(category[j])result = Counter(categories)result_sort = sorted(result.items(), key = lambda x: x[1], reverse = True) #排序 order descending and by x[1]result_sort = collections.OrderedDict(result_sort)othervalue = 0for i in range(15, len(result)):othervalue += list(result_sort.values())[i]# draw the pie picture using matplotlibvalues = []labels = []for i in range(15):values.append(list(result_sort.values())[i])labels.append(list(result_sort.keys())[i])values.append(othervalue)labels.append('其他类型')plt.rcParams['savefig.dpi'] = 200 # 定义图形清晰度,set dpi for figure, affect the figure's sizeplt.rcParams['figure.dpi'] = 200 #set dpi for figurew, l, p = plt.pie(values, explode = [0.02 for i in range(16)], labels = labels, pctdistance = 0.8, radius = 1, rotatelabels = True, autopct = make_autopct(values))[t.set_rotation(315) for t in p] # rotate the text for the labelsplt.title('豆瓣 TOP250 电影种类', y = -0.1)plt.show()# word cloudjieba.add_word('久石让')jieba.add_word('谢耳朵')# 一些语气词和没有意义的词del_words = ['的', ' ', '人', '就是', '一个', '被', '不是', '也', '最', '了', '才', '给', '要', '就', '让', '在', '都', '是', '与', '和', '不', '有', '我', '你', '能', '每个', '不会', '中', '没有','这样', '那么', '不要', '如果', '来', '它', '对', '当', '比','不能', '却', '一种', '而', '不过', '只有', '不得不', '再','不得不', '比', '一部', '啦', '他', '像', '会', '得', '里']all_quotes = ''.join(quote_list) # 将所有代表性评论拼接为一个文本# 去掉标点符号all_quotes = re.sub(r"[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+", " ", all_quotes)words = jieba.lcut(all_quotes)words_final = []for i in range(len(words)): # 去掉一些语气词,没有意义的词。 if words[i] not in del_words:words_final.append(words[i])text_result = Counter(words_final)cloud = WordCloud(font_path = 'FZSTK.TTF',background_color = 'white',width = 1000,height = 860,max_words = 40 )#wc = cloud.generate(words) # 这种方法对中文支持不太好,this mehtod is better for only english stringwc = cloud.generate_from_frequencies(text_result)wc.to_file("豆瓣 TOP 250 词云.jpg") plt.figure()plt.imshow(wc)plt.axis('off')plt.title('豆瓣 TOP 250 电影代表性评论的词云分析')plt.show()# 评分最高的十部电影star_dict = dict(zip(movie_list_chinese_name, star_list))star_sort = sorted(star_dict.items(), key = lambda x: x[1], reverse = True) #排序 order descending and by x[1]star_sort = collections.OrderedDict(star_sort)values = []labels=[]for i in range(10):labels.append(list(star_sort.keys())[i])values.append(list(star_sort.values())[i])bar = plt.barh(range(10), width = values, tick_label = labels, color = 'rgbycmrgby')for i, v in enumerate(values): # 柱状图添加数字plt.text(v + 0.05, i - 0.1, str(v), color = 'blue', fontweight = 'bold')plt.xlim(xmax = 10, xmin = 8)plt.title('评分最高的十部电影')plt.show()# 评分人数最多的十部电影review_dict = dict(zip(movie_list_chinese_name, reviewNum_list))review_sort = sorted(review_dict.items(), key = lambda x: x[1], reverse = True) #排序 order descending and by x[1]review_sort = collections.OrderedDict(review_sort)values = []labels=[]for i in range(10):labels.append(list(review_sort.keys())[i])values.append(list(review_sort.values())[i])bar = plt.barh(range(10), width = values, tick_label = labels, color = 'rgbycmrgby')for i, v in enumerate(values): # 柱状图添加数字plt.text(v + 10000, i - 0.1, str(v), color = 'blue', fontweight = 'bold')plt.xlim(xmax = 1450000, xmin = 400000)plt.title('评分人数最多的十部电影')plt.show()
