700字范文 > python 豆瓣电影top250_[python爬虫]豆瓣电影Top250简单数据分析绘图

python 豆瓣电影top250_[python爬虫]豆瓣电影Top250简单数据分析绘图

时间：2020-05-19 23:26:12

一：简介

通过抓取豆瓣电影Top250的数据，分别进行了三个数据统计，分别是：上榜的电影上映的年份，该年份总共上榜的电影数量，数量为0的就没有统计了；各个国家地区出品的电影数量；250部电影的各个类型标签的数量。

二：源代码

#coding=utf-8

import requests

from bs4 import BeautifulSoup

import os, socket, re

import matplotlib as mpl

import matplotlib.pyplot as plt

import numpy as np

class Spider:

def __init__(self, url='/top250'):

self.url = url

self.header = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"

}

def mkdir(self, path):

path = path.strip()

isExists = os.path.exists(os.path.join("D:\mdouban", path))

if not isExists:

os.makedirs(os.path.join("D:\mdouban", path))

os.chdir(os.path.join("D:\mdouban", path))

else:

os.chdir(os.path.join("D:\mdouban", path))

return os.path.abspath('.')

#获取BeautifulSoup

def get_soup(self, link):

html = requests.get(link, headers=self.header)

html.encoding = html.apparent_encoding

soup = BeautifulSoup(html.text, 'lxml')

return soup

if __name__ == '__main__':

socket.setdefaulttimeout(20)

spider = Spider()

path = spider.mkdir('top250')

print('starting get data from douban...')

def autolabel(rects, ax, xpos='center'): #设置显示每一个条形图的值

"""

Attach a text label above each bar in *rects*, displaying its height.

*xpos* indicates which side to place the text w.r.t. the center of

the bar. It can be one of the following {'center', 'right', 'left'}.

"""

xpos = xpos.lower() # normalize the case of the parameter

ha = {'center': 'center', 'right': 'left', 'left': 'right'}

offset = {'center': 0.5, 'right': 0.57, 'left': 0.43} # x_txt = x + w*off

for rect in rects:

height = rect.get_height()

ax.text(rect.get_x() + rect.get_width() * offset[xpos], 1.01 * height,

'{}'.format(height), ha=ha[xpos], va='bottom', size=6.8)

def drawYearPlot(num_list, name_list): #绘制X轴为年份，Y轴为电影数量的柱状图

ind = np.arange(len(name_list))

fig, ax = plt.subplots()

ax.set_xlabel('year')

ax.set_ylabel('numbers')

ax.set_title('Douban top 250 movie numbers by year')

rext = ax.bar(ind, num_list, color='b', tick_label=name_list)

autolabel(rext, ax)

plt.xticks(np.arange(len(name_list)), rotation=-90, size=7.2) # 设置X轴坐标的属性

fig = plt.gcf()

fig.set_size_inches(15.5, 10.5) # 设置图片大小

plt.savefig('D:/mdouban/douban_year.png', dpi=200) # 保存统计图到本地，必须在show()方法前调用，否则得到的是一张空白图片,dpi是分辨率

plt.show()

plt.close()

def drawCountryPlot(cry_list): #绘制X轴为国家地区，Y轴为电影数量的柱状图

sta = {}

for i in cry_list: #统计各个国家的电影数量

if not sta.__contains__(i):

sta[i] = 1

else:

sta[i] += 1

num_l = [] #数量

country_list = [] #国家地区

for key, values in sta.items():

country_list.append(key)

num_l.append(values)

ind = np.arange(len(country_list))

fig, ax = plt.subplots()

ax.set_xlabel('country')

ax.set_ylabel('numbers')

ax.set_title('Douban top 250 movie numbers by country')

rext = ax.bar(ind, num_l, color='b', tick_label=country_list)

autolabel(rext, ax)

plt.xticks(np.arange(len(country_list)), size=7.2) # 设置X轴坐标的属性

fig = plt.gcf()

fig.set_size_inches(15.5, 10.5) # 设置图片大小

plt.savefig('D:/mdouban/douban_country.png', dpi=200) # 保存统计图到本地，必须在show()方法前调用，否则得到的是一张空白图片,dpi是分辨率

plt.show()

plt.close()

def drawTypePlot(typ_list): #绘制X轴为电影的标签，Y轴为数量的柱状图

sta = {}

for i in typ_list: #统计各个国家的电影数量

if not sta.__contains__(i):

sta[i] = 1

else:

sta[i] += 1

num_l = [] #数量

tp_list = [] #电影类型

for key, values in sta.items():

tp_list.append(key)

num_l.append(values)

ind = np.arange(len(tp_list))

fig, ax = plt.subplots()

ax.set_xlabel('type')

ax.set_ylabel('numbers')

ax.set_title('Douban top 250 movie number by type')

rext = ax.bar(ind, num_l, color='b', tick_label=tp_list)

autolabel(rext, ax)

plt.xticks(np.arange(len(tp_list)), size=7.2) # 设置X轴坐标的属性

fig = plt.gcf()

fig.set_size_inches(15.5, 10.5) # 设置图片大小

plt.savefig('D:/mdouban/douban_type.png', dpi=200) # 保存统计图到本地，必须在show()方法前调用，否则得到的是一张空白图片,dpi是分辨率

plt.show()

plt.close()

#top250共十页

ys = [] #存储年份

cs = [] #存储国家地区

ts = [] #存储电影类别

#解决matplotlib显示中文乱码问题

mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体 SimHei为黑体

mpl.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

x = 1

for i in range(1, 11):

if i == 1:

url = spider.url

else:

url = spider.url + '?start=' + str(25*(i-1)) + '&filter=' #后面9页的链接需要拼接

main_soup = spider.get_soup(url)

ol_grid = main_soup.find('ol', class_='grid_view')

li = ol_grid.find_all('li')

for l in li:

em_rank = l.find('em').get_text()

div_hd = l.find('div', class_='hd')

a = div_hd.find('a')

title = a.find('span', class_='title').get_text()

p_info = l.find('p', class_='').get_text()

s_c = p_info.split('/')[-2].strip()

country = s_c.split()[0] #获取国家地区字段，取第一个

cs.append(country)

l_typ = p_info.split('/')[-1].strip().split() #获取电影类型的数组

for typ in l_typ:

ts.append(typ)

s1 = ''.join(p_info.split()) #去掉字符串中的\xa0

l_s = s1.split('/')

if x == 80:

year = '1961' #第80的大闹天宫上映了多次，特殊处理

else:

year = l_s[-3][-4:] #电影的上映年份

x += 1

ys.append(year)

div_star = l.find('div', class_='star')

rating_num = div_star.find('span', class_='rating_num').get_text()

review = div_star.find_all('span')[3].get_text()

div_bd = l.find('div', class_='bd')

q = div_bd.find('span', class_='inq')

if q != None: #部分电影是没有短评的，所以需要判断

quote = q.get_text()

else:

quote = '无'

name_list = []

sta = {}

for i in range(1931, ): #柱状图的X轴坐标

name_list.append(i)

sta[str(i)] = 0

for x in ys: #统计从1931到每年在榜单中的电影数量

sta[x] += 1

num_list = []

name_list1 = []

for key, value in sta.items():

if value > 0: #只显示电影数量大于0的

name_list1.append(str(key))

num_list.append(value)

drawYearPlot(num_list, name_list1)

drawCountryPlot(cs)

drawTypePlot(ts)

print('over!')

三：生成的柱状图

douban_country.png

douban_type.png

douban_year.png

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。