猫眼电影数据抓取实现分析
1、基准xpath: 匹配所有电影信息的节点对象列表
'//dl[@class="board-wrapper"]/dd'
2、遍历对象列表,依次获取每个电影信息
for dd in dd_list:
电影名称:'.//p[@class="name"]/a/text()'
电影主演:'.//p[@class="star"]/text()'
上映时间:'.//p[@class="releasetime"]/text()'
参考代码
import requests
from lxml import etree
class MaoyanSpider(object):
def __init__(self):
self.url = '/board/4'
self.headers = { 'User-Agent':'' }
def save_html(self):
html = requests.get(url=self.url,headers=self.headers).text
# 解析
parse_html = etree.HTML(html)
# 基准xpath,大的节点对象列表
dd_list = parse_html.xpath('//dl[@class="board-wrapper"]/dd')
item = {}
for dd in dd_list:
item['name'] = dd.xpath('.//p[@class="name"]/a/@title')[0].strip()
item['star'] = dd.xpath('.//p[@class="star"]/text()')[0].strip()
item['time'] = dd.xpath('.//p[@class="releasetime"]/text()')[0].strip()
print(item)
def run(self):
self.save_html()
if __name__ == '__main__':
spider = MaoyanSpider()
spider.run()
The End