700字范文,内容丰富有趣,生活中的好帮手!
700字范文 > [python爬虫]selenium模拟登录京东招聘网 爬取研发类 数据清洗 数据存储 终数据分析和可视化

[python爬虫]selenium模拟登录京东招聘网 爬取研发类 数据清洗 数据存储 终数据分析和可视化

时间:2019-03-12 08:39:55

相关推荐

[python爬虫]selenium模拟登录京东招聘网 爬取研发类 数据清洗 数据存储 终数据分析和可视化

目录

引入包

模拟登陆京东网

利用XPath对网页源代码进行解析

数据清洗

每页数据以追加形式保存至csv文件

保存数据到MongoDB数据库,参数为字典组成的列表

数据分析与可视化

总函数

引入包

from selenium import webdriverfrom mon.by import By#WebDriverWait类,负责循环等待from selenium.webdriver.support.ui import WebDriverWait#expected_contions类,负责条件触发from selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver import ActionChainsfrom lxml import etreeimport pandas as pdimport timeimport osimport refrom pymongo import *import numpy as npimport matplotlib.pyplot as pltfrom collections import Counterfrom matplotlib.pyplot import MultipleLocatorimport wordcloudimport jiebaimport imageioimport csvimport re

模拟登陆京东网

def login(url, user, password):driver.get(url)driver.maximize_window()# 进入登录页面driver.find_element_by_link_text('登录').click()# 选择账户登录方式driver.find_element_by_link_text('账户登录').click()# 输入框输入账号和密码driver.find_element_by_id('loginname').send_keys(user)driver.find_element_by_id('nloginpwd').send_keys(password)driver.find_element_by_id('loginsubmit').click()#手动滑动验证码time.sleep(8)#切换界面后选择职位类别:研发类search_btn = wait.until(EC.element_to_be_clickable((By.XPATH, '//i[@class="arrow-down"]')))search_btn.click()driver.find_element_by_id('YANFA').click()#点击搜索driver.find_element_by_link_text('搜索').click()time.sleep(8)#进入到京东研发类招聘界面#可能进入界面后,研发类仍旧没有选出来,再次选择一下search_btn = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="suggess-sel"]/i')))search_btn.click()driver.find_element_by_xpath('//input[@value="YANFA"]').click()search_btn.click()time.sleep(5)return driver.page_source# 通过点击下一页按钮,获取出第1页外其它页网页源代码def get_next_page():# 将滚动条拖动至页面底端,使下一页按钮显露出来driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")# 查找下一页按钮next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.next')))# 单击按钮next_button.click()time.sleep(5)return driver.page_source

利用XPath对网页源代码进行解析

def parse_page(html):dom = etree.HTML(html)# 点击下拉框查看岗位描述和任职要求,以爬取信息for i in range(1, 11):driver.find_element_by_xpath('//div[@class="line"][{}]/div/span/b[@class="drop-down"]'.format(i)).click()# 职位名称careers_name = dom.xpath('//div[@class="info"]/span[1]/text()')# print(careers_name)# print(len(careers_name))# 职位类别careers_class = dom.xpath('//span[@class="sel"][1]/text()')# print(careers_class)# print(len(careers_class))# 工作地点careers_place = dom.xpath('//span[@class="sel"][2]/text()')# print(careers_place)# print(len(careers_place))# 发布时间published_time = dom.xpath('//div[@class="info"]/span[4]/text()')# print(published_time)# print(len(published_time))# 岗位描述published_info = []for p in range(1, 11):#需要获取每一个父节点内所有的子节点信息info = dom.xpath('string(//div[@class="line"][{}]/div[@class="detail"]/div[@class="par"][1])'.format(p))#正则表达式去除“岗位描述”多余字published=re.findall(r"岗位描述:(.*)",info,re.S)[0]published_info.append(published)# published_info = dom.xpath('//div[@class="detail"][1]/div[@class="par"]/p/string()')# print(published_info)# print(len(published_info))# 任职要求careers_requirement = []for p in range(1, 11):#同理岗位描述requirement = dom.xpath('string(//div[@class="line"][{}]/div[@class="detail"]/div[@class="par"][2])'.format(p))careers = re.findall(r"任职要求:(.*)", requirement,re.S)[0]careers_requirement.append(careers)# print(careers_requirement)# print(len(careers_requirement))data = pd.DataFrame({'职位名称': careers_name,'职位类别': careers_class,'工作地点': careers_place,'发布时间': published_time,'岗位描述': published_info,'任职要求': careers_requirement})return data

数据清洗

def data_cleaning(data):# 查看索引,维度print(result.shape)# 重复值data.drop_duplicates()#缺失值data.fillna("无",inplace=True)#异常值处理plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = Falseplt.figure() # 建立图像p=data.boxplot(return_type = 'dict')x = p['fliers'][0].get_xdata()y = p['fliers'][0].get_ydata()y.sort()print(x)print(y)# 使用annotate添加加标注,y[i]为标注的内容,xy表示标注点坐标,xytert表示注释坐标for i in range(len(x)):plt.annotate(y[i],xy = (x[i],y[i]), xytext = (x[i] + 0.05,y[i]))plt.show()#展示箱线图#按照列值进行排序data.sort_values("发布时间",ascending=False)return data

每页数据以追加形式保存至csv文件

#每页数据以追加形式保存至csv文件def save_file(data): # 参数为DataFramecolumns = ['职位名称', '职位类别', '工作地点', '发布时间','岗位描述', '任职要求']filename = './../data/京东研发类招聘信息.csv'if os.path.exists(filename):data.to_csv(filename, mode='a', encoding='utf_8_sig', columns=columns, index=False, header=False)else:data.to_csv(filename, mode='a', encoding='utf_8_sig', columns=columns, index=False)print("csv保存成功!")

保存数据到MongoDB数据库,参数为字典组成的列表

# 保存数据到MongoDB数据库,参数为字典组成的列表def save_to_mongo(data):try:#dataframe转为字典data= data.to_dict(orient='records')collection.insert_many(data)print('Saved to Mongo')except Exception as e:print(e)

数据分析与可视化

# 数据分析与可视化def analysis_visualization():#统计职位名称并画柱形图plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = Falsedata_path = "./../data/京东研发类招聘信息.csv"train_data = pd.read_csv(data_path)article = train_data["职位名称"]f = lambda x: x.split(" ")article_list = article.apply(f)word_counts = Counter()for line in article_list:word_counts.update(line)counter_list = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)x = list(map(lambda x: x[0], counter_list[:5]))y = list(map(lambda y: y[1], counter_list[:5]))ax = plt.gca()# ax为两条坐标轴的实例x_major_locator = MultipleLocator(1)# 把x轴的刻度间隔设置为1,并存在变量里ax.xaxis.set_major_locator(x_major_locator)plt.bar(x, y)plt.xlabel('职位名称')plt.ylabel('数目')# 添加数据标签plt.title('职位名称')for i in range(len(x)):plt.text(x[i], y[i], y[i])plt.show()#统计工作地点并画扇形图data = train_data['工作地点']num = data.value_counts()plt.figure(figsize=(8, 8)) # 设置画布plt.pie(num, autopct='%.2f%%', labels=num.index)plt.axis("equal")plt.title('工作地点')plt.show()#统计发布日期并画折线图article = train_data["发布时间"]f = lambda x: x.split(" ")article_list = article.apply(f)word_counts = Counter()for line in article_list:word_counts.update(line)counter_list = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)x = list(map(lambda x: x[0], counter_list[:5]))y = list(map(lambda y: y[1], counter_list[:5]))plt.plot(x, y)plt.title('发布时间')plt.xlabel('发布时间')plt.ylabel('数目')plt.show()##统计岗位描述化为词云maskImag = imageio.imread('./../img/京东岗位描述原图.jpg')wordcloudString = ''with open(data_path, 'r', encoding='utf-8') as f:data = csv.reader(f)for item in data:career = ''.join(item[4])wordcloudString += careerls = jieba.lcut(wordcloudString)txt = " ".join(ls)wc = wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\微软雅黑\msyhbd.ttc',mask=maskImag,width=1000, height=700,background_color='white',)wc.generate(txt)wc.to_file('./../img/岗位描述.png')wc.to_image().show()# 统计任职要求化为词云maskImag = imageio.imread('./../img/京东任职要求原图.jpg')wordcloudString = ''with open(data_path, 'r', encoding='utf-8') as f:data = csv.reader(f)for item in data:career = ''.join(item[5])wordcloudString += careerls = jieba.lcut(wordcloudString)txt = " ".join(ls)wc = wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\微软雅黑\msyhbd.ttc',mask=maskImag,width=1000, height=700,background_color='white',)wc.generate(txt)wc.to_file('./../img/任职要求.png')wc.to_image().show()

总函数

if __name__ == '__main__':# 账号:xxxxxuser = input("请输入账号:")#密码:xxxxxxpassword = input("请输入密码:")driver = webdriver.Chrome()wait = WebDriverWait(driver, 10)url = "/"#存储至mangodbclient = MongoClient() # 创建连接db = client.jingdong # 访问数据库collection = db.zhaopin # 创建集合#模拟登陆账号并且登录到招聘页,并选择研发类html = login(url, user, password)for i in range(1, 75):#倘若不是第一页if i != 1:html = get_next_page()data = parse_page(html)if i ==1:result= dataif i != 1:result = result.append(data)#print(data)print("第{}页爬取完成!".format(i))#数据预处理result=data_cleaning(result)#保存为csvsave_file(result)#保存至mangodbsave_to_mongo(result)#数据分析与可视化analysis_visualization()driver.close()

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。