700字范文 > Python爬虫获取简书的用户文章评论图片等数据并存入数据库

Python爬虫获取简书的用户文章评论图片等数据并存入数据库

时间：2020-09-19 03:53:20

Python爬虫获取简书的用户、文章、评论、图片等数据，并存入数据库

爬虫定义：网络爬虫（又称为网页蜘蛛，网络机器人，在FOAF社区中间，更经常的称为网页追逐者），是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。（来源：百度百科——爬虫）爬取原理：万维网由许多的url构成，犹如一张巨大的网。通过url并发送请求获取数据（html），最后解析提取所需数据即可。实战演示：获取简书的用户、文章、评论、图片等数据，并存入数据库。

（1）网站分析

1）获取文章url列表

2）获取文章信息

3）获取评论信息

4）获取用户信息

（2）编码实现

1）bean

# 用户：用户ID、用户名、密码、性别、年龄、地址、电话、头像、邮箱、关注总数、粉丝总数、文章总数、总字数、喜欢总数、余额、简介、注册时间。class User(object):def __init__(self, name, password, sex, age, address, tel, head_image, email, followers_count, fans_count,articles_count, words_count, likes_count, balance, profile):self.user_id = Noneself.name = nameself.password = passwordself.sex = sexself.age = ageself.address = addressself.tel = telself.head_image = head_imageself.email = emailself.followers_count = followers_countself.fans_count = fans_countself.articles_count = articles_countself.words_count = words_countself.likes_count = likes_countself.balance = balanceself.profile = profileself.create_time = Nonedef __str__(self):print("id:" + str(self.user_id))print("sex:" + str(self.sex))print("age:" + str(self.age))print("address:" + str(self.address))print("tel:" + str(self.tel))print("head_image:" + str(self.head_image))print("email:" + str(self.email))print("followers_count:" + str(self.followers_count))print("fans_count:" + str(self.fans_count))print("articles_count:" + str(self.articles_count))print("words_count:" + str(self.words_count))print("likes_count:" + str(self.likes_count))print("balance:" + str(self.balance))print("profile:" + str(self.profile))# 图书分类：图书分类ID、分类名、描述、文章总数、粉丝总数。class Classify(object):def __init__(self, classify_id, name, description, articles_count, fans_count, article_url):self.classify_id = classify_idself.name = nameself.description = descriptionself.articles_count = articles_countself.fans_count = fans_countself.article_url = article_url# 图书：图书ID、分类ID、作者ID、图书标题、图书简要、图书内容、总字数、浏览数、点赞数、评论数、赞赏数、发布时间。class Book(object):def __init__(self, book_id, classify_id, author_name, title, content, words_count, views_count, likes_count, comments_count, rewards_count):self.book_id = book_idself.classify_id = classify_idself.author_id = Noneself.author_name = author_nameself.title = titleself.description = Noneself.content = contentself.words_count = words_countself.views_count = views_countself.likes_count = ments_count = comments_countself.rewards_count = rewards_countself.create_time = Nonedef __str__(self):print("作者：" + self.author_name)print("标题：" + self.title)print("内容：" + self.content)print("字数：" + str(self.words_count))print("浏览：" + str(self.views_count))print("喜欢：" + str(self.likes_count))print("评论：" + str(ments_count))print("打赏：" + str(self.rewards_count))# 评论：评论ID、用户ID、图书ID、评论内容、点赞数、回复数、评论状态、评论日期。class Comment(object):def __init__(self, comment_id, user_id, book_id, content, likes_count, reply_count, status):ment_id = comment_idself.user_id = user_idself.book_id = book_idself.content = contentself.likes_count = likes_countself.reply_count = reply_countself.status = statusself.create_time = Nonedef __str__(self):print("id:" + str(ment_id))print("book_id:" + str(self.book_id))print("content:" + str(self.content))print("likes_count:" + str(self.likes_count))print("reply_count:" + str(self.reply_count))print("status:" + str(self.status))# 回复：回复ID、被回复评论ID、回复评论ID、回复时间。class Reply(object):def __init__(self, reply_id, be_reply_comment_id, reply_comment_id, create_time):self.reply_id = reply_idself.be_reply_comment_id = be_reply_comment_idself.reply_comment_id = reply_comment_idself.create_time = create_time

2）spider

import jsonimport osimport randomimport reimport timefrom xpinyin import Pinyinimport requestsfrom bean import beanheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/92.0.4515.131 Safari/537.36 '}root = "E:/study/JAVA/SmallWhiteBook/Python/SQL/"images_root = 'E:/study/JAVA/SmallWhiteBook/Python/upload/images/'home_url = ''classify = {'程序员': 'NEt52a','摄影': '7b2be866f564','故事': 'fcd7a62be697','IT': 'V2CqjW','读书': 'yD9GAd','诗': 'vHz3Uc','手绘': '8c92f845cd4d','自然科普': 'cc7808b775b4','旅行': '5AUzod','电影': '1hjajt'}users = []books = []comments = []images = []classify_list = []articles_id_list = []user_urls = []book_urls = []comment_urls = []articles_list = []user_name_and_id = []suffix = '.sql'address_list = ['广西南宁','广西河池','广西桂林','四川成都','贵州六盘水','广东深圳','北京天安门','巴黎埃菲尔铁塔',"旧金山","洛杉矶","硅谷","黑龙江","内蒙古","兰州"]# get数据并编码def get_content(url):return requests.get(url, headers=headers).content.decode()# 为防止sql异常用中文的单引号替换英文的单引号def clear_single_quotation_mark(string):string = re.sub("'", "’", str(string))return str(string)# 删除文件def delete_file(path):if os.path.exists(path):os.remove(path)# 下载图片def download_images(url, name):path = images_root + namewith open(path, "wb") as f_obj:f_obj.write(get_content_no_decode(url))# 下载用户头像def download_head_images():for user in users:url = user.head_imagename = str(url).split('?')[0].split('/')[-1]download_images(url, name)# 将中文转化为拼音def format_pinyin(name):p = Pinyin()res = p.get_pinyin(name)ans = ""split = res.split('-')for s in split:ans += s.capitalize()return ans# 写入文件def write(path, content):path = root + path + suffixdelete_file(path)with open(path, "a", encoding='utf-8') as f_obj:f_obj.write(content)# 写入用户信息def write_user():table_name = "user"users_set = []# 清除重复for u in users:flag = Falsefor us in users_set:if u.name == us.name:flag = Truebreakif not flag:users_set.append(u)user_id = 1res = ""for u in users_set:# 用户ID、用户名、密码、性别、年龄、地址、电话、头像、邮箱、关注总数、粉丝总数、文章总数、总字数、喜欢总数、余额、简介、注册时间。sql = "insert " + table_name + "(userId, name, password, sex, age, address, tel, headImage, email, balance, followers, fans, articles, words, likes, profile) values "sql += "('" + str(user_id) + "', '" + str(u.name) + "', '" + str(u.password) + "', '" + u.sex + "', '" + str(u.age) + "', '" + str(u.address) + "', '" + str(u.tel) + "', '" + u.head_image + "', '" + u.email + "', '" + str(u.balance) + "', '" + str(u.followers_count) + "', '" + str(u.fans_count) + "', '" + str(u.articles_count) + "', '" + str(u.words_count) + "', '" + str(u.likes_count) + "', '" + str(u.profile) + "') "sql += ";\n"res += sqluser_id += 1user_name_and_id.append([u.name, user_id])write(table_name, res)# 写入分类信息def write_classify():table_name = "classify"res = ""for c in classify_list:sql = "insert " + table_name + "(classifyId, name, description, articles, fans, image) values "sql += "('" + str(c.classify_id) + "', '" + str(c.name) + "', '" + clear_single_quotation_mark(c.description) + "', '" + str(c.articles_count) + "', '" + str(c.fans_count) + "', '" + str(c.article_url) + "') "sql += ";\n"res += sqlwrite(table_name, res)# 写入图书信息def write_book():table_name = "book"books_set = []for b in books:flag = Falsefor bs in books_set:if b.book_id == bs.book_id:flag = Truebreakif not flag:books_set.append(b)res = ""for b in books_set:author_id = random.randint(1, len(user_name_and_id))sql = "insert " + table_name + "(bookId, userId, classifyId, title, content, words, views, comments, rewards) values "sql += "('" + str(b.book_id) + "', '" + str(author_id) + "', '" + str(b.classify_id) + "', '" + clear_single_quotation_mark(b.title) + "', '" + clear_single_quotation_mark(b.content) + "', '" + str(b.words_count) + "', '" + str(b.views_count) + "', '" + str(ments_count) + "', '" + str(b.rewards_count) + "') "sql += ";\n"res += sqlwrite(table_name, res)# 写入评论信息def write_comment():table_name = "comment"comments_set = []res = ""for c in comments:flag = Falsefor cs in comments_set:if ment_id == ment_id:flag = Truebreakif not flag:comments_set.append(c)for c in comments_set:# 评论ID、用户ID、图书ID、评论内容、点赞数、回复数、评论状态、评论日期。user_id = random.randint(1, len(user_name_and_id))sql = "insert " + table_name + "(userId, bookId, content, status, likes, replys) values "sql += "('" + str(user_id) + "', '" + str(c.book_id) + "', '" + str(c.content) + "', '" + str(c.status) + "', '" + str(c.likes_count) + "', '" + str(c.reply_count) + "') "sql += ";\n"res += sqlwrite(table_name, res)# 简书的爬虫类class Spider(object):# 主页、分类、每种分类爬取的数量def __init__(self, home_url=home_url, classify=classify, per=52, comment_count=20):self.home_url = home_urlself.classify = classifyself.per = ment_count = comment_count# 获取分类详细信息def get_classify_list(self):id = 1for k in classify:# 获取分类urlurl = self.get_classify_url(classify[k])content = get_content(url)pattern = pile(r'<div class="main-top">(.*?)<ul class="trigger-menu"', re.S)main_top = pattern.findall(content)pattern = pile(r'<img src="(.*?)".*?<div class="info">.*?(\d+).*?(\d+).*?</div>', re.S)info = pattern.findall(main_top[0])[0]# 获取文章图片地址article_url = info[0]# 下载图片split = str(article_url).split('?')[0].split('/')path_name = split[-2] + split[-1]download_images(article_url, path_name)# 获取文章总数和粉丝总数articles_count = info[1]fans_count = info[2]pattern = pile(r'<div class="description.*?">(.*?)<a', re.S)# 获取描述信息description = pattern.findall(content)[0]# 添加到分类列表中classify_list.append(bean.Classify(id, k, description, articles_count, fans_count, path_name))# 获取分类中的图书url列表self.get_book_urls(content, id)id += 1# 获取图书url列表def get_book_urls(self, content, classify_id):# 获取分类中的图书url列表pattern = pile(r'<ul class="note-list".*?>(.*?)</ul>', re.S)note_list = pattern.findall(content)[0]pattern = pile(r'<li.*?data-note-id="(.*?)".*?>.*?<a class="wrap-img" href="(.*?)" target="_blank"',re.S)note = pattern.findall(note_list)for n in note:# 获取文章id和地址article_id = n[0]article_url = self.home_url + n[1]articles_list.append([article_id, classify_id, article_url])# 获取图书信息def get_book(self, article_id, classify_id, url):# 解析图书信息content = get_content(url)pattern = pile(r'<h1 class=".*?>(.*?)</h1>.*?href="(/u/.*?)".*?<span.*?>(.*?)</span></a>', re.S)ans = pattern.findall(content)if len(ans) == 0:returnelse:ans = ans[0]# 获取图书标题title = ans[0]# 获取用户urluser_url = self.home_url + ans[1]user_urls.append([classify_id, user_url])# 获取用户名name = ans[2]pattern = pile(r'<article.*?>(.*?)</article>', re.S)article_content = pattern.findall(content)[0]# 获取评论信息comment_url = self.get_comment_url(article_id)comment_urls.append([article_id, classify_id, comment_url])pattern = pile(r'<i aria-label="ic-reply".*?<span.*?(\d+)</span>.*?<i aria-label="ic-like".*?<span.*?>(.*?)</span>', re.S)comment = pattern.findall(content)[0]views_count = random.randint(1000, 90000)comment_count = comment[0]pattern = pile(r'.*?(\d+).*?', re.S)# 防止获取到非数字likes_count = pattern.findall(comment[1])if len(likes_count) == 0:likes_count = 0else:likes_count = likes_count[0]rewards_count = random.randint(10, 900)# 下载文章中的图片# <img data-original-src="//upload-images.jianshu.io/upload_images/25117622-c86992720337c5d4.image" data-original-width="768" data-original-height="427" data-original-format="image/png" data-original-filesize="122873">image_pattern = pile(r'<img data-original-src="//upload-images.jianshu.io/upload_images/(.*?)".*?>', re.S)image_list = image_pattern.findall(article_content)for image in image_list:url = "https://upload-images.jianshu.io/upload_images/" + imagefile_name = str(image).split('/')[-1]download_images(url, file_name)# 替换掉image的地址article_content = re.sub(image_pattern, r'<img src="\1">', article_content)book = bean.Book(article_id, classify_id, name, title, article_content, len(article_content), views_count,likes_count, comment_count, rewards_count)books.append(book)# 获取用户信息，并获取用户发布的图书信息def get_user(self, classify_id, user_url):content = get_content(user_url)pattern = pile(r'<div class="main-top">(.*?)<ul class="trigger-menu"', re.S)main_top = pattern.findall(content)[0]pattern = pile(r'<a class="avatar".*?<img src="(.*?)".*?<a class="name" href="/u/.*?">(.*?)</a>.*?<div class="info">.*?<ul>(.*?)</ul>',re.S)info = pattern.findall(main_top)[0]head_image = info[0]name = info[1]password = 123pattern = pile(r'.*?<i class="iconfont ic-(.*?)man"')sex_icon = pattern.findall(main_top)sex = "男"# 未设置性别，默认女if len(sex_icon) == 0:sex = "女"else:sex_icon = sex_icon[0]if sex_icon == 'wo':sex = "女"age = random.randint(18, 54)address = address_list[random.randint(0, len(address_list) - 1)]tel = 13299884565email = format_pinyin(name) + "@"pattern = pile(r'<li>.*?<p>(.*?)</p>.*?</li', re.S)# 关注、粉丝、文章、字数、喜欢、资产info = pattern.findall(info[2])pattern = pile(r'<div class="description">.*?<div class="js-intro">(.*?)</div>', re.S)profile = pattern.findall(content)[0]# name, password, sex, age, address, tel, head_image, email, followers_count, fans_count,# articles_count, words_count, likes_count, balance, profile, create_timebalance = str(info[5])if balance.__contains__('w'):balance = float(balance.split('w')[0]) * 10000# 下载用户头像path_name = str(head_image).split('?')[0].split('/')[-1]download_images(head_image, path_name)user = bean.User(name, password, sex, age, address, tel + random.randint(0, 1000), path_name, email, info[0],info[1], info[2], info[3], info[4], balance, profile)users.append(user)# 获取用户的文章self.get_book_urls(content, classify_id)# 获取评论信息def get_comment(self, book_id, classify_id, url):comment_list = json.loads(get_content(url))["comments"]for c in comment_list:# 获取用户user = c["user"]# 获取用户urluser_url = self.home_url + "/u/" + user["slug"]# [classify_id, user_url]user_urls.append([classify_id, user_url])# 获取评论comment = ment(c["id"], user["id"], book_id, c["compiled_content"], c["likes_count"],c["children_count"],random.randint(0, 1))comments.append(comment)children_list = c["children"]for c in children_list:user = c["user"]comment = ment(c["id"], user["id"], book_id, c["compiled_content"], 0, 0, random.randint(0, 1))comments.append(comment)user_url = self.home_url + "/u/" + user["slug"]user_urls.append([classify_id, user_url])def get_classify_url(self, v):return self.home_url + '/c/' + vdef entrance(self):self.get_classify_list()print("获取文章列表")for article in articles_list:print(article)time.sleep(1)print("睡眠1s...")self.get_book(article[0], article[1], article[2])print("*" * 100)print("获取用户列表")for user in user_urls:print(user)self.get_user(user[0], user[1])print("*" * 100)print("获取评论列表")for comment in comment_urls:print(comment)self.get_comment(comment[0], comment[1], comment[2])print("*" * 100)print("用户总数：" + str(len(users)))print("文章总数：" + str(len(books)))print("评论总数：" + str(len(comments)))print("*" * 100)print("文件写入...")write_user()write_classify()write_book()write_comment()def get_comment_url(self, article_id):# /shakespeare/notes/9126/comments?page=1&count=10&author_only=false&order_by=descreturn self.home_url + '/shakespeare/notes/' + str(article_id) + '/comments?page=1&count=' + str(ment_count) + '&author_only=false&order_by=desc'if __name__ == '__main__':spider = Spider()spider.entrance()

（3）效果展示

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。

Python爬虫获取简书的用户 文章 评论 图片等数据 并存入数据库

Python爬虫获取简书的用户、文章、评论、图片等数据，并存入数据库

Python爬虫获取简书的用户文章评论图片等数据并存入数据库