时间:-04-10
概述:搜索爬虫 人工智能
一个Python 人工智能爬虫演示,本代码演示抓取新浪微博数据,若正在采集新浪微博数据,如需取消请按CTRL+C 退出程序。
#!/usr/bin/python
#download by
import requests
import MySQL
import json
import urllib
import weibobase
import sys
import time
reload(sys)
sys.setdefaultencoding('utf-8')
cookie = {"Apache": "4085444405648.557.1517558859962",
"H5_INDEX": "2",
"H5_INDEX_TITLE": "%E7%A7%8B%E5%86%AC%E6%9A%96%E8%89%B2%E7%B3%BB",
"M_WEIBOCN_PARAMS": "lfid%3D1005052109066367%252Fhome%26luicode%3D20000174%26fid%3D102803%26uicode%3D10000011",
"SCF": "AlPdz7Wu9iu_xwiWfMtd1hBGr6mZqaKtCcidCgPrDl6ocdl8HcIvA5NZpk0cm36a0xrCpnFl0ZgfV-Bc5BUAktQ.",
"SSOLoginState": "1520562809",
"SUB": "_2A253pYIoDeRhGeRP61sR9ijPzTuIHXVVaS5grDV6PUJbktAKLRLQkW1NUFPZQRFUxRYf5itrGk6VqEtGIU3izGDT",
"SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9W5MyLbIiX5quKaqF190KSgT5JpX5K-hUgL.Fozpeh.7Soq0SoM2dJLoIEXLxKMLBKML12zLxK-L1hqLB-eLxKqL1-2L1KqLxKnL1h.LBozLxKMLBoeLB.zt",
"SUHB": "0Elrkzb0Smx-GW",
"WEIBOCN_FROM": "1110006030",
"_T_WM": "46f8072dc2db4752c9f5f1bb610d6934",
"browser": "d2VpYm9mYXhpYW4%3D",
"h5_deviceID": "da4db009e6ae3831cc4fbc8d1998",
}
cookie2 = {"ALF": "1522043003",
"M_WEIBOCN_PARAMS": "luicode%3D10000011%26lfid%3D102803%26fid%3D102803%26uicode%3D10000011",
"SCF": "AlPdz7Wu9iu_xwiWfMtd1hBGr6mZqaKtCcidCgPrDl6oNht3rRthMvGzFst-DncCt1l6_LYi6h6jCGNO6OtXVDU.",
"SUB": "_2A253lIvWDeRhGeRP61sR9ijPzTuIHXVVdhWerDV6PUJbktANLVTakW1NUFPZQVmJdEJdcebLE3J8mIqAPe4rxEz4",
"SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9W5MyLbIiX5quKaqF190KSgT5JpX5K-hUgL.Fozpeh.7Soq0SoM2dJLoIEXLxKMLBKML12zLxK-L1hqLB-eLxKqL1-2L1KqLxKnL1h.LBozLxKMLBoeLB.zt",
"SUHB": "0pHAjcQEUb1cye",
"WEIBOCN_FROM": "1110006030",
"_T_WM": "46f8072dc2db4752c9f5f1bb610d6934",
"browser": "d2VpYm9mYXhpYW4%3D",
"h5_deviceID": "da4db009e6ae3831cc4fbc8d1998",
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':"gzip, deflate, br",
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': '',
'Cookie':'browser=d2VpYm9mYXhpYW4%3D; h5_deviceID=da4db009e6ae3831cc4fbc8d1998; _T_WM=46f8072dc2db4752c9f5f1bb610d6934; ALF=1523154787; SCF=AlPdz7Wu9iu_xwiWfMtd1hBGr6mZqaKtCcidCgPrDl6ocdl8HcIvA5NZpk0cm36a0xrCpnFl0ZgfV-Bc5BUAktQ.; SUB=_2A253pYIoDeRhGeRP61sR9ijPzTuIHXVVaS5grDV6PUJbktAKLRLQkW1NUFPZQRFUxRYf5itrGk6VqEtGIU3izGDT; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5MyLbIiX5quKaqF190KSgT5JpX5K-hUgL.Fozpeh.7Soq0SoM2dJLoIEXLxKMLBKML12zLxK-L1hqLB-eLxKqL1-2L1KqLxKnL1h.LBozLxKMLBoeLB.zt; SUHB=0Elrkzb0Smx-GW; SSOLoginState=1520562809; H5_INDEX=2; H5_INDEX_TITLE=%E7%A7%8B%E5%86%AC%E6%9A%96%E8%89%B2%E7%B3%BB; WEIBOCN_FROM=1110006030; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D102803%26fid%3D102803%26uicode%3D10000011',
'RA-Sid': 'B781E81A-0402-024118-ce25e1-ba5345',
'RA-Ver': '3.0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
}
class GetWieBo(object):
alls_num=0;
def __init__(self):
super(GetWieBo, self).__init__()
print u'数据获取中,CTRL+C 退出程序...'
def request(self,num=0):
try:
proxies = {
"http": "http://120.79.223.130:3128",
}
url = '/api/container/getIndex?containerid=102803&since_id=%s'%num
json_str = requests.get(url, cookies=cookie).content
json_base = json.loads(json_str)
except :
print "HTTPError--错误%s"
json_base=[]
time.sleep(1000)
print len(json_base["data"]["cards"])
if len(json_base["data"]["cards"])==0:
time.sleep(1000)
for cards in json_base["data"]["cards"]:
user=weibobase.wb_uset
user.avatar_hd=cards["mblog"]["user"].get("avatar_hd")# 用户头像
user.description=cards["mblog"]["user"].get("description")# 用户简介
user.screen_name=cards["mblog"]["user"].get("screen_name") # 用户名字
user.profile_url=cards["mblog"]["user"].get("profile_url") # 用户主页地址
user.followers_count=cards["mblog"]["user"].get("followers_count") # 用户粉丝数量
user.follow_count=cards["mblog"]["user"].get("follow_count") # 用户关注数量
user.id=cards["mblog"]["user"].get("id")# 用户id
mblog = weibobase.wb_mblog
mblog.id=cards["mblog"].get("id")# 微博id
mblog.scheme=cards.get("scheme") # 单条的地址
text=cards["mblog"].get("text") # 内容
mblog.text=text
mblog.created_at=cards["mblog"].get("created_at") # 内容
mblog.attitudes_count=cards["mblog"].get("attitudes_count") # 点赞
ments_count=cards["mblog"].get("comments_count") # 评论
mblog.reposts_count=cards["mblog"].get("reposts_count") # 转发
mblog.source=cards["mblog"].get("source")# 终端
mblog.obj_ext=cards["mblog"].get("obj_ext")#播放次数
if not cards["mblog"].get("pics"):
print "不是图片/没有图片"
mblog.image_urls = []
else:
mblog.image_urls=[]
for pics in cards["mblog"].get("pics"):
url=""
if not pics["large"]:
url = pics["url"]#缩略图
else:
url = pics["large"]["url"] #大图
mblog.image_urls.append(url)
if not cards["mblog"].get("page_info"):
print "不是视频"
mblog.stream_url = ""
mblog.page_url = ""
else:
if not cards["mblog"]["page_info"].get("media_info"):
print "未获取到视频地址"
mblog.stream_url=""
mblog.page_url=""
else:
mblog.stream_url = cards["mblog"]["page_info"].get("media_info").get("stream_url") # 视频地址
mblog.page_url = cards["mblog"]["page_info"].get("page_url") # 视频播放地址
MySQL.selectUset(user)
MySQL.selectMblog(mblog)
MySQL.db_cloce