700字范文,内容丰富有趣,生活中的好帮手!
700字范文 > 备忘3:爬取博主热门信息以及所有热门微博的点赞的用户信息

备忘3:爬取博主热门信息以及所有热门微博的点赞的用户信息

时间:2024-01-26 06:18:35

相关推荐

备忘3:爬取博主热门信息以及所有热门微博的点赞的用户信息

import requestsimport osimport reimport csvimport timeimport json#headers是请求加上头信息,伪装成浏览器访问,不然会被限制headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}Cookies = {"Cookie": "_T_WM=ac858e3c2c4bfcf46782f5928d99; WEIBOCN_FROM=1110006030; ALF=1525487789; SCF=AktDkPHfGtZ_G6P28yFN5QufvOsFbI5pFfURfdnppHMyiRVumWsnFuuqlxsaRkfm-IyfBlTdHqvtLmDZj1Bu2SI.; SUB=_2A253wfyTDeRhGeVO41YZ8ijOwjyIHXVVTYTbrDV6PUJbktANLUTmkW1NTSfFYR33sk1GxQdr6aOyC5D9YpwqQYUy; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFEZ5vWGfwYo6aYTg5NIEeO5JpX5K-hUgL.Foe71hBReoqE1K52dJLoIpeLxKqL1-BLBKnLxKqL1KnL128j; SUHB=04WeilAeo6tedn; SSOLoginState=1522896067; M_WEIBOCN_PARAMS=from%3Dfeed%26oid%3D4225101094628701%26luicode%3D10000011%26lfid%3D1076033084826290%26fid%3D1005053084826290%26uicode%3D10000011"}#当出现一些解决不了的问题时候 试着更新一下Cookies#用户信息,同时也能获取到uid、fid、oid等关键参数def get_user_info(usr_id):url = '/api/container/getIndex?type=uid&value={usr_id}'.format(usr_id=usr_id)resp = requests.get(url, headers=headers, cookies=Cookies)jsondata = resp.json()#print(jsondata)nickname = jsondata.get('data').get('userInfo').get('screen_name')mblog_num = jsondata.get('data').get('userInfo').get('statuses_count')verified = jsondata.get('data').get('userInfo').get('verified')verified_reason = jsondata.get('data').get('userInfo').get('verified_reason')gender = jsondata.get('data').get('userInfo').get('gender')urank = jsondata.get('data').get('userInfo').get('urank') #用户等级mbrank = jsondata.get('data').get('userInfo').get('mbrank')followers_count = jsondata.get('data').get('userInfo').get('followers_count')follow_count = jsondata.get('data').get('userInfo').get('follow_count')uid = jsondata.get('data').get('userInfo').get('toolbar_menus')[0].get('params').get('uid')try:fid = jsondata.get('data').get('userInfo').get('toolbar_menus')[1].get('actionlog').get('fid')oid = jsondata.get('data').get('userInfo').get('toolbar_menus')[2].get('params').get('menu_list')[0].get('actionlog').get('oid')#有时候出现oid后面不是Hotblog错误的时候把0换成1或者1换成0cardid = jsondata.get('data').get('userInfo').get('toolbar_menus')[1].get('actionlog').get('cardid')except:uid = ''fid = ''oid = ''cardid = ''containerid = jsondata.get('data').get('tabsInfo').get('tabs')[0].get('containerid')Info = {'nickname':nickname,'mblog_num':mblog_num,'verified':verified,'verified_reason':verified_reason,'gender':gender,'urank':urank,'mbrank':mbrank,'followers_count':followers_count,'follow_count':follow_count,'uid':uid,'fid':fid,'cardid':cardid,'containerid':containerid,'oid':oid}print(Info)return Info#获取所有热门微博信息(所发微博内容,创建时间,每条热门微博id,每条微博的评论数,转发数,评论数...)def mblog_list(uid,oid):ids=[]base_url = '/api/container/getIndex?containerid={oid}'#base_url='/api/container/getIndex?containerid={oid}&luicode=10000011&lfid=100505128885&featurecode=20000320'#base_url= '/api/container/getIndex?containerid={uid}'page_url = '/api/container/getIndex?containerid={oid}&type=uid&value={uid}&page={page}'#page_url ='/api/container/getIndex?containerid={uid}&page={page}'url = base_url.format(oid=oid)print(url)resp = requests.get(url, headers=headers, cookies=Cookies)resp.encoding = 'gbk'response = resp.json()#print(response)#热门微博数totaltotal = response['data']['cardlistInfo']['total']print(total)#热门微博网页数path = os.getcwd()+'/{dirname}/'.format(dirname='博主微博热门信息汇总')os.mkdir(path)path2 = os.getcwd() + '/%s/%s.csv'%('博主微博热门信息汇总',uid)csvfile = open(path2, 'a+', encoding='gb18030', newline='')writer = csv.writer(csvfile)writer.writerow(('id','reposts_count','comments_count','attitudes_count','date','text'))page_num = int(int(total)/10)+1for i in range(1,page_num+1,1):#if i==2:#breakp_url = page_url.format(oid=oid, uid=uid, page=i)#print(p_url)page_resp = requests.get(p_url,headers=headers,cookies=Cookies)resp.encoding = 'gbk'page_data = page_resp.json()'''filename='22.json'with open(filename,'w') as f:json.dump(page_data,f)'''try:cards = page_data['data']['cards']#print(cards)for card in cards:#print(card)try:mblog = card['mblog']#print(mblog)date = mblog['created_at']id = mblog['id']ids.append(id)dirty_text = mblog['text'] #dirty_text中含有很多链接杂质cleaned1 = re.sub(r'<span .*?</span>', '', dirty_text)text = re.sub(r"<a .*?</a>", '', cleaned1)reposts_count = mblog['reposts_count']comments_count = mblog['comments_count']attitudes_count = mblog['attitudes_count'] writer.writerow((id,reposts_count,comments_count,attitudes_count,date,text))print('有%d页,已经爬了%d页 %s'%(page_num, i, id))except:continueexcept:continuetime.sleep(1)return ids#获取某微博点赞用户信息,保存到usr_id下的文件夹wb_id.csv文件中def get_uesrlike_counts(usr_id, wb_id):url = '/api/attitudes/show?id={id}'.format(id=wb_id)page_url = '/api/attitudes/show?id={id}&page={page}'Resp = requests.get(url, headers=headers, cookies=Cookies)Resp.encoding = 'gbk'#print(url)Resp_data=Resp.json()try: page_max_num = Resp.json()['data']['max'] print(page_max_num) path2 = os.getcwd() + '/%s/%s.csv'%(usr_id,wb_id) csvfile = open(path2, 'a+', encoding='gb18030', newline='') writer = csv.writer(csvfile) writer.writerow(('username','user_id','verified','verified_type','profile_url','review_id','image','source','following','follow_me','date')) for i in range(1,page_max_num+1): #if i==2:#备注:爬取转发的量不受页数的限制 不止100页#break p_url = page_url.format(id=wb_id,page=i) resp = requests.get(p_url, cookies=Cookies, headers=headers) print(resp.status_code)#正确应该是200 resp_data = resp.json() try:data = resp_data.get('data').get('data')#print(data)#此时data是一个列表for d in data: review_id = d['id'] user_id=d['user']['id'] #like_counts = d['like_counts'] source = d['source'] username = d['user']['screen_name'] image = d['user']['profile_image_url'] verified = d['user']['verified'] verified_type = d['user']['verified_type'] profile_url = d['user']['profile_url'] following=d['user']['following'] follow_me=d['user']['follow_me'] #comment = d['raw_text'] #cleaned1 = re.sub(r'<span .*?</span>', '', dirty_text) #comment = re.sub(r"<a .*?</a>", '', cleaned1) date = d['created_at'] #print(comment) writer.writerow((username,user_id, verified, verified_type, profile_url, review_id, image,source,following,follow_me,date)) print('有%d页,已经爬了%d页 %s'%(page_max_num, i, username)) except:print(resp_data['msg'])continue time.sleep(1) csvfile.close()except:print(Resp_data['msg'])def main():#user_id= '1655128924'#user_id='2736225585'#user_id = '2386831995'user_id= '2378564111'wb_id='4225101094628701'user_info = get_user_info(user_id)uid = user_info.get('uid')oid = user_info.get('oid')print(uid,oid)r=mblog_list(uid,oid)print('............')path = os.getcwd()+'/{dirname}/'.format(dirname=user_id)#先在主函数中把名字为user_id建好os.mkdir(path)for i in range(len(r)): #打印博主所有的热门微博的点赞用户的信息print('这是第'+str(i)+'条热门微博')wb_id=r[i] get_uesrlike_counts(user_id,wb_id)main()

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。