700字范文 > python爬去学校_python定向爬虫校园论坛帖子信息

python爬去学校_python定向爬虫校园论坛帖子信息

时间：2018-12-01 21:32:51

引言

写这个小爬虫主要是为了爬校园论坛上的实习信息，主要采用了Requests库

源码

URLs.py

主要功能是根据一个初始url（包含page页面参数）来获得page页面从当前页面数到pageNum的url列表

import re

def getURLs(url, attr, pageNum=1):

all_links = []

try:

now_page_number = int(re.search(attr+'=(\d+)', url, re.S).group(1))

for i in range(now_page_number, pageNum + 1):

new_url = re.sub(attr+'=\d+', attr+'=%s' % i, url, re.S)

all_links.append(new_url)

return all_links

except TypeError:

print "arguments TypeError:attr should be string."

uni_2_native.py

由于论坛上爬取得到的网页上的中文都是unicode编码的形式，文本格式都为 XXX;的形式，所以在爬得网站内容后还需要对其进行转换

import sys

import re

reload(sys)

sys.setdefaultencoding('utf-8')

def get_native(raw):

tostring = raw

while True:

obj = re.search('(.*?);', tostring, flags=re.S)

if obj is None:

break

else:

raw, code = obj.group(0), obj.group(1)

tostring = re.sub(raw, unichr(int(code)), tostring)

return tostring

存入SQLite数据库：saveInfo.py

# -*- coding: utf-8 -*-

import MySQLdb

class saveSqlite():

def __init__(self):

self.infoList = []

def saveSingle(self, author=None, title=None, date=None, url=None,reply=0, view=0):

if author is None or title is None or date is None or url is None:

print "No info saved!"

else:

singleDict = {}

singleDict['author'] = author

singleDict['title'] = title

singleDict['date'] = date

singleDict['url'] = url

singleDict['reply'] = reply

singleDict['view'] = view

self.infoList.append(singleDict)

def toMySQL(self):

conn = MySQLdb.connect(host='localhost', user='root', passwd='', port=3306, db='db_name', charset='utf8')

cursor = conn.cursor()

# sql = "select * from info"

# n = cursor.execute(sql)

# for row in cursor.fetchall():

# for r in row:

# print r

# print '\n'

sql = "delete from info"

cursor.execute(sql)

mit()

sql = "insert into info(title,author,url,date,reply,view) values (%s,%s,%s,%s,%s,%s)"

params = []

for each in self.infoList:

params.append((each['title'], each['author'], each['url'], each['date'], each['reply'], each['view']))

cursor.executemany(sql, params)

mit()

cursor.close()

conn.close()

def show(self):

for each in self.infoList:

print "author: "+each['author']

print "title: "+each['title']

print "date: "+each['date']

print "url: "+each['url']

print "reply: "+str(each['reply'])

print "view: "+str(each['view'])

print '\n'

if __name__ == '__main__':

save = saveSqlite()

save.saveSingle('网','aaa','-10-10 10:10:10','',1,1)

# save.show()

save.toMySQL()

主要爬虫代码

import requests

from lxml import etree

from cc98 import uni_2_native, URLs, saveInfo

# 根据自己所需要爬的网站，伪造一个header

headers ={

'Accept': '',

'Accept-Encoding': '',

'Accept-Language': '',

'Connection': '',

'Cookie': '',

'Host': '',

'Referer': '',

'Upgrade-Insecure-Requests': '',

'User-Agent': ''

}

url = '/list.asp?boardid=459&page=1&action='

cc98 = '/'

print "get infomation from cc98..."

urls = URLs.getURLs(url, "page", 50)

savetools = saveInfo.saveSqlite()

for url in urls:

r = requests.get(url, headers=headers)

html = uni_2_native.get_native(r.text)

selector = etree.HTML(html)

content_tr_list = selector.xpath('//form/table[@class="tableborder1 list-topic-table"]/tbody/tr')

for each in content_tr_list:

href = each.xpath('./td[2]/a/@href')

if len(href) == 0:

continue

else:

# print len(href)

# not very well using for, though just one element in list

# but I don't know why I cannot get the data by index

for each_href in href:

link = cc98 + each_href

title_author_time = each.xpath('./td[2]/a/@title')

# print len(title_author_time)

for info in title_author_time:

info_split = info.split('\n')

title = info_split[0][1:len(info_split[0])-1]

author = info_split[1][3:]

date = info_split[2][3:]

hot = each.xpath('./td[4]/text()')

# print len(hot)

for hot_num in hot:

reply_view = hot_num.strip().split('/')

reply, view = reply_view[0], reply_view[1]

savetools.saveSingle(author=author, title=title, date=date, url=link, reply=reply, view=view)

print "All got! Now saving to Database..."

# savetools.show()

savetools.toMySQL()

print "ALL CLEAR! Have Fun!"

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持脚本之家。

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。