本文来自网易云社区
作者:王涛
此处我们给出几个常用的代码例子,包括get,post(json,表单),带证书访问:
Get 请求@gen.coroutine
deffetch_url():
try:
c=CurlAsyncHTTPClient()#定义一个httpclient
myheaders={
"Host":"",
"Connection":"keep-alive",
"Cache-Control":"max-age=0",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0(Windows;U;WindowsNT6.1;en-US)AppleWebKit/532.5(KHTML,likeGecko)Chrome/4.0.249.0Safari/532.5",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":"gzip,deflate",
"Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8"
}
url="/weixin?type=1&s_from=input&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5&ie=utf8&_sug_=n&_sug_type_="
req=HTTPRequest(url=url,method="GET",headers=myheaders,follow_redirects=True,request_timeout=20,connect_timeout=10,
proxy_host="127.0.0.1",
proxy_port=8888)
response=yieldc.fetch(req)#发起请求
printresponse.code
printresponse.body
IOLoop.current().stop()#停止ioloop线程
except:
printtraceback.format_exc()
Fiddler 抓到的报文请求头:
POST JSON数据请求@gen.coroutine
deffetch_url():
"""抓取url"""
try:
c=CurlAsyncHTTPClient()#定义一个httpclient
myheaders={
"Host":"",
"Connection":"keep-alive",
"Cache-Control":"max-age=0",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0(Windows;U;WindowsNT6.1;en-US)AppleWebKit/532.5(KHTML,likeGecko)Chrome/4.0.249.0Safari/532.5",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":"gzip,deflate",
"Content-Type":"Application/json",
"Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8"
}
url="http://127.0.0.1?type=1&s_from=input&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5&ie=utf8&_sug_=n&_sug_type_="
body=json.dumps({"key1":"value1","key2":"value2"})#Json格式数据
req=HTTPRequest(url=url,method="POST",headers=myheaders,follow_redirects=True,request_timeout=20,connect_timeout=10,
proxy_host="127.0.0.1",proxy_port=8888,body=body)
response=yieldc.fetch(req)#发起请求
printresponse.code
printresponse.body
IOLoop.current().stop()#停止ioloop线程
except:
printtraceback.format_exc()
Fiddler 抓到的报文请求头:
POST Form表单数据请求@gen.coroutine
deffetch_url():
"""抓取url"""
try:
c=CurlAsyncHTTPClient()#定义一个httpclient
myheaders={
"Host":"",
"Connection":"keep-alive",
"Cache-Control":"max-age=0",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0(Windows;U;WindowsNT6.1;en-US)AppleWebKit/532.5(KHTML,likeGecko)Chrome/4.0.249.0Safari/532.5",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":"gzip,deflate",
#"Content-Type":"Application/json",
"Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8"
}
importurllib
url="http://127.0.0.1?type=1&s_from=input&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5&ie=utf8&_sug_=n&_sug_type_="
body=urllib.urlencode({"key1":"value1","key2":"value2"})#封装form表单
req=HTTPRequest(url=url,method="POST",headers=myheaders,follow_redirects=True,request_timeout=20,connect_timeout=10,
proxy_host="127.0.0.1",proxy_port=8888,body=body)
response=yieldc.fetch(req)#发起请求
printresponse.code
printresponse.body
IOLoop.current().stop()#停止ioloop线程
except:
printtraceback.format_exc()
Fiddler 抓到的报文请求头:
添加证书访问deffetch_url():
"""抓取url"""
try:
c=CurlAsyncHTTPClient()#定义一个httpclient
myheaders={
"Host":"",
"Connection":"keep-alive",
"Cache-Control":"max-age=0",
"Upgrade-Insecure-Requests":"1",
"User-Agent":("Mozilla/5.0(WindowsNT10.0;Win64;x64)"
"AppleWebKit/537.36(KHTML,likeGecko)"
"Chrome/68.0.3440.106Safari/537.36"),
"Accept":("text/html,application/xhtml+xml,"
"application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"),
"Accept-Encoding":"gzip,deflate,br",
"Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8"
}
importurllib
url="/"
req=HTTPRequest(url=url,method="GET",headers=myheaders,follow_redirects=True,request_timeout=20,connect_timeout=10,proxy_host="127.0.0.1",
proxy_port=8888,ca_certs="FiddlerRoot.pem")#绑定证书
response=yieldc.fetch(req)#发起请求
printresponse.code
printresponse.body
IOLoop.current().stop()#停止ioloop线程
except:
printtraceback.format_exc()
Fiddler抓到的报文(说明可以正常访问)
四、总结
抓取量少的时候,建议使用requests,简单易用。
并发量大的时候,建议使用tornado,单线程高并发,高效易编程。
以上给出了requests和Fiddler中常用的接口和参数说明,能解决爬虫面对的大部分问题,包括并发抓取、日常的反爬应对,https网站的抓取。
附上一段我自己的常用抓取代码逻辑:importrandomfromtornado.ioloopimportIOLoopfromtornadoimportgenfromtornado.queuesimportQueue
importrandom
fromtornado.ioloopimportIOLoop
fromtornadoimportgen
fromtornado.queuesimportQueue
TASK_QUE=Queue(maxsize=1000)
defresponse_handler(res):
"""处理应答,一般会把解析的新的url添加到任务队列中,并且解析出目标数据"""
pass
@gen.coroutine
defurl_fetcher_without_param():
pass
@gen.coroutine
defurl_fetcher(*args,**kwargs):
globalTASK_QUE
c=CurlAsyncHTTPClient()
while1:
#console_show_log("Let'sspider")
try:
param=TASK_QUE.get(time.time()+300)#5分钟超时
excepttornado.util.TimeoutError::
yieldgen.sleep(random.randint(10,100))
continue
try:
req=HTTPRequest(url,method=,headers=,....)#按需配置参数
response=yieldc.fetch(req)
ifresponse.coe==200:
response_handler(response.body)
exceptException:
yieldgen.sleep(10)
continue
finally:
print"Iamaslowspider"
yieldgen.sleep(random.randint(10,100))
@gen.coroutine
defperiod_callback():
pass
defmain():
io_loop=IOLoop.current()
#添加并发逻辑1
io_loop.spawn_callback(url_fetcher,1)
io_loop.spawn_callback(url_fetcher,2)
io_loop.spawn_callback(url_fetcher_without_param)#参数是可选的
#如果需要周期调用,调用PeriodicCallback:
PERIOD_CALLBACK_MILSEC=10#10,单位ms
io_loop.PeriodicCallback(period_callback,).start()
io_loop.start()
if__name__=="__main__":
main()
以上,欢迎讨论交流
五、参考:
网易云免费体验馆,0成本体验20+款云产品!
更多网易研发、产品、运营经验分享请访问网易云社区。