爬虫学习2-Requests库学习

请求方法:


r=requests.get('http://httpbin.org/get')#get

r = requests.post("http://httpbin.org/post")#post

r = requests.put("http://httpbin.org/put")#put

r = requests.delete("http://httpbin.org/delete")#delect

r = requests.head("http://httpbin.org/get")#head

r = requests.options("http://httpbin.org/get")#options

GET

eg


import requests

r = requests.get(url='http://www.euraxluo.cn')  # 最基本的GET请求

print(r.status_code)  # 内置的状态码查询对象

#状态码非200视为出错

响应状态码

eg:404


r = requests.get('http://httpbin.org/status/404')

print(r.status_code)#404

error_info = r.raise_for_status()#Response.raise_for_status()抛出异常

带参数的url请求:


#向url传递参数

r = requests.get(url='http://dict.baidu.com/s', params={'wd': 'python'})#带参数的GET请求

#当你不知道你的编码类型时

r.encoding = r.apparent_encoding#获取编码类型

print(r.text)#返回解码后的数据

tips

若有图片 r.content 返回bytes数据

eg:r.content


r = requests.get(url='http://music.baidu.com')#实测,没啥区别

html=r.content

#html_doc=str(html,'utf-8')

html_doc=html.decode("utf-8","ignore")

print(html_doc)

响应内容

不同的内容处理方式

Json:request.json()
二进制:一般用于图片

from PIL import Image

from io import BytesIO

m = request.content#未解码内容

i = Image.open(m)#用二进制数据创建图片
text:可以自动解码。用的最多

import requests

r=requests.get('https://euraxluo.cn')

r.text#已经经过自动解码

编码问题:

1.get_encodings_from_content

if req.encoding == 'ISO-8859-1':

    encodings = requests.utils.get_encodings_from_content(req.text)

    if encodings:

        encoding = encodings[0]

    else:

        encoding = req.apparent_encoding

    # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')

    global encode_content

    encode_content = req.content.decode(encoding, 'replace') #如果设置为replace,则会用?取代非法字符;
2.防御编程

    try:

        r = requests.get(url,timeout = 30)

        r.raise_for_status()

        r.encoding = r.apparent_encoding

        return r.text

    except:

		print("error")

定制headers

headers必须是string,bytestring,Unicode


url='https://euraluo.cn'

headers={'user-agent':'my-app/0.0.1'}#UA

r=requests.get(url,headers=headers)

POST

POST表单

多个元素使用同一key的时候可以将dict替换成元祖列表,

可以将data=替换成json=,传入json对象。


payload={'key1':'value1','key2':'value2'}

r=requests.post('http://httpbin.org/post', data=payload)

POST文件


url = 'http://httpbin.org/post'

files={'file':open('report.xls','rb')}#使用二进制模式打开文件。

r=requests.post(url,files=files)

访问cookie

r.cookies['cookie_name']

发送cookie到服务器

cookies=dict(cookies_are='workding')

r.requests.get(url,cookies=cookies)

eg


#Cookie的返回对象为RequestsCookieJar,类似于字典,适合跨域名路径使用。

jar=requests.cookies.RequestsCookieJar()

jar.set('tasty_cookie', 'yum', domain='httpbin.org', path='/cookies')

jar.set('gross_cookie', 'blech', domain='httpbin.org', path='/elsewhere')

url = 'http://httpbin.org/cookies'

r = requests.get(url, cookies=jar)

r.text#打印出cookie

会话

使用get()时,先构建Request对象,发送请求;再返回Response对象,包含服务器返回的所有信息

eg


#会话也可用来为请求方法提供缺省数据

s=requests.Session()

s.auth=('user','pass')

s.headers.update({'x-test':'true'})

#x-test和x-test2都会发送出去

s.get('http://httpbin.org/headers', headers={'x-test2':'true'})

print(s)

SSL证书验证

如果设置为False,会忽略对SSL证书的验证。

requests.get('https://github.com', verify=True)

SSL验证默认开始,如果验证失败,会抛出SSLErro

代理

通过代理爬取


import requests

proxies = {"http": "171.38.24.164:8132"}

r=requests.get("http://ip.chinaz.com/getip.aspx", ,proxies=proxies)

print(r.text)

获取代理ip

import requests

import bs4

from bs4 import BeautifulSoup



User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'

header = {}

header['User-Agent'] = User_Agent



url = 'http://www.xicidaili.com/nn/1'

r = requests.get(url,headers=header)

res = r.text



soup = BeautifulSoup(res,"html.parser")

ips = soup.findAll('tr')

f = open("./ip_proy/ip","w")



for x in range(1,len(ips)):

    ip = ips[x]#一行

    tds = ip.findAll("td")

    line = "%s:%s;%s" % (tds[1].contents[0],tds[2].contents[0],tds[5].contents[0])+"\n"

    ip_temp = tds[2].contents[0]+"\t"+tds[3].contents[0]+"\n"

    f.write(line)

    print(line)

验证能否连接


#Unicode gbk

import socket

import re

import sys



import requests



#socket.setdefaulttimeout(3)#全局延时



#f2 = open("./ip_proy/run_ip", "w")

f = open("./ip_proy/ip")

lines = f.readlines()

proxys = []



for i in range(0,len(lines)):

    ip = lines[i].strip("\n").split("\t")

    ip_line = re.split(r"[.:;]",ip[0])#re.split分割

    proxy_host = "http://"+ip_line[0]+"."+ip_line[1]+"."+ip_line[2]+"."+ip_line[3]+":"+ip_line[4]

    proxy_temp = {"http":proxy_host}

    proxys.append(proxy_temp)



url = "http://ip.chinaz.com/getip.aspx"

for proxy in proxys:

    try:

        res = requests.get(url,proxies=proxy,timeout=30)

        res.encoding = res.apparent_encoding

        res = res.text

        print(proxy)

        print(res+"\n")

        '''#hava a bug

        for values in proxy.values():

        print(values)

        f2.write(str(values)+"\n")

        '''

    except:

        print(proxy)

        print("times out\n")

f.close()

#f2.close()

sys.exit()

CC BY-NC 4.0

爬虫学习3-网页解析器
爬虫学习1-概念及urllib2

Comments