请求方法:
r=requests.get('http://httpbin.org/get')#get
r = requests.post("http://httpbin.org/post")#post
r = requests.put("http://httpbin.org/put")#put
r = requests.delete("http://httpbin.org/delete")#delect
r = requests.head("http://httpbin.org/get")#head
r = requests.options("http://httpbin.org/get")#options
GET
eg
import requests
r = requests.get(url='http://www.euraxluo.cn') # 最基本的GET请求
print(r.status_code) # 内置的状态码查询对象
#状态码非200视为出错
响应状态码
eg:404
r = requests.get('http://httpbin.org/status/404')
print(r.status_code)#404
error_info = r.raise_for_status()#Response.raise_for_status()抛出异常
带参数的url请求:
#向url传递参数
r = requests.get(url='http://dict.baidu.com/s', params={'wd': 'python'})#带参数的GET请求
#当你不知道你的编码类型时
r.encoding = r.apparent_encoding#获取编码类型
print(r.text)#返回解码后的数据
tips
若有图片 r.content 返回bytes数据
eg:r.content
r = requests.get(url='http://music.baidu.com')#实测,没啥区别
html=r.content
#html_doc=str(html,'utf-8')
html_doc=html.decode("utf-8","ignore")
print(html_doc)
响应内容
不同的内容处理方式
Json:request.json()
二进制:一般用于图片
from PIL import Image
from io import BytesIO
m = request.content#未解码内容
i = Image.open(m)#用二进制数据创建图片
text:可以自动解码。用的最多
import requests
r=requests.get('https://euraxluo.cn')
r.text#已经经过自动解码
编码问题:
1.get_encodings_from_content
if req.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(req.text)
if encodings:
encoding = encodings[0]
else:
encoding = req.apparent_encoding
# encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
global encode_content
encode_content = req.content.decode(encoding, 'replace') #如果设置为replace,则会用?取代非法字符;
2.防御编程
try:
r = requests.get(url,timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("error")
定制headers
headers必须是string,bytestring,Unicode
url='https://euraluo.cn'
headers={'user-agent':'my-app/0.0.1'}#UA
r=requests.get(url,headers=headers)
POST
POST表单
多个元素使用同一key的时候可以将dict替换成元祖列表,
可以将data=替换成json=,传入json对象。
payload={'key1':'value1','key2':'value2'}
r=requests.post('http://httpbin.org/post', data=payload)
POST文件
url = 'http://httpbin.org/post'
files={'file':open('report.xls','rb')}#使用二进制模式打开文件。
r=requests.post(url,files=files)
Cookie
访问cookie
r.cookies['cookie_name']
发送cookie到服务器
cookies=dict(cookies_are='workding')
r.requests.get(url,cookies=cookies)
eg
#Cookie的返回对象为RequestsCookieJar,类似于字典,适合跨域名路径使用。
jar=requests.cookies.RequestsCookieJar()
jar.set('tasty_cookie', 'yum', domain='httpbin.org', path='/cookies')
jar.set('gross_cookie', 'blech', domain='httpbin.org', path='/elsewhere')
url = 'http://httpbin.org/cookies'
r = requests.get(url, cookies=jar)
r.text#打印出cookie
会话
使用get()时,先构建Request对象,发送请求;再返回Response对象,包含服务器返回的所有信息
eg
#会话也可用来为请求方法提供缺省数据
s=requests.Session()
s.auth=('user','pass')
s.headers.update({'x-test':'true'})
#x-test和x-test2都会发送出去
s.get('http://httpbin.org/headers', headers={'x-test2':'true'})
print(s)
SSL证书验证
如果设置为False,会忽略对SSL证书的验证。
requests.get('https://github.com', verify=True)
SSL验证默认开始,如果验证失败,会抛出SSLErro
代理
通过代理爬取
import requests
proxies = {"http": "171.38.24.164:8132"}
r=requests.get("http://ip.chinaz.com/getip.aspx", ,proxies=proxies)
print(r.text)
获取代理ip
import requests
import bs4
from bs4 import BeautifulSoup
User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
header = {}
header['User-Agent'] = User_Agent
url = 'http://www.xicidaili.com/nn/1'
r = requests.get(url,headers=header)
res = r.text
soup = BeautifulSoup(res,"html.parser")
ips = soup.findAll('tr')
f = open("./ip_proy/ip","w")
for x in range(1,len(ips)):
ip = ips[x]#一行
tds = ip.findAll("td")
line = "%s:%s;%s" % (tds[1].contents[0],tds[2].contents[0],tds[5].contents[0])+"\n"
ip_temp = tds[2].contents[0]+"\t"+tds[3].contents[0]+"\n"
f.write(line)
print(line)
验证能否连接
#Unicode gbk
import socket
import re
import sys
import requests
#socket.setdefaulttimeout(3)#全局延时
#f2 = open("./ip_proy/run_ip", "w")
f = open("./ip_proy/ip")
lines = f.readlines()
proxys = []
for i in range(0,len(lines)):
ip = lines[i].strip("\n").split("\t")
ip_line = re.split(r"[.:;]",ip[0])#re.split分割
proxy_host = "http://"+ip_line[0]+"."+ip_line[1]+"."+ip_line[2]+"."+ip_line[3]+":"+ip_line[4]
proxy_temp = {"http":proxy_host}
proxys.append(proxy_temp)
url = "http://ip.chinaz.com/getip.aspx"
for proxy in proxys:
try:
res = requests.get(url,proxies=proxy,timeout=30)
res.encoding = res.apparent_encoding
res = res.text
print(proxy)
print(res+"\n")
'''#hava a bug
for values in proxy.values():
print(values)
f2.write(str(values)+"\n")
'''
except:
print(proxy)
print("times out\n")
f.close()
#f2.close()
sys.exit()
Comments