PyCurl使用方法

主要介绍怎么使用pycurl的基本使用方法，参考官网的实例代码例举！！！

基本使用方法

c = pycurl.Curl()    #创建一个curl对象
c.setopt(pycurl.CONNECTTIMEOUT, 5)    #连接的等待时间，设置为0则不等待
c.setopt(pycurl.TIMEOUT, 5)           #请求超时时间
c.setopt(pycurl.NOPROGRESS, 0)        #是否屏蔽下载进度条，非0则屏蔽
c.setopt(pycurl.MAXREDIRS, 5)         #指定HTTP重定向的最大数
c.setopt(pycurl.FORBID_REUSE, 1)      #完成交互后强制断开连接，不重用
c.setopt(pycurl.FRESH_CONNECT,1)      #强制获取新的连接，即替代缓存中的连接
c.setopt(pycurl.DNS_CACHE_TIMEOUT,60) #设置保存DNS信息的时间，默认为120秒
c.setopt(pycurl.URL,"http://www.baidu.com")      #指定请求的URL
c.setopt(pycurl.USERAGENT,"Mozilla/5.2 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50324)")    #配置请求HTTP头的User-Agent
c.setopt(pycurl.HEADERFUNCTION, getheader)   #将返回的HTTP HEADER定向到回调函数getheader
c.setopt(pycurl.WRITEFUNCTION, getbody)      #将返回的内容定向到回调函数getbody
c.setopt(pycurl.WRITEHEADER, fileobj)        #将返回的HTTP HEADER定向到fileobj文件对象
c.setopt(pycurl.WRITEDATA, fileobj)          #将返回的HTML内容定向到fileobj文件对象
c.getinfo(pycurl.HTTP_CODE)         #返回的HTTP状态码
c.getinfo(pycurl.TOTAL_TIME)        #传输结束所消耗的总时间
c.getinfo(pycurl.NAMELOOKUP_TIME)   #DNS解析所消耗的时间
c.getinfo(pycurl.CONNECT_TIME)      #建立连接所消耗的时间
c.getinfo(pycurl.PRETRANSFER_TIME)  #从建立连接到准备传输所消耗的时间
c.getinfo(pycurl.STARTTRANSFER_TIME)    #从建立连接到传输开始消耗的时间
c.getinfo(pycurl.REDIRECT_TIME)     #重定向所消耗的时间
c.getinfo(pycurl.SIZE_UPLOAD)       #上传数据包大小
c.getinfo(pycurl.SIZE_DOWNLOAD)     #下载数据包大小
c.getinfo(pycurl.SPEED_DOWNLOAD)    #平均下载速度
c.getinfo(pycurl.SPEED_UPLOAD)      #平均上传速度
c.getinfo(pycurl.HEADER_SIZE)       #HTTP头部大小

python2

import pycurl
from StringIO import StringIO
buffer = StringIO()
c = pycurl.Curl()
c.setopt(c.URL, 'http://www.pythontab.com/')
c.setopt(c.WRITEDATA, buffer)
c.perform()
c.close()
body = buffer.getvalue()
print(body)

python3

import pycurl
from io import BytesIO
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(c.URL, 'http://www.pythontab.com/')
c.setopt(c.WRITEDATA, buffer)
c.perform()
c.close()
body = buffer.getvalue()
print(body.decode('iso-8859-1'))

PycURL 自动处理cookie

import pycurl
import StringIO

url = "http://www.google.com/"
crl = pycurl.Curl()
crl.setopt(pycurl.VERBOSE,1)
crl.setopt(pycurl.FOLLOWLOCATION, 1)
crl.setopt(pycurl.MAXREDIRS, 5)
crl.fp = StringIO.StringIO()
crl.setopt(pycurl.URL, url)
crl.setopt(crl.WRITEFUNCTION, crl.fp.write)

# Option -b/--cookie <name=string/file> Cookie string or file to read cookies from
# Note: must be a string, not a file object.
crl.setopt(pycurl.COOKIEFILE, "cookie_file_name")

# Option -c/--cookie-jar <file> Write cookies to this file after operation
# Note: must be a string, not a file object.
crl.setopt(pycurl.COOKIEJAR, "cookie_file_name")

crl.perform()
print crl.fp.getvalue()

PycURL 实现POST方法

import pycurl
import StringIO
import urllib

url = "http://www.google.com/"
post_data_dic = {"name":"value"}
crl = pycurl.Curl()
crl.setopt(pycurl.VERBOSE,1)
crl.setopt(pycurl.FOLLOWLOCATION, 1)
crl.setopt(pycurl.MAXREDIRS, 5)
#crl.setopt(pycurl.AUTOREFERER,1)

crl.setopt(pycurl.CONNECTTIMEOUT, 60)
crl.setopt(pycurl.TIMEOUT, 300)
#crl.setopt(pycurl.PROXY,proxy)
crl.setopt(pycurl.HTTPPROXYTUNNEL,1)
#crl.setopt(pycurl.NOSIGNAL, 1)
crl.fp = StringIO.StringIO()
crl.setopt(pycurl.USERAGENT, "dhgu hoho")

# Option -d/--data <data>   HTTP POST data
crl.setopt(crl.POSTFIELDS,  urllib.urlencode(post_data_dic))

crl.setopt(pycurl.URL, url)
crl.setopt(crl.WRITEFUNCTION, crl.fp.write)
crl.perform()

print crl.fp.getvalue()

urllib 超时设置

import socket
socket.setdefaulttimeout(5.0)

根据服务器编码进行解码

import pycurl
import re
try:
    from io import BytesIO
except ImportError:
    from StringIO import StringIO as BytesIO

headers = {}
def header_function(header_line):
    # HTTP standard specifies that headers are encoded in iso-8859-1.
    # On Python 2, decoding step can be skipped.
    # On Python 3, decoding step is required.
    header_line = header_line.decode('iso-8859-1')

    # Header lines include the first status line (HTTP/1.x ...).
    # We are going to ignore all lines that don't have a colon in them.
    # This will botch headers that are split on multiple lines...
    if ':' not in header_line:
        return

    # Break the header line into header name and value.
    name, value = header_line.split(':', 1)

    # Remove whitespace that may be present.
    # Header lines include the trailing newline, and there may be whitespace
    # around the colon.
    name = name.strip()
    value = value.strip()

    # Header names are case insensitive.
    # Lowercase name here.
    name = name.lower()

    # Now we can actually record the header name and value.
    # Note: this only works when headers are not duplicated, see below.
    headers[name] = value

buffer = BytesIO()
c = pycurl.Curl()
c.setopt(c.URL, 'http://pycurl.io')
c.setopt(c.WRITEFUNCTION, buffer.write)
# Set our header function.
c.setopt(c.HEADERFUNCTION, header_function)
c.perform()
c.close()

# Figure out what encoding was sent with the response, if any.
# Check against lowercased header name.
encoding = None
if 'content-type' in headers:
    content_type = headers['content-type'].lower()
    match = re.search('charset=(\S+)', content_type)
    if match:
        encoding = match.group(1)
        print('Decoding using %s' % encoding)
if encoding is None:
    # Default encoding for HTML is iso-8859-1.
    # Other content types may have different default encoding,
    # or in case of binary data, may have no encoding at all.
    encoding = 'iso-8859-1'
    print('Assuming encoding is %s' % encoding)

body = buffer.getvalue()
# Decode using the encoding we figured out.
print(body.decode(encoding))

写入一个文件

import pycurl

# 只要文件以二进制模式打开, python 2 和 python 3
# 可以在不解码的情况下编写响应正文。
with open('out.html', 'wb') as f:
    c = pycurl.Curl()
    c.setopt(c.URL, 'http://pycurl.io/')
    c.setopt(c.WRITEDATA, f)
    c.perform()
    c.close()

设置重定向

import pycurl

c = pycurl.Curl()
# Redirects to https://www.python.org/.
c.setopt(c.URL, 'http://www.python.org/')
# Follow redirect.
c.setopt(c.FOLLOWLOCATION, True)
c.perform()
c.close()

检查响应

import pycurl
try:
    from io import BytesIO
except ImportError:
    from StringIO import StringIO as BytesIO

buffer = BytesIO()
c = pycurl.Curl()
c.setopt(c.URL, 'http://pycurl.io/')
c.setopt(c.WRITEDATA, buffer)
c.perform()

# HTTP response code, e.g. 200.
print('Status: %d' % c.getinfo(c.RESPONSE_CODE))
# Elapsed time for the transfer.
print('Status: %f' % c.getinfo(c.TOTAL_TIME))

# getinfo must be called before close.
c.close()

post

# 若要发送表单数据, 请使用POSTFIELDS选项。表单数据必须预先进行 URL 编码
import pycurl
try:
    # python 3
    from urllib.parse import urlencode
except ImportError:
    # python 2
    from urllib import urlencode

c = pycurl.Curl()
c.setopt(c.URL, 'https://httpbin.org/post')

post_data = {'field': 'value'}
# Form data must be provided already urlencoded.
postfields = urlencode(post_data)
# Sets request method to POST,
# Content-Type header to application/x-www-form-urlencoded
# and data to send in request body.
c.setopt(c.POSTFIELDS, postfields)

c.perform()
c.close()

文件上传

# 若要上载文件, 请使用HTTPPOST选项。若要上载物理文件, 请使用FORM_FILE
import pycurl

c = pycurl.Curl()
c.setopt(c.URL, 'https://httpbin.org/post')

c.setopt(c.HTTPPOST, [
    ('fileupload', (
        # upload the contents of this file
        c.FORM_FILE, __file__,
    )),
])

c.perform()
c.close()

# 如果文件数据在内存中, 请使用BUFFER/BUFFERPTR
import pycurl

c = pycurl.Curl()
c.setopt(c.URL, 'https://httpbin.org/post')

c.setopt(c.HTTPPOST, [
    ('fileupload', (
        c.FORM_BUFFER, 'readme.txt',
        c.FORM_BUFFERPTR, 'This is a fancy readme file',
    )),
])

c.perform()
c.close()

爬取 DDos 保护的站点

准备抓取一个网站，但是发现启用DDOS保护，感觉有点麻烦了。打开浏览器F12，发现要求你等待几秒钟检测浏览器，然后通过302重定向到正确的网页。源码查看打开 html 源码，可以看到主要是下面的这个代码：还有在 JS 中使用到的Element: 源码分享上面代码的作用，主要是原来判断访问站点的客户端是否是一个真实的浏览器。访问网站主页的时候，首先得到的是一个 503 错误，显示上面的页面内容。然后等待 5 秒，让用户看清楚提示信息，然后再根据得到的随机值东拼西凑计算出另一个值，写入 jschl-answer 中，最后通过 challenge-form 这个表单提交。提交成功之后， /cdn-cgi/l/chk_jschl 会使用 302 重定向返回真实的网页内容。cgksoUW 是一个随机的名称，pass 那个 input 中包含的值也是一个变化的值。每次进入这个页面，这两个值都会变化。那么，pass 的值在提交的时候到底是如何计算出来的？从 f = document.getElementById('challenge-form'); 下面的那段代码可以看出，这个计算无非是对 cgksoUW.bRM 进行一些加乘运算。然后再转成一个 Int 进行提交。关键在于那些看起来莫名其妙的加号、括号和方括号的集合的含义为何？这是一个简单的障眼法。让我们看看 cgksoUW.bRM 的初始值，这个表达式的结果等于数字 12 我们把上面的代码拆成几个子表达式，就好理解了。首先是第一个嵌套括号中比如这样的： (+!![]+[])，打开浏览器的 console ,在其中输入 +!![] 回车，可以得到数字 1 。先不管哪个加号，单独看 !![] 的值，它是布尔值 true 。这是因为 [] 返回的是一个有效的 Array，代表 true ，两次取反，依然是 true 。而左边的那个加号由于没有提供左操作数，因此它的含义是正数，这里做了一次数字转换，自动把 true 转换成了 1 。接着往后算，后面是个加号，然后是个 [] 。空的数组，默认是作为字符串处理的。[].toString() 的值为空字符串。前面的 1 加上后面的空字符串，得到一个字符串 "1" 。知道了原理...

阅读全文

爬虫知识分享

搜索此博客

PyCurl使用方法

评论

发表评论

此博客中的热门博文

QtWebkit开发爬虫

爬取 DDos 保护的站点