python--爬虫--获取和解析存储网页内容--以薄荷网为例(3)
- UID
- 1066743
|
python--爬虫--获取和解析存储网页内容--以薄荷网为例(3)
urllib2的用法
1、最简单的urlopen
#coding:utf-8
import urllib, urllib2
#前半部分的链接(注意是http,不是https)
url_pre = 'http://www.baidu.com/s'
#GET参数
params = {}
params['wd'] = u'测试'.encode('utf-8')
url_params = urllib.urlencode(params)
#GET请求完整链接
url = '%s?%s' % (url_pre, url_params)
#打开链接,获取响应
response = urllib2.urlopen(url)
#获取响应的html
html = response.read()
#将html保存到文件
with open('test.txt', 'w') as f:
f.write(html)
2、使用Request
#coding:utf-8
import urllib, urllib2
#前半部分的链接(注意是http,不是https)
url_pre = 'http://www.baidu.com/s'
#GET参数
params = {}
params['wd'] = u'测试'.encode('utf-8')
url_params = urllib.urlencode(params)
#GET请求完整链接
url = '%s?%s' % (url_pre, url_params)
#构造请求,获取响应
request = urllib2.Request(url)
response = urllib2.urlopen(request)
#获取响应的html
html = response.read()
with open('test.txt', 'w') as f:
f.write(html)
3、post请求
#coding:utf-8
import urllib, urllib2
#构造表单数据,表单数据也是和GET请求一样的形式
values = {}
values['username'] = "aaaaaa"
values['password'] = "bbbbbb"
data = urllib.urlencode(values)
#构造请求
url = "http://xxxxxxxxxxx"
request = urllib2.Request(url, data)
#响应结果
response = urllib2.urlopen(request)
html = response.read()
print(html)
4、处理cookie
#coding:utf-8
import urllib2
import cookielib
#创建cookie
cookie = cookielib.CookieJar()
handler=urllib2.HTTPCookieProcessor(cookie)
#通过handler来构建自定义opener
opener = urllib2.build_opener(handler)
#此处的open方法同urllib2的urlopen方法
request = urllib2.Request('http://www.baidu.com')
response = opener.open(request)
for item in cookie:
print('%s = %s' % (item.name, item.value))
5、反爬虫设置header
#coding:utf-8
import urllib, urllib2
#设置header
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent':user_agent}
#构造Request请求,其中第二个参数是data
url = 'http://www.server.com/login'
request = urllib2.Request(url, None, headers)
#响应请求
response = urllib2.urlopen(request)
html = response.read()
6、 读一个本地文件
import urllib2
f=urllib2.urlopen('file:./a.txt')
buf=f.read()
7、 中文地址解析
h4 = u'http://www.baidu.com?w=测试'
h4=h4.encode('utf-8')
response = urllib2.urlopen(h4)
html = response.read()
最好用正确的编码转换一下。 上面的例子如果不用转换的函数处理一下网址,会导致urlopen 失败。
8、分类操作
FTP
handler = urllib2.FTPHandler()
request = urllib2.Request(url='ftp://ftp.ubuntu.com/')
opener = urllib2.build_opener(handler)
f = opener.open(request)
print f.read()
如果需要用户名和密码:
urllib2.Request(url='ftp://用户名:密码@ftp地址/')
HTTP
handler = urllib2.HTTPHandler()
request = urllib2.Request(url='http://ftp.ubuntu.com/')
opener = urllib2.build_opener(handler)
f = opener.open(request)
print f.read()
9、使用代理
proxy_support = urllib2.ProxyHandler({"http":"http://proxy.****.com/"})
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
res = urllib2.urlopen('http://www.taobao.com/')
print res.read() #将读取得到整个html页面 |
|
|
|
|
|