import urllib.request#获取一个get请求response = urllib.request.urlopen("http://www.baidu.com") #封装在response中print(response.read().decode('utf-8')) #decode('utf-8')对获取到的网页代码解码,防止出现中文乱码,打出网页源码#获取一个post请求(用于模拟登录(密码,用户))用httpbin.orgimport urllib.parse #解析器,解析键值对data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding = "utf-8")#表单,将键值对信息封装为二进制的包,encoding = "utf-8"封装方式response = urllib.request.urlopen("http://httpbin.org/post",data = data)print(response.read().decode('utf-8'))
二、超时问题try:response = urllib.request.urlopen("http://httpbin.org/post",timeout=0.01)#时间超过0.01秒print(response.read().decode('utf-8'))except urllib.error.URLError as e: print("time out!")
三、响应头的问题(伪装成浏览器)url = "https://httpbin.org/post"headers = {"User-Agent":"……"}data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding = "utf-8")req = urllib.request.Request(url=url,data=data,headers=headers,method='post')#封装,模拟成真的浏览器response = urllib.request.urlopen(req)#封装print(response.read().decode("utf-8"))
找User-Agent的方法(找headers的键值对):在网络中找
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-yaNApJ6z-1644636635823)(C:Users荔枝AppDataRoamingTyporatypora-user-imagesimage-20220204161745986.png)]
四、获取数据#爬取网页def getData(baseurl):dataist = [] for i in range(0,10):#调用获取页面信息的函数,10次 url = baseurl + str(i*25) html = askURL(url)#保存获取到的网页源码return datalist#得到指定的一个URL的网页内容def askURL(url):head = {"User-Agent":"……"}#用来伪装,模拟浏览器头部信息 request = urllib.request.Request(url,headers=head)#携带headers去访问url try: response = urllib.request.urlopen(request)#获取整个网页的信息 html = response.read().decode("utf-8")#读取信息(网页源码) except urllib.error.URLError as e:#捕获错误 if hasattr(e,"code"): print(e.code)#打印code,看编码有什么问题 if hasattr(e,"reason"): print(e.reason)#打印出没有成功的原因 return html
r(e,“reason”):
print(e.reason)#打印出没有成功的原因
return html