A08.简单爬虫.txt

UP 返回
import urllib.request
data=urllib.request.urlopen("https://www.csdn.net/").read()		#data拿到网页源码

#爬取QQ号
import urllib.request
import re
data=urllib.request.urlopen("https://edu.csdn.net/huiyiCourse/detail/253").read().decode("utf-8")
pat="<p>(\d*?)</p>"
rst=re.compile(pat).findall(data)
print(rst[0])


#爬取出版社信息并写入文件
import urllib.request
import re
data=urllib.request.urlopen("https://read.douban.com/provider/all").read().decode("utf-8")
pat='<div class="name">(.*?)</div>'
#【tip】此处必须外面是‘里面是“,反过来的话无法匹配到内容
#【tip】(.*?)获取到匹配里面的内容，不加()得到的是<div class="name">XXX</div>这样的结果
rst=re.compile(pat).findall(data)
fh=open("D:\\ProjectCodes\\python\\douban_publishers_test.txt","w")
for i in rst:
        fh.write(i+"\n")
fh.close()

#urllib讲解
【#1】urlretrieve	(网址,本地文件存储地址)	直接下载网页到本地	
import urllib.request
urllib.request.urlretrieve("https://read.douban.com/provider/all","D:\\ProjectCodes\\python\\oo1.html")
【#2】urlcleanup	清除缓存
urllib.request.urlcleanup()
【#3】info	获取页面简介信息
file=urllib.request.urlopen("https://edu.csdn.net/huiyiCourse/detail/253")
print(file.info())
【#4】getcode	获取网页爬取的状态码(状态成功或者失败，检测网站)
print(file.getcode())		#200即代表访问有效
【#5】geturl		获取当前访问的页面的url
print(file.geturl())

【#】超时设置
import urllib.request
for i in range(0,100):
        file=urllib.request.urlopen("https://edu.csdn.net/huiyiCourse/detail/253",timeout=1)		#设置网页超时时间为1s
        try:
                print(len(file.read()))
        except Exception as err:
                print("出现异常"+str(err))

【#】get请求实现百度自动搜索
import urllib.request,re
keywd="好的"
url="https://www.baidu.com/s?wd="+urllib.request.quote(keywd)	#如果关键词有中文需要用quote转码
data=urllib.request.urlopen(url).read().decode("utf-8")
print(data)
pat="<noscript>(.*?)</noscript>"	#baidu好像有反爬，导致此处并不能获得什么信息，就随便匹配一下了
rst=re.compile(pat).findall(data)
print(rst)

【#】post请求
import urllib.request
import urllib.parse
#..............................设置post表单
posturl="www.baidu.com"
postdata=urllib.parse.urlencode({
    "name":"asassa",
    "pass":"safdsdfs",
    }).encode("utf-8")
#................................进行post，使用urllib.request下的Request(post地址,post数据)
req=urllib.request.Request(posturl,postdata)
rst=urllib.request.urlopen(req).read().decode("utf-8")

【#】常见网页状态码
301	Moved Permanently		重定向到新的url，永久性
302	Found		重定向到临时的url，非永久性
304	Not Modified		请求的资源未更新
400	Bad Request		非法请求
401	Unauthorized		请求未经授权
403	Forbidden		禁止访问
404	Not Found	没有找到对应的页面
500	Internal Server Error		服务器内部出现错误
501	Not Implemented		服务器不支持实现请求所需要的功能

【#】URLError与HTTPError
两者都是异常处理的类，HTTPError是URLError的子类，HTTPError有异常状态码与异常原因，URLError没有
所以处理的时候不能用URLError直接代替HTTPError。如果要代替，必须要判断是否有状态码属性
(1) URLError出现的原因
1.连不上服务器
2.远程url不存在
3.无网络
4.触发HTTPError

import urllib.request
import urllib.error		#引入
try:
    file=urllib.request.urlopen("https://www.mzitu.com/172876/2")
    print("success")
except urllib.error.URLError as e:
    if hasattr(e,"code"):
        print(e.code)
    if hasattr(e,"reason"):
        print(e.reason)
DOWN 返回