A09.浏览器伪装.txt

UP 返回
#伪装浏览器代理访问网页
import urllib.request
url="http://blog.csdn.net"
#头文件 headers=("User-Agent",用户代理值)   
#浏览器通过 f12→Network→点击任意链接→Headers 来查看
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0")
opener=urllib.request.build_opener()
data=opener.open(url).read()
#wb 二进制写入文件
fh=open("D:\\ProjectCodes\\python\\test2.html","wb")
fh.write(data)
fh.close()

#爬取新闻
import urllib.request,re
url="http://news.qq.com/"
data=urllib.request.urlopen(url).read().decode("UTF-8","ignore")   
pat1='<a class="picture" href="(.*?)" target="_blank">'   #自己写正则
alllink=re.compile(pat1).findall(data)
for i in range(0,len(alllink)):
    thislink=alllink[i]
    thispage=urllib.request.urlopen(thislink).read.decode("gb2312","ignore")
    pat2="<frame src=(.*?)>"
    isframe=re.compile(pat2).findall(thispage)
    if(len(isframe)==0):  #直接爬
        urllib.request.urlretrieve(thislink,"D:\\ProjectCodes\\python\\data\\"+str(i)+".html")
    else:    #得到frame网址爬
        flink=isframe[0]
        urllib.request.urlretrieve(flink,"D:\\ProjectCodes\\python\\data\\"+str(i)+".html")

#爬取csdn博客
import urllib.request,re
url="https://blog.csdn.net/"
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
#安装为全局
urllib.request.install_opener(opener)
data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat='<h3 class="tracking-ad" data-mod="popu_254"><a href="(.*?)"'
alllink=re.compile(pat).findall(data)
for i in range(0,len(alllink)):
    localpath="D:\\ProjectCodes\\python\\data\\"+str(i)+".html"
    thislink=alllink[i]
    urllib.request.urlretrieve(thislink,filename=localpath)
    print("当前文章"+str(i)+"爬取成功")

#爬取糗事百科
import urllib.request,re
url="https://blog.csdn.net/"
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
#安装为全局
urllib.request.install_opener(opener)
for i in range(0,35):
    thisurl="http://www.qiushibaike.com/8hr/page/"+str(i+1)+"/?s=4948859"
    urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
    pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
    rst=re.compile(pat,re.S).findall(data)
    for j in range(0,len(rst)):
        print(rst[j])
        print("------")


DOWN 返回