python3爬虫实践(正则+xpath 站长素材免费简历模板爬取)

时间:2020-9-7 作者:admin


import requests
from lxml import etree
import re
import os


def url_values():
    tree = etree.HTML(response)
    list_jianli_Download_page = tree.xpath(("//div[@class='bggray clearfix pt20']/div[3]//div[@id='container']/div"))
    # print(len(list_jianli_Download_page))
    for url in list_jianli_Download_page:
        jump_url.append(url.xpath('./a/@href')[0])


def Download_page():
    for downlaodurl in jump_url:
        response = requests.get(url=downlaodurl,headers=headers).text
        tree = etree.HTML(response)
        rex = "http://.*?.rar"
        re_value = re.findall(rex,response)
        downlaod_url.append(re_value[0])
        # download_url = tree.xpath('//div[@class="bggray clearfix"]/div[2]//div[@class="down_wrap"]/div[2]/ul/li/a/@href')
        name = tree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()')[0]
        file_name.append(name.encode("iso-8859-1").decode("utf-8")+'.rar')


def get_file():
    a = 0
    for name,url in zip(file_name,downlaod_url):
        file = requests.get(url=url,headers=headers).content
        if requests.get(url=url,headers=headers).status_code == 200:
            a = a + 1
        else:
            print(f"文件{filename}下载失败")
            continue
        filename = "简历模板/"+ name
        with open(filename,'wb') as fp:
            fp.write(file)
            print("下载完成",filename,a)


if __name__ == "__main__":
    jump_url = []
    downlaod_url = []
    file_name = []
    if not os.path.exists('./简历模板'):
        os.mkdir('./简历模板')
    url = "http://sc.chinaz.com/jianli/free.html"
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"}
    response = requests.get(url=url,headers=headers).text
    url_values()
    Download_page()
    print(len(downlaod_url))

笔记思路入门参考

python3爬虫实践(正则+xpath 站长素材免费简历模板爬取)

声明:本文内容由互联网用户自发贡献自行上传,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任。如果您发现有涉嫌版权的内容,欢迎进行举报,并提供相关证据,工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。