从Python爬虫到Spark预处理数据的真实需求[二]

时间:2020-8-29 作者:admin


絮叨两句:
博主是一名软件工程系的在校生,利用博客记录自己所学的知识,也希望能帮助到正在学习的同学们
人的一生中会遇到各种各样的困难和折磨,逃避是解决不了问题的,唯有以乐观的精神去迎接生活的挑战
少年易老学难成,一寸光阴不可轻。
最喜欢的一句话:今日事,今日毕


博主刚刚接触爬虫,有什么不足之处请大家谅解,也希望能指导一下


系列文章目录

从Python爬虫到Spark预处理数据的真实需求[一]
从Python爬虫到Spark预处理数据的真实需求[二]
从Python爬虫到Spark预处理数据的真实需求[三]
从Python爬虫到Spark预处理数据的真实需求[四]
从Python爬虫到Spark预处理数据的真实需求[五]


文章目录


前言

使用Selenium进行自动渲染获取数据


提示:以下是本篇文章正文内容,下面案例可供参考

思路

以机油为例:

                                                          首先进入机油的界面

从Python爬虫到Spark预处理数据的真实需求[二]


                                                          可以发现有很多品牌

从Python爬虫到Spark预处理数据的真实需求[二]
从Python爬虫到Spark预处理数据的真实需求[二]
按F12 去找到所有分类的父标签

从Python爬虫到Spark预处理数据的真实需求[二]

可以发现每一个分类就是一个Li标签,我们可以通过这些标签进入每一个品牌中,跳转到每一品牌的商品列表上

从Python爬虫到Spark预处理数据的真实需求[二]
然后获取每一个商品的详情链接,价格,品牌的名称,商品的图片,商品的标题名称


                                                          重点就是进行翻页的动作

从Python爬虫到Spark预处理数据的真实需求[二]
有两种方式:

  1. 去发现每一页地址链接的规律
  2. 使用自动点击的方式进行获取下一页

在下面的代码中我个人使用的是第一种方式
我之前的文章有第二种方式的使用
Selenium:测试抓取京东数据练习[汽车用品-机油]

                                      最后注意一点就是每进入新的一页它是会显示一半数据
                                                    另一半数据需要手动的下拉滑动

从Python爬虫到Spark预处理数据的真实需求[二]

下拉滑动的动作有点丑陋大家见笑了:

'''
:@browser  是Selenium 的实例对象
'''
def windows(browser):
    for i in range(0, 10000, 50):
        windowBout(browser, i)
    for i in range(10000, 0, -50):
        windowTop(browser, i)
    for i in range(0, 10000, 50):
        windowBout(browser, i)
def windowBout(browser,i):
    js = f"window.scrollTo(0,{i})"
    browser.execute_script(js)

def windowTop(browser, i):
        js = f"window.scrollTo(0,{i})"
        browser.execute_script(js)



在这里我只获取了每个商品详情的连接,并没有在进行下一步获取,在之前Selenium:测试抓取京东数据练习[汽车用品-机油]这个文章中就获取所有的详情页面的数据在进行返回,不过效率太慢,所有我只获取了从Python爬虫到Spark预处理数据的真实需求[二]

接下来上代码

机油

from  bs4 import BeautifulSoup
import time as ti
import random
from fake_useragent import UserAgent
from selenium import webdriver

def getUrl(url):
    bowser = webdriver.Chrome()
    bowser.get(url)
    brand_home=bowser.page_source
    JY_soup = BeautifulSoup(brand_home, 'html.parser')
    J_valueList_li_All = JY_soup.find('div', attrs={'class': 'sl-v-logos'}).find('ul', attrs={
        'class': 'J_valueList v-fixed'}).findAll('li')
    bowser2 = webdriver.Chrome()
    for li in J_valueList_li_All:
            brand_href=f"https://list.jd.com{li.find('a')['href']}"
            brand_name = f"{li.find('a')['title']}"
            print("品牌分类:----->", brand_name, brand_href)
            bowser2.get(brand_href)
            windows(bowser2)
            brand_html = bowser2.page_source
            ca_Html = BeautifulSoup(brand_html, 'html.parser')
            b_title = ca_Html.find('span', attrs={'class': 'p-skip'})
            if b_title == None:
                '''
                直接解析获取商品
                '''
                print('没有下一页直接获取数据')
                getProduct(ca_Html, brand_name=brand_name)
            else:
                b_fy_Number = int(b_title.find('b').text)
                print('共:',b_fy_Number)
                print("--------------------------------------第1页---------------------------------")
                '''获取当前页的数据'''
                getProduct(ca_Html, brand_name=brand_name)
                page = 3
                s = 61
                xh_count = 0
                for i in range(2, b_fy_Number+1):
                    print(f"--------------------------------------第{i}页---------------------------------")
                    fy_page_Href = f"{brand_href}&page={page}&s={s}&click=0"
                    bowser2.get(fy_page_Href)
                    windows(bowser2)
                    ti.sleep(1)
                    fy_page_href_html = bowser2.page_source
                    fy_Html_soup = BeautifulSoup(fy_page_href_html, 'html.parser')
                    if fy_Html_soup.find('span', attrs={'class': 'result'}) != None:
                        print(fy_Html_soup.find('span', attrs={'class': 'result'}))
                        break
                    getProduct(fy_Html_soup, brand_name=brand_name)
                    page += 2
                    s += 60
                    xh_count += 1
                    if xh_count == 100:
                        break

def getProduct(barn_soup,brand_name):
    # count=0
    URL_NAME = []
    li_All = barn_soup.find('div', attrs={'id': 'J_goodsList'}).findAll('li')
    for li in li_All:
        # count+=1
        li_href = li.find('a')
        if li_href != None:
            li_href = li_href['href']
        else:
            continue
        https_li_href = f"https:{li_href}"
        # 商品价格
        p_price = li.find('div', attrs={'class': 'p-price'}).find('i').text
        URL_NAME.append({'href_url': https_li_href, 'bran_name': brand_name, 'price': p_price})
    savUrl(URL_NAME)
    # print(count)

def savUrl(URL_NAME_Array):
    for url_name in URL_NAME_Array:
        with open('D:\\url\\jy\\JD_JY_URLS_price.txt','a',encoding='utf-8') as urls:
            urls.write(str(url_name)+'\r')

'''
:@browser  是Selenium 的实例对象
'''
def windows(browser):
    for i in range(0, 10000, 50):
        windowBout(browser, i)
    for i in range(10000, 0, -50):
        windowTop(browser, i)
    for i in range(0, 10000, 50):
        windowBout(browser, i)
def windowBout(browser,i):
    js = f"window.scrollTo(0,{i})"
    browser.execute_script(js)

def windowTop(browser, i):
        js = f"window.scrollTo(0,{i})"
        browser.execute_script(js)
if __name__ == '__main__':
    url = "https://list.jd.com/list.html?cat=6728,6742,11849"
    getUrl(url)

轮胎

from  bs4 import BeautifulSoup
import time as ti
import random
from fake_useragent import UserAgent
from selenium import webdriver

def getUrl(url):
    bowser = webdriver.Chrome()
    bowser.get(url)
    brand_home=bowser.page_source
    JY_soup = BeautifulSoup(brand_home, 'html.parser')
    J_valueList_li_All = JY_soup.find('div', attrs={'class': 'sl-v-logos'}).find('ul', attrs={
        'class': 'J_valueList v-fixed'}).findAll('li')
    bowser2 = webdriver.Chrome()
    for li in J_valueList_li_All:
            brand_href=f"https://search.jd.com/{li.find('a')['href']}"
            brand_name = f"{li.find('a')['title']}"
            print("品牌分类:----->", brand_name, brand_href)
            bowser2.get(brand_href)
            windows(bowser2)
            brand_html = bowser2.page_source
            ca_Html = BeautifulSoup(brand_html, 'html.parser')
            b_title = ca_Html.find('span', attrs={'class': 'p-skip'})
            if b_title == None:
                '''
                直接解析获取商品
                '''
                print('没有下一页直接获取数据')
                getProduct(ca_Html, brand_name=brand_name)
            else:
                b_fy_Number = int(b_title.find('b').text)
                print('共:',b_fy_Number)
                print("--------------------------------------第1页---------------------------------")
                '''获取当前页的数据'''
                getProduct(ca_Html, brand_name=brand_name)
                page = 3
                s = 51
                xh_count = 0
                for i in range(2, b_fy_Number+1):
                    print(f"--------------------------------------第{i}页---------------------------------")
                    fy_page_Href = f"{brand_href}&cid2=6742&&page={page}&s={s}&click=0"
                    bowser2.get(fy_page_Href)
                    windows(bowser2)
                    ti.sleep(1)
                    fy_page_href_html = bowser2.page_source
                    fy_Html_soup = BeautifulSoup(fy_page_href_html, 'html.parser')
                    if fy_Html_soup.find('span', attrs={'class': 'result'}) != None:
                        print(fy_Html_soup.find('span', attrs={'class': 'result'}))
                        break
                    getProduct(fy_Html_soup, brand_name=brand_name)
                    page += 2
                    s += 50
                    xh_count += 1
                    if xh_count == 100:
                        break

def getProduct(barn_soup,brand_name):
    # count=0
    URL_NAME = []
    li_All = barn_soup.find('div', attrs={'id': 'J_goodsList'}).findAll('li')
    for li in li_All:
        # count+=1
        li_href = li.find('a')
        if li_href != None:
            li_href = li_href['href']
        else:
            continue
        https_li_href = f"https:{li_href}"
        # 商品价格
        p_price = li.find('div', attrs={'class': 'p-price'}).find('i').text
        URL_NAME.append({'href_url': https_li_href, 'bran_name': brand_name,'price':p_price})
    savUrl(URL_NAME)
    # print(count)

def savUrl(URL_NAME_Array):
    for url_name in URL_NAME_Array:
        # print(url_name)
        with open('D:\\url\\luntai\\JD_LT_URLS.txt','a',encoding='utf-8') as urls:
            urls.write(str(url_name)+'\r')

def windows(browser):
    for i in range(0, 10000, 50):
        windowBout(browser, i)
    for i in range(10000, 0, -50):
        windowTop(browser, i)
    for i in range(0, 10000, 50):
        windowBout(browser, i)
def windowBout(browser,i):
    js = f"window.scrollTo(0,{i})"
    browser.execute_script(js)

def windowTop(browser, i):
        js = f"window.scrollTo(0,{i})"
        browser.execute_script(js)
if __name__ == '__main__':
    url = "https://search.jd.com/Search?keyword=%E8%BD%AE%E8%83%8E&enc=utf-8&wq=&pvid=b2160a1bc78b4897827700e1dba8e242"
    getUrl(url)

刹车片

from  bs4 import BeautifulSoup
import time as ti
import random
from fake_useragent import UserAgent
from selenium import webdriver

def getUrl(url):
    bowser = webdriver.Chrome()
    bowser.get(url)
    brand_home=bowser.page_source
    JY_soup = BeautifulSoup(brand_home, 'html.parser')
    J_valueList_li_All = JY_soup.find('div', attrs={'class': 'sl-v-logos'}).find('ul', attrs={
        'class': 'J_valueList v-fixed'}).findAll('li')
    bowser2 = webdriver.Chrome()
    for li in J_valueList_li_All:
            brand_href=f"https://coll.jd.com{li.find('a')['href']}"
            brand_href=brand_href[:brand_href.index('JL=3_')+4]
            brand_name = f"{li.find('a')['title']}"
            print("品牌分类:----->", brand_name, brand_href)
            bowser2.get(brand_href)
            windows(bowser2)
            brand_html = bowser2.page_source
            ca_Html = BeautifulSoup(brand_html, 'html.parser')
            b_title = ca_Html.find('span', attrs={'class': 'p-skip'})
            if b_title == None:
                '''
                直接解析获取商品
                '''
                print('没有下一页直接获取数据')
                getProduct(ca_Html, brand_name=brand_name)
            else:
                b_fy_Number = int(b_title.find('b').text)
                print('共:',b_fy_Number)
                print("--------------------------------------第1页---------------------------------")
                '''获取当前页的数据'''
                getProduct(ca_Html, brand_name=brand_name)
                page = 2
                xh_count = 0
                for i in range(2, b_fy_Number+1):
                    print(f"--------------------------------------第{i}页---------------------------------")
                    # https: // coll.jd.com / list.html?sub = 23867 & ev = exbrand_6927 & JL = 3
                    fy_page_Href = f"{str(brand_href).replace('JL=3','')}&page={page}&JL=6_0_0"
                    print(fy_page_Href)
                    bowser2.get(fy_page_Href)
                    windows(bowser2)
                    ti.sleep(1)
                    fy_page_href_html = bowser2.page_source
                    fy_Html_soup = BeautifulSoup(fy_page_href_html, 'html.parser')
                    if fy_Html_soup.find('span', attrs={'class': 'result'}) != None:
                        print(fy_Html_soup.find('span', attrs={'class': 'result'}))
                        break
                    getProduct(fy_Html_soup, brand_name=brand_name)
                    page += 1
                    xh_count += 1
                    if xh_count == 100:
                        break

def getProduct(barn_soup,brand_name):
    # count=0
    URL_NAME = []
    li_All = barn_soup.find('ul', attrs={'class': 'gl-warp clearfix'}).findAll('li')
    for li in li_All:
        # count+=1
        li_href = li.find('a')
        if li_href != None:
            li_href = li_href['href']
        else:
            continue
        https_li_href = f"https:{li_href}"
        # 商品价格
        p_price = li.find('div', attrs={'class': 'p-price'}).find('i').text
        sku=li.find('div',attrs={'class':'gl-i-wrap j-sku-item'})['data-sku']
        print(sku)
        URL_NAME.append({'href_url': https_li_href, 'bran_name': brand_name,'price':p_price,'skuId':sku})
    savUrl(URL_NAME)
    # print(count)

def savUrl(URL_NAME_Array):
    for url_name in URL_NAME_Array:
        # print(url_name)
        with open('D:\\url\\SCP\\JD_SCP_URLS.txt','a',encoding='utf-8') as urls:
            urls.write(str(url_name)+'\r')

def windows(browser):
    for i in range(0, 10000, 50):
        windowBout(browser, i)
    for i in range(10000, 0, -50):
        windowTop(browser, i)
    for i in range(0, 10000, 50):
        windowBout(browser, i)
def windowBout(browser,i):
    js = f"window.scrollTo(0,{i})"
    browser.execute_script(js)

def windowTop(browser, i):
        js = f"window.scrollTo(0,{i})"
        browser.execute_script(js)
if __name__ == '__main__':
    url = "https://coll.jd.com/list.html?sub=23867"
    getUrl(url)

添加剂

from  bs4 import BeautifulSoup
import time as ti
import random
from fake_useragent import UserAgent
from selenium import webdriver

def getUrl(url):
    bowser = webdriver.Chrome()
    bowser.get(url)
    brand_home=bowser.page_source
    JY_soup = BeautifulSoup(brand_home, 'html.parser')
    J_valueList_li_All = JY_soup.find('div', attrs={'class': 'sl-v-logos'}).find('ul', attrs={
        'class': 'J_valueList v-fixed'}).findAll('li')
    bowser2 = webdriver.Chrome()
    for li in J_valueList_li_All:
            brand_href=f"https://search.jd.com/{li.find('a')['href']}"
            brand_name = f"{li.find('a')['title']}"
            print("品牌分类:----->", brand_name, brand_href)
            bowser2.get(brand_href)
            windows(bowser2)
            brand_html = bowser2.page_source
            ca_Html = BeautifulSoup(brand_html, 'html.parser')
            b_title = ca_Html.find('span', attrs={'class': 'p-skip'})
            if b_title == None:
                '''
                直接解析获取商品
                '''
                print('没有下一页直接获取数据')
                getProduct(ca_Html, brand_name=brand_name)
            else:
                b_fy_Number = int(b_title.find('b').text)
                print('共:',b_fy_Number)
                print("--------------------------------------第1页---------------------------------")
                '''获取当前页的数据'''
                getProduct(ca_Html, brand_name=brand_name)
                page = 3
                s = 51
                xh_count = 0
                for i in range(2, b_fy_Number+1):
                    print(f"--------------------------------------第{i}页---------------------------------")
                    fy_page_Href = f"{brand_href}&cid2=6742&&page={page}&s={s}&click=0"
                    bowser2.get(fy_page_Href)
                    windows(bowser2)
                    ti.sleep(1)
                    fy_page_href_html = bowser2.page_source
                    fy_Html_soup = BeautifulSoup(fy_page_href_html, 'html.parser')
                    if fy_Html_soup.find('span', attrs={'class': 'result'}) != None:
                        print(fy_Html_soup.find('span', attrs={'class': 'result'}))
                        break
                    getProduct(fy_Html_soup, brand_name=brand_name)
                    page += 2
                    s += 50
                    xh_count += 1
                    if xh_count == 100:
                        break

def getProduct(barn_soup,brand_name):
    # count=0
    URL_NAME = []
    li_All = barn_soup.find('div', attrs={'id': 'J_goodsList'}).findAll('li')
    for li in li_All:
        # count+=1
        li_href = li.find('a')
        if li_href != None:
            li_href = li_href['href']
        else:
            continue
        https_li_href = f"https:{li_href}"
        # 商品价格
        p_price = li.find('div', attrs={'class': 'p-price'}).find('i').text
        URL_NAME.append({'href_url': https_li_href, 'bran_name': brand_name,'price':p_price})
    savUrl(URL_NAME)
    # print(count)

def savUrl(URL_NAME_Array):
    for url_name in URL_NAME_Array:
        # print(url_name)
        with open('D:\\url\\tjj\\JD_TJJ_URLS.txt','a',encoding='utf-8') as urls:
            urls.write(str(url_name)+'\r')

def windows(browser):
    for i in range(0, 10000, 50):
        windowBout(browser, i)
    for i in range(10000, 0, -50):
        windowTop(browser, i)
    for i in range(0, 10000, 50):
        windowBout(browser, i)
def windowBout(browser,i):
    js = f"window.scrollTo(0,{i})"
    browser.execute_script(js)

def windowTop(browser, i):
        js = f"window.scrollTo(0,{i})"
        browser.execute_script(js)
if __name__ == '__main__':
    url = "https://search.jd.com/search?keyword=%E6%B7%BB%E5%8A%A0%E5%89%82&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%B7%BB%E5%8A%A0%E5%89%82&stock=1&cid3=11850#J_searchWrap"
    getUrl(url)

原厂件

from  bs4 import BeautifulSoup
import time as ti
import random
from fake_useragent import UserAgent
from selenium import webdriver

def getUrl(url):
    bowser = webdriver.Chrome()
    bowser.get(url)
    brand_home=bowser.page_source
    print(brand_home)
    JY_soup = BeautifulSoup(brand_home, 'html.parser')
    J_valueList_li_All = JY_soup.find('div', attrs={'class': 'sl-v-logos'}).find('ul', attrs={
        'class': 'J_valueList v-fixed'}).findAll('li')
    bowser2 = webdriver.Chrome()
    for li in J_valueList_li_All:
            brand_href=f"https://coll.jd.com{li.find('a')['href']}"
            brand_href=brand_href[:brand_href.index('JL=3_')+4]
            brand_name = f"{li.find('a')['title']}"
            print("品牌分类:----->", brand_name, brand_href)
            bowser2.get(brand_href)
            windows(bowser2)
            brand_html = bowser2.page_source
            ca_Html = BeautifulSoup(brand_html, 'html.parser')
            b_title = ca_Html.find('span', attrs={'class': 'p-skip'})
            if b_title == None:
                '''
                直接解析获取商品
                '''
                print('没有下一页直接获取数据')
                # getProduct(ca_Html, brand_name=brand_name)
            else:
                b_fy_Number = int(b_title.find('b').text)
                print('共:',b_fy_Number)
                print("--------------------------------------第1页---------------------------------")
                '''获取当前页的数据'''
                # getProduct(ca_Html, brand_name=brand_name)
                page = 2
                xh_count = 0
                for i in range(2, b_fy_Number+1):
                    print(f"--------------------------------------第{i}页---------------------------------")
                    # https: // coll.jd.com / list.html?sub = 23867 & ev = exbrand_6927 & JL = 3
                    fy_page_Href = f"{str(brand_href).replace('JL=3','')}&page={page}&JL=6_0_0"
                    print(fy_page_Href)
                    bowser2.get(fy_page_Href)
                    windows(bowser2)
                    ti.sleep(1)
                    fy_page_href_html = bowser2.page_source
                    fy_Html_soup = BeautifulSoup(fy_page_href_html, 'html.parser')
                    if fy_Html_soup.find('span', attrs={'class': 'result'}) != None:
                        print(fy_Html_soup.find('span', attrs={'class': 'result'}))
                        break
                    # getProduct(fy_Html_soup, brand_name=brand_name)
                    page += 1
                    xh_count += 1
                    if xh_count == 100:
                        break

def getProduct(barn_soup,brand_name):
    # count=0
    URL_NAME = []
    li_All = barn_soup.find('ul', attrs={'class': 'gl-warp clearfix'}).findAll('li')
    for li in li_All:
        # count+=1
        li_href = li.find('a')
        if li_href != None:
            li_href = li_href['href']
        else:
            continue
        https_li_href = f"https:{li_href}"
        # 商品价格
        p_price = li.find('div', attrs={'class': 'p-price'}).find('i').text
        sku=li.find('div',attrs={'class':'gl-i-wrap j-sku-item'})['data-sku']
        print(sku)
        URL_NAME.append({'href_url': https_li_href, 'bran_name': brand_name,'price':p_price,'skuId':sku})
    # savUrl(URL_NAME)
    # print(count)

def savUrl(URL_NAME_Array):
    for url_name in URL_NAME_Array:
        # print(url_name)
        with open('D:\\url\\YCJ\\JD_YCJ_URLS.txt','a',encoding='utf-8') as urls:
            urls.write(str(url_name)+'\r')

def windows(browser):
    for i in range(0, 10000, 50):
        windowBout(browser, i)
    for i in range(10000, 0, -50):
        windowTop(browser, i)
    for i in range(0, 10000, 50):
        windowBout(browser, i)
def windowBout(browser,i):
    js = f"window.scrollTo(0,{i})"
    browser.execute_script(js)

def windowTop(browser, i):
        js = f"window.scrollTo(0,{i})"
        browser.execute_script(js)
if __name__ == '__main__':
    url = "https://coll.jd.com/list.html?sub=42052"
    getUrl(url)

火花塞

from  bs4 import BeautifulSoup
import time as ti
import random
from fake_useragent import UserAgent
from selenium import webdriver

def getUrl(url):
    bowser = webdriver.Chrome()
    bowser.get(url)
    brand_home=bowser.page_source
    JY_soup = BeautifulSoup(brand_home, 'html.parser')
    J_valueList_li_All = JY_soup.find('div', attrs={'class': 'sl-v-list'}).find('ul', attrs={
        'class': 'J_valueList v-fixed'}).findAll('li')
    bowser2 = webdriver.Chrome()
    for li in J_valueList_li_All:
            brand_href=f"https://list.jd.com{li.find('a')['href']}"
            brand_name = f"{li.find('a')['title']}"
            print("品牌分类:----->", brand_name, brand_href)
            bowser2.get(brand_href)
            windows(bowser2)
            brand_html = bowser2.page_source
            ca_Html = BeautifulSoup(brand_html, 'html.parser')
            b_title = ca_Html.find('span', attrs={'class': 'p-skip'})
            if b_title == None:
                '''
                直接解析获取商品
                '''
                print('没有下一页直接获取数据')
                getProduct(ca_Html, brand_name=brand_name)
            else:
                b_fy_Number = int(b_title.find('b').text)
                print('共:',b_fy_Number)
                print("--------------------------------------第1页---------------------------------")
                '''获取当前页的数据'''
                getProduct(ca_Html, brand_name=brand_name)
                page = 3
                s = 53
                xh_count = 0
                for i in range(2, b_fy_Number+1):
                    print(f"--------------------------------------第{i}页---------------------------------")
                    fy_page_Href = f"{brand_href}&cid2=6742&&page={page}&s={s}&click=0"
                    bowser2.get(fy_page_Href)
                    windows(bowser2)
                    ti.sleep(1)
                    fy_page_href_html = bowser2.page_source
                    fy_Html_soup = BeautifulSoup(fy_page_href_html, 'html.parser')
                    if fy_Html_soup.find('span', attrs={'class': 'result'}) != None:
                        print(fy_Html_soup.find('span', attrs={'class': 'result'}))
                        break
                    # getProduct(fy_Html_soup, brand_name=brand_name)
                    page += 2
                    s += 52
                    xh_count += 1
                    if xh_count == 100:
                        break

def getProduct(barn_soup,brand_name):
    # count=0
    URL_NAME = []
    li_All = barn_soup.find('div', attrs={'id': 'J_goodsList'}).findAll('li')
    for li in li_All:
        # count+=1
        li_href = li.find('a')
        if li_href != None:
            li_href = li_href['href']
        else:
            continue
        https_li_href = f"https:{li_href}"
        # 商品价格
        p_price = li.find('div', attrs={'class': 'p-price'}).find('i').text
        sku=li['data-sku']
        URL_NAME.append({'href_url': https_li_href, 'bran_name': brand_name,'price':p_price,'skuId':sku})
    savUrl(URL_NAME)
    # print(count)

def savUrl(URL_NAME_Array):
    for url_name in URL_NAME_Array:
        # print(url_name)
        with open('D:\\url\\HHS\\JD_HHS_URLS.txt','a',encoding='utf-8') as urls:
            urls.write(str(url_name)+'\r')

def windows(browser):
    for i in range(0, 10000, 50):
        windowBout(browser, i)
    for i in range(10000, 0, -50):
        windowTop(browser, i)
    for i in range(0, 10000, 50):
        windowBout(browser, i)
def windowBout(browser,i):
    js = f"window.scrollTo(0,{i})"
    browser.execute_script(js)

def windowTop(browser, i):
        js = f"window.scrollTo(0,{i})"
        browser.execute_script(js)
if __name__ == '__main__':
    url = "https://list.jd.com/list.html?cat=6728,6742,6767"
    getUrl(url)

总结

以上内容就是使用Selenium进行获取数据,有什么不足希望大家能进行指导,记得点赞

声明:本文内容由互联网用户自发贡献自行上传,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任。如果您发现有涉嫌版权的内容,欢迎进行举报,并提供相关证据,工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。