Python

时间:2020-9-4 作者:admin


selenium+Xpath+csv爬取京东商品信息

selenium爬取京东的手机商品信息

利用pycharm爬取京东商城的手机商品的信息(价格,型号,样式,或者内存详细信息)

python的入门库

师傅领进门修行在个人!!!

import requests#请求库
from bs4 import BeautifulSoup #解析网页用
import lxml #解析网页

headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'
'Referer':'https://www.jd.com/'
}#设置请求头模拟浏览器访问,防止反爬虫系统

url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=0184af803e7e4c309dde99e28a40547d'

r.requests.get(url,headers = headers)#向网页发送请求

#print(r.status_code)#返回状态码
#print(r.text)#返回网页代码

soup = BeautifulSoup(r.text,'lxml')#解析网页

info_frist = soup.find_all(attrs = {'class':'gl-i-wrap'})
#根据所需要爬取的东西的网页代码id进行爬取

for title in info_frist:
	print(title.text.replace('n',''))
	#遍历info_获取的信息进行格式化输出

查看网页源码

打开京东商城搜索手机,注意要再点击一下分类处的手机(下图),否则会有其他收集相关的产品干扰信息,第二个图为手机商品图
PythonPython

F12大法,定位到需要爬取位置的源码,得你所得。

下一步打开pychrm(python软件,不会还有人不会配环境吧,不会吧不会吧)

代码

	**话不多说直接上代码**
	**都带注释的,不懂的再私信我,在线答疑**
import csv
import json
import random
import re
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
#声明插件的路径
driver_path = r'F:\Pycharm\chromedriver.exe'
#声明一个谷歌插件,不加载图片#不加载图片节省时间
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
#设置属性
driver = webdriver.Chrome(executable_path=driver_path,options=options)
#url
url = 'https://search.jd.com/search?keyword=%E6%89%8B%E6%9C%BA&cid3=655&cid2=653&page=1&s=1&click=0'
#声明一个list,用来存储dict
data_list = []
def start_spider():
    # 请求url
    driver.get(url)

    # 显示等待下一页的元素加载完成
    WebDriverWait(driver, 1000).until(
        EC.presence_of_all_elements_located(
            (By.CLASS_NAME, 'pn-next')
        )
    )
    # 先获取一个有多少页
    all_page = eval(driver.find_element_by_css_selector('span.p-skip>em>b').text)
    print(all_page)
    # 设置一个计数器
    count = 0

    while True:
        try:
            count += 1
            WebDriverWait(driver, 1000).until(
                EC.presence_of_all_elements_located(
                    (By.CLASS_NAME, 'gl-item')
                )
            )# 显示等待商品信息加载完成
            driver.execute_script('document.documentElement.scrollTop=10000')# 拉滚动条到底部,加载商品
            time.sleep(3)
            driver.execute_script('document.documentElement.scrollTop=0')# 随机延迟,等下元素全部刷新
            lis = driver.find_elements_by_class_name('gl-item')# 开始提取信息,找到ul标签下的全部li标签
            print(lis)
            for li in lis:
                #商品名字
                name = li.find_element_by_xpath('.//div[@class="p-name p-name-type-2"]//em').text
                # 去掉“京品手机”
                pattern = r"京品手机|\n"
                name = re.sub(pattern, "", name, flags=re.S)
                #商品链接
                phone_url = li.find_elements_by_xpath(".//div[@class='p-name p-name-type-2']/a")[0].get_attribute(
                    "href")
                #商品价格
                charge = li.find_element_by_xpath('.//div[@class="p-price"]//i').text
                #评论人数
                number = li.find_element_by_xpath('.//div[@class="p-commit"]/strong/a').text
                #店铺名字
                shop_name = li.find_elements_by_xpath('.//div[@class="p-shop"]//a')
                if len(shop_name) > 0:
                    shop_name = shop_name[0].text
                else:
                    shop_name = "null"
                #是否自营
                tmp = li.find_elements_by_xpath(".//div[@class='p-icons']/i[1]")
                phone_proprietary = True if len(tmp) > 0 and tmp[0].text == "自营" else False
                #获取手机id
                phone_id = li.get_attribute("data-sku")
                data_dict = {}
                data_dict['phone_id'] = phone_id
                data_dict['name'] = name
                data_dict['phone_url'] = phone_url
                data_dict['charge'] = charge
                data_dict['number'] = number
                data_dict['shop_name'] = shop_name
                data_dict['phone_proprietary'] = phone_proprietary
                data_list.append(data_dict)
                print(data_dict)
        except Exception as e:
            print(e)
        # 如果count==all_page就退出循环
        so = all_page
        if count == so:
            break

        driver.find_element_by_class_name('pn-next').click()
        time.sleep(2)

def main():
    start_spider()
    # 将数据写入jsonwenj
    with open('data_json.json', 'a+', encoding='utf-8') as f:
        json.dump(data_list, f, ensure_ascii=False, indent=4)
    print('json文件写入完成')

    with open('data_csv.csv', 'w', encoding='utf-8', newline='') as f:
        # 表头
        title = data_list[0].keys()
        # 声明writer
        writer = csv.DictWriter(f, title)
        # 写入表头
        writer.writeheader()
        # 批量写入数据
        writer.writerows(data_list)
    print('csv文件写入完成')


if __name__ == '__main__':

    main()
    # 退出浏览器
    driver.quit()



总结

多学多练多操作

声明:本文内容由互联网用户自发贡献自行上传,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任。如果您发现有涉嫌版权的内容,欢迎进行举报,并提供相关证据,工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。