使用scrapy爬取suning

发布时间：2019-06-04 21:08:18编辑：auto阅读（2240）

# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy


class SuSpider(scrapy.Spider):
    name = 'su'
    allowed_domains = ['suning.com']
    start_urls = ['http://list.suning.com/?safp=d488778a.error1.0.4786e76351']

    def parse(self, response):
        # 获取大分类列表
        bcate_list = response.xpath("//div[@class='allsortLeft']/ul/li")
        for bcate in bcate_list:
            item = {}
            # 获取大分类class的值
            class_name = bcate.xpath("./@class").extract_first()
            # 获取所有大分类的名称
            item["BCate"] = bcate.xpath("./a/span/text()").extract_first()
            # print(item["BCate"])
            # 根据大分类的class定位每个大分类下的所有小分类
            scate_list = response.xpath("//div[@class='{}']/div".format(class_name))
            for scate in scate_list:
                # 小分类的名称
                item["SCate"] = scate.xpath("./div[1]/a/@title").extract_first()
                # 获取每个小分类下的所有标签
                tag_list = scate.xpath("./div[2]/a")
                for tag in tag_list:
                    # 每个标签的链接和名称
                    item["tag"] = tag.xpath("./text()").extract_first()
                    item["tag_link"] = "http:" + tag.xpath("./@href").extract_first()
                    # 进入列表页
                    yield scrapy.Request(
                        item["tag_link"],
                        callback=self.good_list,
                        meta={"item": deepcopy(item)}
                    )

    def good_list(self, response):
        item = deepcopy(response.meta["item"])
        # 获取当前页的所有商品列表
        li_list = response.xpath("//div[@id='product-wrap']/div/ul/li")
        for li in li_list:
            # 获取商品的图片地址，名称，价格，商品详情页的链接
            item["good_img"] = "http:"+li.xpath(".//div[@class='res-img']/div/a/img/@src").extract_first()
            item["good_name"] = li.xpath(".//div[@class='res-info']/div/a/text()").extract_first()
            item["good_price"] = li.xpath(".//div[@class='res-info']/div/span/text()").extract_first()
            item["good_href"] = li.xpath(".//div[@class='res-info']/div/a/@href").extract_first()
            # 进入商品详情页
            if item["good_href"] != "javascript:void(0);":
                yield scrapy.Request(
                    "http:"+item["good_href"],
                    callback=self.good_detail,
                    meta={"item": deepcopy(item)}
                )
        # 翻页
        next_url = response.xpath("//a[@id='nextPage']/@href").extract_first()
        if next_url:
            yield scrapy.Request(
                next_url,
                callback=self.good_list,
                meta={"item": response.meta["item"]}
            )

    def good_detail(self, response):
        item = response.meta["item"]
        # 获取当前商品的属性规格：颜色、版本、
        size_list = response.xpath("//div[@id='J-TZM']/dl")
        for size in size_list:
            size_name = size.xpath("./dt/span/text()").extract_first()
            size_value = size.xpath("./dd/ul/li/@title").extract()
            item[size_name] = size_value
        print(item)

View Code

关键字：

上一篇： python抓取知识星球精选帖,制作为p

下一篇： utf-8 下汉字为什么需要三个字节



搜索

热门推荐

最新文章

Python搭建一个RAG系统(分片/检索/召回/重排序/生成)
 1072°
Browser-use:智能浏览器自动化(Web-Agent)
 1747°
使用 LangChain 实现本地 Agent
 1431°
使用 LangChain 构建本地 RAG 应用
 1364°
使用LLaMA-Factory微调大模型的function calling能力
 1634°
复现一个简单Agent系统
 1451°
LLaMA Factory-Lora微调实现声控语音多轮问答对话-1
 2111°
LLaMA Factory微调后的模型合并导出和部署-4
 3657°
LLaMA Factory微调模型的各种参数怎么设置-3
 3584°
LLaMA Factory构建高质量数据集-2
 2528°

博主信息

姓名：Run
职业：谜
邮箱：383697894@qq.com
定位：上海 · 松江

扫我打开

友情链接

百度 淘宝 腾讯 慕课网 CSDN 博客园 51cto博客