使用scrapy爬取suning

发布时间:2019-06-04 21:08:18编辑:auto阅读(2048)

    # -*- coding: utf-8 -*-
    import scrapy
    from copy import deepcopy
    
    
    class SuSpider(scrapy.Spider):
        name = 'su'
        allowed_domains = ['suning.com']
        start_urls = ['http://list.suning.com/?safp=d488778a.error1.0.4786e76351']
    
        def parse(self, response):
            # 获取大分类列表
            bcate_list = response.xpath("//div[@class='allsortLeft']/ul/li")
            for bcate in bcate_list:
                item = {}
                # 获取大分类class的值
                class_name = bcate.xpath("./@class").extract_first()
                # 获取所有大分类的名称
                item["BCate"] = bcate.xpath("./a/span/text()").extract_first()
                # print(item["BCate"])
                # 根据大分类的class定位每个大分类下的所有小分类
                scate_list = response.xpath("//div[@class='{}']/div".format(class_name))
                for scate in scate_list:
                    # 小分类的名称
                    item["SCate"] = scate.xpath("./div[1]/a/@title").extract_first()
                    # 获取每个小分类下的所有标签
                    tag_list = scate.xpath("./div[2]/a")
                    for tag in tag_list:
                        # 每个标签的链接和名称
                        item["tag"] = tag.xpath("./text()").extract_first()
                        item["tag_link"] = "http:" + tag.xpath("./@href").extract_first()
                        # 进入列表页
                        yield scrapy.Request(
                            item["tag_link"],
                            callback=self.good_list,
                            meta={"item": deepcopy(item)}
                        )
    
        def good_list(self, response):
            item = deepcopy(response.meta["item"])
            # 获取当前页的所有商品列表
            li_list = response.xpath("//div[@id='product-wrap']/div/ul/li")
            for li in li_list:
                # 获取商品的图片地址,名称,价格,商品详情页的链接
                item["good_img"] = "http:"+li.xpath(".//div[@class='res-img']/div/a/img/@src").extract_first()
                item["good_name"] = li.xpath(".//div[@class='res-info']/div/a/text()").extract_first()
                item["good_price"] = li.xpath(".//div[@class='res-info']/div/span/text()").extract_first()
                item["good_href"] = li.xpath(".//div[@class='res-info']/div/a/@href").extract_first()
                # 进入商品详情页
                if item["good_href"] != "javascript:void(0);":
                    yield scrapy.Request(
                        "http:"+item["good_href"],
                        callback=self.good_detail,
                        meta={"item": deepcopy(item)}
                    )
            # 翻页
            next_url = response.xpath("//a[@id='nextPage']/@href").extract_first()
            if next_url:
                yield scrapy.Request(
                    next_url,
                    callback=self.good_list,
                    meta={"item": response.meta["item"]}
                )
    
        def good_detail(self, response):
            item = response.meta["item"]
            # 获取当前商品的属性规格:颜色、版本、
            size_list = response.xpath("//div[@id='J-TZM']/dl")
            for size in size_list:
                size_name = size.xpath("./dt/span/text()").extract_first()
                size_value = size.xpath("./dd/ul/li/@title").extract()
                item[size_name] = size_value
            print(item)
    View Code

     

关键字