scrapy之京东商城数据信息爬取

主要是手动爬取过程有点冗长,待改进如有大神请指点:

本次使用的是自动爬虫,通过正则来获取所有能获取的商品的ID,再将ID与商品详情页连接对比分析,获取更多的商品的详情页链接。

spider.py篇:

# -*- coding: utf-8 -*-
import scrapy
from scrapy_redis.spiders import RedisSpider
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from JDstore.items import JdstoreItem
import re
from scrapy.http import Request
import requests

class JdgoodsSpider(CrawlSpider):
    name = 'JDgoods'
    allowed_domains = ['jd.com']
    # start_urls = ['https://www.jd.com/']
    rules = (
        Rule(LinkExtractor(allow=''), callback='parse_item', follow=True),
    )
    header={
        'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
    }
    def start_requests(self):
        return [Request('https://www.jd.com/',headers=self.header)]

    def parse_item(self, response):
        try:
            thisurl=response.url
            pat='https://item.jd.com/(.*?).html'
            x=re.compile(pat).findall(thisurl)
            if(x):
                thisid=re.compile(pat).findall(thisurl)[0]
                i = JdstoreItem()
                i['title'] = response.xpath('//html/head/title').extract()[0]
                
                i['content'] = response.xpath('//meta[@name="description"]/@content').extract()[0]
                
                i['shopname'] = response.xpath('//a[@clstag="shangpin|keycount|product|dianpuname1"]/@title').extract()[0]
                
                comment_url='https://club.jd.com/comment/productCommentSummaries.action?referenceIds={}'.format(str(thisid))
                price_url='https://p.3.cn/prices/mgets?callback=jQuery5488277&type=1&area=1_72_4137_0&pdtk=&pduid=1510968015061901022163&pdpin=&pin=null&pdbp=0&skuIds=J_{}&ext=11000000&source=item-pc'.format(str(thisid))
                data = requests.get(comment_url,headers=self.header).text
                data1 = requests.get(price_url,headers=self.header).text
                i['comment'] = re.search('"CommentCountStr":"(.*?)"', data, re.S).group(1)
                i['goodrate'] = re.search('"GoodRateShow":(.*?),', data, re.S).group(1)
                i['price'] = re.search('"p":"(.*?)"', data1, re.S).group(1)
                if len(['title']) and len(i['content']) and len(i['shopname']) and len(i['comment']) and len(i['goodrate']) and len(i['price']):
                    yield i
                else:
                    pass
            else:
                pass
        except Exception as e:
            print("报错为:s%"%e)

items.py篇:

import scrapy


class JdstoreItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    content = scrapy.Field()
    shopname = scrapy.Field()
    goodrate = scrapy.Field()
    price = scrapy.Field()
    comment = scrapy.Field()

pipelines.py篇

# -*- coding: utf-8 -*-
import pymongo
from scrapy.conf import settings

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# from scrapy.conf import settings

class JdstorePipeline(object):
    def __init__(self):
        host=settings["MONGODB_HOST"]
        port=settings["MONGODB_PORT"]
        dbname=settings["MONGODB_DBNAME"]
        tablename=settings["MONGODB_DOCNAME"]
        client=pymongo.MongoClient(host=host,port=port)
        jdstore=client[dbname]
        self.goods_list=jdstore[tablename]

    def process_item(self, item, spider):
        goodsinfos=dict(item)
        self.goods_list.insert(goodsinfos)
        return item

settings.py篇

# -*- coding: utf-8 -*-
BOT_NAME = 'JDstore'

SPIDER_MODULES = ['JDstore.spiders']
NEWSPIDER_MODULE = 'JDstore.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
#
# MYSQL_HOSTS = '127.0.0.1'
# MYSQL_USER = 'root'
# MYSQL_PASSWORD = 'xxxxx'
# #MYSQL_PORT = settings.MYSQL_PORT
# MYSQL_DB='jd'
# CHARSET='utf8'

#FEED_URI='file:///C:/Users/Administrator/Desktop/code/jd.csv'
#FEED_FORMAT="CSV"

ITEM_PIPELINES = {
   'JDstore.pipelines.JdstorePipeline': 300,
}

MONGODB_HOST='127.0.0.1'
MONGODB_PORT=27017
MONGODB_DBNAME='JDSHORE'
MONGODB_DOCNAME='goodslist'

it's over!!!!!!!!!!!!!!!!!!!


你可能感兴趣的:(项目实战)