主要是手动爬取过程有点冗长,待改进如有大神请指点:
本次使用的是自动爬虫,通过正则来获取所有能获取的商品的ID,再将ID与商品详情页连接对比分析,获取更多的商品的详情页链接。
spider.py篇:
# -*- coding: utf-8 -*- import scrapy from scrapy_redis.spiders import RedisSpider from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from JDstore.items import JdstoreItem import re from scrapy.http import Request import requests class JdgoodsSpider(CrawlSpider):
name = 'JDgoods' allowed_domains = ['jd.com'] # start_urls = ['https://www.jd.com/'] rules = ( Rule(LinkExtractor(allow=''), callback='parse_item', follow=True), ) header={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36' } def start_requests(self): return [Request('https://www.jd.com/',headers=self.header)] def parse_item(self, response): try: thisurl=response.url pat='https://item.jd.com/(.*?).html' x=re.compile(pat).findall(thisurl) if(x): thisid=re.compile(pat).findall(thisurl)[0] i = JdstoreItem() i['title'] = response.xpath('//html/head/title').extract()[0] i['content'] = response.xpath('//meta[@name="description"]/@content').extract()[0] i['shopname'] = response.xpath('//a[@clstag="shangpin|keycount|product|dianpuname1"]/@title').extract()[0] comment_url='https://club.jd.com/comment/productCommentSummaries.action?referenceIds={}'.format(str(thisid)) price_url='https://p.3.cn/prices/mgets?callback=jQuery5488277&type=1&area=1_72_4137_0&pdtk=&pduid=1510968015061901022163&pdpin=&pin=null&pdbp=0&skuIds=J_{}&ext=11000000&source=item-pc'.format(str(thisid)) data = requests.get(comment_url,headers=self.header).text data1 = requests.get(price_url,headers=self.header).text i['comment'] = re.search('"CommentCountStr":"(.*?)"', data, re.S).group(1) i['goodrate'] = re.search('"GoodRateShow":(.*?),', data, re.S).group(1) i['price'] = re.search('"p":"(.*?)"', data1, re.S).group(1) if len(['title']) and len(i['content']) and len(i['shopname']) and len(i['comment']) and len(i['goodrate']) and len(i['price']): yield i else: pass else: pass except Exception as e: print("报错为:s%"%e)
import scrapy class JdstoreItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() content = scrapy.Field() shopname = scrapy.Field() goodrate = scrapy.Field() price = scrapy.Field() comment = scrapy.Field()
pipelines.py篇
# -*- coding: utf-8 -*- import pymongo from scrapy.conf import settings # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html # from scrapy.conf import settings class JdstorePipeline(object): def __init__(self): host=settings["MONGODB_HOST"] port=settings["MONGODB_PORT"] dbname=settings["MONGODB_DBNAME"] tablename=settings["MONGODB_DOCNAME"] client=pymongo.MongoClient(host=host,port=port) jdstore=client[dbname] self.goods_list=jdstore[tablename] def process_item(self, item, spider): goodsinfos=dict(item) self.goods_list.insert(goodsinfos) return item
# -*- coding: utf-8 -*- BOT_NAME = 'JDstore'
SPIDER_MODULES = ['JDstore.spiders'] NEWSPIDER_MODULE = 'JDstore.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False # # MYSQL_HOSTS = '127.0.0.1' # MYSQL_USER = 'root' # MYSQL_PASSWORD = 'xxxxx' # #MYSQL_PORT = settings.MYSQL_PORT # MYSQL_DB='jd' # CHARSET='utf8'
#FEED_URI='file:///C:/Users/Administrator/Desktop/code/jd.csv' #FEED_FORMAT="CSV" ITEM_PIPELINES = { 'JDstore.pipelines.JdstorePipeline': 300, } MONGODB_HOST='127.0.0.1' MONGODB_PORT=27017 MONGODB_DBNAME='JDSHORE' MONGODB_DOCNAME='goodslist' it's over!!!!!!!!!!!!!!!!!!!