field = scrapy.Field()
ls = ['field1', 'field2', ...]
class 工程名Item(scrapy.Item):
for fd in ls:
exec(fd + '=scrapy.Field()')
浏览器伪装(line 19)
USER_AGENT = 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)'
爬虫协议(line 22)
ROBOTSTXT_OBEY = False
日志等级(bottom)
LOG_LEVEL = 'INFO'
# LOG_FILE = 'log.txt'
scrapy shell 网址
response.text
response.xpath('').extract()
response.xpath('').re('')
工程目录下创建【z.py】(z排最下容易找),并写入↓
from scrapy import cmdline
# 储存json模式
# cmdline.execute('scrapy crawl example -t json -o 工程.json -s FEED_EXPORT_ENCODING="utf-8"'.split())
# 普通模式
cmdline.execute(['scrapy', 'crawl', 'example'])
# 不打印日志
# cmdline.execute(['scrapy', 'crawl', 'example', '--nolog'])
from pymysql.connections import Connection
from time import strftime
database = 'z_test' # 库名
table = 'industry' # 表名
clear = lambda x: x.strip().replace("'", '"')if isinstance(x, str)else x
class IndustryPipeline(object):
def open_spider(self, spider):
self.db = Connection('localhost', 'root', 'yellow', database, charset='utf8')
self.cursor = self.db.cursor()
def close_spider(self, spider):
self.cursor.close()
self.db.close()
def process_item(self, item, spider):
dt = dict({'collect_date': strftime('%Y-%m-%d')}, **item) # 采集日期
ls = [(k, clear(v))for k, v in dt.items()if v is not None]
sql = 'INSERT %s (' % table + ','.join(i[0] for i in ls) +\
')VALUES(' + ','.join('%r' % i[1] for i in ls) + ');'
self.cursor.execute(sql)
self.db.commit()
return item
CREATE TABLE tb_name(
serial_number INT(9) AUTO_INCREMENT COMMENT '编号',
url CHAR(255) UNIQUE NOT NULL COMMENT '统一资源定位符',
title VARCHAR(255) COMMENT '标题',
detail TEXT COMMENT '细节',
price FLOAT(3,2) UNSIGNED DEFAULT 1.23 COMMENT '价格',
public_time DATETIME COMMENT '发布时间',
collect_date DATE COMMENT '采集日期',
PRIMARY KEY (`serial_number`,`url`)
);
DROP TABLE tb_name;