python爬虫篇4——爬取专利著作权信息

mysql代码:

CREATE TABLE `copyright` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `软件名称` varchar(500) DEFAULT NULL,
  `登记号` varchar(500) DEFAULT NULL,
  `分类号` varchar(500) DEFAULT NULL,
  `软件简称` varchar(500) DEFAULT NULL,
  `版本号` varchar(500) DEFAULT NULL,
  `首次发表日期` varchar(500) DEFAULT NULL,
  `登记批准日期` varchar(500) DEFAULT NULL,
  `软件著作权人` varchar(500) DEFAULT NULL,
  `软件著作权人详情` varchar(500) DEFAULT NULL,
  PRIMARY KEY (`id`),
  UNIQUE KEY `登记号` (`登记号`)
) ENGINE=InnoDB AUTO_INCREMENT=9871 DEFAULT CHARSET=utf8

CREATE TABLE `patent` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `专利名称` varchar(500) DEFAULT NULL,
  `发明人` varchar(500) DEFAULT NULL,
  `申请人` varchar(500) DEFAULT NULL,
  `申请日` datetime DEFAULT NULL,
  `公开日` datetime DEFAULT NULL,
  `详情地址` varchar(500) DEFAULT NULL,
  PRIMARY KEY (`id`),
  UNIQUE KEY `详情地址` (`详情地址`)
) ENGINE=InnoDB AUTO_INCREMENT=13610 DEFAULT CHARSET=utf8

python代码:

getpatentdata.py主程序
import re
from urllib.parse import unquote, quote
from lxml import etree
from requests_html import HTMLSession

from 抓取专利著作权信息.MysqlHelper import MysqlHelper


# 获取专利信息
class Patent:
    def __init__(self, sqr, year):
        self.helper = MysqlHelper(host='localhost',
                                  port=8080,
                                  user='root',
                                  passwd='123',
                                  db='students',
                                  charset='utf8')
        self.creatTable()
        self.sum = 0
        while year <= 2019:
            if year >= 2016:
                dateList = ["%s-01-01" % str(year), "%s-02-01" % str(year), "%s-03-01" % str(year),
                            "%s-04-01" % str(year), "%s-05-01" % str(year), "%s-06-01" % str(year),
                            "%s-07-01" % str(year), "%s-08-01" % str(year), "%s-09-01" % str(year),
                            "%s-10-01" % str(year), "%s-11-01" % str(year), "%s-12-01" % str(year),
                            "%s-12-31" % str(year)]
            else:
                dateList = ["%s-01-01" % str(year), "%s-03-01" % str(year), "%s-05-01" % str(year),
                            "%s-07-01" % str(year),
                            "%s-09-01" % str(year),
                            "%s-11-01" % str(year), "%s-12-31" % str(year)]
            print("*" * 66)
            print("\033[36m开始抓取%s年的专利数据,已累计抓取%s条数据\033[0m" % (str(year), self.sum))
            print("*" * 66)
            for i in range(len(dateList) - 1):
                self.getPatent(sqr, dateList[i], dateList[i + 1])
            else:
                year += 1
        else:
            print("\033[34m专利数据抓取完毕!共抓取%s条数据\033[0m" % str(self.sum))
            showFunction()

    # 建表
    def creatTable(self):
        sql1 = "CREATE TABLE `patent` (`id` int primary key not null auto_increment,`专利名称` varchar(500) DEFAULT NULL  ,`发明人` varchar(500) DEFAULT NULL ,`申请人` varchar(500) DEFAULT NULL,`申请日` datetime DEFAULT NULL,`公开日` datetime DEFAULT NULL,`详情地址` varchar(500) DEFAULT NULL UNIQUE);"
        self.helper.execute(sql1)

    # 获取数据 sqr申请人 sqday_start申请日开始 sqday_end申请日结束
    def getPatent(self, sqr, sqday_start, sqday_end):
        self.patent_url = "http://dbpub.cnki.net/Grid2008/Dbpub/Brief.aspx?curpage=8&RecordsPerPage=350&QueryID=64&ID=SCPD&turnpage=1&systemno=&NaviDatabaseName=SCPD_ZJCLS&NaviField=%e4%b8%93%e9%a2%98%e5%ad%90%e6%a0%8f%e7%9b%ae%e4%bb%a3%e7%a0%81&navigatorValue=&subBase=all"
        self.session = HTMLSession()
        self.add_url = "http://dbpub.cnki.net/Grid2008/Dbpub/"
        self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                        'Accept-Encoding': 'gzip,deflate',
                        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
                        'Connection': 'keep-alive',
                        'Content-Type': 'application/x-www-form-urlencoded',
                        'Origin': 'http://dbpub.cnki.net',
                        'Host': 'dbpub.cnki.net',
                        'Upgrade-Insecure-Requests': '1',
                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
                        }
        # h = self.session.post(self.patent_url, headers=headers)
        # pagenum = etree.HTML(h.html.html).xpath('//div[@id="id_grid_total"]/text()')[0][5:-3]
        # print("共%s条数据" % pagenum)
        # self.patent_url = self.patent_url + "&RecordsPerPage=" + pagenum
        self.data = "ID=SCPD&hdnSearchType=&hdnIsAll=false&NaviField=%E4%B8%93%E9%A2%98%E5%AD%90%E6%A0%8F%E7%9B%AE%E4%BB%A3%E7%A0%81&NaviDatabaseName=SCPD_ZJCLS&systemno=&hdnFathorCode=sysAll&selectbox=I&strNavigatorValue=%2CA%2CB%2CC%2CD%2CE%2CF%2CH%2CI&strNavigatorName=%2C%E5%9F%BA%E7%A1%80%E7%A7%91%E5%AD%A6%2C%E5%B7%A5%E7%A8%8B%E7%A7%91%E6%8A%80%E2%85%A0%E8%BE%91%2C%E5%B7%A5%E7%A8%8B%E7%A7%91%E6%8A%80%E2%85%A1%E8%BE%91%2C%E5%86%9C%E4%B8%9A%E7%A7%91%E6%8A%80%2C%E5%8C%BB%E8%8D%AF%E5%8D%AB%E7%94%9F%E7%A7%91%E6%8A%80%2C%E5%93%B2%E5%AD%A6%E4%B8%8E%E4%BA%BA%E6%96%87%E7%A7%91%E5%AD%A6%2C%E7%A4%BE%E4%BC%9A%E7%A7%91%E5%AD%A6%E2%85%A1%E8%BE%91%2C%E4%BF%A1%E6%81%AF%E7%A7%91%E6%8A%80&singleleafcode=&searchAttachCondition=&SearchQueryID=5&SearchFieldRelationDirectory=&updateTempDB=&bCurYearTempDB=1&fieldtips=%E7%AF%87%E5%90%8D%2F%5B%E5%9C%A8%E6%96%87%E7%8C%AE%E6%A0%87%E9%A2%98%E4%B8%AD%E6%A3%80%E7%B4%A2%E3%80%82%E5%AF%B9%E8%AF%A5%E6%A3%80%E7%B4%A2%E9%A1%B9%E7%9A%84%E6%A3%80%E7%B4%A2%E6%98%AF%E6%8C%89%E8%AF%8D%E8%BF%9B%E8%A1%8C%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D%2C%E5%85%B3%E9%94%AE%E8%AF%8D%2F%5B%E6%A3%80%E7%B4%A2%E6%96%87%E7%8C%AE%E7%9A%84%E5%85%B3%E9%94%AE%E8%AF%8D%E4%B8%AD%E6%BB%A1%E8%B6%B3%E6%A3%80%E7%B4%A2%E6%9D%A1%E4%BB%B6%E7%9A%84%E6%96%87%E7%8C%AE%E3%80%82%E5%AF%B9%E8%AF%A5%E6%A3%80%E7%B4%A2%E9%A1%B9%E7%9A%84%E6%A3%80%E7%B4%A2%E6%98%AF%E6%8C%89%E8%AF%8D%E8%BF%9B%E8%A1%8C%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D%2C%E7%AC%AC%E4%B8%80%E8%B4%A3%E4%BB%BB%E4%BA%BA%2F%5B%E8%AF%B7%E9%80%89%E6%8B%A9%E6%A3%80%E7%B4%A2%E9%A1%B9%E5%B9%B6%E6%8C%87%E5%AE%9A%E7%9B%B8%E5%BA%94%E7%9A%84%E6%A3%80%E7%B4%A2%E8%AF%8D%EF%BC%8C%E9%80%89%E6%8B%A9%E6%8E%92%E5%BA%8F%E6%96%B9%E5%BC%8F%E3%80%81%E5%8C%B9%E9%85%8D%E6%A8%A1%E5%BC%8F%E3%80%81%E6%96%87%E7%8C%AE%E6%97%B6%E9%97%B4%E7%AD%89%E9%99%90%E5%AE%9A%E6%9D%A1%E4%BB%B6%EF%BC%8C%E7%84%B6%E5%90%8E%E7%82%B9%E5%87%BB%E2%80%9C%E6%A3%80%E7%B4%A2%E2%80%9D%E3%80%82%5D%2C%E4%BD%9C%E8%80%85%2F%5B%E5%8F%AF%E8%BE%93%E5%85%A5%E4%BD%9C%E8%80%85%E5%AE%8C%E6%95%B4%E5%A7%93%E5%90%8D%EF%BC%8C%E6%88%96%E5%8F%AA%E8%BE%93%E5%85%A5%E8%BF%9E%E7%BB%AD%E7%9A%84%E4%B8%80%E9%83%A8%E5%88%86%E3%80%82%5D%2C%E6%9C%BA%E6%9E%84%2F%5B%E5%8F%AF%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E6%9C%BA%E6%9E%84%E5%90%8D%E7%A7%B0%EF%BC%8C%E6%88%96%E5%8F%AA%E8%BE%93%E5%85%A5%E8%BF%9E%E7%BB%AD%E7%9A%84%E4%B8%80%E9%83%A8%E5%88%86%E3%80%82%5D%2C%E4%B8%AD%E6%96%87%E6%91%98%E8%A6%81%2F%5B%E5%AF%B9%E8%AF%A5%E6%A3%80%E7%B4%A2%E9%A1%B9%E7%9A%84%E6%A3%80%E7%B4%A2%E6%98%AF%E6%8C%89%E8%AF%8D%E8%BF%9B%E8%A1%8C%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D%2C%E5%BC%95%E6%96%87%2F%5B%E8%AF%B7%E9%80%89%E6%8B%A9%E6%A3%80%E7%B4%A2%E9%A1%B9%E5%B9%B6%E6%8C%87%E5%AE%9A%E7%9B%B8%E5%BA%94%E7%9A%84%E6%A3%80%E7%B4%A2%E8%AF%8D%EF%BC%8C%E9%80%89%E6%8B%A9%E6%8E%92%E5%BA%8F%E6%96%B9%E5%BC%8F%E3%80%81%E5%8C%B9%E9%85%8D%E6%A8%A1%E5%BC%8F%E3%80%81%E6%96%87%E7%8C%AE%E6%97%B6%E9%97%B4%E7%AD%89%E9%99%90%E5%AE%9A%E6%9D%A1%E4%BB%B6%EF%BC%8C%E7%84%B6%E5%90%8E%E7%82%B9%E5%87%BB%E2%80%9C%E6%A3%80%E7%B4%A2%E2%80%9D%E3%80%82%5D%2C%E5%85%A8%E6%96%87%2F%E8%AF%B7%E9%80%89%E6%8B%A9%E6%A3%80%E7%B4%A2%E9%A1%B9%E5%B9%B6%E6%8C%87%E5%AE%9A%E7%9B%B8%E5%BA%94%E7%9A%84%E6%A3%80%E7%B4%A2%E8%AF%8D%EF%BC%8C%E9%80%89%E6%8B%A9%E6%8E%92%E5%BA%8F%E6%96%B9%E5%BC%8F%E3%80%81%E5%8C%B9%E9%85%8D%E6%A8%A1%E5%BC%8F%E3%80%81%E6%96%87%E7%8C%AE%E6%97%B6%E9%97%B4%E7%AD%89%E9%99%90%E5%AE%9A%E6%9D%A1%E4%BB%B6%EF%BC%8C%E7%84%B6%E5%90%8E%E7%82%B9%E5%87%BB%E2%80%9C%E6%A3%80%E7%B4%A2%E2%80%9D%E3%80%82%5D%2C%E5%9F%BA%E9%87%91%2F%5B%E6%A3%80%E7%B4%A2%E5%8F%97%E6%BB%A1%E8%B6%B3%E6%9D%A1%E4%BB%B6%E7%9A%84%E5%9F%BA%E9%87%91%E8%B5%84%E5%8A%A9%E7%9A%84%E6%96%87%E7%8C%AE%E3%80%82%5D%2C%E4%B8%AD%E6%96%87%E5%88%8A%E5%90%8D%2F%5B%E8%AF%B7%E8%BE%93%E5%85%A5%E9%83%A8%E5%88%86%E6%88%96%E5%85%A8%E9%83%A8%E5%88%8A%E5%90%8D%E3%80%82%5D%2CISSN%2F%5B%E8%AF%B7%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84ISSN%E5%8F%B7%E3%80%82%5D%2C%E5%B9%B4%2F%5B%E8%BE%93%E5%85%A5%E5%9B%9B%E4%BD%8D%E6%95%B0%E5%AD%97%E7%9A%84%E5%B9%B4%E4%BB%BD%E3%80%82%5D%2C%E6%9C%9F%2F%5B%E8%BE%93%E5%85%A5%E6%9C%9F%E5%88%8A%E7%9A%84%E6%9C%9F%E5%8F%B7%EF%BC%8C%E5%A6%82%E6%9E%9C%E4%B8%8D%E8%B6%B3%E4%B8%A4%E4%BD%8D%E6%95%B0%E5%AD%97%EF%BC%8C%E8%AF%B7%E5%9C%A8%E5%89%8D%E9%9D%A2%E8%A1%A5%E2%80%9C0%E2%80%9D%EF%BC%8C%E5%A6%82%E2%80%9C08%E2%80%9D%E3%80%82%5D%2C%E4%B8%BB%E9%A2%98%2F%5B%E4%B8%BB%E9%A2%98%E5%8C%85%E6%8B%AC%E7%AF%87%E5%90%8D%E3%80%81%E5%85%B3%E9%94%AE%E8%AF%8D%E3%80%81%E4%B8%AD%E6%96%87%E6%91%98%E8%A6%81%E3%80%82%E5%8F%AF%E6%A3%80%E7%B4%A2%E5%87%BA%E8%BF%99%E4%B8%89%E9%A1%B9%E4%B8%AD%E4%BB%BB%E4%B8%80%E9%A1%B9%E6%88%96%E5%A4%9A%E9%A1%B9%E6%BB%A1%E8%B6%B3%E6%8C%87%E5%AE%9A%E6%A3%80%E7%B4%A2%E6%9D%A1%E4%BB%B6%E7%9A%84%E6%96%87%E7%8C%AE%E3%80%82%E5%AF%B9%E4%B8%BB%E9%A2%98%E6%98%AF%E6%8C%89%E8%AF%8D%E6%A3%80%E7%B4%A2%E7%9A%84%EF%BC%8C%E8%AF%B7%E5%B0%BD%E5%8F%AF%E8%83%BD%E8%BE%93%E5%85%A5%E5%AE%8C%E6%95%B4%E7%9A%84%E8%AF%8D%EF%BC%8C%E4%BB%A5%E9%81%BF%E5%85%8D%E6%BC%8F%E6%A3%80%E3%80%82%5D&advancedfield1=%E7%94%B3%E8%AF%B7%E4%BA%BA&advancedvalue1=" + quote(
            sqr) + "&imageField.x=50&imageField.y=11&searchmatch=0&order=dec&RecordsPerPage=350&hdnUSPSubDB=%E4%B8%93%E5%88%A9%E7%B1%BB%E5%88%AB%2C%2B1%2B2%2B3%2B%2C3%2C3&TableType=PY&display=chinese&encode=gb&TablePrefix=SCPD&View=SCPD&yearFieldName=%E5%B9%B4&userright=&VarNum=1&MM_fieldValue_1_1=" + sqday_start + "&MM_fieldValue_1_2=" + sqday_end + "&MM_slt_updateTime=&MM_Update_Time=&MM_Update_EndTime=&MM_fieldValue_2_1=&MM_fieldValue_2_2=&MM_hiddenTxtName=MM_fieldValue_1_1%40%40%40MM_fieldValue_1_2%40%40%40MM_fieldValue_2_1%40%40%40MM_fieldValue_2_2%40%40%40MM_Update_Time%40%40%40MM_Update_EndTime&MM_fieldName=%E7%94%B3%E8%AF%B7%E6%97%A5%40%40%40%E7%94%B3%E8%AF%B7%E6%97%A5%40%40%40%E5%85%AC%E5%BC%80%E6%97%A5%40%40%40%E5%85%AC%E5%BC%80%E6%97%A5%40%40%40%E6%9B%B4%E6%96%B0%E6%97%A5%E6%9C%9F%40%40%40%E6%9B%B4%E6%96%B0%E6%97%A5%E6%9C%9F&MM_hiddenRelation=%3E%3D%40%40%40%3C%3D%40%40%40%3E%3D%40%40%40%3C%3D%40%40%40%3E%3D%40%40%40%3C%3D&lastpage=7&RecordsPerPage2=50&systemno=%2C&classtype=&QueryID=5&turnpage=&curpage=1&curpage1=1&curpage2=1"
        # print(unquote(datas))
        # print(h.text)
        patentdata = ''
        while len(patentdata) == 0:
            self.h1 = self.session.post(self.patent_url, headers=self.headers, data=self.data)
            patentdata = etree.HTML(self.h1.text).xpath('//table[@class="s_table"]//tr')
        print("\033[31m从%s 到 %s 共有%s条专利数据\033[0m" % (sqday_start, sqday_end, len(patentdata) - 1))
        # 数据写入数据库
        for i in range(1, len(patentdata)):
            item = patentdata[i]
            # number = item.xpath('./td[@class="s_tabletd_rb"]')[0].xpath('string(.)')
            patentname = item.xpath('./td[@class="s_tabletd_rb"]')[1].xpath('string(.)')
            patentpeople = item.xpath('./td[@class="s_tabletd_rb"]')[2].xpath('string(.)')
            sqpeople = item.xpath('./td[@class="s_tabletd_rb"]')[3].xpath('string(.)')
            sqday = item.xpath('./td[@class="s_tabletd_rb"]')[4].xpath('string(.)')
            openday = item.xpath('./td[@class="s_tabletd_rb"]')[5].xpath('string(.)')
            address = self.add_url + item.xpath('./td[@class="s_tabletd_rb"]//a/@href')[0]
            # print("*" * 66)
            sql = "insert into patent(`专利名称`,`发明人`,`申请人` ,`申请日`,`公开日` ,`详情地址`) values(%s,%s,%s,%s,%s,%s)on duplicate key update `专利名称` = %s and `发明人` = %s and `申请人` = %s  and `申请日` = %s  and `公开日` = %s  and `详情地址` = %s;"
            params = [patentname, patentpeople, sqpeople, sqday, openday, address, patentname,
                      patentpeople, sqpeople, sqday, openday, address]
            result = self.helper.execute(sql, params)
            if str(result).__contains__('1292'):
                pass
            else:
                print(str(i) + '.' + patentname + '   数据入库成功!')
                self.sum += 1


# 获取软件著作权信息
class Copyright:
    def __init__(self, key):
        self.helper = MysqlHelper(host='localhost',
                                  port=8080,
                                  user='root',
                                  passwd='123',
                                  db='students',
                                  charset='utf8')
        self.creatTable()
        self.getCopyrightData(key)

    # 建表
    def creatTable(self):
        sql1 = "CREATE TABLE `copyright` (`id` int primary key not null auto_increment,`软件名称` varchar(500) DEFAULT NULL  ,`登记号` varchar(500) DEFAULT NULL UNIQUE,`分类号` varchar(500) DEFAULT NULL,`软件简称` varchar(500) DEFAULT NULL,`版本号` varchar(500) DEFAULT NULL,`首次发表日期` varchar(500) DEFAULT NULL,`登记批准日期` varchar(500) DEFAULT NULL,`软件著作权人` varchar(500) DEFAULT NULL,`软件著作权人详情` varchar(500) DEFAULT NULL);"
        self.helper.execute(sql1)

    def getCopyrightData(self, key):
        self.add_url = 'https://www.qichacha.com'
        self.page = 1
        self.sum = 0
        self.copyright_url = "https://www.qichacha.com/more_rjzzq.html?key=%s&p=%s" % (key, str(self.page))
        self.session = HTMLSession()
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Cookie': 'acw_tc=9dff1e1d15740724795763997e1d4fc677c413795a13ba5e12a187111d; QCCSESSID=4koqg095imku2ge3616s51au67; _uab_collina=157407248111977014224544; zg_did=%7B%22did%22%3A%20%2216e7e07f5ad448-0acb91ff1d41898-4c302b7a-fa000-16e7e07f5af58a%22%7D; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201574072481204%2C%22updated%22%3A%201574072508771%2C%22info%22%3A%201574072481208%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%223edfa18efe756b45eb94b06651c93d3a%22%7D; UM_distinctid=16e7e07f5e0258-0c4202a4fac742-4c302b7a-fa000-16e7e07f5e6346; CNZZDATA1254842228=281213894-1574070903-%7C1574070903; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1574072482; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1574072509',
            'Host': 'www.qichacha.com',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
        }
        try:
            self.h = self.session.get(self.copyright_url, headers=self.headers)
        except Exception:
            self.h = self.session.get(self.copyright_url, headers=self.headers)
        # print(self.h.text)
        # 获取总页数
        pagesum = etree.HTML(self.h.text).xpath('//a[@class="end"]/text()')[0]
        while self.page <= int(pagesum):
            print("*" * 66)
            print('\033[31m开始抓取第%s页的数据,共%s页\033[0m' % (self.page, pagesum))
            self.copyright_url = "https://www.qichacha.com/more_rjzzq.html?key=%s&p=%s" % (key, str(self.page))
            self.copyrightdata = ''
            while not len(self.copyrightdata):
                self.h = self.session.get(self.copyright_url, headers=self.headers)
                self.copyrightdata = etree.HTML(self.h.text).xpath('//section[@id="searchlist"]')
            # 数据写入数据库
            print("*" * 66)
            for item in self.copyrightdata:
                # 软件名称
                copyrightname = item.xpath('.//span[@class="name"]')[0].xpath('string(.)')
                djh_and_flh = re.split(r'[::]',
                                       re.sub(r'\s+', '',
                                              item.xpath('.//small[@class="text-muted clear text-ellipsis m-t-xs"]')[
                                                  0].xpath(
                                                  'string(.)')))
                # 登记号
                djh = re.findall(r'(.*?)分类号', djh_and_flh[1])[0]
                # 分类号
                flh = djh_and_flh[2]
                if not len(djh):
                    djh = '空'
                if not len(flh):
                    flh = '空'
                rjjc_and_bbh = re.split(r':', re.sub(r'\s+', '', item.xpath(
                    './/small[@class="text-muted clear text-ellipsis m-t-xs"]')[1].xpath('string(.)')))
                # 软件简称
                rjname = re.findall(r'(.*?)版本号', rjjc_and_bbh[1])[0]
                # 版本号
                bbh = rjjc_and_bbh[2]
                fbtime_and_pztime = re.split(r':', re.sub(r'\s+', '', item.xpath(
                    './/small[@class="text-muted clear text-ellipsis m-t-xs"]')[2].xpath('string(.)')))
                # 首次发表日期
                fbtime = re.findall(r'(.*?)登记批准日期', fbtime_and_pztime[1])[0]
                # 登记批准日期
                pztime = fbtime_and_pztime[2]
                if djh == '-':
                    djh = ''
                if flh == '-':
                    flh = ''
                # 软件著作权人
                rjzzqr = re.split(r':', re.sub(r'\s+', '',
                                               item.xpath('.//footer [@class="panel-footer clear"]')[0].xpath(
                                                   'string(.)')))[1]
                # 软件著作权人详情
                try:
                    rjurl = self.add_url + item.xpath('.//footer [@class="panel-footer clear"]/a/@href')[0]
                except IndexError:
                    rjurl = '空'
                sql = "insert into copyright(`软件名称`,`登记号`,`分类号` ,`软件简称`,`版本号` ,`首次发表日期`,`登记批准日期`,`软件著作权人`,`软件著作权人详情`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)on duplicate key update `软件名称` = %s and `登记号` = %s and `分类号` = %s  and `软件简称` = %s  and `版本号` = %s  and `首次发表日期` = %s and `登记批准日期` = %s and `软件著作权人` = %s and `软件著作权人详情` = %s;"
                params = [copyrightname, djh, flh, rjname, bbh, fbtime, pztime,
                          rjzzqr, rjurl, copyrightname, djh, flh, rjname, bbh, fbtime, pztime,
                          rjzzqr, rjurl]
                result = self.helper.execute(sql, params)
                if str(result).__contains__('1292'):
                    pass
                else:
                    print(copyrightname + ',' +
                          djh + ',' + flh + ',' + rjname + ',' + bbh + ',' + fbtime + ',' + pztime + ',' + rjzzqr + ',' + rjurl)
                    print('数据入库成功!')
                    # print("*" * 66)
                    # 累计数据数量
                    self.sum += 1
            else:
                print('\033[34m累计抓取数据%s条!\033[0m' % self.sum)
                self.page += 1
        else:
            print("\033[34m著作权数据抓取完毕!共抓取%s条数据\033[0m" % str(self.sum))
            showFunction()


# 展示功能菜单
def showFunction():
    print("*" * 66)
    print("\t\t\t\t\t专利著作权信息下载工具V1.0\t\t\t\t\t")
    print("*" * 66)
    print("\033[34m请选择功能\n1.抓取全部专利数据\n2.已抓取全部专利数据,执行更新数据操作\n3.抓取全部著作权数据\n4.退出程序\033[0m")
    print("*" * 66)
    point = True
    while point:
        fuc = input('请输入功能序号:')
        if not fuc.isdigit():
            print("\033[31m输入错误,请输入功能序号!\033[0m")
            point = True
        elif int(fuc) == 1:
            strs = input("请输入申请人关键词(直接回车键返回上一级):")
            if not len(strs):
                point = True
            else:
                Patent(strs, 1985)
                point = False
        elif int(fuc) == 2:
            strs = input("请输入申请人关键词(直接回车键返回上一级):")
            if not len(strs):
                point = True
            else:
                Patent(strs, 2019)
                point = False
        elif int(fuc) == 3:
            strs = input("请输入著作权关键词(直接回车键返回上一级):")
            if not len(strs):
                point = True
            else:
                Copyright(strs)
                point = False
        elif int(fuc) == 4:
            print('程序已关闭...')
            exit()
        else:
            print("\033[31m输入错误,请输入正确的功能序号!\033[0m")
            point = True
            # Patent("江西")
            # Patent("南昌")


if __name__ == '__main__':
    showFunction()
MysqlHelper.py数据库辅助连接类:
from click._compat import raw_input
from pymysql import *

"""封装mysql连接类"""


class MysqlHelper:
    """初始化数据库参数"""

    def __init__(self, host, port, user, passwd, db, charset):
        # 数据库连接地址
        self.host = host
        # 地址端口
        self.port = port
        # 数据库用户名
        self.user = user
        # 数据库密码
        self.passwd = passwd
        # 数据库名称
        self.db = db
        # 编码
        self.charset = charset

    """连接数据库,获取Connection对象和cursor游标对象"""

    def open(self):
        self.conn = connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, db=self.db,
                            charset=self.charset)
        self.cursor = self.conn.cursor()

    """执行用户输入的sql语句,参数化sql语句中的输入值"""

    def execute(self, sql, params=()):
        try:
            # 打开数据库连接
            self.open()
            # 执行sql语句
            self.cursor.execute(sql, params)
            # 提交事务
            self.conn.commit()
            # 关闭数据库连接
            self.close()
            # print("sql执行完成")
        except Exception as e:
            # 发送错误回滚
            # self.rollback()
            return e

    def createDataBase(self, sql, params=()):
        try:
            # 打开数据库连接
            conn = connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd,
                           charset=self.charset)
            cursor = conn.cursor()
            # 执行sql语句
            cursor.execute(sql, params)
            # 提交事务
            conn.commit()
            # 关闭数据库连接
            cursor.close()
            conn.close()
            # print("sql执行完成")
        except Exception as e:
            # 发送错误回滚
            # self.rollback()
            print(e)

    """返回sql全部查询结果"""

    def all(self, sql, params=()):
        try:
            # 打开数据库连接
            self.open()
            # 执行sql语句
            self.cursor.execute(sql, params)
            # 调用cursor的fetchall获取全部执行结果
            result = self.cursor.fetchall()
            # 关闭数据库连接
            self.close()
            # 返回执行结果
            return result
        except Exception as e:
            return e

    """返回sql查询结果一行"""

    def single(self, sql, params=()):
        try:
            # 打开数据库连接
            self.open()
            # 执行sql语句
            self.cursor.execute(sql, params)
            # 调用cursor的fetchone获取全部执行结果中的一条
            result = self.cursor.fetchone()
            # 关闭数据库连接
            self.close()
            # 返回执行结果
            return result
        except Exception as e:
            print(e)

    """数据库回滚"""

    def rollback(self):
        self.conn.rollback()

    """关闭数据库"""

    def close(self):
        self.cursor.close()
        self.conn.close()


"""测试用"""
if __name__ == '__main__':
    msh = MysqlHelper('localhost', 8080, 'root', '123', 'test', 'utf8')
    name = raw_input('请输入学生姓名:')
    sbname = raw_input('请输入科目名称:')
    sql = 'insert into students(name) values(%s)'
    sql1 = 'insert into subjects(sbname) values(%s)'
    sql2 = 'select id,name from students where id<5'
    msh.execute(sql, [name])
    msh.execute(sql1, [sbname])
    print(msh.all(sql2))

程序可能存在部分bug,欢迎交流指正。

你可能感兴趣的:(python)