selenium爬虫爬取1688供应商企业信息

#! /usr/bin/env python

coding:utf-8

-- coding=utf-8 --

from imp import reload

import time
import urllib.request
import sys
import numpy as np
import pymysql
import re
from lxml import etree
from selenium import webdriver

解决中文报错的问题

reload(sys)
try:
driver = webdriver.Firefox(executable_path = ‘F:\geckodriver-v0.23.0-win64\geckodriver.exe’)
except Exception as e:
driver = webdriver.Firefox(executable_path = ‘F:\geckodriver-v0.23.0-win64\geckodriver.exe’)

构建agents防止反爬虫

user_agents = [
‘Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11’,
‘Opera/9.25 (Windows NT 5.1; U; en)’,
‘Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1;.NET CLR 1.1.4322; .NET CLR2.0.50727)’,
‘Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5(like Gecko) (Kubuntu)’,
‘Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12’,
‘Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9’,
“Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7”,
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
]
time.sleep(5)

跳转到页面的url

urlconfing = open(r’F:\DomeTest\csv\url.txt’,encoding=‘utf-8-sig’)
for urlItem in urlconfing.readlines():
print(urlItem)
driver.get(urlItem.strip())

TODO 总共有100页,使用for循环采集,可根据类型页面更改 pageRange

pageRange = 101
for page in range(1, pageRange):
    # 捕捉异常
    # try:
        # 获取企业名称列表
        title = driver.find_elements_by_css_selector("a[class=list-item-title-text]")
        print(len(title))
        # 获取产品
        product = driver.find_elements_by_xpath("//div[@class=\"list-item-detail\"]/div[1]/div[1]/a[1]")
        # 打印长度,调试
        print("第%s 页,本页面 (%s 页) :%s 个项目" %(page,pageRange,len(title)))
        ######################循环
        for i in range(len(title)):
            try:
                title_value = title[i].get_attribute('title')
            except Exception as e:
                continue
            # 获取标题的值
            print('------------------------------------------------------------')
            print('公司名:',title_value)
            # 主营产品
            product_value = product[i].text.replace('\n', '')
            print('主营产品:',product_value)
            # 获取跳转联系方式url
            time.sleep(1)
            try:
                href_value = title[i].get_attribute('href') + 'page/contactinfo.htm'
            except Exception as e:
                continue
            ####联系方式页面
            # 随机选择agent进行访问
            agent = np.random.choice(user_agents)
            # 组建header头部
            headers = {'User-Agent': agent, 'Accept': '*/*', 'Referer': 'http://www.google.com'}
            # 使用urllib进行Request
            request = urllib.request.Request(href_value, headers=headers)
            # 访问链接
            try:
                response = urllib.request.urlopen(request)
            except Exception as e:
                continue
            # 获得网页源码
            html = response.read()
            locXpath = etree.HTML(html)
            # 公司联系人
            try:
                 contacts = locXpath.xpath("//div[@class='contact-info']/dl/dd/a[1]/text()")[0].strip()
                 print('联系人:',contacts)
            except Exception as e:
                 contacts =''
            # 电话号
            try:
                 phone_num = locXpath.xpath("//div[@class='contcat-desc']/dl[1]/dd/text()")[0].strip()
                 print('电话号:',phone_num)
            except Exception as e:
                 phone_num =''
            # 手机号
            try:
                 phone = locXpath.xpath("//*[@class='m-mobilephone']/dd/text()")[0].strip()
                 print('手机号:',phone)
            except Exception as e:
                 phone =''
            #手机号加电话号
            try:
                phones = phone + ';' + phone_num
                print('手机号和电话号:', phones)
            except Exception as e:
                phones = ''
            # 公司地址
            try:
                 address = locXpath.xpath("//*[@class='address']/text()")[0].strip()
                 pro = address.split(' ')
                 province = pro[1]
                 if '市' in province:
                     province = province.split('市')[0]
                 else:
                     province = province
                 Province = province
                 # print(address)

                 if len(pro) == 2:
                     city = pro[1]
                 elif len(pro) == 3:
                     city = pro[2]
                 elif len(pro) == 4:
                     city = pro[2]
                 else:
                     city = '市'
                 City = ''
                 if '市' in city:
                     City = city.split('市')[0] + '市'
                 else:
                     City = city
                 City1 = City
                 print('省份:',Province)
                 print('市:',City1)
                 print('公司地址:',address)
            except Exception as e:
                 address =''
            # 公司主页
            try:
                web_diver = locXpath.xpath("//*[@class='outsite']/text()|//*[@class='subdomain']/text()")[0].strip()
                print('公司主页:',web_diver)
            except Exception as e:
                web_diver = ''

            ####公司档案
            # 获取跳转公司档案url

            doc_href_value = title[i].get_attribute('href') + 'page/creditdetail.htm'
            # 随机选择agent进行访问
            agent1 = np.random.choice(user_agents)
            # 组建header头部
            headers1 = {'User-Agent': agent1, 'Accept': '*/*', 'Referer': 'http://www.google.com'}
            # 使用urllib进行Request
            request1 = urllib.request.Request(doc_href_value, headers=headers1)
            # 访问链接
            try:
                response = urllib.request.urlopen(request1)
                # 获得网页源码
                doc_html = response.read()
                # 公司档案XPATH
                docXpath = etree.HTML(doc_html)
                # 成立时间
            except Exception as e:
                continue
            try:
                chenglishijian = docXpath.xpath("//td[@class='tb-info tb-value']/p/span[1]/text()")[0].replace('年','-').replace('月','-').replace('日','')
                print('成立时间:',chenglishijian)
            except Exception as e:
                chenglishijian = ''
            # 注册资本
            try:
                zhucheziben = docXpath.xpath("//td[@class='tb-info tb-value']/p/span[1]/text()")[1]
                zhuchezibennum =  re.findall(r"\d+\.?\d*",zhucheziben)
                if len(zhuchezibennum)==1:
                    zhuchezibennum = zhuchezibennum
                else:
                    zhuchezibennum = 0
                print('注册资本:',zhuchezibennum)
            except Exception as e:
                zhuchezibennum = ''
            #经营范围
            try:
                jingyingfanwei = docXpath.xpath("//td[@class='tb-info tb-value']/p/span[1]/text()")[2]
                print('经营范围:',jingyingfanwei)
            except Exception as e:
                jingyingfanwei = ''
            # 注册地址
            try:
                zhuchedizhi = docXpath.xpath("//td[@class='tb-info tb-value']/p/span[1]/text()")[3]
                print('注册地址:',zhuchedizhi)
            except Exception as e:
                zhuchedizhi = ''
            # 累计成交数
            try:
                acDealNumXpath = docXpath.xpath('//*[@id="J_CompanyTradeCreditRecord"]/ul/li[1]/p[2]/text()')[0]
                print('累计成交数:',acDealNumXpath)
            except Exception as e:
                acDealNumXpath = ''
            # 累计买家数
            try:
                acBuyNumXpath = docXpath.xpath('//*[@id="J_CompanyTradeCreditRecord"]/ul/li[2]/p[2]/text()')[0]
                print('累计买家数:',acBuyNumXpath)
            except Exception as e:
                acBuyNumXpath = ''
            # 重复采购率
            try:
                repBuyPerXpath = docXpath.xpath('//*[@id="J_CompanyTradeCreditRecord"]/ul/li[3]/p[2]/text()')[0]
                print('重复采购率:',repBuyPerXpath)
            except Exception as e:
                repBuyPerXpath = ''
            # 90天退款率
            try:
                ref90PerXpath = docXpath.xpath('//*[@id="J_CompanyTradeCreditRecord"]/ul/li[4]/p[2]/text()')[0]
                print('90天退款率:',ref90PerXpath)
            except Exception as e:
                ref90PerXpath = ''
            # 90天客服介入率
            try:
                kf90PerXpath = docXpath.xpath('//*[@id="J_CompanyTradeCreditRecord"]/ul/li[5]/p[2]/text()')[0]
                print('90天客服介入率:',kf90PerXpath)
            except Exception as e:
                kf90PerXpath = ''
            # 90天纠纷率
            try:
                jf90PerXpath = docXpath.xpath('//*[@id="J_CompanyTradeCreditRecord"]/ul/li[6]/p[2]/text()')[0]
                print('90天纠纷率:',jf90PerXpath)
            except Exception as e:
                jf90PerXpath = ''

            leimu = ''
            print('类目:',leimu)
            print('-------------------------------------------------------------------------')
            # 连接数据库配置信息
            db_Local = pymysql.connect(
                host='服务器账号',
                port=服务器端口号,
                user='admin',
                passwd='123456',
                db='precisionmarketing',
                charset='utf8')
            try:
                cur = db_Local.cursor()
                sql = 'INSERT INTO Spider_Summary_1688_itms_dw(company_name,' \
                      'contacts,' \
                      'phone,' \
                      'Province,' \
                      'city,' \
                      'address,' \
                      'web_site,' \
                      'create_time,' \
                      'money,' \
                      'operation,' \
                      'products,' \
                      'registeredaddress,' \
                      'chengjiaonumber,' \
                      'maijianumber,' \
                      'purchaserate,' \
                      'refundrate,' \
                      'interventionrate,' \
                      'disputerate,' \
                      'leimu)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
                cur.executemany(sql, [(
                    title_value,
                    contacts,
                    phones,
                    Province,
                    City1,
                    address,
                    web_diver,
                    chenglishijian,
                    zhuchezibennum,
                    jingyingfanwei,
                    product_value,
                    zhuchedizhi,
                    acDealNumXpath,
                    acBuyNumXpath,
                    repBuyPerXpath,
                    ref90PerXpath,
                    kf90PerXpath,
                    jf90PerXpath,
                    leimu)])
                print('插入数据成功')
                db_Local.commit()
                db_Local.close()
            except Exception as e:
                print('插入数据不成功')
                continue
        ######################下一页
        js = 'var q=document.documentElement.scrollTop=30000'
        driver.execute_script(js)
        time.sleep(1)
        page = driver.find_elements_by_css_selector("a[class=page-next]")
        try:
            page = page[0]
            page.click()
        except Exception as e:
            continue
        ######################下一页
        time.sleep(1)

print(“close driver”)

关闭模拟浏览器

driver.close()

你可能感兴趣的:(selenium爬虫爬取1688供应商企业信息)