#! /usr/bin/env python
from imp import reload
import time
import urllib.request
import sys
import numpy as np
import pymysql
import re
from lxml import etree
from selenium import webdriver
reload(sys)
try:
driver = webdriver.Firefox(executable_path = ‘F:\geckodriver-v0.23.0-win64\geckodriver.exe’)
except Exception as e:
driver = webdriver.Firefox(executable_path = ‘F:\geckodriver-v0.23.0-win64\geckodriver.exe’)
user_agents = [
‘Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11’,
‘Opera/9.25 (Windows NT 5.1; U; en)’,
‘Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1;.NET CLR 1.1.4322; .NET CLR2.0.50727)’,
‘Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5(like Gecko) (Kubuntu)’,
‘Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12’,
‘Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9’,
“Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7”,
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
]
time.sleep(5)
urlconfing = open(r’F:\DomeTest\csv\url.txt’,encoding=‘utf-8-sig’)
for urlItem in urlconfing.readlines():
print(urlItem)
driver.get(urlItem.strip())
pageRange = 101
for page in range(1, pageRange):
# 捕捉异常
# try:
# 获取企业名称列表
title = driver.find_elements_by_css_selector("a[class=list-item-title-text]")
print(len(title))
# 获取产品
product = driver.find_elements_by_xpath("//div[@class=\"list-item-detail\"]/div[1]/div[1]/a[1]")
# 打印长度,调试
print("第%s 页,本页面 (%s 页) :%s 个项目" %(page,pageRange,len(title)))
######################循环
for i in range(len(title)):
try:
title_value = title[i].get_attribute('title')
except Exception as e:
continue
# 获取标题的值
print('------------------------------------------------------------')
print('公司名:',title_value)
# 主营产品
product_value = product[i].text.replace('\n', '')
print('主营产品:',product_value)
# 获取跳转联系方式url
time.sleep(1)
try:
href_value = title[i].get_attribute('href') + 'page/contactinfo.htm'
except Exception as e:
continue
####联系方式页面
# 随机选择agent进行访问
agent = np.random.choice(user_agents)
# 组建header头部
headers = {'User-Agent': agent, 'Accept': '*/*', 'Referer': 'http://www.google.com'}
# 使用urllib进行Request
request = urllib.request.Request(href_value, headers=headers)
# 访问链接
try:
response = urllib.request.urlopen(request)
except Exception as e:
continue
# 获得网页源码
html = response.read()
locXpath = etree.HTML(html)
# 公司联系人
try:
contacts = locXpath.xpath("//div[@class='contact-info']/dl/dd/a[1]/text()")[0].strip()
print('联系人:',contacts)
except Exception as e:
contacts =''
# 电话号
try:
phone_num = locXpath.xpath("//div[@class='contcat-desc']/dl[1]/dd/text()")[0].strip()
print('电话号:',phone_num)
except Exception as e:
phone_num =''
# 手机号
try:
phone = locXpath.xpath("//*[@class='m-mobilephone']/dd/text()")[0].strip()
print('手机号:',phone)
except Exception as e:
phone =''
#手机号加电话号
try:
phones = phone + ';' + phone_num
print('手机号和电话号:', phones)
except Exception as e:
phones = ''
# 公司地址
try:
address = locXpath.xpath("//*[@class='address']/text()")[0].strip()
pro = address.split(' ')
province = pro[1]
if '市' in province:
province = province.split('市')[0]
else:
province = province
Province = province
# print(address)
if len(pro) == 2:
city = pro[1]
elif len(pro) == 3:
city = pro[2]
elif len(pro) == 4:
city = pro[2]
else:
city = '市'
City = ''
if '市' in city:
City = city.split('市')[0] + '市'
else:
City = city
City1 = City
print('省份:',Province)
print('市:',City1)
print('公司地址:',address)
except Exception as e:
address =''
# 公司主页
try:
web_diver = locXpath.xpath("//*[@class='outsite']/text()|//*[@class='subdomain']/text()")[0].strip()
print('公司主页:',web_diver)
except Exception as e:
web_diver = ''
####公司档案
# 获取跳转公司档案url
doc_href_value = title[i].get_attribute('href') + 'page/creditdetail.htm'
# 随机选择agent进行访问
agent1 = np.random.choice(user_agents)
# 组建header头部
headers1 = {'User-Agent': agent1, 'Accept': '*/*', 'Referer': 'http://www.google.com'}
# 使用urllib进行Request
request1 = urllib.request.Request(doc_href_value, headers=headers1)
# 访问链接
try:
response = urllib.request.urlopen(request1)
# 获得网页源码
doc_html = response.read()
# 公司档案XPATH
docXpath = etree.HTML(doc_html)
# 成立时间
except Exception as e:
continue
try:
chenglishijian = docXpath.xpath("//td[@class='tb-info tb-value']/p/span[1]/text()")[0].replace('年','-').replace('月','-').replace('日','')
print('成立时间:',chenglishijian)
except Exception as e:
chenglishijian = ''
# 注册资本
try:
zhucheziben = docXpath.xpath("//td[@class='tb-info tb-value']/p/span[1]/text()")[1]
zhuchezibennum = re.findall(r"\d+\.?\d*",zhucheziben)
if len(zhuchezibennum)==1:
zhuchezibennum = zhuchezibennum
else:
zhuchezibennum = 0
print('注册资本:',zhuchezibennum)
except Exception as e:
zhuchezibennum = ''
#经营范围
try:
jingyingfanwei = docXpath.xpath("//td[@class='tb-info tb-value']/p/span[1]/text()")[2]
print('经营范围:',jingyingfanwei)
except Exception as e:
jingyingfanwei = ''
# 注册地址
try:
zhuchedizhi = docXpath.xpath("//td[@class='tb-info tb-value']/p/span[1]/text()")[3]
print('注册地址:',zhuchedizhi)
except Exception as e:
zhuchedizhi = ''
# 累计成交数
try:
acDealNumXpath = docXpath.xpath('//*[@id="J_CompanyTradeCreditRecord"]/ul/li[1]/p[2]/text()')[0]
print('累计成交数:',acDealNumXpath)
except Exception as e:
acDealNumXpath = ''
# 累计买家数
try:
acBuyNumXpath = docXpath.xpath('//*[@id="J_CompanyTradeCreditRecord"]/ul/li[2]/p[2]/text()')[0]
print('累计买家数:',acBuyNumXpath)
except Exception as e:
acBuyNumXpath = ''
# 重复采购率
try:
repBuyPerXpath = docXpath.xpath('//*[@id="J_CompanyTradeCreditRecord"]/ul/li[3]/p[2]/text()')[0]
print('重复采购率:',repBuyPerXpath)
except Exception as e:
repBuyPerXpath = ''
# 90天退款率
try:
ref90PerXpath = docXpath.xpath('//*[@id="J_CompanyTradeCreditRecord"]/ul/li[4]/p[2]/text()')[0]
print('90天退款率:',ref90PerXpath)
except Exception as e:
ref90PerXpath = ''
# 90天客服介入率
try:
kf90PerXpath = docXpath.xpath('//*[@id="J_CompanyTradeCreditRecord"]/ul/li[5]/p[2]/text()')[0]
print('90天客服介入率:',kf90PerXpath)
except Exception as e:
kf90PerXpath = ''
# 90天纠纷率
try:
jf90PerXpath = docXpath.xpath('//*[@id="J_CompanyTradeCreditRecord"]/ul/li[6]/p[2]/text()')[0]
print('90天纠纷率:',jf90PerXpath)
except Exception as e:
jf90PerXpath = ''
leimu = ''
print('类目:',leimu)
print('-------------------------------------------------------------------------')
# 连接数据库配置信息
db_Local = pymysql.connect(
host='服务器账号',
port=服务器端口号,
user='admin',
passwd='123456',
db='precisionmarketing',
charset='utf8')
try:
cur = db_Local.cursor()
sql = 'INSERT INTO Spider_Summary_1688_itms_dw(company_name,' \
'contacts,' \
'phone,' \
'Province,' \
'city,' \
'address,' \
'web_site,' \
'create_time,' \
'money,' \
'operation,' \
'products,' \
'registeredaddress,' \
'chengjiaonumber,' \
'maijianumber,' \
'purchaserate,' \
'refundrate,' \
'interventionrate,' \
'disputerate,' \
'leimu)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
cur.executemany(sql, [(
title_value,
contacts,
phones,
Province,
City1,
address,
web_diver,
chenglishijian,
zhuchezibennum,
jingyingfanwei,
product_value,
zhuchedizhi,
acDealNumXpath,
acBuyNumXpath,
repBuyPerXpath,
ref90PerXpath,
kf90PerXpath,
jf90PerXpath,
leimu)])
print('插入数据成功')
db_Local.commit()
db_Local.close()
except Exception as e:
print('插入数据不成功')
continue
######################下一页
js = 'var q=document.documentElement.scrollTop=30000'
driver.execute_script(js)
time.sleep(1)
page = driver.find_elements_by_css_selector("a[class=page-next]")
try:
page = page[0]
page.click()
except Exception as e:
continue
######################下一页
time.sleep(1)
print(“close driver”)
driver.close()