爬取某某查,得到某某之家中获取公司的电话和邮箱,并保存到mysql数据库
大致如下
注意,有些公司的电话和邮箱没有记录
代码如下
import time
import pymysql
import requests
from lxml import etree
from selenium import webdriver
import io
import sys
from selenium.webdriver.chrome.options import Options
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
企查查请求头
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': '写自己的',
'Host': 'www.qichacha.com',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': '本地请求头'
}
#爬取企查查
#输入公司名称,并请求
def qi(organizer_list):
res=requests.get(f"https://www.qichacha.com/search?key={organizer_list}",headers=headers)
#print(res.text)
pares(res.text)
#得到第一个的连接并请求
def pares(html):
root=etree.HTML(html)
href=root.xpath('//*[@id="search-result"]/tr[1]/td[3]/a/@href')
href=" ".join(href)
url="https://www.qichacha.com"+href
ane=requests.get(url,headers=headers)
#print('---------------------------------------------------',ane.text)
New(ane.text)
#获取公司邮箱和电话
def New(html):
root=etree.HTML(html)
global mailbox,phone
mailbox=root.xpath('//*[@id="company-top"]/div[2]/div[2]/div[3]/div[2]/span[1]/span[2]/a/text()')
phone=root.xpath('//*[@id="company-top"]/div[2]/div[2]/div[3]/div[1]/span[1]/span[2]/span/text()')
return (mailbox,phone)
#爬取站长之家
def search():
options = Options()
# 无头-不显示浏览器窗口
options.add_argument("--headless")
path = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver"
chrome = webdriver.Chrome(executable_path=path, options=options)
#爬取页数
for i in range(1,11):
chrome.get(f"http://icp.chinaz.com/provinces?&companytype=&city=%u5168%u56FD&custype=0&companyName=&page=1",)
#print(chrome.page_source)
#x=chrome.find_element_by_xpath("//tbody[@id='result_table']/tr")
#print(x.text)
#爬取条数
for d in range(1,21):
domain_list = chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[1]/a")
domain_list=domain_list.text
organizer_list =chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[2]")
organizer_list=organizer_list.text
nature_list = chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[3]")
nature_list=nature_list.text
license_list = chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[5]")
license_list=license_list.text
web_name_list = chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[6]")
web_name_list=web_name_list.text
web_domain_list = chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[7]/span/a")
web_domain_list= web_domain_list.text
verify_time_list =chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[8]")
verify_time_list=verify_time_list.text
#将获取的数据提供给函数qi()
qi(organizer_list)
#print(mailbox,phone)
#连接数据库
connection = pymysql.connect(host='127.0.0.1', user='root', password='', database='yu', charset="utf8")
with connection.cursor() as cursor:
sql = "INSERT INTO yu(domain_list,organizer_list,nature_list,license_list, web_name_list,web_domain_list,verify_time_list,mailbox,phone) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
#判断mailbox,phone是否为空并保存到数据库
try:
#cursor.execute(sql, (domain_list, organizer_list, nature_list, license_list, web_name_list, web_domain_list,verify_time_list))
cursor.execute(sql, (domain_list, organizer_list, nature_list, license_list, web_name_list, web_domain_list,verify_time_list,mailbox,phone))
connection.commit()
except:
try:
cursor.execute(sql, (domain_list, organizer_list, nature_list, license_list, web_name_list, web_domain_list,verify_time_list, mailbox, 0))
connection.commit()
except:
try:
cursor.execute(sql, (domain_list, organizer_list, nature_list, license_list, web_name_list, web_domain_list,verify_time_list, 0,phone ))
connection.commit()
except:
cursor.execute(sql, (domain_list, organizer_list, nature_list, license_list, web_name_list, web_domain_list,verify_time_list, 0, 0))
connection.commit()
finally:
connection.close()
print(domain_list, organizer_list, nature_list, license_list, web_name_list, web_domain_list, verify_time_list,mailbox,phone)
time.sleep(5)
chrome.close()
if __name__=="__main__":
#循环开启
#while True:
search()
#每隔多长时间爬一次
#time.sleep()