爬取企查查,得到站长之家中获取到的公司的电话和邮箱,并保存到mysql数据库

爬取某某查,得到某某之家中获取公司的电话和邮箱,并保存到mysql数据库
大致如下
在这里插入图片描述
注意,有些公司的电话和邮箱没有记录
代码如下

import time
import pymysql
import requests
from lxml import etree
from selenium import webdriver
import io
import sys

from selenium.webdriver.chrome.options import Options

sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
企查查请求头
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Cookie': '写自己的',
    'Host': 'www.qichacha.com',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': '本地请求头'
}
#爬取企查查
#输入公司名称,并请求
def qi(organizer_list):
    res=requests.get(f"https://www.qichacha.com/search?key={organizer_list}",headers=headers)
    #print(res.text)
    pares(res.text)
   #得到第一个的连接并请求
def pares(html):
    root=etree.HTML(html)
    href=root.xpath('//*[@id="search-result"]/tr[1]/td[3]/a/@href')
    href=" ".join(href)
    url="https://www.qichacha.com"+href
    ane=requests.get(url,headers=headers)
    #print('---------------------------------------------------',ane.text)
    New(ane.text)
   #获取公司邮箱和电话
def New(html):
    root=etree.HTML(html)
    global mailbox,phone
    mailbox=root.xpath('//*[@id="company-top"]/div[2]/div[2]/div[3]/div[2]/span[1]/span[2]/a/text()')
    phone=root.xpath('//*[@id="company-top"]/div[2]/div[2]/div[3]/div[1]/span[1]/span[2]/span/text()')
    return (mailbox,phone)
#爬取站长之家
def search():
        options = Options()
        # 无头-不显示浏览器窗口
        options.add_argument("--headless")
        path = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver"
        chrome = webdriver.Chrome(executable_path=path, options=options)
        #爬取页数
        for i in range(1,11):
            chrome.get(f"http://icp.chinaz.com/provinces?&companytype=&city=%u5168%u56FD&custype=0&companyName=&page=1",)
                #print(chrome.page_source)
                #x=chrome.find_element_by_xpath("//tbody[@id='result_table']/tr")
                #print(x.text)
                #爬取条数
            for d in range(1,21):
                domain_list = chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[1]/a")
                domain_list=domain_list.text
                organizer_list =chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[2]")
                organizer_list=organizer_list.text
                nature_list = chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[3]")
                nature_list=nature_list.text
                license_list = chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[5]")
                license_list=license_list.text
                web_name_list = chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[6]")
                web_name_list=web_name_list.text
                web_domain_list = chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[7]/span/a")
                web_domain_list= web_domain_list.text
                verify_time_list =chrome.find_element_by_xpath(f"//tbody[@id='result_table']/tr[{d}]/td[8]")
                verify_time_list=verify_time_list.text
                #将获取的数据提供给函数qi()
                qi(organizer_list)
                #print(mailbox,phone)
                #连接数据库
             	connection = pymysql.connect(host='127.0.0.1', user='root', password='', database='yu', charset="utf8")
                with connection.cursor() as cursor:
                    sql = "INSERT INTO yu(domain_list,organizer_list,nature_list,license_list, web_name_list,web_domain_list,verify_time_list,mailbox,phone) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                    #判断mailbox,phone是否为空并保存到数据库
                    try:
                            #cursor.execute(sql, (domain_list, organizer_list, nature_list, license_list, web_name_list, web_domain_list,verify_time_list))
                            cursor.execute(sql, (domain_list, organizer_list, nature_list, license_list, web_name_list, web_domain_list,verify_time_list,mailbox,phone))
                            connection.commit()
                    except:
                        try:
                            cursor.execute(sql, (domain_list, organizer_list, nature_list, license_list, web_name_list, web_domain_list,verify_time_list, mailbox, 0))
                            connection.commit()
                        except:
                            try:
                                cursor.execute(sql, (domain_list, organizer_list, nature_list, license_list, web_name_list, web_domain_list,verify_time_list, 0,phone ))
                                connection.commit()
                            except:
                                cursor.execute(sql, (domain_list, organizer_list, nature_list, license_list, web_name_list, web_domain_list,verify_time_list, 0, 0))
                                connection.commit()
                    finally:
                        connection.close()
                    print(domain_list, organizer_list, nature_list, license_list, web_name_list, web_domain_list, verify_time_list,mailbox,phone)
                    time.sleep(5)
        chrome.close()
if __name__=="__main__":
    #循环开启
    #while True:
        search()
        #每隔多长时间爬一次
        #time.sleep()

你可能感兴趣的:(爬虫)