优信二手车爬虫



import re,requests,json
from lxml import etree
import pymysql

class MySql():

    def __init__(self):
        self.count = 1
        self.conn_mysql()

    def conn_mysql(self):
        #创建数据库连接对象
        self.conn = pymysql.connect(host='xxxxx',user='root',password='xxxx',charset='utf8',database='xxxx')
        #创建操作数据库对象
        self.cur = self.conn.cursor()




class YouXin(MySql):

    # def __init__(self):
    #     self.spider_name = '优信二手车'
    #     self.count = 1

    def __call__(self, *args, **kwargs):
        base_url = 'https://www.xin.com/beijing/?'
        self.headers = {
 
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36",
        }
        html,html_xml = self.get_html(base_url,self.headers)
        #获取城市列表
        self.city_constitute_list = self.get_city_list()#含有城市及对应的url

        #获取品牌列表
        brand_total_list = self.get_brand_list()

        #获取每个城市各个品牌列表页面
        #拼接每个城市各个品牌列表页面url
        for city in self.city_constitute_list:
            city_url = city[1]
            # print(city_url)
            for brand_tuple in brand_total_list:
                brand_city_url = city_url + brand_tuple[1]+ '/'
                # print(brand_city_url)
                #获取车系
                car_series_list = self.get_car_series(brand_city_url,city[0],brand_tuple[0])#含有汽车车系和对应url
                # break
            # break



    def get_html(self,url,headers):
        html = requests.get(url,headers=self.headers).text
        html_xml = etree.HTML(html)
        # print(html)
        return html,html_xml

    #获取所有城市名和对应url列表
    def get_city_list(self):
        base_url = 'https://www.xin.com/apis/Ajax_common/get_home_city/'
        city_html,city_html_xml = self.get_html(base_url,self.headers)
        # print(city_html)

        #将城市接口页面获取的字典转换为json格式
        city_json = json.loads(city_html)
        # print(city_json)

        #获取全部城市cityid
        cityid_list = city_json.get('data').get('city_all').keys()
        # print(cityid_list)
        # print(len(cityid_list))#286

        #由cityid循环获取每个城市cityname,ename(ename用于拼接城市url)
        self.city_constitute_list = []
        for cityid in cityid_list:
            cityname = city_json.get('data').get('city_all').get(cityid).get('cityname')
            # print(cityname)
            ename = city_json.get('data').get('city_all').get(cityid).get('ename')
            # print(ename)

            #拼接每个城市优信二手车完整url
            city_url = 'https://www.xin.com/' +ename + '/'
            # print(city_url)

            #组合城市名字和url
            city_constitute =(cityname ,city_url)
            self.city_constitute_list.append(city_constitute)
        # print(self.city_constitute_list)
        # print(len(self.city_constitute_list))
        return self.city_constitute_list

    #获取所有品牌名称及拼音(#用来拼接品牌url)
    def get_brand_list(self):
        #汽车品牌接口地址#https://www.xin.com/apis/Ajax_home/get_home_brand/
        base_url = 'https://www.xin.com/apis/Ajax_home/get_home_brand/'
        html,html_xml = self.get_html(base_url,self.headers)
        json_html = json.loads(html)
        # print(json_html)

        #获取所有汽车品牌
        #获取所有品牌首字母
        brand_alpha_list1 = json_html.get('data')[0].keys()
        brand_alpha_list2 = json_html.get('data')[1].keys()
        brand_alpha_list3 = json_html.get('data')[2].keys()
        # print(type(brand_alpha_list1))#
        # print(brand_alpha_list1)#dict_keys(['G', 'F', 'A', 'C', 'H', 'B', 'D'])
        #循环取出所有品牌,和品牌拼音(#用来拼接品牌url)放入一个列表中
        brand_total_list = []
        for index in range(3):
            for alpha in json_html.get('data')[index].keys():
                #通过品牌首字母获取所有品牌品牌名,及拼音
                brandname_list = json_html.get('data')[index].get(alpha)
                for brandname_index in brandname_list:
                    brandname = brandname_index.get('brandname')
                    # print(brandname)
                    brandspell = brandname_index.get('pinyin')
                    brand_tuple = (brandname,brandspell)
                    brand_total_list.append(brand_tuple)
        # print(brand_total_list)
        # print(len(brand_total_list))#242个品牌
        return brand_total_list#含有brandname及pinyin

    #获取车系及地址
    def get_car_series(self,url,cityname,brandname):
        '''
        :param url: 每个城市每个品牌列表页url
        :return:
        '''
        html,html_xml = self.get_html(url,headers=self.headers)
        # print(html)
        car_series_list = html_xml.xpath('//dl[@id="select2"]/dd[position()>1]/a/text()|//dl[@id="select2"]/dd[position()>1]/a/@href')
        # print(car_series_list)
        # print(len(car_series_list))

        #拼接完整的城市品牌车系地址
        for index in range(len(car_series_list)):
            # print(car_series)
            if 'com' in car_series_list[index]:
                # 完整的城市品牌车系地址
                car_series_list[index] = 'http:' + car_series_list[index]
                # print(car_series)
                # break
        # print(car_series_list)
        self.get_data(car_series_list,cityname,brandname)
        return car_series_list
    def get_data(self,car_series_list,cityname,brandname):
        #循环获取每一个车系汽车列表数据
        for index in range(0,len(car_series_list),2):
            page = 1
            while True:
                print('============车系{}-第{}页============'.format(car_series_list[index + 1],page))

                #拼接分页url
                page_url = car_series_list[index] + 'i{}/'.format(page)
                # print(page_url)
                html,html_xml = self.get_html(page_url,self.headers)

                # print(html)
                #缩小列表页面搜索范围
                li_list = html_xml.xpath('//div[@class="_list-con list-con clearfix ab_carlist"]/ul/li')
                # print(len(li_list))

                #循环获取每一个li标签中的信息
                for li in li_list:
                    # print(li)
                    #获取图片
                    pic = ('http:'+ li.xpath('.//div[@class="across"]/a/img/@src')[0]) if li.xpath('.//div[@class="across"]/a/img/@src') else ''
                    # print(pic)
                #
                #   #获取标题
                    title = li.xpath('.//div[@class="pad"]/h2/span/text()')[0] if li.xpath('.//div[@class="pad"]/h2/span/text()') else ''
                    # print(title)

                    #获取车辆年限
                    buy_date = li.xpath('.//div[@class="pad"]/span/text()[1]')[0].strip() if li.xpath('.//div[@class="pad"]/span/text()[1]') else ''
                    # print(buy_date)

                    #获取里程
                    mileage = li.xpath('.//div[@class="pad"]/span/text()[2]')[0].strip() if li.xpath('.//div[@class="pad"]/span/text()[2]') else ''
                    # print(mileage)

                    #库存地址
                    stock_address = li.xpath('.//div[@class="pad"]/span/span/text()')[0] if li.xpath('.//div[@class="pad"]/span/span/text()') else ''
                    # print(stock_address)

                    #车辆价格
                    price = li.xpath('.//div[@class="pad"]/p/em/text()')[0].strip().strip('万').strip().strip('\n') if li.xpath('.//div[@class="pad"]/p/em/text()') else ''
                    # print(price)

                    #详情页url
                    detail_url = ('https:' + li.xpath('.//div[@class="across"]/a/@href')[0]) if li.xpath('.//div[@class="across"]/a/@href') else ''
                    # print(detail_url)

                    #获取详情页
                    detail_html,detail_html_xml = self.get_html(detail_url,self.headers)
                    # print(detail_html)

                    #获取变速箱
                    gearbox = detail_html_xml.xpath('//div[@class="cd_m_i_pz"]/dl[3]/dd[2]/span[2]/a/text()')[0].strip() if detail_html_xml.xpath('//div[@class="cd_m_i_pz"]/dl[3]/dd[2]/span[1]') else ''
                    # print(gearbox)

                    #获取长宽高
                    leng = detail_html_xml.xpath('//div[@class="cd_m_pop_pzcs_slide"]/ul/li[3]/dl[1]/dd[1]/span[2]/text()')[0].strip() if detail_html_xml.xpath('//div[@class="cd_m_pop_pzcs_slide"]/ul/li[3]/dl[1]/dd[1]/span[2]/text()') else ''
                    weight = detail_html_xml.xpath('//div[@class="cd_m_pop_pzcs_slide"]/ul/li[3]/dl[1]/dd[2]/span[2]/text()')[0].strip() if detail_html_xml.xpath('//div[@class="cd_m_pop_pzcs_slide"]/ul/li[3]/dl[1]/dd[2]/span[2]/text()') else ''
                    height = detail_html_xml.xpath('//div[@class="cd_m_pop_pzcs_slide"]/ul/li[3]/dl[1]/dd[3]/span[2]/text()')[0].strip() if detail_html_xml.xpath('//div[@class="cd_m_pop_pzcs_slide"]/ul/li[3]/dl[1]/dd[3]/span[2]/text()') else ''
                    # print(leng,weight,height)
                    #将长宽高放在一起
                    lwh = [leng,weight,height]
                    #将所有数据存入字典
                    car_detail_dict = {
                        'cityname':cityname,
                        'brandname':brandname,
                        'car_series':car_series_list[index + 1],
                        'pic':pic,
                        'title':title,
                        'buy_date':buy_date,
                        'mileage':mileage,
                        'stock_address':stock_address,
                        'price':price,
                        'detail_url':detail_url,
                        'gearbox':gearbox,
                        'lwh':lwh,
                    }
                    # print(car_detail_dict)
                    self.insert_into(car_detail_dict)

                    # break

                page += 1
                if '下一页' not in html:
                    break
                # break
            # break

    def insert_into(self,data):

        cityname = data['cityname']
        brandname = data['brandname']
        car_series = data['car_series']
        pic = data['pic']
        title = data['title']
        buy_date = data['buy_date']
        mileage = data['mileage']
        stock_address = data['stock_address']
        price = data['price']
        detail_url = data['detail_url']
        gearbox = data['gearbox']
        lwh = data['lwh']
        sql = '''
        insert into youxin(cityname,brandname,car_series,pic,title,buy_date,mileage,stock_address,price,detail_url,gearbox,lwh) 
        VALUES ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")
        '''.format(cityname,brandname,car_series,pic,title,buy_date,mileage,stock_address,price,detail_url,gearbox,lwh)
        try:
            self.cur.execute(sql)
            self.conn.commit()
            print(self.count,sql)
            self.count += 1
        except Exception as e:
            print(e)
            self.conn.rollback()

if __name__ == '__main__':
    mysql = MySql()
    youxin = YouXin()
    youxin()











你可能感兴趣的:(爬虫)