python爬虫---爬取优信二手车

import requests
import re
import redis
from lxml import etree
import pymysql
#获取城市数据
class City():
    def __init__(self):
        #初始化Redis连接
        self.r=self.get_redis()
    def __call__(self, *args, **kwargs):
        self.get_city()
    # 创建数据库,减少对网站的攻击
    def get_redis(self):
        return redis.Redis(host='127.0.0.1', port=6379, db=1)
    def get_city(self):
        # 初始的url
        interface_url="https://www.xin.com/apis/Ajax_common/get_home_city/"
        import json
        json_data=str(requests.get(interface_url).json())
        print(json_data)
        print(type(json_data))
        #获取城市:
        city_name=re.findall("'ename': '(.*?)'",json_data)
        # print(city_name)
        #遍历城市,获取城市的url
        for city in city_name:
            city_url="https://www.xin.com/{}/s/?channel=a49b117c44837d110753e751863f53".format(city)
            # print('================正在下载{}=============='.format(city))
            # print(city_url)
            #将城市的链接写入Redis数据库
            self.r.rpush("city_url", city_url)
    def get_html(self,url):
        response=requests.get(url)
        html=response.text
        # print(html)
        return  html,etree.HTML(html)
# 获取所有车的品牌:
class AutomobileBrand(City):
    def __call__(self, *args, **kwargs):
        self.get_brand()
    def get_brand(self):
        city_url_all=self.r.lrange('city_url',0,-1)
        # print(city_url_all)
        for city_url in city_url_all:
            # 根据url获取所有的车的品牌:
            # print(city_url.decode('utf-8'))
            htnl,html_xml=self.get_html(city_url.decode('utf-8'))
            car_brand_name=html_xml.xpath('//ul//li[position()>1]/dl/dd/a/text()')
            car_brand_url=html_xml.xpath('//ul//li[position()>1]/dl/dd/a/@href')
            # print(car_brand_url)
            # print(car_brand_name)
            for index in range(len(car_brand_url)):
                brand_url='https:'+car_brand_url[index]
                brand_name=car_brand_name[index].strip()
                # print('===================开始下载{}========================'.format(brand_name))
                # print(brand_url,brand_name)
                #将数据存入Redis中
                # self.r.rpush('brand_url_name',brand_url,brand_name)
                # for car_url in car_brand_url:
                #     car_url='https:'+car_url
                #     print(car_url)
                # for car_name in car_brand_name:
                #     car_name=car_name.strip()
                #     print(car_name)
            # break
# 准备获取数据:
class Car(AutomobileBrand):
    def __call__(self, *args, **kwargs):
        self.count=1
        self.conn_mysql()
        self.get_system()
    def get_system(self):
        #读取数据库数据
        brand_url_name=self.r.lrange('brand_url_name',0,-1)
        # print(brand_url_name)
        for index in range(0,len(brand_url_name),2):
            brand_url=brand_url_name[index].decode('utf-8')
            # print(self.count,brand_url)
            # self.count+=1
            #获取品牌的页面:
            html,html_xml=self.get_html(brand_url)
            # #获取车系:
            car_system_name=html_xml.xpath('//div[@id="search_serial"]//ul//div/li/a/text()')
            car_sys_url=html_xml.xpath('//div[@id="search_serial"]//ul//div/li/a/@href')
            # print(car_system)
            print(car_sys_url)
            #遍历地址拼接/地址:
            for index in range(len(car_sys_url)):
                print('=============正在下载车系{}==============='.format(car_system_name[index]))
                car_sys='https:'+car_sys_url[index]
                print(car_sys)
                #根据车系获取最大的页面:
                self.get_page(car_sys)
    def get_page(self,car_sys):
        #获取最大页面:
        html,html_xml=self.get_html(car_sys)
        # max_page1 = html_xml.xpath('//div[@class="con-page search_page_link"]')
        max_page=html_xml.xpath('//div[@class="con-page search_page_link"]//a/@data-page')
        # print(max_page)
        # if not max_page1:
        try:
            if max_page==[]:
                if "小优还没有为您准备好车源" in html:
                        pass
                else:
                    max_page=["1","2"]
        # print(max_page)
            for page in range(1,int(max_page[-2])+1):
                # print(page)
                print('===================第{}页开始下载=================='.format(page))
                page_url=car_sys+"i{}/".format(page)
                # print(page_url)
                self.get_data(page_url)
            # break
        except IndexError as e:
            print(e)
        finally:
            pass
    def get_data(self,page_url):
        html,html_xml=self.get_html(page_url)
        #获取每一页的汽车信息
        #首先缩小范围获取数据:
        li_list=html_xml.xpath('//div[@class="_list-con list-con clearfix ab_carlist"]/ul/li')
        # print(li_list)
        #遍历li_list标签:
        for li in li_list:
            #获取图片:
            car_pic=li.xpath('.//a/img/@src')
            # print(car_pic)
            # print(type(car_pic))
            if car_pic[0]  in '//s6.xinstatic.com/www/img/default.png':
                car_pic = li.xpath('.//a/img/@data-original')
                # print(car_pic)
                # print(type(car_pic))
            if 'https:' not in car_pic[0]:
                car_pic = "https:" + car_pic[0]
            else:
                car_pic=car_pic[0]
            if '_18' in car_pic:
                car_pic=car_pic.replace('_18','')
            # print(car_pic)
            #获取车名字:
            car_name=li.xpath('.//h2/span/text()')[0]
            # print(car_name)
            #获取年份
            car_year=li.xpath('.//div[@class="pad"]/span/text()')[0].strip().replace('年','')
            # print(car_year)
            #获取行驶的公里数
            car_km=li.xpath('.//div[@class="pad"]/span/text()')[1].strip()
            # print(car_km)
            # 获取仓库:
            car_house = li.xpath('.//div[@class="pad"]/span/span/text()')[0]
            # print(car_house)
            # 获取首付和月供:
            car__money_mouth = li.xpath('.//div[@class="pad"]/span[@class="pay-price"]/text()')
            car__money_mouth=','.join(car__money_mouth).replace('\n','').strip()
            #获取首付:
            car_first_money=car__money_mouth.split('                                                            ')[0]
            car_first_money=car_first_money if car_first_money else ''
            # print(car_first_money)
            # 获取月供:
            car_mouth_money=car__money_mouth.split('                                                            ')[1]
            car_mouth_money=car_mouth_money if car_mouth_money else ''
            # print(car_mouth_money)
            #获取总价钱:
            car_all_money=html_xml.xpath('.//em/text()')[1].strip().replace('\n','')
            car_price=re.findall('(.*?)万',car_all_money)[0].strip()
            car_price=car_price if car_price else ''
            # print(car_price)
            #根据每一个盒子获取到详情页面的连接:
            car_details=li.xpath('.//a/@href')[0]
            car_details="https:"+car_details
            # print(car_details)
            #创建一个字典:
            car_dict={
                'car_pic':car_pic,
                'car_name':car_name,
                'car_year':car_year,
                'car_km':car_km,
                'car_house':car_house,
                'car_first_money':car_first_money,
                'car_mouth_money':car_mouth_money,
                'car_price':car_price,
                'car_details':car_details,
            }
            # print(car_dict)
            self.insert_mysql(car_dict)
    def insert_mysql(self,car_dict):
        car_pic=car_dict['car_pic']
        car_name=car_dict['car_name']
        car_year=car_dict['car_year']
        car_km=car_dict['car_km']
        car_house=car_dict['car_house']
        car_first_money=car_dict['car_first_money']
        car_mouth_money=car_dict['car_mouth_money']
        car_price=car_dict['car_price']
        car_details=car_dict['car_details']
        #将数据插入数据库
        sql="""
        insert into youxin (car_pic,car_name,car_year,car_km,car_house,car_first_money,car_mouth_money,car_price,car_details)
VALUES('{}','{}',':{}','{}','{}','{}','{}','{}','{}')
        """.format(car_pic,car_name,car_year,car_km,car_house,car_first_money,car_mouth_money,car_price,car_details)
        # print(sql)
        try:
            #将数据保存到数据库
            self.cur.execute(sql)
            #提交数据
            self.conn.commit()
            print(self.count, sql)
            self.count += 1
        except Exception as e:
            print(e)
            self.conn.rollback()

    def conn_mysql(self):
        #创建连接数据库的对象:
        self.conn=pymysql.Connect(host='127.0.0.1',user='root',password='admin',database='02180530',charset='utf8')
        #创建操作数据库对象
        self.cur=self.conn.cursor()
        print(self.conn)
        # 表示连接成功
if __name__ == '__main__':
    # city=City()
    # city()
    # brand=AutomobileBrand()
    # brand()
    car=Car()
    car()

你可能感兴趣的:(python爬虫---爬取优信二手车)