import requests
import re
import redis
from lxml import etree
import pymysql
#获取城市数据
class City():
def __init__(self):
#初始化Redis连接
self.r=self.get_redis()
def __call__(self, *args, **kwargs):
self.get_city()
# 创建数据库,减少对网站的攻击
def get_redis(self):
return redis.Redis(host='127.0.0.1', port=6379, db=1)
def get_city(self):
# 初始的url
interface_url="https://www.xin.com/apis/Ajax_common/get_home_city/"
import json
json_data=str(requests.get(interface_url).json())
print(json_data)
print(type(json_data))
#获取城市:
city_name=re.findall("'ename': '(.*?)'",json_data)
# print(city_name)
#遍历城市,获取城市的url
for city in city_name:
city_url="https://www.xin.com/{}/s/?channel=a49b117c44837d110753e751863f53".format(city)
# print('================正在下载{}=============='.format(city))
# print(city_url)
#将城市的链接写入Redis数据库
self.r.rpush("city_url", city_url)
def get_html(self,url):
response=requests.get(url)
html=response.text
# print(html)
return html,etree.HTML(html)
# 获取所有车的品牌:
class AutomobileBrand(City):
def __call__(self, *args, **kwargs):
self.get_brand()
def get_brand(self):
city_url_all=self.r.lrange('city_url',0,-1)
# print(city_url_all)
for city_url in city_url_all:
# 根据url获取所有的车的品牌:
# print(city_url.decode('utf-8'))
htnl,html_xml=self.get_html(city_url.decode('utf-8'))
car_brand_name=html_xml.xpath('//ul//li[position()>1]/dl/dd/a/text()')
car_brand_url=html_xml.xpath('//ul//li[position()>1]/dl/dd/a/@href')
# print(car_brand_url)
# print(car_brand_name)
for index in range(len(car_brand_url)):
brand_url='https:'+car_brand_url[index]
brand_name=car_brand_name[index].strip()
# print('===================开始下载{}========================'.format(brand_name))
# print(brand_url,brand_name)
#将数据存入Redis中
# self.r.rpush('brand_url_name',brand_url,brand_name)
# for car_url in car_brand_url:
# car_url='https:'+car_url
# print(car_url)
# for car_name in car_brand_name:
# car_name=car_name.strip()
# print(car_name)
# break
# 准备获取数据:
class Car(AutomobileBrand):
def __call__(self, *args, **kwargs):
self.count=1
self.conn_mysql()
self.get_system()
def get_system(self):
#读取数据库数据
brand_url_name=self.r.lrange('brand_url_name',0,-1)
# print(brand_url_name)
for index in range(0,len(brand_url_name),2):
brand_url=brand_url_name[index].decode('utf-8')
# print(self.count,brand_url)
# self.count+=1
#获取品牌的页面:
html,html_xml=self.get_html(brand_url)
# #获取车系:
car_system_name=html_xml.xpath('//div[@id="search_serial"]//ul//div/li/a/text()')
car_sys_url=html_xml.xpath('//div[@id="search_serial"]//ul//div/li/a/@href')
# print(car_system)
print(car_sys_url)
#遍历地址拼接/地址:
for index in range(len(car_sys_url)):
print('=============正在下载车系{}==============='.format(car_system_name[index]))
car_sys='https:'+car_sys_url[index]
print(car_sys)
#根据车系获取最大的页面:
self.get_page(car_sys)
def get_page(self,car_sys):
#获取最大页面:
html,html_xml=self.get_html(car_sys)
# max_page1 = html_xml.xpath('//div[@class="con-page search_page_link"]')
max_page=html_xml.xpath('//div[@class="con-page search_page_link"]//a/@data-page')
# print(max_page)
# if not max_page1:
try:
if max_page==[]:
if "小优还没有为您准备好车源" in html:
pass
else:
max_page=["1","2"]
# print(max_page)
for page in range(1,int(max_page[-2])+1):
# print(page)
print('===================第{}页开始下载=================='.format(page))
page_url=car_sys+"i{}/".format(page)
# print(page_url)
self.get_data(page_url)
# break
except IndexError as e:
print(e)
finally:
pass
def get_data(self,page_url):
html,html_xml=self.get_html(page_url)
#获取每一页的汽车信息
#首先缩小范围获取数据:
li_list=html_xml.xpath('//div[@class="_list-con list-con clearfix ab_carlist"]/ul/li')
# print(li_list)
#遍历li_list标签:
for li in li_list:
#获取图片:
car_pic=li.xpath('.//a/img/@src')
# print(car_pic)
# print(type(car_pic))
if car_pic[0] in '//s6.xinstatic.com/www/img/default.png':
car_pic = li.xpath('.//a/img/@data-original')
# print(car_pic)
# print(type(car_pic))
if 'https:' not in car_pic[0]:
car_pic = "https:" + car_pic[0]
else:
car_pic=car_pic[0]
if '_18' in car_pic:
car_pic=car_pic.replace('_18','')
# print(car_pic)
#获取车名字:
car_name=li.xpath('.//h2/span/text()')[0]
# print(car_name)
#获取年份
car_year=li.xpath('.//div[@class="pad"]/span/text()')[0].strip().replace('年','')
# print(car_year)
#获取行驶的公里数
car_km=li.xpath('.//div[@class="pad"]/span/text()')[1].strip()
# print(car_km)
# 获取仓库:
car_house = li.xpath('.//div[@class="pad"]/span/span/text()')[0]
# print(car_house)
# 获取首付和月供:
car__money_mouth = li.xpath('.//div[@class="pad"]/span[@class="pay-price"]/text()')
car__money_mouth=','.join(car__money_mouth).replace('\n','').strip()
#获取首付:
car_first_money=car__money_mouth.split(' ')[0]
car_first_money=car_first_money if car_first_money else ''
# print(car_first_money)
# 获取月供:
car_mouth_money=car__money_mouth.split(' ')[1]
car_mouth_money=car_mouth_money if car_mouth_money else ''
# print(car_mouth_money)
#获取总价钱:
car_all_money=html_xml.xpath('.//em/text()')[1].strip().replace('\n','')
car_price=re.findall('(.*?)万',car_all_money)[0].strip()
car_price=car_price if car_price else ''
# print(car_price)
#根据每一个盒子获取到详情页面的连接:
car_details=li.xpath('.//a/@href')[0]
car_details="https:"+car_details
# print(car_details)
#创建一个字典:
car_dict={
'car_pic':car_pic,
'car_name':car_name,
'car_year':car_year,
'car_km':car_km,
'car_house':car_house,
'car_first_money':car_first_money,
'car_mouth_money':car_mouth_money,
'car_price':car_price,
'car_details':car_details,
}
# print(car_dict)
self.insert_mysql(car_dict)
def insert_mysql(self,car_dict):
car_pic=car_dict['car_pic']
car_name=car_dict['car_name']
car_year=car_dict['car_year']
car_km=car_dict['car_km']
car_house=car_dict['car_house']
car_first_money=car_dict['car_first_money']
car_mouth_money=car_dict['car_mouth_money']
car_price=car_dict['car_price']
car_details=car_dict['car_details']
#将数据插入数据库
sql="""
insert into youxin (car_pic,car_name,car_year,car_km,car_house,car_first_money,car_mouth_money,car_price,car_details)
VALUES('{}','{}',':{}','{}','{}','{}','{}','{}','{}')
""".format(car_pic,car_name,car_year,car_km,car_house,car_first_money,car_mouth_money,car_price,car_details)
# print(sql)
try:
#将数据保存到数据库
self.cur.execute(sql)
#提交数据
self.conn.commit()
print(self.count, sql)
self.count += 1
except Exception as e:
print(e)
self.conn.rollback()
def conn_mysql(self):
#创建连接数据库的对象:
self.conn=pymysql.Connect(host='127.0.0.1',user='root',password='admin',database='02180530',charset='utf8')
#创建操作数据库对象
self.cur=self.conn.cursor()
print(self.conn)
# 表示连接成功
if __name__ == '__main__':
# city=City()
# city()
# brand=AutomobileBrand()
# brand()
car=Car()
car()