python 使用 selenium 爬取美团获取酒店信息

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from pyquery import PyQuery as pq
import pymongo

browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)


def get_phone_num(url):
    browser.get(url)
    html = browser.page_source
    doc = pq(html)
    items = doc('.pull-right.other-detail.H100.bgw.R4Noborder').items()
    title = doc('.fs26.fc3.pull-left.bold').text()
    for item in items:
        hotel = {
            'title': title,
            'phoneNum': item.find('.mb10.m20.fc6.fs14 div.mb10').text().split(' ')[2][3:]
        }
        print(hotel)
        # save_to_mongo(hotel)


MONGO_URL = 'xxxx'
MONGO_DB = 'xxxx'
MONGO_COLLECTION = 'xxxx'
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]


def save_to_mongo(product):
    try:
        if db[MONGO_COLLECTION].insert(product):
            print('成功保存到MongoDB')
    except Exception as e:
        print(e.args)
        print('保存到MongoDB失败')


def get_hotel():
    html = browser.page_source
    doc = pq(html)
    # print(doc)
    items = doc('.content-view #main-view #list-view .poi-results article.poi-item').items()
    # print(items)
    for item in items:
        hotel = {
            'title': item.find('.picture-wrapper a img').attr('alt'),
            'address': item.find('.poi-address').text()[: -4],
            'cans': item.find('.service-icons').text().replace('\n', ' '),
            'score': item.find('.poi-grade').text().replace(' ', '').split('分')[0],
            'likes': item.find('.poi-grade').text().replace(' ', '').split('分')[1],
            'users': item.find('.poi-buy-num').text(),
            'price': item.find('.poi-price').text(),
            'url': item.find('.poi-title').attr('href')
        }
        # print(hotel)
        # save_to_mongo(hotel)
        get_phone_num(hotel.get('url'))


def get_page(page):
    print('正在爬取第:' + str(page) + ' 页')
    try:
        url = 'https://hotel.meituan.com/zhuhai/'
        browser.get(url)
        if page > 1:
            next_page = wait.until(ec.element_to_be_clickable(
                (By.CSS_SELECTOR, '.list-page-view ul.paginator li.next a')))
            next_page.click()
        wait.until(
            ec.presence_of_element_located((By.CSS_SELECTOR, 'div.paginator-wrapper li.current span')))
        wait.until(
            ec.presence_of_element_located((By.CSS_SELECTOR, '#main-view #list-view article.poi-item')))
        get_hotel()
    except TimeoutException:
        get_page(page)


MAX_PAGE = 1
if __name__ == '__main__':
    # get_page(1)
    for i in range(1, MAX_PAGE + 1):
        get_page(i)

你可能感兴趣的:(python 使用 selenium 爬取美团获取酒店信息)