Python爬虫:爬取城市景点信息

爬取详情页面的所有信息

景点名称,地址 、简介、类型、时间、门票等

直接就上代码; 

import requests
from lxml import etree
from multiprocessing.pool import Pool
headers = {
    'Referer': 'https://yancheng.cncn.com/jingdian/dazonghu/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3722.400 QQBrowser/10.5.3738.400'
}
def save(content):#存储文件
    with open('盐城景区.doc', 'a')as f:
        f.write(content+'\n')

def get_detail(href):#获取详情页
    response = requests.get(href, headers=headers)
    return response.text

def paser_pages(resp):#解析详情页
    infos = []
    info = etree.HTML(resp)
    title = info.xpath('//h1/text()')[0]#获取标题
    infos.append(title)
    #print(title)
    dls = info.xpath('//div[@class="type"]//dl')#获取详情页信息
    for dl in dls:
        detail = dl.xpath('.//text()')
        detail = str(''.join(detail)).replace('\xa0', '').strip()
        infos.append(detail)
        #print(detail)
    save('\n'.join(infos))

def get_pages(url):#首页
    response = requests.get(url, headers=headers)
    # print(response.text)
    selector = etree.HTML(response.text)
    items = selector.xpath('//div[@class="city_spots_list"]/ul//li')
    for item in items:
        #获取详情页url
        href = item.xpath('./a/@href')[0]
        #print(href)
        res = get_detail(href)
        paser_pages(res)

if __name__ == '__main__':
    #多线程爬取
    page_href = ['https://yancheng.cncn.com/jingdian/1-{}-0-0.html'.format(str(i)) for i in range(1, 6)]
    pool = Pool()
    result = pool.map(get_pages, page_href)
    pool.close()
    pool.join()

结果:

Python爬虫:爬取城市景点信息_第1张图片

 

你可能感兴趣的:(Python爬虫开发,爬虫开发,爬虫,景点,Python)