爬虫—豆瓣图书标签下的书籍信息

注意:爬虫要适量,代码需谨慎

豆瓣图书标签下的书籍信息

直接奉上效果
爬虫—豆瓣图书标签下的书籍信息_第1张图片
内容
爬虫—豆瓣图书标签下的书籍信息_第2张图片
为了减少服务器的压力,我只获取了每个标签下书籍的第一页数据,一定要注意!
步骤就不详细写了,全在代码里边

import random
import requests

from lxml import etree
from multiprocessing import Pool


def getHtml(url):
    try:
        head = ['Mozilla/5.0', 'Chrome/78.0.3904.97', 'Safari/537.36']
        headers = {
     
            'user-agent': head[random.randint(0, 2)]
        }
        response = requests.get(url=url,
                                headers=headers)  # proxies={'http': '113.204.164.194:8080'})
        if response.status_code == 200:
            content = response.text
            return content
        else:
            return response.status_code
    except Exception as e:
        return e


def htmlToTree(html):
    tree = etree.HTML(html)
    return tree


def parseHtml(html):
    tag_list = []
    tree = htmlToTree(html)
    path = '//div[@id="content"]/div[@class="grid-16-8 clearfix"]/div[@class="article"]//' \
           'div[@class="indent tag_cloud"]/table[@class="tagCol"]/tbody/tr/td'
    data = tree.xpath(path)
    for item in data:
        tag_dict = {
     }
        td = etree.tostring(item, pretty_print=True, encoding='utf-8').decode('utf-8')
        td_tree = htmlToTree(td)
        tag_name = td_tree.xpath('//a/text()')[0]
        href = td_tree.xpath('//a/@href')[0]
        num = str(td_tree.xpath('//b/text()')[0])[1:-1]
        tag_dict["tag_name"] = tag_name
        tag_dict["url"] = "https://book.douban.com" + href
        tag_dict["count"] = num
        tag_list.append(tag_dict)
    return tag_list


def getPerTagBookInfo(tagInfo):
    tag_allbooks = {
     }
    books_info = []
    tag_name = tagInfo['tag_name']
    html = getHtml(tagInfo['url'])
    tree = htmlToTree(html)
    path = '//div[@id="content"]//div[@class="article"]/div[@id="subject_list"]/ul' \
           '[@class="subject-list"]/li/div[@class="info"]'
    data = tree.xpath(path)
    for item in data:
        book_info = {
     }
        book = etree.tostring(item, pretty_print=True, encoding='utf-8').decode('utf-8')
        book_tree = htmlToTree(book)
        book_name = book_tree.xpath('//h2/a/text()')[0].strip()
        book_url = book_tree.xpath('//h2/a/@href')[0]
        book_pub = book_tree.xpath('//div[@class="pub"]/text()')[0].strip()
        book_score = book_tree.xpath(
            '//div[@class="star clearfix"]/span[@class="rating_nums"]/text()')
        if book_score:
            book_score = book_score[0]
        else:
            book_score = 0
        book_info['book_name'] = book_name
        book_info['book_url'] = book_url
        book_info['book_pub'] = book_pub
        book_info['book_score'] = book_score
        books_info.append(book_info)
    tag_allbooks['tag_name'] = tag_name
    tag_allbooks['books_info'] = books_info
    return tag_allbooks


if __name__ == '__main__':
    url = 'https://book.douban.com/tag/?view=cloud'
    html = getHtml(url)
    # print(html)
    data = parseHtml(html)
    alltag_allbooks = []
    pool = Pool()
    for tagInfo in data[:5]:
        tag_allbooks = pool.apply_async(getPerTagBookInfo, args=(tagInfo,))
        # tag_allbooks = getPerTagBookInfo(tagInfo)
        alltag_allbooks.append(tag_allbooks)
    pool.close()
    pool.join()
    for alltag_allbook in alltag_allbooks:
        alltag_allbook = alltag_allbook.get()
        tag_name = alltag_allbook.get('tag_name')
        path = 'source/{}_book.txt'.format(tag_name)
        with open(path, 'a', encoding='utf-8') as f:
            f.write("标签名称 作者 链接 出版信息 评分" + '\n')
            for book in alltag_allbook.get('books_info'):
                info = tag_name + ' ' + book['book_name'] + ' ' + book['book_url'] + ' ' + book['book_pub'] + ' ' + str(book['book_score'])
                f.write(info + '\n')
    print(alltag_allbooks)

爬虫—豆瓣图书标签下的书籍信息_第3张图片

你可能感兴趣的:(爬虫,爬虫,python,xpath,html,url)