BeautifulSoup+Request 爬取豆瓣图书Top250

先上代码吧

 

import requests
from bs4 import BeautifulSoup
import pymysql
books=[]
def cow(i):
    url="https://book.douban.com/top250?start=%d" % (i*25)
    web_data=requests.get(url)
    book_data=BeautifulSoup(web_data.text,'lxml')
    # help(book_data.findAll)
    table=book_data.findAll("table",dict(width="100%"))
    for item in table:
        title=item.div.a.text.strip()
        r_title=title.replace("\n","").replace(" ","")
        # title_fu=item.div.a.span.text.strip()
        author=item.find('p',{'class':'pl'}).text.strip()
        score=item.find('span',{'class':'rating_nums'}).text.strip()
        people=item.find('span',{'class':'pl'}).text.strip().replace('\n','').replace(' ','')
        score_people=score +people
        buy=item.div.a['href']
        if item.find('span',{'class':'inq'}):
            quote=item.find('span',{'class':'inq'}).text.strip()
        else:
            quote='no description'
        books.append((r_title,author,score_people,quote,buy))
    return books
#存入表格
def write(books):
    with open('douban_book2.xls','a',encoding='utf-8') as f:
         for book in books:
              f.write(book[0]+'\t'+book[1]+'\t'+book[2]+'\t'+book[3]+'\t'+book[4])  #存入文档的时候,才使用这个命令
              print(book[0]+'\t'+book[1]+'\t'+book[2]+'\t'+book[3]+'\t'+book[4],file=f)
def cun(books):
    # 连接数据库
    conn = pymysql.connect(
        host='localhost',
        port=3306,
        user='root',
        passwd='123456',
        db='wang',
    )
    cur = conn.cursor()
    sqli = "insert into douban_book(书名,作者,评分,经典语句,购买链接) values(%s,%s,%s,%s,%s)"
    for book in books:
       cur.execute(sqli, (book[0], book[1], book[2], book[3],book[4]))
       conn.commit()

    conn.close()
    cur.close()
for i in range(10):
    cow(i)
#把数据直接存为xls的格式
#write(books)
#把数据存入数据库
cun(books)

Msql数据库中的数据表为:

BeautifulSoup+Request 爬取豆瓣图书Top250_第1张图片

从上面的数据表中,可以看出作者一栏中,还含有出版社,价格等的信息,本来还应该利用正则表达式再提取的。这个我过段时间再操作。

 

你可能感兴趣的:(爬虫入门)