python 爬取豆瓣电影排行榜TOP250 保存到数据库或文件

 本篇文章爬取豆瓣电影排行榜,使用的模块:第三方模块requests,正则模块,数据库MySQL等

import requests
import re
import pymysql

class DouBan:
    def __init__(self):
        self.baseurl = 'https://movie.douban.com/top250?start={}&filter='
        #数据库相关的参数
        self.host = 'localhost'
        self.user = 'root'
        self.pwd = '123456'
        self.conn = pymysql.connect(self.host, self.user, self.pwd)
        self.cur = self.conn.cursor()

    def getHTml(self,url):
        res = requests.get(url)
        res.encoding='utf-8'
        html = res.text
        self.parseHtml(html)

    def parseHtml(self,html):
        reg = '
[\s\S]*?([\s\S]*?)[\s\S]*?
([\s\S]*?) / ([\s\S]*?) [\s\S]*?([\s\S]*?)' p = re.compile(reg) infolist = p.findall(html) for info in infolist: each_info=[] for x in info: x = x.strip() each_info.append(x) self.writeComment(each_info) #--------------------保存到本地----------------------- def writeComment(self,info): try: with open('豆瓣.txt','a') as f: for each in info: f.write(each+' ') f.write('\n') except: print('打开文件错误') #------------------保存到数据库mysql------------------ # def writeComment(self,info): # self.cur.execute('create database if not exists maoyan;') # self.cur.execute('use maoyan') # self.cur.execute('create table if not exists maoyan(id int primary key auto_increment,moviename varchar(60),date char(100),country varchar(20),score varchar(10))default charset="utf8";') # self.cur.execute('insert into maoyan(moviename,date,country,score) values("%s","%s","%s","%s");'%(info[0],info[1],info[2],info[3])) # self.conn.commit() # def main(self): # self.cur.execute('drop database maoyan;') for i in range(0,10): url = self.baseurl.format(i*25) print(url) self.getHTml(url) print('写入完成') if __name__=='__main__': douban = DouBan() douban.main()

 

你可能感兴趣的:(爬虫)