python-猫眼电影爬虫

爬取猫眼电影TOP100(http://maoyan.com/board/4?offset=90)
1). 爬取内容: 电影名称,主演, 上映时间,图片url地址保存到mariadb数据库中;
2). 所有的图片保存到本地/mnt/maoyan/电影名.png


import re
from urllib.request import urlopen
from urllib import request

import pymysql


def getmovies():

    for i in range(10):
        url = 'http://maoyan.com/board/4?offset=%d' %(i*10)
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0'
        req = request.Request(url, headers={'User-Agent':user_agent})
        content = urlopen(req).read().decode('utf-8')
        print("正在爬取地址")
        pattern = r'(?P<name>[\u4e00-\u9fa5]+)'
        movies = re.findall(pattern,content)
        print(movies)
        pattern2 = r'

\s*(.+)\s*

' star = re.findall(pattern2,content) print(star) pattern3 = r'

(.+)

' time = re.findall(pattern3,content) print(time) conn = pymysql.connect(user='root', password='971203', charset='utf8', autocommit=True) cur = conn.cursor() conn.select_db('bank') # create_sql = 'create table movies2 (电影名字 varchar(60) not null , 主演 varchar(200) not null , 上映时间 varchar(50) not null , 图片url varchar(200) not null );' # cur.execute(create_sql) num = len(movies) for i in range(num): insert_sqli1 = 'insert into movies2 (电影名字,主演,上映时间,图片url) VALUES ("%s","%s","%s","%s");' % (movies[i][1],star[i],time[i],movies[i][0]) cur.execute(insert_sqli1) cur.close() conn.close() for i in movies: url = i[0] content = urlopen(url).read() with open('movies/%s.jpg' %(i[1]), 'wb+') as f: f.write(content) getmovies()

你可能感兴趣的:(python-猫眼电影爬虫)