python数据分析-豆瓣电影Top250

数据爬取源码

详细过程大家可以在其他帖子中看到.
这里采用python2.7以及原生库urllib2和re库进行爬取.

# coding=utf-8
import urllib2
import re
import time


def get_Request(page):
   url = 'https://movie.douban.com/top250'
   headers = {
       'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36',
       'Host': 'movie.douban.com',
   }
   req = urllib2.Request(url + '?start=' + str(page) + '&filter=', headers=headers)
   res = urllib2.urlopen(req).read()

   return res


def get_content(res):
   total = re.findall('
([\s\S]+?)
\s+', res, re.S) all_data = list() for item in total: try: title = re.findall('(.*?)', item) count = re.findall('(\d+)', item) director = re.findall('

\s+导演:\s+(.*?)   ', item) other = re.findall('
\s+(\d{4,}) / (.*?) / (.*?)\s+

', item) score = re.findall('(\d\.\d+)', item) name = title[0] other_name = '' counts = count[0] direct = director[0] year = other[0][0] country = other[0][1].replace(' ', '/'), type = other[0][2].replace(' ', '/'), scores = score[0], if len(title) > 1: other_name = title[1].replace(' / ', '').replace(',', ' '), else: title.append('0') other_name = title[1] data = dict( name=name, other_name=other_name[0], director=direct, year=year, country=country[0], type=type[0], score=scores[0], ) all_data.append(data) except: all_data.append(data) return all_data def save_data(data): value = '' with open('data.txt', 'a') as f: for line in data: for values in ['name', 'other_name', 'director', 'year', 'country', 'type', 'score']: if values == 'score': value += line[values] else: value += line[values] + ',' f.write(str(value) + '\n') print value + '\n' value = '' f.close() def run(page): res = get_Request(page) data = get_content(res) save_data(data) if __name__ == "__main__": page = 0 while page < 250: run(page=page) page += 25 time.sleep(0.5) print 'finished data crawl'

数据分析

影片类型分析

这里对于爬取下来的数据集,并对类型中的"/"进行切割.
分割前:


1.jpg

你可能感兴趣的:(python数据分析-豆瓣电影Top250)