python3 爬取猫眼榜单top100(requests+beautifulsoup)

初学python,记录学习过程。

爬取的url:http://maoyan.com/board/4

共十页,第二页的url:http://maoyan.com/board/4?offset=10 以此类推

源码如下:

#猫眼电影TOP100

import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
#找到str在string中最后出现的位置,若没有出现过,则返回-1
def find_last(string,str):
    last_position=-1
    while True:
        position=string.find(str,last_position+1)
        if position==-1:
            return last_position
        last_position=position

def get_html(url):
    try:
        response = requests.get(url)
        if response.status_code ==200:
            return response.text
        return None
    except RequestException:
        return None

def parse_html(html):
    soup = BeautifulSoup(html , "html.parser")
    names = [i.a.string for i in soup.select(".movie-item-info > .name")]
    stars = [i.string for i in soup.select(".movie-item-info > .star")]
    times = [i.string for i in soup.select(".movie-item-info > .releasetime")]
    integers = [i.string for i in soup.select(".score > .integer")]
    fractions = [i.string for i in soup.select(".score > .fraction")]
    star = []
    time = []
    score = []
    for s in stars:
        index = s.find('主演')
        lindex = find_last(s,'\n')
        star.append(s[index+3:lindex])

    for s in times:
        index = s.find('上映时间')
        time.append(s[index+5:])

    for x,y in zip(integers,fractions):
        score.append(x+y)

    return names,star,time,score

def write_to_file(names,stars,times,scores):
    file = open('result.txt','a',encoding='utf-8')
    # file.write('%-15s %-20s %-17s %-3s' % ('电影', '主演', '上映时间', '评分:'))
    # file.write('\n')
    for n,s,t,sc in zip(names,stars,times,scores):
        file.write('%-15s %-20s %-17s %-3s' % (n,s,t,sc))
        file.write('\n')
    file.close()


def main(offset):
    url = 'http://maoyan.com/board/4?offset='+str(offset)
    html = get_html(url)
    names,stars,times,scores = parse_html(html)
    write_to_file(names,stars,times,scores)

if __name__ == '__main__':
   for i in range(10):
       main(i*10)


结果如下:

python3 爬取猫眼榜单top100(requests+beautifulsoup)_第1张图片



你可能感兴趣的:(python)