USnews 国外排行榜 爬取数据

 今天,因为某种需要,要对国外大学排行榜进行数据的爬取。所以,对那个网站的一些数据进行的了爬取。

对爬取到的数据进行存储到mysql数据库中。

 网站地址:点击打开链接

# _._ coding:utf-8 _._#
import lxml
from lxml import etree
import requests
import MySQLdb

# 打开数据库连接
db = MySQLdb.connect("xxxx","xxxx","xxxx","xxxx" )
#设置数据库编码
db.set_character_set('utf8')
# 使用cursor()方法获取操作游标
cursor = db.cursor()

headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
r = requests.get(url='https://www.usnews.com/education/best-global-universities/search?region=&subject=&name=',headers=headers)
html = r.text.encode('utf-8')
# result = etree.tostring(html, pretty_print=True)
# print result
result = etree.HTML(html)
# # print result
# seq1 = []

allPage = result.xpath('//div[@class="pagination"]//a[last()-1]')
sumPage = allPage[0].text

subjects_list = []
select = result.xpath('//select[@name="subject"]')
allOptions = select[0].xpath('option')
#删除没有的第一项
del allOptions[0]
for option in allOptions:
    subjects_list.append(option.text.strip().lower().replace(' and ','-').replace('/','-').replace(' ','-'))
pageSize = 0
# link_list = []

subjects_list[1] = 'arts-and-humanities'
for subject in subjects_list[20:]:
    print subject
    seq1 = []
    link_list = []
    r = requests.get(url='https://www.usnews.com/education/best-global-universities/search?region=&subject=' + subject,headers=headers)
    html = r.text.encode('utf-8')
    # result = etree.tostring(html, pretty_print=True)
    # print result
    result = etree.HTML(html)
    # print etree.tostring(result, pretty_print=True)
    allPage = result.xpath('//div[@class="pagination"]//a[last()-1]')
    sumPage = allPage[0].text
    for a in range(int(sumPage)):

        r = requests.get(url='https://www.usnews.com/education/best-global-universities/search?subject=' + subject + '&page=' + str(a+1),headers=headers)
        html = r.text.encode('utf-8')
        # result = etree.tostring(html, pretty_print=True)
        # print result
        result = etree.HTML(html)

        #拿到学校名字
        schoolNames = result.xpath('//div[@class="block unwrap"]//h2//a')
        for s in schoolNames:
            seq1.append({'name':s.text})

        link_res = result.xpath('//div[@class="block unwrap"]//h2//a/@href')

        #拿到学校链接
        for link_r in link_res:
            link_list.append(link_r)
            #print link_r
        #设置索引
        pageSize = len(schoolNames)
        pageSize *= a
        # print schoolNames[0].text
        #拿到排名
        rank_list = result.xpath('//div/span[@class="rankscore-bronze"]')
        for i,r in enumerate(rank_list):
            seq1[i + pageSize]['ranking'] = r.text.strip().replace('#','')
        # print rank_list[0].text.strip().replace('#','')
        #拿到国家,地区
        span = result.xpath('//div[@class="block unwrap"]//div[1]/span')
        for i in range(0,len(span),2):
            seq1[i/2 + pageSize]['country'] = span[i].text
            seq1[i/2 + pageSize]['area'] = span[i+1].text

        # print span[0].text,span[1].text
        # print seq1
    print len(seq1)
    print len(link_list)

    #大学内部两个大列表
    content_list1 = {}
    content_list2 = {}

    for i,link_l in enumerate(link_list):#[:1]):
        i = int(i)
        u_link = requests.get(url=link_l,headers=headers)
        html = u_link.text.encode('utf-8')
        result = etree.HTML(html)
        content1 = result.xpath('//div[@id="directoryPageSection-institution-data"]')
        #print 'content1: ' + seq1[i]['name']
        if content1:
            content1_subDivs = content1[0].xpath('div')
            print len(content1_subDivs)

            index = 0
            for subDIv in content1_subDivs:
                if index == 0:
                    content_list1[seq1[i]['name']] = [dict(
                        value = subDIv.xpath('div')[0].text.strip(),    #数量
                        key   = subDIv.xpath('div')[1].text.strip()    #指标
                    )]
                    index += 1
                else:
                    content_list1[seq1[i]['name']] += [dict(
                        value = subDIv.xpath('div')[0].text.strip(),    #数量
                        key   = subDIv.xpath('div')[1].text.strip()    #指标
                    )]

        #print 'content2: ' + seq1[i]['name']
        content2 = result.xpath('//div[@id="directoryPageSection-indicator-rankings"]')

        if content2:
            content2_subDivs = content2[0].xpath('div')
            #删除一个没有用的div
            del content2_subDivs[0]

            print len(content2_subDivs)

            index = 0
            for subDIv in content2_subDivs:
                if index == 0:
                    content_list2[seq1[i]['name']] = [dict(
                        value = subDIv.xpath('div')[0].text.strip(),    #数量
                        key   = subDIv.xpath('div')[1].text.strip()    #指标
                    )]
                    index += 1
                else:
                    content_list2[seq1[i]['name']] += [dict(
                        key   = subDIv.xpath('div')[1].text.strip(),    #指标
                        value = subDIv.xpath('div/span/span')[0].text.strip().replace('#','')   #排名
                    )]
    # print div




    #记录schoolid
    schoolid = {}


    # 执行sql语句
    for s in seq1:
        #print s['ranking']
        cursor.execute("INSERT INTO us_rank(`rank`,`name`,`country`,`area`,`subject`) VALUES(%s,%s,%s,%s,%s)",(s['ranking'] if s.has_key('ranking') else None,s['name'],s['country'],s['area'],subject))
        schoolid[s['name']] = db.insert_id()
        print db.insert_id()
        # 提交到数据库执行
        db.commit()

    for k,v in content_list1.items():
        for v_v in v:
            cursor.execute("INSERT INTO target_storage(`name`,`value`,`label`,`school`) VALUES(%s,%s,%s,%s)",(v_v['key'],v_v['value'],1,schoolid[k]))
            # 提交到数据库执行
            db.commit()

    for k,v in content_list2.items():
        for v_v in v:
            cursor.execute("INSERT INTO target_storage(`name`,`value`,`label`,`school`) VALUES(%s,%s,%s,%s)",(v_v['key'],v_v['value'],2,schoolid[k]))
            # 提交到数据库执行
            db.commit()
# except:
#    # Rollback in case there is any error
#    db.rollback()


# 关闭数据库连接
db.close()


你可能感兴趣的:(python,爬虫)