今天,因为某种需要,要对国外大学排行榜进行数据的爬取。所以,对那个网站的一些数据进行的了爬取。
对爬取到的数据进行存储到mysql数据库中。
网站地址:点击打开链接
# _._ coding:utf-8 _._#
import lxml
from lxml import etree
import requests
import MySQLdb
# 打开数据库连接
db = MySQLdb.connect("xxxx","xxxx","xxxx","xxxx" )
#设置数据库编码
db.set_character_set('utf8')
# 使用cursor()方法获取操作游标
cursor = db.cursor()
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
r = requests.get(url='https://www.usnews.com/education/best-global-universities/search?region=&subject=&name=',headers=headers)
html = r.text.encode('utf-8')
# result = etree.tostring(html, pretty_print=True)
# print result
result = etree.HTML(html)
# # print result
# seq1 = []
allPage = result.xpath('//div[@class="pagination"]//a[last()-1]')
sumPage = allPage[0].text
subjects_list = []
select = result.xpath('//select[@name="subject"]')
allOptions = select[0].xpath('option')
#删除没有的第一项
del allOptions[0]
for option in allOptions:
subjects_list.append(option.text.strip().lower().replace(' and ','-').replace('/','-').replace(' ','-'))
pageSize = 0
# link_list = []
subjects_list[1] = 'arts-and-humanities'
for subject in subjects_list[20:]:
print subject
seq1 = []
link_list = []
r = requests.get(url='https://www.usnews.com/education/best-global-universities/search?region=&subject=' + subject,headers=headers)
html = r.text.encode('utf-8')
# result = etree.tostring(html, pretty_print=True)
# print result
result = etree.HTML(html)
# print etree.tostring(result, pretty_print=True)
allPage = result.xpath('//div[@class="pagination"]//a[last()-1]')
sumPage = allPage[0].text
for a in range(int(sumPage)):
r = requests.get(url='https://www.usnews.com/education/best-global-universities/search?subject=' + subject + '&page=' + str(a+1),headers=headers)
html = r.text.encode('utf-8')
# result = etree.tostring(html, pretty_print=True)
# print result
result = etree.HTML(html)
#拿到学校名字
schoolNames = result.xpath('//div[@class="block unwrap"]//h2//a')
for s in schoolNames:
seq1.append({'name':s.text})
link_res = result.xpath('//div[@class="block unwrap"]//h2//a/@href')
#拿到学校链接
for link_r in link_res:
link_list.append(link_r)
#print link_r
#设置索引
pageSize = len(schoolNames)
pageSize *= a
# print schoolNames[0].text
#拿到排名
rank_list = result.xpath('//div/span[@class="rankscore-bronze"]')
for i,r in enumerate(rank_list):
seq1[i + pageSize]['ranking'] = r.text.strip().replace('#','')
# print rank_list[0].text.strip().replace('#','')
#拿到国家,地区
span = result.xpath('//div[@class="block unwrap"]//div[1]/span')
for i in range(0,len(span),2):
seq1[i/2 + pageSize]['country'] = span[i].text
seq1[i/2 + pageSize]['area'] = span[i+1].text
# print span[0].text,span[1].text
# print seq1
print len(seq1)
print len(link_list)
#大学内部两个大列表
content_list1 = {}
content_list2 = {}
for i,link_l in enumerate(link_list):#[:1]):
i = int(i)
u_link = requests.get(url=link_l,headers=headers)
html = u_link.text.encode('utf-8')
result = etree.HTML(html)
content1 = result.xpath('//div[@id="directoryPageSection-institution-data"]')
#print 'content1: ' + seq1[i]['name']
if content1:
content1_subDivs = content1[0].xpath('div')
print len(content1_subDivs)
index = 0
for subDIv in content1_subDivs:
if index == 0:
content_list1[seq1[i]['name']] = [dict(
value = subDIv.xpath('div')[0].text.strip(), #数量
key = subDIv.xpath('div')[1].text.strip() #指标
)]
index += 1
else:
content_list1[seq1[i]['name']] += [dict(
value = subDIv.xpath('div')[0].text.strip(), #数量
key = subDIv.xpath('div')[1].text.strip() #指标
)]
#print 'content2: ' + seq1[i]['name']
content2 = result.xpath('//div[@id="directoryPageSection-indicator-rankings"]')
if content2:
content2_subDivs = content2[0].xpath('div')
#删除一个没有用的div
del content2_subDivs[0]
print len(content2_subDivs)
index = 0
for subDIv in content2_subDivs:
if index == 0:
content_list2[seq1[i]['name']] = [dict(
value = subDIv.xpath('div')[0].text.strip(), #数量
key = subDIv.xpath('div')[1].text.strip() #指标
)]
index += 1
else:
content_list2[seq1[i]['name']] += [dict(
key = subDIv.xpath('div')[1].text.strip(), #指标
value = subDIv.xpath('div/span/span')[0].text.strip().replace('#','') #排名
)]
# print div
#记录schoolid
schoolid = {}
# 执行sql语句
for s in seq1:
#print s['ranking']
cursor.execute("INSERT INTO us_rank(`rank`,`name`,`country`,`area`,`subject`) VALUES(%s,%s,%s,%s,%s)",(s['ranking'] if s.has_key('ranking') else None,s['name'],s['country'],s['area'],subject))
schoolid[s['name']] = db.insert_id()
print db.insert_id()
# 提交到数据库执行
db.commit()
for k,v in content_list1.items():
for v_v in v:
cursor.execute("INSERT INTO target_storage(`name`,`value`,`label`,`school`) VALUES(%s,%s,%s,%s)",(v_v['key'],v_v['value'],1,schoolid[k]))
# 提交到数据库执行
db.commit()
for k,v in content_list2.items():
for v_v in v:
cursor.execute("INSERT INTO target_storage(`name`,`value`,`label`,`school`) VALUES(%s,%s,%s,%s)",(v_v['key'],v_v['value'],2,schoolid[k]))
# 提交到数据库执行
db.commit()
# except:
# # Rollback in case there is any error
# db.rollback()
# 关闭数据库连接
db.close()