python 爬一下

1.安装Requests
window:pip install requests
linux:sudo pip install requests
国内安装缓慢,建议到:
http://www.lfd.uci.edu/~gohlke/pythonlibs/


搜索到request并下载    
修改后缀名whl为zip并解压,复制requests文件夹到python的lib目录下

 

2.获取网站内容

import requests
useragent = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'}
html = requests.get("http://tieba.baidu.com/f?ie=utf-8&kw=python",headers=useragent)
print(html.text)

 

 

3.向网页提交数据
get从服务器获取数据
post向服务器发送数据
get通过构造url中的参数来实现功能
post是将数据放在header中提交数据

在使用ajax加载数据的时候是不会在源码中显示的,这时候就要发送post请求来获取数据

data={
    'type':'1',
    'sort':'1',
    'currentPage':'3'
}
html_text = requests.post("http://xxxxxx/student/courses/searchCourses",data=data)
print(html_text.text)

---------------------------------------------------------------------------------------------

举个小例子,这是从计科学院的视频上记录下来的笔记

import requests
import re

# -*- coding: utf-8 -*-

class spider(object):

    def changepage(self,url,total_page):
        now_page = int(re.search('pageNum=(\d+)',url,re.S).group(1))
        page_group=[]
        for i in range(now_page,total_page+1):
            link = re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S)
            page_group.append(link)
        return page_group

    def getsource(self,url):
        html = requests.get(url)
        return html.text

    def geteveryclass(self,source):
        everyclass = re.findall('(<li id=.*?</li>)',source,re.S)
        return everyclass

    def getinfo(self,eachclass):
        info = {}
        info['title'] = re.search('alt="(.*?)"',eachclass,re.S).group(1)
        info['content'] = re.search('display: none;">(.*?)</p>',eachclass,re.S).group(1)
        timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
        info['classtime'] = timeandlevel[0]
        info['classlevel'] = timeandlevel[1]
        info['learnnum'] = re.search('"learn-number">(.*?)</em>',eachclass,re.S).group(1)
        return info

    def saveinfo(self,classinfo):
        f=open('info.txt','a')#open(路径+文件名,读写模式)r只读,r+读写,w新建(会覆盖原有文件),a追加,b二进制文件.常用模式
        for each in classinfo:
            f.writelines('title:'+each['title']+'\n')
          #  f.writelines('content:'+each['content'+'\n'])
           # f.writelines('classtime:'+each['classtime'+'\n'])
           # f.writelines('classlevel:'+each['classlevel'+'\n'])
           # f.writelines('learnnum:'+each['learnnum'+'\n\n'])
        f.close()




if __name__ == '__main__':

    classinfo = []#定义一个列表,里面将放置所有课程的字典
    url = 'http://www.jikexueyuan.com/course/?pageNum=1'
    jikespider = spider()#实例化
    all_links = jikespider.changepage(url,2)#获取20页的url
    for link in all_links:
        print('读取文件:'+link)
        html = jikespider.getsource(link)#获取当前页资源
        everyclass = jikespider.geteveryclass(html)#获取当前页所有的li
        for each in everyclass:
            info = jikespider.getinfo(each)#分类获取资源
            classinfo.append(info)#加入列表
    jikespider.saveinfo(classinfo)#写操作

 

你可能感兴趣的:(python 爬一下)