【爬虫实践】接口访问中保持cookie持续更新有效

登录cookie的获取:

很多网站访问时需要登录方可请求数据,所以需要模拟登录并保存cookie,核心代码如下:

def auto_login(num,url):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])  # 禁止打印日志
    driver = webdriver.Chrome(executable_path=r"C:\python_work\chromedriver.exe",chrome_options=chrome_options)  #
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
        Object.defineProperty(navigator,'webdriver',{
        get:() => undefined
        })
        """
    })
    driver.set_window_size(800, 600)
    driver.set_window_position(500, 0)

    driver.get(url)
    time.sleep(3)
    driver.find_element_by_xpath('/html/body/div[3]/div[4]/div/div/div/div/div/div[1]/div[12]/a/img').click()
    time.sleep(3)
    # 切换到新窗体
    w_list = driver.window_handles
    driver.switch_to.window(w_list[-1])
    referer = driver.current_url
    # print(referer)
    try:        #点击登录按钮
        driver.find_element_by_xpath('//*[@id="app"]/div/div[2]/div/div[2]/button').click()
    except:
        pass
    time.sleep(2)
    try:  # 切换到登录窗体
        driver.switch_to.frame(driver.find_element_by_xpath('//*[@id="loginchinatype"]/iframe'))
    except:
        pass
    try:
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//*[@id="login"]/div[2]/div/div[1]/a[2]')))
        driver.find_element_by_xpath('//*[@id="login"]/div[2]/div/div[1]/a[2]').click()
        # 点击短信登录//*[@id="login"]/div[2]/div/div[1]/a[2]
        driver.find_element_by_xpath('//*[@id="login"]/div[2]/div/div[1]/a[2]').click()
        driver.find_element_by_xpath('//*[@id="fm-sms-login-id"]').send_keys(num)
        driver.find_element_by_xpath('//*[@id="login-form"]/div[2]/div[3]/a').click()  # 点击获取验证码,调出滑块
        time.sleep(2)
        huakuai(driver,'2')
        yzm = input('请输入手机上的验证码(输入1放弃登录):')  #('请用手机扫描二维码登陆后,回车继续!')
        if yzm=='1':
            return '',''
        driver.find_element_by_xpath('//*[@id="fm-smscode"]').send_keys(yzm)
        driver.find_element_by_xpath('//*[@id="login-form"]/div[5]/button').click()
        time.sleep(5)
        driver.refresh()
        input('是否可以获取cookie?未登录成功则手动操作一下,成功的话回车继续!')
        cookie_list = driver.get_cookies()
        cookies = ""
        for ck in cookie_list:
            cookies += '{}={}; '.format(ck['name'], ck['value'])
        driver.quit()
        if '__cn_logon__=false;' in cookies:
            print(num,'登录失败,重新登录!')
    except:
        print('等待20秒仍未获取登陆窗口,将关闭浏览器重新登录!')
        driver.quit()
        
    with open('cookie.json', 'w') as fp:
        json.dump(json_list, fp)
        fp.close()
    print('cookie存储成功!')

cookie保持有效,获取set-cookie:

加载cookies文件,使用requests库爬取数据并动态更新cookies,可以使cookies不失效。这里借鉴大佬的代码,源自:https://blog.csdn.net/data_scientist/article/details/75218792

import pickle
import time
import requests
import random

class Spider:
    def __init__(self,domain='51job.com'):
        self.headers_51job={
            'Host': 'ehire.51job.com',
            'Origin':'http://ehire.51job.com',
            'User-Agent':"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36""",
            'Accept': """text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8""",
            'Accept-Language': """zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3""",
            'Referer': """http://ehire.51job.com/Jobs/JobSearchPost.aspx?IsHis=N""",
            'Upgrade-Insecure-Requests':'1',
            'Connection': 'keep-alive'
        }
        self.s=requests.Session()
        self.s.headers=self.headers_51job
        self.__domain = domain
        self.timeOut=30
        self.cookies={}


    def SetLoginDomain(self,domain='51job.com'):
        """设置登录域名"""
        self.__domain=domain
        return self.__domain

    def SetTimeOut(self,timeOut=30):
        self.__timeOut=timeOut
        return self.__timeOut

    def set_cookies(self):
        """读取cookie文件 该文件由另外一个登录程序获取"""
        with open('./cookies.txt') as f:
            cookies = pickle.loads(f.read())
        for cookie in cookies:
            self.cookies[cookie['name']]=cookie['value']
        self.s.cookies.update(self.cookies)

    def open_url(self, url,data=None):
        """页面请求方法"""
        # 请求页面方法
        MaxTryTimes = 20
        waite_time = random.uniform(0, 1)  # 初始化等待时间
        for i in range(MaxTryTimes):
            time.sleep(waite_time)
            try:
                req = self.s.post(url,data=data,headers=self.headers_51job,timeout=self.timeOut)
                content=req.text
                if req.cookies.get_dict():
                    self.s.cookies.update(req.cookies)
                break
            except Exception,e:
                print e
                content = ''
        return content

if __name__ == '__main__':
    spider=Spider()
    spider.set_cookies()
    content=spider.open_url(url='http://ehire.51job.com/Jobs/JobSearchPost.aspx?IsHis=N')
    print content

你可能感兴趣的:(python爬虫)