nodejs简易爬虫支持代理ip

nodejs 爬虫支持代理IP 原创雕虫小技

欢迎一起交流学习,废话不说直接上代码

const request = require("request")
const iconv = require('iconv-lite')
const cheerio = require("cheerio")
const _ = require("lodash")
const crypto = require('crypto')
class spider {

  constructor(config) {
    /**
     * @param {boolen} debug
     * @param {boolen} proxy
     * @param {string} proxy_ip
     * @param {array} temp_proxy
     * @param {array} headers
     */
    this.config = config
    this.doc_type = config.doc_type || 'html'
    this.headers = {
      'Host': this.config.Host
    }
    this.result_obj = {
      id: Number,
      title: String,
      link: String,
      desc: String,
      from: String
    }
    this.result_ext = {}
    this.debug = false
    this.proxy = false
    this.proxy_ip = null
    this.temp_proxy = []
    this.headers = []
    this.result_list = []
  }

  /**
   * @param {string} url
   * @param {string} method
   * @return {string} 
   */
  async so(url = "baidu.com", method = 'get') {
    let options = {
      method: method,
      url: url,
      timeout: 8000,
      headers: this.headers
    }
    console.log('开始爬取', url)
    if (this.temp_proxy.length && this.proxy) {
      if (!this.proxy_ip) {
        this.say('找出可用的代理ip')
        this.proxy_ip = await this.checkIp()
        console.log('获取到了才到下一步')
      }
      if (!this.proxy_ip) {
        console.log('还没有的话。估计是不行了')

        return false
      }
      options.proxy = this.proxy_ip
      return new Promise((resolve, reject) => {

        request(options, (error, response, body) => {
          try {

            if (error) throw error;

            if (/meta.*charset=gb2312/.test(body)) {
              body = iconv.decode(body, 'gbk');
            }
            if (this.proxy_ip) {
              this.say('这个IP果然牛逼!!!!', this.proxy_ip)
            }
            this.say('爬取完成了,丢出去html给下一个兄弟处理\n')

            if (this.doc_type == 'json') {
              resolve(this.handle(body[this.res_data]))
            } else {
              resolve(this.handle(body))
            }


          } catch (e) {
            this.say(options.proxy + '爬取失败了,因为这原因代理ip换一下吧' + options.proxy, e.response)
            this.tempProxy = []
            this.proxyIp = null
            resolve(false)
          }

        });
      }).catch(e => {
        this.say(e);
        return false
      })
    } else {
      return new Promise((resolve, reject) => {

        request(options, (error, response, body) => {
          try {

            if (error) throw error;

            if (/meta.*charset=gb2312/.test(body)) {
              body = iconv.decode(body, 'gbk');
            }
            let result
            this.say('爬取完成了,丢出去html给下一个兄弟处理\n')
            if (this.doc_type == 'json') {
              result = JSON.parse(body)
              // console.log(result)
              return resolve(this.handle(result[this.res_data]))
            } else {
              console.log(this.config.doc_type)
              resolve(this.handle(body))
            }

          } catch (e) {
            this.say('爬取失败了,因为这原因', e.response)
            return reject(e);
          }

        });
      }).catch(e => {
        console.log(e)
      })
    }
  }
  /**
   * 获取代理IP
   * @returns 
   */
  get_proxy_list() {
    let api_url = 'http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=http%3A%2F%2Fwww.66ip.cn%2F%3Fsxb%3D%26tqsl%3D100%26ports%255B%255D2%3D%26ktip%3D%26sxa%3D%26radio%3Dradio%26submit%3D%25CC%25E1%2B%2B%25C8%25A1';
    return new Promise((resolve, reject) => {
      let options = {
        method: 'GET',
        url: api_url,
        gzip: true,
        encoding: null,
        headers: {
          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
          'Accept-Encoding': 'gzip, deflate',
          'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
          'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
          'referer': 'http://www.66ip.cn/'
        }
      }

      request(options, (error, response, body) => {
        try {
          if (error) throw error;
          if (/meta.*charset=gb2312/.test(body)) {
            body = iconv.decode(body, 'gbk');
          }
          let ret = body.match(/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,4}/g);
          resolve(ret);
        } catch (e) {
          return reject(e);
        }
      })
    })
  }
  /**
   * 设置代理ip
   */
  async set_proxy() {
    //   this.say(this.tempProxy)
    this.proxy = true
    this.say('好吧请稍等,代理IP稍后奉上....')
    if (!this.tempProxy.length) {
      this.temp_proxy = await this.get_proxy_list()
      this.say('这次一用力获取到了' + this.temp_proxy.length + '个IP,请测试!')
    } else {
      this.say('原来的代理IP还有慢慢慢用')
      //其实该换新的了
      this.temp_proxy = []
    }
  }
  /**
   * 处理数据
   * @returns
   */
  handle(data) {
    if (this.doc_type == 'json') {
      let item = {}
      for (let i in data) {
        item = {}
        for (let it in this.result_obj) {
          //如果是数读取到最后一个字符串
          if (_.isArray(this.result_obj[it])) {
            if (this.result_obj[it].length == 2) {
              item[it] = data[i][this.result_obj[it][0]][this.result_obj[it][1]]
            }
            if (this.result_obj[it].length == 3) {
              item[it] = data[i][this.result_obj[it][0]][this.result_obj[it][1]][this.result_obj[it][2]]
            }
          } else {
            item[it] = data[i][this.result_obj[it]]
          }

        }
        let new_item = Object.assign(item, this.result_ext)
        new_item['key'] = this.set_key(new_item.title)
        this.result_list[i] = new_item
      }
      if (this.debug) {
        this.say(data[0])
        this.say(this.result_list[0])
      }
    }

    if (this.doc_type == 'html') {
      let $ = cheerio.load(data)
      this.say('这是一个html文档,具体是否可用要问一下下面的兄弟才知道')
      return $
    }
    return this.result_list
  }

  async test_proxy(proxy_ip) {
    if (proxyIp == undefined) {
      this.say('ip 已经没了,重新获取')
      this.temp_proxy = []
      await this.set_proxy()
    }
    //测试这个代理IP 可用就设置
    return new Promise((resolve, reject) => {
      let target_options = {
        method: 'GET',
        url: 'http://ip.chinaz.com/getip.aspx',
        timeout: 8000
      };

      //这里修改一下,变成你要访问的目标网站
      this.say(`开始测试这个IP ${proxyIp}`);

      target_options.proxy = 'http://' + proxy_ip;
      request(target_options, (error, response, body) => {
        try {
          if (error) throw error;
          body = body.toString();
          // this.say(body);

          if (body.length < 100) {
            this.say(`兄弟这个可以拿去用==>> ${proxy_ip}`);
            resolve('http://' + proxy_ip)
          } else {
            this.say(`这个IP无效==>> ${proxy_ip}`);
            resolve(false)
          }

        } catch (e) {
          return reject(false);
        }

      });

    }).catch(e => {
      return e
    })
  }

  say(lang, lang1 = '') {
    if (this.debug) {
      console.log(lang, lang1)
    }
  }
  async check_ip() {
    //从第一个开始找找到合适的返回
    if (this.proxy) {
      for (let i in this.temp_proxy) {
        console.log(this.temp_proxy[i])
        let true_ip = await this.testProxy(this.temp_proxy[i])
        if (true_ip) {
          //记住这个索引
          return true_ip
        }
      }
    }
    return false
  }
  set_key(str) {
    if (!str) {
      return 'no str'
    }
    let rand = Math.ceil(Math.random() * 100)
    const md5 = crypto.createHash('md5')
    let password = md5.update(str + rand).digest('hex')
    return password
  }
}

/**
 * api 例子
 */

const api_spider = new spider({
  Host: 'juejin.com',
  doc_type: 'json'
})
api_spider.res_data = ['d']
api_spider.result_obj = {
  id: 'objectId',
  title: 'title',
  link: 'originalUrl',
  desc: 'content',
  like: 'collectionCount',
  comments: 'commentsCount',
  createdAt: 'createdAt'
}
api_spider.result_ext = {
  from: '掘金'
}

api_spider.so('https://search-merger-ms.juejin.im/v1/search?query=node&page=0&raw_result=false&src=web')
  .then((data) => {
    console.log(data[0])
  })



/**
 * html 例子
 */

const so = async function () {

  let start = new spider({
    Host: 'zzk.cnblogs.com',
    doc_type: 'html'
  })
  start.headers['Cookie'] = 'GA1.2.182722864.1520590357; UM_distinctid=162105121121049-0b14491452f4e4-32667b04-1aeaa0-162105121136f2; _gid=GA1.2.1004931403.1527083606; __utma=59123430.182722864.1520590357.1527094058.1527094058.1; __utmc=59123430; __utmz=59123430.1527094058.1.1.utmcsr=cnblogs.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=59123430.3.10.1527094058'
  let $ = await start.so('http://zzk.cnblogs.com/s?t=b&w=node')
  $('div.searchItem').each(function (i, elem) {
    let item = $(this).children('.searchItemTitle')
    start.result_list[i] = {
      id: i + 1,
      title: item.text(),
      link: item.children('a').attr('href'),
      desc: $(this).children('.searchCon').text().substr(0, 80),
      from: '博客园',
      createdAt: $(this).children('.searchItemInfo').children('.searchItemInfo-publishDate').text(),
      comments: $(this).children('.searchItemInfo').children('.searchItemInfo-comments').text(),
      views: $(this).children('.searchItemInfo').children('.searchItemInfo-views').text(),
      key: start.set_key(item.text())
    }
  })
  // console.log(start.resultList)
  return start.result_list
}
so().then((data)=>{
    console.log(data[0])
})
module.exports = spider
``

你可能感兴趣的:(nodejs简易爬虫支持代理ip)