驴妈妈旅游爬虫

概览页抓取链接

  1 import requests
  2 import re
  3 import pymysql
  4 import hashlib
  5 import datetime
  6 
  7 
  8 class Demo(object):
  9     def __init__(self):
 10         self.host = '127.0.0.1'
 11         self.db = 'app_mark'
 12         self.user = 'root'
 13         self.passwd = '123456'
 14         self.charset = 'utf8mb4'
 15         self.headers = {
 16             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
 17         }
 18         self.url = 'http://www.lvmama.com/'
 19         self.channel_link = [
 20             'http://s.lvmama.com/group/H13K110000?keyword=%E6%99%AE%E5%90%89%E5%B2%9B&k=0#list',  # 海岛
 21             'http://s.lvmama.com/route/H13K310000?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list',  # 东南亚
 22             'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list',  # 中国港澳台
 23             'http://s.lvmama.com/group/H13K110000?keyword=%E8%BF%AA%E6%8B%9C&k=0#list',  # 迪拜
 24             'http://s.lvmama.com/group/C262H13K110000?keyword=%E4%BF%84%E7%BD%97%E6%96%AF&tabType=group#list',  # 俄罗斯
 25             'http://s.lvmama.com/group/H13K110000Y4?keyword=%E8%B6%8A%E5%8D%97#list#list',  # 越南
 26             'http://s.lvmama.com/group/C265H13K110000?keyword=%E6%B3%95%E5%9B%BD&tabType=group#list%22',  # 法瑞意德
 27             'http://s.lvmama.com/group/H13K110000?keyword=%E5%B7%B4%E5%8E%98%E5%B2%9B&k=0#list',  # 巴厘岛
 28             'http://s.lvmama.com/route/H13K310000?keyword=%E6%97%A5%E6%9C%AC&k=0#list',  # 日本
 29             'http://s.lvmama.com/route/H13K310000?keyword=%E6%AC%A7%E6%B4%B2&k=0#list',  # 欧美
 30             'http://s.lvmama.com/route/H13K440100?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list',  # 新加坡
 31             'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list',  # 香港
 32             'http://s.lvmama.com/route/H13K310000?keyword=%E6%BE%B3%E6%B4%B2&k=0#list',  # 澳洲
 33             'http://s.lvmama.com/route/H13K310000?keyword=%E6%B3%B0%E5%9B%BD&k=0#list',  # 泰国
 34             'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%89%E4%BA%9A&k=0#list',  # 三亚
 35             'http://s.lvmama.com/route/H13K440300P2?keyword=%E4%B8%89%E4%BA%9A&tabType=route350',  # 三亚p2
 36             'http://s.lvmama.com/route/H13K440300P3?keyword=%E4%B8%89%E4%BA%9A&tabType=route350',  # 三亚p3
 37             'http://s.lvmama.com/route/H13K440300P4?keyword=%E4%B8%89%E4%BA%9A&tabType=route350',  # 三亚p4
 38             'http://s.lvmama.com/route/H13K440300?keyword=%E5%8E%A6%E9%97%A8&k=0#list',  # 厦门
 39             'http://s.lvmama.com/route/H13K440300?keyword=%E5%B9%BF%E4%B8%9C&k=0#list',  # 广东
 40             'http://s.lvmama.com/route/H13K440300?keyword=%E4%BA%91%E5%8D%97&k=0#list',  # 云南
 41             'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%8A%E6%B5%B7&k=0#list',  # 上海
 42             'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%AE%89&k=0#list',  # 西安
 43             'http://s.lvmama.com/route/H13K440300?keyword=%E6%88%90%E9%83%BD&k=0#list',  # 成都
 44             'http://s.lvmama.com/route/H13K440300?keyword=%E5%90%89%E6%9E%97&k=0#list',  # 吉林
 45             'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%8C%97&k=0#list',  # 西北
 46             'http://s.lvmama.com/scenictour/K110000?keyword=%E5%8C%97%E4%BA%AC&k=0#list',  # 北京
 47             'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E4%B8%9C&k=0#list',  # 山东
 48             'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E8%A5%BF&k=0#list',  # 山西
 49             'http://s.lvmama.com/scenictour/K110000?keyword=%E6%B2%B3%E5%8C%97&k=0#list',  # 河北
 50             'http://s.lvmama.com/scenictour/K110000?keyword=%E8%BE%BD%E5%AE%81&k=0#list',  # 辽宁
 51             ]
 52         self.channel_name = [
 53             '海岛',
 54             '东南亚',
 55             '中国港澳台',
 56             '迪拜',
 57             '俄罗斯',
 58             '越南',
 59             '法瑞意德',
 60             '巴厘岛',
 61             '日本',
 62             '欧洲',
 63             '新加坡',
 64             '香港',
 65             '澳洲',
 66             '泰国',
 67             '三亚',
 68             '三亚p2',
 69             '三亚p3',
 70             '三亚p4',
 71             '厦门',
 72             '广东',
 73             '云南',
 74             '上海',
 75             '西安',
 76             '成都',
 77             '吉林',
 78             '西北',
 79             '北京',
 80             '山东',
 81             '山西',
 82             '河北',
 83             '辽宁',
 84         ]
 85 
 86     def get_html(self, url):
 87         response = requests.get(url, headers=self.headers)
 88         response.encoding = response.apparent_encoding
 89         html = response.text
 90         return html
 91 
 92     def get_data(self):
 93         # 首页抓取
 94         # html = self.get_html(self.url)
 95         # datas = re.findall('
  • )', datas, re.S) 97 # for li in lis: 98 # # detail_url = re.findall('
  • 99 # detail_url = re.findall('href="(.*?)"', li, re.S)[0] # 详情页网页链接 100 # self.save_data(detail_url) 101 # print(datas) 102 103 # 频道抓取 104 urls = [] 105 # 正则匹配链接 106 for index, channel in enumerate(self.channel_link): 107 html = self.get_html(channel) 108 divs = re.findall('
  •  

    细览页解析字段

      1 import pymysql
      2 import re
      3 import requests
      4 from multiprocessing.dummy import Pool as ThreadPool
      5 import datetime
      6 
      7 
      8 class XLY(object):
      9     def __init__(self):
     10         self.host = '127.0.0.1'
     11         self.db = 'app_mark'
     12         self.user = 'root'
     13         self.passwd = '123456'
     14         self.charset = 'utf8mb4'
     15         self.headers = {
     16             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
     17         }
     18         self.start = datetime.datetime.now()
     19 
     20     def get_data(self):
     21         # 从gly表中拿链接
     22         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
     23         cur = con.cursor()
     24         sql = 'select link from gly where tag = "1" and sitename="驴妈妈旅游"'
     25         after_sql = 'update gly set tag="1" where tag="0" and sitename = "驴妈妈旅游"'
     26         try:
     27             cur.execute(sql)
     28             results = cur.fetchall()
     29             cur.execute(after_sql)
     30         except Exception as e:
     31             con.rollback()
     32             results = None
     33             print('error~', e)
     34         else:
     35             con.commit()
     36         cur.close()
     37         con.close()
     38         return results
     39 
     40     def parse_data(self, url):
     41         # 正则匹配各个字段
     42         print(url)
     43         url = url[0]
     44         # 匹配id
     45         id = url.split('/')[-1]
     46         id = re.sub('\?.*', '', id)
     47         # print(id)
     48         response = requests.get(url, headers=self.headers)
     49         html = response.text
     50         if 'scenic' not in url and 'hotel' not in url:
     51             # 去掉酒店和景点
     52             # 匹配标题
     53             title = re.findall('(.*?)', html, re.S)
     54             if title:
     55                 title = title[0]
     56                 title = re.sub('\n|\r| |自营|<[\s\S]*?>', '', title)
     57                 title = title.strip()
     58             else:
     59                 title = re.findall('

    (.*?)

    ', html, re.S) 60 if title: 61 title = title[0] 62 title = re.sub('\n|\r| |自营|<[\s\S]*?>', '', title) 63 title = title.strip() 64 # 匹配价格 65 price = re.findall('(\d+)', html, re.S) 66 if price: 67 price = price[0] 68 else: 69 price = re.findall('.*?(\d+).*?', html, re.S) 70 if price: 71 price = price[0] 72 else: 73 price = re.findall('(\d+)', html, re.S) 74 if price: 75 price = price[0] 76 else: 77 price = re.findall('.*?(\d+).*?', html, re.S) 78 if price: 79 price = price[0] 80 else: 81 price = None 82 # 匹配好评率 83 praise = re.findall('

    [\s\S]*?([\s\S]*?)[\s\S]*?

    ', html, re.S) 84 if praise: 85 praise = praise[0] 86 praise = re.sub('<.*?>', '', praise) 87 praise = praise.strip() 88 else: 89 praise = re.findall('
    ([\s\S]*?)', html, re.S) 90 if praise: 91 praise = praise[0] 92 else: 93 praise = re.findall('([\s\S]*?)', html, re.S) 94 if praise: 95 praise = praise[0] 96 praise = praise.strip() 97 else: 98 praise = re.findall('

    [\s\S]*?([\s\S]*?)%[\s\S]*?', html, re.S) 99 if praise: 100 praise = praise[0] 101 praise = praise.strip() 102 else: 103 praise = re.findall('([\s\S]*?)', html, re.S) 104 if praise: 105 praise = praise[0] 106 if praise: 107 if '%' in praise: 108 praise = re.sub('%', '', praise) 109 praise = float(praise) 110 if praise > 100: 111 praise = None 112 print('好评率抓取错误') 113 else: 114 pass 115 else: 116 praise = None 117 # 匹配出发地 118 starting_city = re.findall('

    [\s\S]*?出发城市[\s\S]*?([\s\S]*?)', html, re.S) 119 target_city = re.findall('
    目的地[\s\S]*?
    ([\s\S]*?)
    ', html, re.S) 120 if starting_city: 121 starting_city = starting_city[0] 122 starting_city = re.sub('<.*?>', '', starting_city) 123 # 匹配目的地 124 target_city = target_city[0] 125 target_city = re.sub('<.*?>', '', target_city) 126 # 匹配天数 127 days_spent = re.findall('
    出游天数[\s\S]*?
    ([\s\S]*?)
    ', html, re.S)[0] 128 days_spent = re.sub('<.*?>', '', days_spent) 129 # print(days_spent) 130 else: 131 starting_city = target_city = days_spent = None 132 # 匹配类型 133 type_ = re.findall('([\s\S]*?)', html, re.S) 134 if type_: 135 type_ = type_[0] 136 else: 137 type_ = re.findall('([\s\S]*?)', html, re.S) 138 if type_: 139 type_ = type_[0] 140 else: 141 type_ = re.findall('([\s\S]*?)', html, re.S) 142 if type_: 143 type_ = type_[0] 144 else: 145 type_ = None 146 # print(type_) 147 list_data = [id, title, price, praise, starting_city, target_city, days_spent, type_, url] 148 self.save_data(list_data) 149 150 def save_data(self, list_data): 151 # 写入数据库 152 con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) 153 cur = con.cursor() 154 sql = 'insert into lvmama(id_num, title, price, praise, starting_city, target_city, days_spent, type_, link) values (%s, %s, %s, %s, %s, %s, %s, %s, %s)' 155 # cur.execute(sql, list_data) 156 # con.commit() 157 try: 158 cur.execute(sql, list_data) 159 print('insert success') 160 except Exception as e: 161 con.rollback() 162 print('error~', e) 163 else: 164 con.commit() 165 cur.close() 166 con.close() 167 168 169 if __name__ == '__main__': 170 xly = XLY() 171 urls = xly.get_data() 172 if urls: 173 # 开启多线程 174 pool = ThreadPool(20) 175 pool.map(xly.parse_data, urls) 176 pool.close() 177 pool.join() 178 end = datetime.datetime.now() 179 print('耗时:', (end-xly.start)) 180 # for url in urls: 181 # url = url[0] 182 # xly.parse_data(url) 183 # break

     

    你可能感兴趣的:(驴妈妈旅游爬虫)