代码备份

简单的爬虫,由于这里面写了一个简单的压缩以及异或加密程序,因此运行完会看到两个不带后缀名的文件以及一个json文件。加密的目的本来是为了放到阿里云服务器而不被云盾检测到。当然,您也可以删掉相关代码(已在代码中标出),直接以图片形式保存。

保留所有函数,删除其他部分,再见!。

# -*- coding: UTF-8 -*-
import threading
import requests
import re
import time
from bs4 import BeautifulSoup
import os
import urllib
import zipfile
import shutil
import json


def downIMG(url,path):
	opener = urllib.request.build_opener()
	opener.addheaders = [('User-agent', 'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10')]
	urllib.request.install_opener(opener) 
	urllib.request.urlretrieve(url,path)
	return;

def getall(page,num,site,filepath):
	mainsite=site+'/thread0806.php?fid=16&search=&page='+str(page)
	mhtml = requests.get(mainsite)
	mhtml.encoding = 'gbk'
	part_siteURL = re.findall(r"

.+?

",mhtml.text,re.S) urls = [] links=[]; titles=[]; effect=[]; if(page==1): start=9 else: start=0 for n in range(start,len(part_siteURL)-1): link=re.findall(r"htm_data.+?\.html",part_siteURL[n],re.S) if not len(link)==0: link=site+'/'+link[0] #如https://www.baidu.com/ effect.append(n) links.append(link) title=re.findall(r"

()*(.+?)<.*?/a>

",part_siteURL[n],re.S) #print(title[0][1]) title=title[0][1] titles.append(title) website=links[num-1] html = requests.get(website) html.encoding = 'gbk' res = requests.get(website) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'lxml') folder =soup.title.text[:-32] folderutf = folder.encode("utf-8") folderutf = folderutf.decode('utf-8') print('正在下载 '+str(num)+'、'+titles[num-1]+'') soup = BeautifulSoup(html.text, 'html.parser') #part_picURL = re.findall("src='http://img(.+?\.jpg)' type='image'>",html.text,re.S) part_picURL = re.findall(r"src='([a-zA-Z0-9|/|www.|.com|:|_|\?|\.\=]+?)(\.|&)(jpg|gif|png|JPG|PNG|GIF)' type='image'>",html.text,re.S) headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} sufs=[] for each in part_picURL: picURL = each[0]+each[1]+each[2] suf=each[2] urls.append(picURL) sufs.append(suf) length=len(urls) #print(urls) threads=[]; title=re.sub(r'[\?|\\|\/|\>|\<|\"|\:|\*|\|]','~',str(titles[num-1])) pathURL=filepath+'/'+re.sub(r'[\?|\\|\/|\>|\<|\"|\:|\*|\|]','~',str(titles[num-1]))#此处换成你自己要保存的路径,类似'c:\users\'格式 global G; G=0; if (length!=0): print("当前共有"+str(length)+"张图片。") for k in range(0,length): if not os.path.exists(pathURL): os.makedirs(pathURL) url=urls[k];paths=pathURL+'/'+str(k)+'.'+sufs[k]; #print(url+' '+paths) threads.append(threading.Thread(target=downimgs,args=(url,paths,k))); for thread in threads: thread.start() time.sleep(0.5) for t in threads: t.join(10) if (len(os.listdir(pathURL))==0): os.rmdir(pathURL) #开始删除------------------------------------------------------------------------------- else: tm=time.strftime("%m%d_%H%M%S", time.localtime()) compress(pathURL, filepath+'/%s.zip'%(tm)) print('压缩成功!') key=0x9e enc(filepath+'/%s.zip'%(tm),filepath+'/%s'%(tm),key) shutil.rmtree(pathURL) os.remove(filepath+'/%s.zip'%(tm)) if os.path.exists(filepath+'/list'): dec(filepath+'/list',filepath+'/list.json',key); jsdec=open(filepath+'/list.json'); listdic=json.loads(jsdec.read()); jsdec.close(); os.remove(filepath+'/list'); else: listdic={} listdic[tm]=title; jsdec=open(filepath+'/list.json','w'); jsdec.write(json.dumps(listdic,ensure_ascii=False)) jsdec.close() enc(filepath+'/list.json',filepath+'/list',key) #结束删除-------------------------------------------------------------------------------- else: print("当前无可下载图片。"); return; def gettitles(page,site): mainsite=site+'/thread0806.php?fid=16&search=&page='+str(page) mhtml = requests.get(mainsite) mhtml.encoding = 'gbk' part_siteURL = re.findall(r"

.+?

",mhtml.text,re.S) titles=[]; effect=[]; links=[]; if(page==1): start=9 else: start=0 for n in range(start,len(part_siteURL)-1): link=re.findall(r"htm_data.+?\.html",part_siteURL[n],re.S) if not len(link)==0: link=site+'/'+link[0] #如https://www.baidu.com/ effect.append(n) links.append(link) title=re.findall(r"

()*(.+?)<.*?/a>

",part_siteURL[n],re.S) title=title[0][1] titles.append(title) for s in range(0,len(effect)-1): print(str(s+1)+'、'+titles[s]) return titles; def getpic(page,dic,site,filepath): for num in dic: getall(page,num,site,filepath) return; def downimgs(url,path,k): global G; try: downIMG(url,path) print("第"+str(G+1)+"张图片下载成功。") # print(path) G=G+1; except: print("第%d张图片下载失败。"%(G+1)) G=G+1; def enc(src,enc,key): src=open(src, 'rb') enc=open(enc, 'wb') b=src.read(1) while (b!=b''): bi=ord(b)^(key) enc.write(bytes([bi])) b=src.read(1) enc.close() print('加密成功!') def dec(enc,dec,key): enc=open(enc, 'rb') dec=open(dec, 'wb') b=enc.read(1) while (b!=b''): bi=ord(b)^(key) dec.write(bytes([bi])) b=enc.read(1) dec.close() print('解密成功!') def compress(get_files_path, set_files_path): f = zipfile.ZipFile(set_files_path , 'w', zipfile.ZIP_DEFLATED ) for dirpath, dirnames, filenames in os.walk( get_files_path ): fpath = dirpath.replace(get_files_path,'') fpath = fpath and fpath + os.sep or '' for filename in filenames: f.write(os.path.join(dirpath,filename), fpath+filename) f.close() #************************************************************************************# b=a.split(',') lis=[]; n=0; for e in b: finde=re.findall(r'(^[0-9]+?$|^[0-9]+?-[0-9]+$)',e) #print(finde) if len(finde)==1: if e.find('-')==-1: if int(e)(len(titles)-1) or int(ch[0][1]))<1 or int(ch[0][1])>(len(titles)-1)): if(n==0): print("数值超出范围!",end='') n=1; else: lis.append(range(int(ch[0][0]),int(ch[0][1]))) else: print("格式有错误!") #************************************************************************************#

你可能感兴趣的:(代码备份)