爬虫模板开发

做一个界面来方便爬取数据,用的re

项目上传git地址:https://github.com/sqhl/Spider

1.pyqt5界面:
 

class Ui_Form(object):
    def setupUi(self, Form):
        Form.setObjectName("Form")
        Form.setEnabled(True)
        Form.resize(762, 487)
        self.label = QtWidgets.QLabel(Form)
        self.label.setGeometry(QtCore.QRect(70, 20, 91, 31))
        self.label.setObjectName("label")
        self.lineEdit = QtWidgets.QLineEdit(Form)
        self.lineEdit.setEnabled(False)
        self.lineEdit.setGeometry(QtCore.QRect(170, 20, 271, 31))
        self.lineEdit.setObjectName("lineEdit")
        self.pushButton = QtWidgets.QPushButton(Form)
        self.pushButton.setGeometry(QtCore.QRect(450, 20, 71, 31))
        self.pushButton.setObjectName("pushButton")
        self.label_2 = QtWidgets.QLabel(Form)
        self.label_2.setGeometry(QtCore.QRect(530, 30, 231, 16))
        self.label_2.setObjectName("label_2")
        self.label_3 = QtWidgets.QLabel(Form)
        self.label_3.setGeometry(QtCore.QRect(80, 60, 81, 31))
        self.label_3.setObjectName("label_3")
        self.lineEdit_2 = QtWidgets.QLineEdit(Form)
        self.lineEdit_2.setGeometry(QtCore.QRect(170, 60, 271, 31))
        self.lineEdit_2.setObjectName("lineEdit_2")
        self.checkBox = QtWidgets.QCheckBox(Form)
        self.checkBox.setGeometry(QtCore.QRect(450, 70, 91, 19))
        self.checkBox.setObjectName("checkBox")
        self.checkBox_2 = QtWidgets.QCheckBox(Form)
        self.checkBox_2.setGeometry(QtCore.QRect(170, 110, 91, 19))
        self.checkBox_2.setObjectName("checkBox_2")
        self.checkBox_3 = QtWidgets.QCheckBox(Form)
        self.checkBox_3.setGeometry(QtCore.QRect(400, 110, 91, 19))
        self.checkBox_3.setObjectName("checkBox_3")
        self.checkBox_4 = QtWidgets.QCheckBox(Form)
        self.checkBox_4.setGeometry(QtCore.QRect(170, 180, 91, 19))
        self.checkBox_4.setObjectName("checkBox_4")
        self.checkBox_5 = QtWidgets.QCheckBox(Form)
        self.checkBox_5.setGeometry(QtCore.QRect(400, 180, 91, 19))
        self.checkBox_5.setObjectName("checkBox_5")
        self.lineEdit_3 = QtWidgets.QLineEdit(Form)
        self.lineEdit_3.setGeometry(QtCore.QRect(170, 130, 211, 31))
        self.lineEdit_3.setObjectName("lineEdit_3")
        self.lineEdit_4 = QtWidgets.QLineEdit(Form)
        self.lineEdit_4.setGeometry(QtCore.QRect(400, 130, 211, 31))
        self.lineEdit_4.setObjectName("lineEdit_4")
        self.lineEdit_5 = QtWidgets.QLineEdit(Form)
        self.lineEdit_5.setGeometry(QtCore.QRect(170, 200, 211, 31))
        self.lineEdit_5.setObjectName("lineEdit_5")
        self.lineEdit_6 = QtWidgets.QLineEdit(Form)
        self.lineEdit_6.setGeometry(QtCore.QRect(400, 200, 211, 31))
        self.lineEdit_6.setObjectName("lineEdit_6")
        self.textBrowser = QtWidgets.QTextBrowser(Form)
        self.textBrowser.setGeometry(QtCore.QRect(170, 240, 441, 192))
        self.textBrowser.setObjectName("textBrowser")
        self.pushButton_2 = QtWidgets.QPushButton(Form)
        self.pushButton_2.setGeometry(QtCore.QRect(240, 440, 93, 28))
        self.pushButton_2.setObjectName("pushButton_2")
        self.pushButton_3 = QtWidgets.QPushButton(Form)
        self.pushButton_3.setGeometry(QtCore.QRect(410, 440, 93, 28))
        self.pushButton_3.setObjectName("pushButton_3")
        self.pushButton_3.setEnabled(False)

        self.retranslateUi(Form)
        QtCore.QMetaObject.connectSlotsByName(Form)

    def retranslateUi(self, Form):
        _translate = QtCore.QCoreApplication.translate
        Form.setWindowTitle(_translate("Form", "Form"))
        self.label.setText(_translate("Form", "选择文件夹 :"))

        self.pushButton.setText(_translate("Form", "选择"))
        self.pushButton.clicked.connect(lambda: self.msg(Form))

        self.label_2.setText(_translate("Form", "(默认为当前目录下的Data文件夹)"))
        self.label_3.setText(_translate("Form", "输入网址 :"))
        self.checkBox.setText(_translate("Form", "伪造"))
        self.checkBox_2.setText(_translate("Form", "文章"))
        self.checkBox_3.setText(_translate("Form", "图片"))
        self.checkBox_4.setText(_translate("Form", "链接"))
        self.checkBox_5.setText(_translate("Form", "自定义"))

        self.pushButton_2.setText(_translate("Form", "start"))
        self.pushButton_2.clicked.connect(lambda: self.start(Form))

        self.pushButton_3.setText(_translate("Form", "stop"))
        self.pushButton_3.clicked.connect(lambda: self.stop(Form))
    def msg(self,Form):
        directory1 = QFileDialog.getExistingDirectory(Form,"选取文件夹","./Data")
        self.lineEdit.setText(directory1)
    def start(self,Form):
        self.textBrowser.append("爬虫开始工作....")
        self.Spider = Spider(self)
        self.Spider.start()
        if self.checkBox_4.checkState():
            self.pushButton_3.setEnabled(True)
    def stop(self,Form):
        self.Spider.stop()
        self.textBrowser.append("停止继续爬取!!!")
        self.pushButton_3.setEnabled(False)
class Spider(QThread):
    def __init__(self, index):
        super().__init__()
        self.index = index
        self.spider = spider()
        self.spider.spider_init(index)
        self.flag = True
    def run(self):
        while self.flag:
            self.spider.spider_next_href(self.index)
    def stop(self):
        time.sleep(2)
        self.flag = False
        self.spider.save_href(self.index)

效果:
爬虫模板开发_第1张图片

2.spider_next.py:
 

from urllib import  request
import re
import copy
import gevent
from gevent import monkey
import os
monkey.patch_all()
from datetime import datetime
from bs4 import BeautifulSoup
import random
class spider(object):
    def spider_init(self,index):
        url = index.lineEdit_2.text()
        self.b = []
        self.image = []
        if url == "":
            index.textBrowser.append("url输入错误!")
        else:
            self.b.append(url)
            self.c = copy.deepcopy(self.b)
            self.dic = index.lineEdit.text()
            if not self.dic:
                self.dic = "./Data"
            self.dic += "/" + datetime.now().strftime('%Y-%m-%d')
            if not os.path.exists(self.dic):
                os.mkdir(self.dic)
    def spider_next_href(self, index):
        if self.c:
            self.spider_html(self.c.pop(0), index)
    def spider_html(self, url, index):
        try:
            index.textBrowser.append("开始爬取"+url)
            if index.checkBox.checkState():
                headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"}
                req = request.Request(url=url, headers=headers) #这里必须写url=url
                req.add_header('Referer', 'http://www.mzitu.com/')
            else:
                req = request.Request(url)
            self.new_dic = self.dic+"/"+url.split("/")[-1]
            if not os.path.exists(self.new_dic):
                os.mkdir(self.new_dic)
            self.html = request.urlopen(req).read().decode('utf-8')
            if index.checkBox_2.checkState():  # 文章
                g1 = gevent.spawn(self.spider_article, index)
            if index.checkBox_3.checkState():  # 图片
                g2 = gevent.spawn(self.spider_pic, index)
            if index.checkBox_4.checkState():  # 链接
                g3 = gevent.spawn(self.spider_href, index)
            if index.checkBox_5.checkState():  # 自定义
                g4 = gevent.spawn(self.spider_custom, index)
            if index.checkBox_2.checkState():
                g1.join()
            if index.checkBox_3.checkState():
                g2.join()
            if index.checkBox_4.checkState():
                g3.join()
            if index.checkBox_5.checkState():
                g4.join()
            index.textBrowser.append(url+"爬取结束......")
        except:
            index.textBrowser.append("url输入错误!")
    def spider_href(self, index): #链接
        #url = index.lineEdit_3.text()
        req_href = r"(https://blog.csdn.net/by_side_with_sun/article/details/.+?)\""
        all_list = re.findall(req_href, self.html)
        right_list = set(list(filter(lambda x:x not in self.b,all_list)))
        index.textBrowser.append("新增链接数:"+str(len(right_list)))
        for href in right_list:
            self.b.append(href)
            self.c.append(href)
        index.textBrowser.append("剩余链接数:"+str(len(self.c)))
    def spider_pic(self,index): #图片
        index.textBrowser.append("爬取图片中....")
        req_pic = r'(.*?)"
        title = re.findall(req_title, self.html)
        req_text = r"
(.*?)
" text = re.findall(req_text, self.html, re.S) text = "".join(text) dr = re.compile(r'<[^>]+>', re.S) text = dr.sub('', text) with open(self.new_dic+"/article.txt","a", encoding="utf-8") as f: f.write((title[0]+"\n"+text).replace(u'\xa0', u'')) index.textBrowser.append("爬取--"+title[0]+"--文章结束....") def spider_custom(self, index): #自定义 print("custom") def save_href(self,index): with open(self.dic+"/href.txt","a", encoding="utf-8") as f: f.write("\n".join(self.b)) index.textBrowser.append("总计爬取"+str(len(self.b))+"条链接..........") index.textBrowser.append("总计爬取" + str(len(self.image)) + "张图片..........") if __name__ =="__main__": spider()

希望能在我的基础上做做更改,互相学习

你可能感兴趣的:(python)