python提取PDF文件

一、1

import os
import pdfplumber
from common.log import Log
from common.data_process import FileOperate
from common.config_reader import ConfigReader
import datetime

class BaseMethod:

    def __init__(self):
        self.log = Log()
        self.file_path = ConfigReader().get_value("file", "Case_file_path")
        self.fileoperate = FileOperate()
        # self.runstatus_file_path = ConfigReader().get_value("file", "runstatus_file_path")

    def open_path(self):
        # 获取pdf文件路径
        (self.dirname, self.filename) = os.path.split(self.file_path)
        (self.file, extension) = os.path.splitext(self.filename)
        if self.dirname == "":
            return -1
        elif self.filename == "":
            return -1
        else:
            return 0

    def as_name(self):
        # pdf 提取信息后另存为的路径
        if not os.path.exists(self.dirname):
            os.mkdir(self.dirname)
        timestr = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
        self.savefile = os.path.join(self.dirname, self.file+'-'+timestr)
        try:
            if len(self.savefile) <= 100:
                return self.savefile
        except:
            self.log.logMsg(2, 'Failed to get file')
            return None

    def pages(self):
        pages = pdfplumber.open(self.file_path).pages
        for i in range(len(pages)):
            page = pages[i]
            return page

    def as_txt_file(self):
        # 读取pdf文件,写入txt文件
        txt_file = self.as_name()
        try:
            table = self.pages().extract_text()
            self.fileoperate.writefile(txt_file + ".txt", "txt", table

你可能感兴趣的:(python,python,开发语言)