在日常工作中,我们经常需要从PowerPoint文档中提取文本内容进行处理。本文将详细介绍如何使用Python开发一个带图形界面的PPT文本提取工具,该工具可以轻松地从PPTX文件中提取所有文本内容,并按页码显示。
C:\pythoncode\new\GetContentOFPPT.py
import wx
import os
from pptx import Presentation
class PPTExtractFrame(wx.Frame):
def __init__(self):
super().__init__(parent=None, title='PPT文字提取工具', size=(800, 600))
self.InitUI()
def InitUI(self):
# 创建面板
panel = wx.Panel(self)
# 创建垂直布局
vbox = wx.BoxSizer(wx.VERTICAL)
# 创建按钮
self.select_btn = wx.Button(panel, label='选择PPT文件')
self.select_btn.Bind(wx.EVT_BUTTON, self.OnSelect)
vbox.Add(self.select_btn, 0, wx.ALL | wx.CENTER, 5)
# 创建文本框
self.memo = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
vbox.Add(self.memo, 1, wx.ALL | wx.EXPAND, 5)
panel.SetSizer(vbox)
def OnSelect(self, event):
# 创建文件选择对话框
with wx.FileDialog(self, "选择PPTX文件",
wildcard="PowerPoint files (*.pptx)|*.pptx",
style=wx.FD_OPEN | wx.FD_FILE_MUST_EXIST) as fileDialog:
if fileDialog.ShowModal() == wx.ID_CANCEL:
return
# 获取选择的文件路径
pathname = fileDialog.GetPath()
try:
self.ExtractText(pathname)
except Exception as e:
wx.MessageBox(f'错误:{str(e)}', '错误',
wx.OK | wx.ICON_ERROR)
def ExtractText(self, filepath):
# 清空文本框
self.memo.Clear()
# 打开PPT文件
prs = Presentation(filepath)
# 遍历所有幻灯片
for idx, slide in enumerate(prs.slides, 1):
text_content = []
# 提取当前幻灯片中的所有文本
for shape in slide.shapes:
if hasattr(shape, "text"):
if shape.text.strip(): # 只添加非空文本
text_content.append(shape.text.strip())
# 如果该页有文本,则添加到memo中
if text_content:
page_text = f"第{idx}页内容:\n" + "\n".join(text_content) + "\n\n"
self.memo.AppendText(page_text)
def main():
app = wx.App()
frame = PPTExtractFrame()
frame.Show()
app.MainLoop()
if __name__ == '__main__':
main()
本项目使用了以下技术:
程序采用面向对象的设计方法,主要包含以下组件:
import wx
import os
from pptx import Presentation
这三个import语句导入了程序所需的核心模块:
class PPTExtractFrame(wx.Frame):
def __init__(self):
super().__init__(parent=None, title='PPT文字提取工具', size=(800, 600))
self.InitUI()
这是程序的主窗口类,继承自wx.Frame。构造函数中:
def InitUI(self):
panel = wx.Panel(self)
vbox = wx.BoxSizer(wx.VERTICAL)
self.select_btn = wx.Button(panel, label='选择PPT文件')
self.select_btn.Bind(wx.EVT_BUTTON, self.OnSelect)
vbox.Add(self.select_btn, 0, wx.ALL | wx.CENTER, 5)
self.memo = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
vbox.Add(self.memo, 1, wx.ALL | wx.EXPAND, 5)
panel.SetSizer(vbox)
InitUI方法创建了程序的界面布局:
def OnSelect(self, event):
with wx.FileDialog(self, "选择PPTX文件",
wildcard="PowerPoint files (*.pptx)|*.pptx",
style=wx.FD_OPEN | wx.FD_FILE_MUST_EXIST) as fileDialog:
if fileDialog.ShowModal() == wx.ID_CANCEL:
return
pathname = fileDialog.GetPath()
try:
self.ExtractText(pathname)
except Exception as e:
wx.MessageBox(f'错误:{str(e)}', '错误',
wx.OK | wx.ICON_ERROR)
OnSelect方法处理文件选择操作:
def ExtractText(self, filepath):
self.memo.Clear()
prs = Presentation(filepath)
for idx, slide in enumerate(prs.slides, 1):
text_content = []
for shape in slide.shapes:
if hasattr(shape, "text"):
if shape.text.strip():
text_content.append(shape.text.strip())
if text_content:
page_text = f"第{idx}页内容:\n" + "\n".join(text_content) + "\n\n"
self.memo.AppendText(page_text)
ExtractText方法是程序的核心功能:
def main():
app = wx.App()
frame = PPTExtractFrame()
frame.Show()
app.MainLoop()
if __name__ == '__main__':
main()
程序入口部分: