PyPDF2是一个历史悠久的Python PDF处理库,而PyPDF4是其改进和维护的分支版本:
# 安装PyPDF2
pip install PyPDF2
# 或安装PyPDF4(推荐)
pip install PyPDF4
PyPDF4相比PyPDF2有以下优势:
本文后续代码兼容两个库,只需将导入语句调整即可:
# 使用PyPDF2
from PyPDF2 import PdfFileReader, PdfFileWriter
# 或使用PyPDF4
from PyPDF4 import PdfFileReader, PdfFileWriter
理解PDF文件的基本结构有助于更好地操作它们:
PyPDF2/4主要关注页面级操作,对内部元素的处理相对有限。
from PyPDF4 import PdfFileReader
def get_pdf_info(pdf_path):
"""获取PDF文件的基本信息"""
with open(pdf_path, 'rb') as file:
reader = PdfFileReader(file)
# 获取页数
page_count = reader.getNumPages()
# 获取文档信息
info = reader.getDocumentInfo()
# 检查是否加密
is_encrypted = reader.isEncrypted
return {
'page_count': page_count,
'title': info.title if info.title else "无标题",
'author': info.author if info.author else "未知作者",
'subject': info.subject if info.subject else "无主题",
'creator': info.creator if info.creator else "未知创建者",
'producer': info.producer if info.producer else "未知生成器",
'is_encrypted': is_encrypted
}
# 使用示例
pdf_info = get_pdf_info('example.pdf')
print(f"PDF信息:\n页数: {
pdf_info['page_count']}")
print(f"标题: {
pdf_info['title']}")
print(f"作者: {
pdf_info['author']}")
print(f"加密状态: {
'已加密' if pdf_info['is_encrypted'] else '未加密'}")
def extract_text_from_pdf(pdf_path, password=None):
"""从PDF文件中提取所有文本"""
with open(pdf_path, 'rb') as file:
reader = PdfFileReader(file)
# 如果PDF加密且提供了密码
if reader.isEncrypted and password:
success = reader.decrypt(password)
if not success:
return "提供的密码不正确"
# 存储所有页面的文本
all_text = []
# 逐页提取文本
for page_num in range(reader.getNumPages()):
page = reader.getPage(page_num)
page_text = page.extractText()
all_text.append(page_text)
# 合并所有文本
return "\n".join(all_text)
# 使用示例
text = extract_text_from_pdf('document.pdf')
print(f"提取的文本预览:\n{
text[:300]}...") # 仅显示前300个字符
def extract_pdf_metadata(pdf_path):
"""提取PDF文件的元数据"""
with open(pdf_path, 'rb') as file:
reader = PdfFileReader(file)
# 获取文档信息字典
info = reader.getDocumentInfo()
# 处理XMP元数据(如果存在)
xmp_info = {
}
if hasattr(reader, 'getXmpMetadata') and reader.getXmpMetadata():
xmp = reader.getXmpMetadata()
# 提取常见XMP字段
xmp_info = {
'creator_tool': xmp.creator_tool if hasattr(xmp, 'creator_tool') else None,
'description': xmp.description if hasattr(xmp, 'description') else None,
'create_date': xmp.create_date if hasattr(xmp, 'create_date') else None,
'modify_date': xmp.modify_date if hasattr(xmp, 'modify_date') else None,
}
# 获取PDF版本
pdf_version = reader.pdf_header
metadata = {
'title': info.title if hasattr(info, 'title') else None,
'author': info.author if hasattr(info, 'author') else None,
'subject': info.subject if hasattr(info, 'subject') else None,
'creator': info.creator if hasattr(info, 'creator') else None,
'producer': info.producer if hasattr(info, 'producer') else None,
'creation_date': info.creation_date if hasattr(info, 'creation_date') else None,
'modification_date': info.modification_date if hasattr(info, 'modification_date') else None,
'pdf_version': pdf_version,
'xmp_metadata': xmp_info
}
return metadata
# 使用示例
metadata = extract_pdf_metadata('report.pdf')
for key, value in metadata.items():
if key != 'xmp_metadata':
print(f"{
key}: {
value}")
# 单独输出XMP元数据
if metadata['xmp_metadata']:
print("\nXMP元数据:")
for key, value in metadata['xmp_metadata'].items():
print(f"{
key}: {
value}")
def merge_pdfs(pdf_paths, output_path):
"""合并多个PDF文件为一个新文件"""
pdf_writer = PdfFileWriter()
# 遍历每个输入PDF
for path in pdf_paths:
with open(path, 'rb') as file:
pdf_reader = PdfFileReader(file)
# 如果文件加密,需要先处理
if pdf_reader.isEncrypted:
print(f"跳过加密文件: {
path}")
continue
# 添加每一页到输出PDF
for page_num in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page_num)
pdf_writer.addPage(page)
# 写入合并后的PDF
with open(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
return output_path
# 使用示例
input_pdfs = ['document1.pdf', 'document2.pdf', 'document3.pdf']
merged_pdf = merge_pdfs(input_pdfs, 'merged_document.pdf')
print(f"合并完成,已保存到: {
merged_pdf}")
def split_pdf(input_path, output_name_pattern='page_%i.pdf'):
"""将PDF文件拆分为单页文件"""
with open(input_path, 'rb') as file:
pdf_reader = PdfFileReader(file)
# 检查是否加密
if pdf_reader.isEncrypted:
return "无法拆分加密文件"
# 存储输出文件路径列表
output_files = []
# 逐页保存为单独文件
for page_num in range(pdf_reader.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf_reader.getPage(page_num))
# 生成输出文件名
output_path = output_name_pattern % (page_num + 1)
# 保存页面到新文件
with open(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
output_files.append(output_path)
return output_files
# 使用示例
output_pages = split_pdf('document.pdf', 'output/page_%03d.pdf')
print(f"文件已拆分为 {
len(output_pages)} 页:")
for page in output_pages[:5]: # 只显示前5个
print(f" - {
page}")
if len(output_pages) > 5:
print(f"... 以及其他 {
len(output_pages) - 5} 页")
def extract_pages(input_path, output_path, pages):
"""从PDF中提取指定页面到新文件
参数:
input_path: 输入PDF文件路径
output_path: 输出PDF文件路径
pages: 要提取的页码列表(从0开始)
"""
with open(input_path, 'rb') as file:
pdf_reader = PdfFileReader(file)
# 创建输出PDF
pdf_writer = PdfFileWriter()
# 获取页面总数
page_count = pdf_reader.getNumPages()
# 添加指定页面
for page_num in pages:
if 0 <= page_num < page_count:
pdf_writer.addPage(pdf_reader.getPage(page_num))
else:
print(f"警告: 页码 {
page_num+1} 超出范围(1-{
page_count})")
# 保存提取的页面
with open(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
return output_path
# 使用示例 - 提取第1、3、5页(索引从0开始)
extracted_pdf = extract_pages('report.pdf', 'extracted_pages.pdf', [0, 2, 4])
print(f"已提取选定页面到: {
extracted_pdf}")
def add_watermark(input_path, watermark_path, output_path):
"""向PDF文件添加水印
参数:
input_path: 原始PDF文件路径
watermark_path: 水印PDF文件路径(通常为单页)
output_path: 输出带水印的PDF文件路径
"""
# 打开水印PDF
with open(watermark_path, 'rb') as watermark_file:
watermark_reader = PdfFileReader(watermark_file)
# 获取水印页面
watermark_page = watermark_reader.getPage(0)
# 打开原始PDF
with open(input_path, 'rb') as input_file:
pdf_reader = PdfFileReader(input_file)
pdf_writer = PdfFileWriter()
# 为每一页添加水印
for page_num in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page_num)
# 将原始页面内容覆盖在水印上
# 注意: 顺序很重要!这样水印就会显示在内容下面
watermark_page.mergePage(page)
pdf_writer.addPage(watermark_page)
# 重新获取水印页面以避免重复使用
watermark_page = waterm