pdfplumber是一个Python库,专门用于从PDF文件中提取文本、表格和其他信息。相比其他PDF处理库,pdfplumber提供了更直观的API和更精确的文本定位能力。
主要特点:
pip install pdfplumber
import pdfplumber
with pdfplumber.open("example.pdf") as pdf:
first_page = pdf.pages[0]
print(first_page.extract_text())
代码解释:
pdfplumber.open()
打开PDF文件pdf.pages
获取所有页面的列表extract_text()
提取页面文本内容with pdfplumber.open("report.pdf") as pdf:
for page in pdf.pages:
print(page.extract_text())
应用场景:合同文本分析、报告内容提取等
with pdfplumber.open("formatted.pdf") as pdf:
page = pdf.pages[0]
words = page.extract_words()
for word in words:
print(f"文本: {word['text']}, 位置: {word['x0'], word['top']}, 字体: {word['fontname']}")
输出示例:
文本: 标题, 位置: (72.0, 84.0), 字体: Helvetica-Bold
文本: 内容, 位置: (72.0, 96.0), 字体: Helvetica
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# 定义区域(x0, top, x1, bottom)
area = (50, 100, 400, 300)
cropped = page.crop(area)
print(cropped.extract_text())
应用场景:提取发票中的特定信息、扫描件中的关键数据等
with pdfplumber.open("data.pdf") as pdf:
page = pdf.pages[0]
table = page.extract_table()
for row in table:
print(row)
输出示例:
['姓名', '年龄', '职业']
['张三', '28', '工程师']
['李四', '32', '设计师']
with pdfplumber.open("complex_table.pdf") as pdf:
page = pdf.pages[0]
# 自定义表格设置
table_settings = {
"vertical_strategy": "text",
"horizontal_strategy": "text",
"intersection_y_tolerance": 10
}
table = page.extract_table(table_settings)
参数说明:
vertical_strategy
:垂直分割策略horizontal_strategy
:水平分割策略intersection_y_tolerance
:行合并容差with pdfplumber.open("multi_page_table.pdf") as pdf:
full_table = []
for page in pdf.pages:
table = page.extract_table()
if table:
# 跳过表头(假设第一页已经有表头)
if page.page_number > 1:
table = table[1:]
full_table.extend(table)
for row in full_table:
print(row)
应用场景:财务报表分析、数据报表汇总等
with pdfplumber.open("debug.pdf") as pdf:
page = pdf.pages[0]
im = page.to_image()
im.debug_tablefinder().show()
功能说明:
to_image()
将页面转为图像debug_tablefinder()
高亮显示检测到的表格show()
显示图像(需要安装Pillow)with pdfplumber.open("drawing.pdf") as pdf:
page = pdf.pages[0]
lines = page.lines
curves = page.curves
rects = page.rects
print(f"找到 {len(lines)} 条直线")
print(f"找到 {len(curves)} 条曲线")
print(f"找到 {len(rects)} 个矩形")
应用场景:工程图纸分析、设计文档处理等
def custom_extract_method(page):
# 获取所有字符对象
chars = page.chars
# 按y坐标分组(行)
lines = {}
for char in chars:
line_key = round(char["top"])
if line_key not in lines:
lines[line_key] = []
lines[line_key].append(char)
# 按x坐标排序并拼接文本
result = []
for y in sorted(lines.keys()):
line_chars = sorted(lines[y], key=lambda c: c["x0"])
line_text = "".join([c["text"] for c in line_chars])
result.append(line_text)
return "\n".join(result)
with pdfplumber.open("custom.pdf") as pdf:
page = pdf.pages[0]
print(custom_extract_method(page))
应用场景:处理特殊格式的PDF文档
with pdfplumber.open("large.pdf") as pdf:
# 只处理前5页
for page in pdf.pages[:5]:
process(page.extract_text())
from concurrent.futures import ThreadPoolExecutor
def process_page(page):
return page.extract_text()
with pdfplumber.open("big_file.pdf") as pdf:
with ThreadPoolExecutor(max_workers=4) as executor:
results = list(executor.map(process_page, pdf.pages))
import pickle
def extract_and_cache(pdf_path, cache_path):
try:
with open(cache_path, "rb") as f:
return pickle.load(f)
except FileNotFoundError:
with pdfplumber.open(pdf_path) as pdf:
data = [page.extract_text() for page in pdf.pages]
with open(cache_path, "wb") as f:
pickle.dump(data, f)
return data
text_data = extract_and_cache("report.pdf", "report_cache.pkl")
def extract_invoice_info(pdf_path):
invoice_data = {
"invoice_no": None,
"date": None,
"total": None
}
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
lines = text.split("\n")
for line in lines:
if "发票号码" in line:
invoice_data["invoice_no"] = line.split(":")[1].strip()
elif "日期" in line:
invoice_data["date"] = line.split(":")[1].strip()
elif "合计" in line:
invoice_data["total"] = line.split()[-1]
return invoice_data
def analyze_paper(pdf_path):
sections = {
"abstract": "",
"introduction": "",
"conclusion": ""
}
with pdfplumber.open(pdf_path) as pdf:
current_section = None
for page in pdf.pages:
text = page.extract_text()
for line in text.split("\n"):
line = line.strip()
if line.lower() == "abstract":
current_section = "abstract"
elif line.lower().startswith("1. introduction"):
current_section = "introduction"
elif line.lower().startswith("conclusion"):
current_section = "conclusion"
elif current_section:
sections[current_section] += line + "\n"
return sections
import csv
def convert_pdf_to_csv(pdf_path, csv_path):
with pdfplumber.open(pdf_path) as pdf:
with open(csv_path, "w", newline="") as f:
writer = csv.writer(f)
for page in pdf.pages:
table = page.extract_table()
if table:
writer.writerows(table)
with pdfplumber.open("chinese.pdf") as pdf:
page = pdf.pages[0]
# 确保系统安装了中文字体
text = page.extract_text()
print(text.encode("utf-8").decode("utf-8"))
解决方案:
table_settings = {
"vertical_strategy": "lines",
"horizontal_strategy": "lines",
"explicit_vertical_lines": page.lines,
"explicit_horizontal_lines": page.lines,
"intersection_x_tolerance": 15,
"intersection_y_tolerance": 15
}
table = page.extract_table(table_settings)
调整策略:
# 逐页处理并立即释放内存
with pdfplumber.open("huge.pdf") as pdf:
for i, page in enumerate(pdf.pages):
process(page.extract_text())
# 手动释放页面资源
pdf.release_resources()
if i % 10 == 0:
print(f"已处理 {i+1} 页")
优先选择pdfplumber:
考虑其他方案:
预处理PDF文件:
# 使用Ghostscript优化PDF
import subprocess
subprocess.run(["gs", "-sDEVICE=pdfwrite", "-dNOPAUSE", "-dBATCH",
"-dSAFER", "-sOutputFile=optimized.pdf", "original.pdf"])
组合使用多种工具:
# 结合PyMuPDF获取更精确的文本位置
import fitz
doc = fitz.open("combined.pdf")
建立错误处理机制:
def safe_extract(pdf_path):
try:
with pdfplumber.open(pdf_path) as pdf:
return pdf.pages[0].extract_text()
except Exception as e:
print(f"处理{pdf_path}时出错: {str(e)}")
return None
性能监控:
import time
start = time.time()
# pdf处理操作
print(f"处理耗时: {time.time()-start:.2f}秒")
pdfplumber是Python生态中最强大的PDF解析库之一,特别适合需要精确提取文本和表格数据的应用场景。通过合理使用其丰富的功能和灵活的API,可以解决大多数PDF处理需求。对于特殊需求,结合其他PDF处理工具和自定义逻辑,能够构建出高效可靠的PDF处理流程。