在计算机世界中,文件就像一本本等待翻阅的典籍。Python的open()
函数如同手持放大镜,让我们能精确控制阅读和书写:
# 经典打开模式组合
with open("data.txt", "r+", encoding="utf-8") as f:
# r+模式:可读可写,文件指针初始位置在开头
content = f.read(10) # 读取前10个字节
f.seek(0) # 移动文件指针到起始位置
f.write("Hello World") # 覆盖原有内容
关键参数解析:
模式字符 | 读写权限 | 文件指针位置 |
---|---|---|
r |
只读 | 起始位置 |
w |
只写 | 起始位置 |
a |
追加 | 文件末尾 |
b |
二进制 | 视模式而定 |
+ |
读写 | 指定模式位置 |
pathlib
模块提供了面向对象的文件路径操作,比传统os.path更直观:
from pathlib import Path
current_dir = Path.cwd() / "data" / "raw"
file_path = current_dir / "sales_2023.csv"
# 跨平台兼容操作
print(file_path.exists()) # 检查文件存在性
print(file_path.parent) # 获取父目录
print(file_path.stem) # 提取文件名(不含扩展名)
print(file_path.with_suffix(".xlsx")) # 修改文件扩展名
处理文件时,try-except
块是必备的防护装备:
try:
with open("secret.txt", "r") as f:
print(f.read())
except FileNotFoundError:
print("⚠️ 文件失踪!请检查路径")
except PermissionError:
print(" 权限不足!请联系管理员")
except UnicodeDecodeError:
print(" 文件编码异常,尝试utf-8或gbk编码")
处理Excel表格前,CSV往往是更轻量级的选择:
import csv
# 写入CSV(自动处理转义字符)
with open("employees.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["姓名", "年龄", "邮箱"])
writer.writerows([
["张三", 28, "[email protected]"],
["李四", 34, "[email protected]"]
])
# 读取CSV(自动解析表头)
with open("employees.csv", "r") as f:
reader = csv.DictReader(f)
for row in reader:
print(f"{row['姓名']}({row['年龄']}岁) → {row['邮箱']}")
高级技巧:
csv.mergecell()
(需手动处理)csv.writer(f, delimiter=";")
open(..., "rb")
配合csv.read_binary()
JSON已成为API通信的标准格式,Python内置json
模块提供高效处理:
import json
# 对象转JSON(美化输出)
data = {
"name": "Alice",
"age": 30,
"hobbies": ["reading", "hiking"],
"address": {"city": "Beijing", "street": "Xidan"}
}
print(json.dumps(data, indent=4, ensure_ascii=False))
# JSON转对象(复杂类型处理)
json_str = '''
{
"numbers": [1, 2.5, True],
"null_value": null,
"datetime": "2023-10-05T14:30:00Z"
}
'''
data = json.loads(json_str)
print(type(data["datetime"])) #
json.JSONEncoder
自定义类型编码ijson
库json
模块比eval
快10倍以上虽然JSON更流行,但XML在配置文件中仍有不可替代的地位:
import xml.etree.ElementTree as ET
# 创建XML文档
root = ET.Element(" bookstore ")
book = ET.SubElement(root, "book", id="bk101")
author = ET.SubElement(book, "author").text = "J.K. Rowling"
title = ET.SubElement(book, "title").text = "Harry Potter"
# XML转字典(递归解析)
def xml_to_dict(element):
return {child.tag: xml_to_dict(child) for child in element}
book_dict = xml_to_dict(book)
print(book_dict) # {'author': 'J.K. Rowling', 'title': 'Harry Potter'}
lxml
库处理大型XML文件findall(".//book[id='bk101']")
处理GB级文件时,流式处理能节省宝贵内存:
import sys
# 逐行处理大文件
for line in sys.stdin:
if "ERROR" in line:
print(f"发现错误行:{line.strip()}")
# 二进制流解码
with open("image.jpg", "rb") as f:
chunk = f.read(1024)
while chunk:
process_chunk(chunk)
chunk = f.read(1024)
pickle
模块让对象能跨越内存与磁盘:
import pickle
# 永久保存对象状态
obj = {"name": "Buddy", "age": 5, "skills": ["fetch", "roll"]}
with open("dog.pkl", "wb") as f:
pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
# 跨版本兼容读取
with open("dog.pkl", "rb") as f:
loaded_obj = pickle.load(f)
print(loaded_obj["name"]) # 输出"Buddy"
安全警告:
json
代替pickle
进行网络传输configparser
模块让配置管理变得优雅:
import configparser
# 读取INI格式配置
config = configparser.ConfigParser()
config.read("app.ini")
# 动态获取配置
db_host = config["database"]["host"]
max_connections = int(config["server"]["max_connections"])
# 写入配置(保留注释和格式)
with open("app.ini", "w") as f:
config.write(f)
处理二进制文件时,直接操作字节流效率提升显著:
# 使用缓冲区读写
with open("video.mp4", "rb") as f:
buffer = f.read(4096)
while buffer:
process_buffer(buffer)
buffer = f.read(4096)
# 内存映射文件(适用于大文件)
with open("genome.fasta", "r+b") as f:
mmapped = mmap.mmap(f.fileno(), 0)
# 随机访问基因序列
start = mmapped.find(b"ATG", 0)
end = mmapped.find(b"TAG", start)
缓存常用数据避免重复计算:
from functools import lru_cache
@lru_cache(maxsize=128)
def expensive_calculation(n):
# 模拟耗时操作
time.sleep(1)
return n * n
# 第一次计算需要1秒
print(expensive_calculation(100))
# 第二次秒级响应
print(expensive_calculation(100))
# 错误示范(忘记关闭文件)
file = open("log.txt", "w")
file.write("error message")
# 正确做法(使用上下文管理器)
with open("log.txt", "w") as f:
f.write("error message")
# 错误示范(不同操作系统兼容性问题)
path = "data" + "/" + "file.txt"
# 正确做法
from pathlib import Path
path = Path("data") / "file.txt"
# 错误示范(中文编码问题)
json.dumps({"中文": "测试"}, ensure_ascii=True) # 输出{"\u4e2d\u6587": "测试"}
# 正确做法
json.dumps({"中文": "测试"}, ensure_ascii=False) # 保留原始中文
import pandas as pd
from datetime import datetime
# 1. 文件读取
raw_df = pd.read_csv("sales_raw.csv", dtype={"order_date": str})
# 2. 数据清洗
clean_df = raw_df.pipe(
lambda df: df.dropna(subset=["product_id"]), # 删除缺失值
lambda df: df[df["price"] > 0], # 过滤无效价格
lambda df: df.assign( # 字段转换
order_date=lambda x: datetime.strptime(x, "%Y-%m-%d").dt.date,
price_cents=lambda x: int(round(x * 100))
)
)
# 3. 格式导出
clean_df.to_parquet("sales_clean.parquet", compression="snappy")
# 4. 日志记录
with open("data_pipeline.log", "a") as f:
f.write(f"处理完成:{datetime.now()} | 记录数:{len(clean_df)}\n")
open()
、read()
、write()
的十八般武艺"优秀的文件处理代码应该像精心设计的自动售货机——用户只需投入数据,就能得到预期的格式化商品,而无需理解内部机械结构。"
——《Effective Python》 Brett Slatkin
最终,当你能像操作自然语言一样处理文件时,就会发现编程世界的数据汪洋,不过是一艘艘等待登船的智慧方舟。记住:每次正确关闭文件句柄,都是对计算机资源的尊重;每行优雅的格式化代码,都在为未来的自己节省时间。