Python字符串如同不可变的乐高积木:
模块 | 功能 | 时间复杂度 |
---|---|---|
切片 | 获取子串 | O(k) |
find | 查找子串 | O(n) |
join | 拼接操作 | O(n) |
正则匹配 | 模式搜索 | O(n) |
需求 | 字符串 | 字节串 | 字符串数组 |
---|---|---|---|
可读性 | ✔️ | ❌ | ✔️ |
网络传输 | ❌ | ✔️ | ❌ |
修改频率 | 低 | 中 | 高 |
# Python 3.6+ 原生支持
import re
from string import Template
text = "Python字符串"
print(text[2:5]) # 输出:tho(切片操作)
print(len(text)) # 输出:8(长度计算)
print("Py" in text) # 输出:True(成员检查)
name = "Alice"
age = 30
# f-string格式化(Python 3.6+)
print(f"{name}今年{age}岁") # Alice今年30岁
import re
pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
text = "联系邮箱:[email protected]"
match = re.search(pattern, text)
print(match.group()) # [email protected]
text = "中文"
# 编码为字节
bytes_data = text.encode('utf-8') # b'\xe4\xb8\xad\xe6\x96\x87'
# 解码回字符串
decoded = bytes_data.decode('gbk', errors='ignore') # 错误处理
from string import Template
tpl = Template("$name的分数是$score")
print(tpl.substitute(name="Bob", score=95)) # Bob的分数是95
# 错误方式(低效)
result = ""
for s in ["a", "b", "c"]:
result += s # 每次创建新对象
# 正确方式(高效)
parts = ["a", "b", "c"]
result = "".join(parts)
dirty = " Hello, World! \n"
clean = dirty.strip().replace(" ", " ")
print(clean) # "Hello, World!"
text = "Apple Inc. was founded in 1976."
# 分词处理
words = text.split() # ['Apple', 'Inc.', ...]
# 首字母大写
title = text.title() # 'Apple Inc. Was Founded In 1976.'
with open("data.txt", "w", encoding="utf-8") as f:
f.write("Python文件操作")
with open("data.txt", "r") as f:
content = f.read() # 自动解码
# 列对齐输出
data = [("Alice", 95), ("Bob", 88)]
for name, score in data:
print(f"{name:<10} | {score:>3}")
# Alice | 95
# Bob | 88
# 案例3输出:
[email protected]
# 案例7输出:
Hello, World!
# 案例10输出:
Alice | 95
Bob | 88
操作 | 方法 | 耗时(ms) | 内存开销 |
---|---|---|---|
拼接 | +操作符 | 1200 | 高 |
拼接 | join | 2.5 | 低 |
格式化 | %操作 | 45 | 中 |
格式化 | f-string | 28 | 低 |
优先使用f-string
print(f"结果:{value:.2f}")
路径拼接
import os
path = os.path.join("dir", "file.txt")
常量格式化
SQL_TEMPLATE = "SELECT * FROM {table} WHERE id={id}"
多行字符串
long_text = (
"这是一段非常长的文本"
"可以自动连接相邻字符串"
)
模式预编译
pattern = re.compile(r"\d+")
字符串驻留利用
a = "hello"
b = "hello"
assert a is b # 小字符串重用
安全包含处理
user_input = "alert('hack')"
safe = html.escape(user_input)
高效换行处理
lines = text.splitlines()
枚举字符串
from enum import Enum
class Color(Enum):
RED = "red"
BLUE = "blue"
类型提示
def process(text: str) -> str:
return text.upper()
编码忽略错误
data.decode('ascii') # UnicodeDecodeError
误用is比较
a = "hello!"
b = "hello!"
a is b # 可能False(长字符串不驻留)
循环拼接
s = ""
for _ in range(10000):
s += "a" # 极低效
编码混淆
with open("data.txt", "w") as f:
f.write("中文") # 可能乱码(缺encoding参数)
正则注入
re.search(user_input, text) # 危险!
切片越界
"abc"[10:20] # 返回空字符串但不报错
不可变修改
s = "hello"
s[0] = "H" # TypeError
格式化漏洞
user_input = "admin"
query = f"SELECT * FROM users WHERE name='{user_input}'" # SQL注入风险
忽略大小写
if "Apple" == "apple": # 总是False
字节串混淆
b"hello"[0] = 104 # 返回104(ASCII值),非字符'h'
编码诊断
import chardet
print(chardet.detect(b'\xe4\xb8\xad\xe6\x96\x87')) # 检测编码
特殊字符查看
print(repr("hello\nworld")) # 显示转义字符
内存优化检查
a = "hello"
print(id(a)) # 查看对象地址验证驻留