特性 | 进程 | 线程 |
---|---|---|
资源分配 | 独立内存空间 | 共享进程内存 |
创建开销 | 大 | 小 |
通信方式 | IPC(管道、队列等) | 共享变量 |
上下文切换 | 开销大 | 开销小 |
安全性 | 高(相互隔离) | 低(需要同步) |
GIL影响 | 无 | 有 |
适用场景 | CPU密集型任务 | I/O密集型任务 |
import multiprocessing
import os
import time
from concurrent.futures import ProcessPoolExecutor
multiprocessing
和 concurrent.futures
都是 Python 中用于实现并发的标准库模块,它们之间的关系可以概括为:
基础与抽象:
multiprocessing
提供底层的多进程编程接口concurrent.futures
是建立在 multiprocessing
之上的高级抽象层设计哲学:
multiprocessing
:提供全面的进程控制能力concurrent.futures
:提供统一的异步执行接口关键联系:
concurrent.futures.ProcessPoolExecutor
内部使用 multiprocessing
模块实现
特性 | multiprocessing |
concurrent.futures |
---|---|---|
核心组件 | Process, Pool, Queue 等 | Executor, Future 对象 |
编程范式 | 面向过程/面向对象 | 基于 Future 的异步编程 |
使用复杂度 | 中等(需要手动管理) | 简单(自动管理资源) |
执行模型 | 直接控制进程 | 通过执行器(Executor)管理 |
错误处理 | 需要手动捕获异常 | Future 对象封装异常 |
结果获取 | 通过返回值或共享对象 | 通过 Future.result() |
进度跟踪 | 需自定义实现 | 内置 as_completed() 方法 |
跨线程/进程统一接口 | 无 | 有(ThreadPoolExecutor/ProcessPoolExecutor) |
ProcessPoolExecutor
内部实现机制
# 伪代码展示 ProcessPoolExecutor 如何基于 multiprocessing 实现
class ProcessPoolExecutor:
def __init__(self, max_workers=None):
# 使用 multiprocessing 的 Queue 进行进程间通信
self._call_queue = multiprocessing.Queue()
self._result_queue = multiprocessing.Queue()
# 使用 multiprocessing.Process 创建工作者进程
self._processes = set()
for _ in range(max_workers):
p = multiprocessing.Process(
target=_worker,
args=(self._call_queue, self._result_queue)
)
p.start()
self._processes.add(p)
def submit(self, fn, *args, **kwargs):
# 将任务放入队列
self._call_queue.put((fn, args, kwargs))
return Future()
def shutdown(self):
# 清理 multiprocessing 进程
for p in self._processes:
p.terminate()
建议:
1、优先选择 concurrent.futures
当:
2、使用 multiprocessing
当:
def worker(name, delay):
print(f"Process {name} (PID: {os.getpid()}) started")
time.sleep(delay)
print(f"Process {name} completed")
return f"{name}-result"
if __name__ == "__main__":
# 创建进程
p1 = multiprocessing.Process(target=worker, args=("Alpha", 2))
p2 = multiprocessing.Process(target=worker, args=("Beta", 1))
# 启动进程
p1.start()
p2.start()
# 等待进程结束
p1.join()
p2.join()
print("All processes completed")
class MyProcess(multiprocessing.Process):
def __init__(self, name, delay):
super().__init__()
self.name = name
self.delay = delay
def run(self):
print(f"Process {self.name} (PID: {os.getpid()}) started")
time.sleep(self.delay)
print(f"Process {self.name} completed")
if __name__ == "__main__":
processes = [
MyProcess("Alpha", 2),
MyProcess("Beta", 1),
MyProcess("Gamma", 3)
]
for p in processes:
p.start()
for p in processes:
p.join()
print("All custom processes finished")
方法/属性 | 描述 | 示例 |
---|---|---|
start() |
启动进程 | p.start() |
run() |
进程执行的主体方法(可重写) | 自定义进程类时覆盖 |
join(timeout) |
等待进程终止 | p.join() |
is_alive() |
检查进程是否正在运行 | if p.is_alive(): ... |
name |
获取/设置进程名称 | p.name = "Worker-1" |
pid |
进程标识符(整数) | print(p.pid) |
daemon |
守护进程标志(主进程结束时自动终止) | p.daemon = True |
exitcode |
进程退出代码(None表示仍在运行) | print(p.exitcode) |
terminate() |
强制终止进程 | p.terminate() |
kill() |
类似terminate但使用SIGKILL信号 | p.kill() |
close() |
关闭进程对象释放资源 | p.close() |
def producer(q, items):
for item in items:
print(f"Producing: {item}")
q.put(item)
time.sleep(0.1)
q.put(None) # 结束信号
def consumer(q):
while True:
item = q.get()
if item is None:
break
print(f"Consuming: {item}")
time.sleep(0.2)
if __name__ == "__main__":
queue = multiprocessing.Queue()
producer_process = multiprocessing.Process(
target=producer,
args=(queue, ["A", "B", "C", "D", "E"])
)
consumer_process = multiprocessing.Process(
target=consumer,
args=(queue,)
)
producer_process.start()
consumer_process.start()
producer_process.join()
consumer_process.join()
def sender(conn, messages):
for message in messages:
print(f"Sending: {message}")
conn.send(message)
time.sleep(0.1)
conn.send("END")
conn.close()
def receiver(conn):
while True:
message = conn.recv()
if message == "END":
break
print(f"Received: {message}")
time.sleep(0.15)
if __name__ == "__main__":
parent_conn, child_conn = multiprocessing.Pipe()
p1 = multiprocessing.Process(
target=sender,
args=(parent_conn, ["Msg1", "Msg2", "Msg3"])
)
p2 = multiprocessing.Process(
target=receiver,
args=(child_conn,)
)
p1.start()
p2.start()
p1.join()
p2.join()
def increment(counter):
for _ in range(10000):
with counter.get_lock():
counter.value += 1
if __name__ == "__main__":
counter = multiprocessing.Value('i', 0)
processes = []
for _ in range(5):
p = multiprocessing.Process(target=increment, args=(counter,))
processes.append(p)
p.start()
for p in processes:
p.join()
print(f"Final counter value: {counter.value}")
def square(arr, index):
arr[index] = arr[index] ** 2
if __name__ == "__main__":
# 创建共享数组
shared_array = multiprocessing.Array('i', [1, 2, 3, 4, 5])
print("Original array:", list(shared_array))
processes = []
for i in range(len(shared_array)):
p = multiprocessing.Process(target=square, args=(shared_array, i))
processes.append(p)
p.start()
for p in processes:
p.join()
print("Squared array:", list(shared_array))
def worker(d, l, index):
d[index] = f"Value-{index}"
l.append(index ** 2)
if __name__ == "__main__":
with multiprocessing.Manager() as manager:
# 创建共享字典和列表
shared_dict = manager.dict()
shared_list = manager.list()
processes = []
for i in range(5):
p = multiprocessing.Process(
target=worker,
args=(shared_dict, shared_list, i)
)
processes.append(p)
p.start()
for p in processes:
p.join()
print("Shared Dictionary:", dict(shared_dict))
print("Shared List:", list(shared_list))
def printer(lock, msg):
with lock:
print(f"[{os.getpid()}] {msg}")
time.sleep(0.1)
if __name__ == "__main__":
lock = multiprocessing.Lock()
messages = ["Hello", "from", "multiple", "processes"]
processes = []
for msg in messages:
p = multiprocessing.Process(target=printer, args=(lock, msg))
processes.append(p)
p.start()
for p in processes:
p.join()
def access_resource(sem, worker_id):
with sem:
print(f"Worker {worker_id} acquired resource")
time.sleep(1)
print(f"Worker {worker_id} releasing resource")
if __name__ == "__main__":
sem = multiprocessing.Semaphore(2) # 允许2个进程同时访问
workers = 5
processes = []
for i in range(workers):
p = multiprocessing.Process(
target=access_resource,
args=(sem, i)
)
processes.append(p)
p.start()
for p in processes:
p.join()
def waiter(event, worker_id):
print(f"Worker {worker_id} waiting for event")
event.wait()
print(f"Worker {worker_id} processing after event")
def setter(event):
print("Setter sleeping before setting event")
time.sleep(2)
print("Setter setting event")
event.set()
if __name__ == "__main__":
event = multiprocessing.Event()
# 创建等待进程
waiters = [
multiprocessing.Process(target=waiter, args=(event, i))
for i in range(3)
]
setter_proc = multiprocessing.Process(target=setter, args=(event,))
for p in waiters:
p.start()
setter_proc.start()
for p in waiters:
p.join()
setter_proc.join()
def cpu_intensive_task(n):
print(f"Processing {n} in PID {os.getpid()}")
return sum(i * i for i in range(n))
if __name__ == "__main__":
with multiprocessing.Pool(processes=4) as pool:
# 同步执行
result = pool.apply(cpu_intensive_task, (1000000,))
print(f"Apply result: {result}")
# 异步执行
async_result = pool.apply_async(cpu_intensive_task, (2000000,))
print(f"Async result: {async_result.get()}")
# Map操作
results = pool.map(cpu_intensive_task, [100000, 200000, 300000])
print(f"Map results: {results}")
# Map异步操作
async_results = pool.map_async(cpu_intensive_task, [400000, 500000])
print(f"Async map results: {async_results.get()}")
def task(n):
print(f"Processing {n} in PID {os.getpid()}")
return n * n
if __name__ == "__main__":
with ProcessPoolExecutor(max_workers=4) as executor:
# 提交单个任务
future1 = executor.submit(task, 5)
print(f"Task result: {future1.result()}")
# 批量提交
futures = [executor.submit(task, i) for i in range(10)]
# 按完成顺序处理结果
for future in as_completed(futures):
print(f"Completed: {future.result()}")
# 使用map获取有序结果
results = executor.map(task, range(5, 15))
print("Ordered results:", list(results))
def success_callback(result):
print(f"Task succeeded with result: {result}")
def error_callback(exc):
print(f"Task failed with exception: {exc}")
def risky_task(n):
if n % 3 == 0:
raise ValueError(f"Bad number {n}")
return n ** 0.5
if __name__ == "__main__":
with ProcessPoolExecutor(max_workers=3) as executor:
futures = []
for i in range(10):
future = executor.submit(risky_task, i)
future.add_done_callback(
lambda f: success_callback(f.result()) if not f.exception()
else error_callback(f.exception())
)
futures.append(future)
# 等待所有任务完成
for future in futures:
try:
future.result()
except Exception as e:
pass # 错误已在回调中处理
def producer(queue, count):
for i in range(count):
item = f"Item-{i}"
queue.put(item)
print(f"Produced {item}")
time.sleep(0.1)
queue.put(None) # 结束信号
def consumer(queue, worker_id):
while True:
item = queue.get()
if item is None:
queue.put(None) # 传递结束信号
break
print(f"Worker {worker_id} consumed {item}")
time.sleep(0.2)
if __name__ == "__main__":
queue = multiprocessing.Queue(maxsize=5)
# 创建生产者
producer_proc = multiprocessing.Process(
target=producer, args=(queue, 10)
)
# 创建消费者
consumers = [
multiprocessing.Process(
target=consumer, args=(queue, i)
)
for i in range(3)
]
producer_proc.start()
for c in consumers:
c.start()
producer_proc.join()
for c in consumers:
c.join()
def mapper(item):
"""映射函数:返回(单词, 1)元组"""
time.sleep(0.01) # 模拟处理时间
return [(word.lower(), 1) for word in item.split()]
def reducer(key, values):
"""归约函数:计算单词出现次数"""
return (key, sum(values))
if __name__ == "__main__":
# 示例文本数据
texts = [
"Python is an interpreted high-level programming language",
"Python is created by Guido van Rossum",
"Python is widely used in data science",
"Python supports multiple programming paradigms"
]
with ProcessPoolExecutor() as executor:
# Map阶段:并行处理所有文本
map_results = executor.map(mapper, texts)
# 收集所有映射结果
all_items = []
for result in map_results:
all_items.extend(result)
# 按单词分组
grouped = {}
for key, value in all_items:
if key not in grouped:
grouped[key] = []
grouped[key].append(value)
# Reduce阶段:并行处理每个单词
reduce_futures = []
for key, values in grouped.items():
future = executor.submit(reducer, key, values)
reduce_futures.append(future)
# 获取最终结果
word_counts = {}
for future in reduce_futures:
key, count = future.result()
word_counts[key] = count
# 打印词频统计
print("Word Count Results:")
for word, count in sorted(word_counts.items(), key=lambda x: x[1], reverse=True):
print(f"{word}: {count}")
# manager_server.py
import multiprocessing.managers
class SharedManager(multiprocessing.managers.BaseManager):
pass
def run_server():
shared_dict = {}
shared_list = []
SharedManager.register('get_dict', callable=lambda: shared_dict)
SharedManager.register('get_list', callable=lambda: shared_list)
manager = SharedManager(address=('', 50000), authkey=b'secret')
server = manager.get_server()
print("Server started...")
server.serve_forever()
if __name__ == "__main__":
run_server()
# client_worker.py
import multiprocessing.managers
class SharedManager(multiprocessing.managers.BaseManager):
pass
def worker():
SharedManager.register('get_dict')
SharedManager.register('get_list')
manager = SharedManager(address=('localhost', 50000), authkey=b'secret')
manager.connect()
shared_dict = manager.get_dict()
shared_list = manager.get_list()
shared_dict['worker'] = os.getpid()
shared_list.append(os.getpid())
print(f"Worker {os.getpid()} updated shared data")
if __name__ == "__main__":
processes = [multiprocessing.Process(target=worker) for _ in range(3)]
for p in processes:
p.start()
for p in processes:
p.join()
print("All workers completed")
避免过度进程创建:使用进程池复用进程
减少IPC开销:
负载均衡:根据任务类型分配进程
CPU亲和性:绑定进程到特定核心
import psutil
def set_cpu_affinity(pid, cores):
process = psutil.Process(pid)
process.cpu_affinity(cores)
lock.acquire(timeout=5)
join()
等待子进程结束daemon=True
自动清理with
语句)finally
块中释放资源def monitor_processes():
while True:
print("\n=== Active Processes ===")
for proc in multiprocessing.active_children():
print(f"{proc.name} (PID: {proc.pid}, Alive: {proc.is_alive()}")
time.sleep(2)
if __name__ == "__main__":
# 启动工作进程
processes = [
multiprocessing.Process(target=time.sleep, args=(i,), name=f"Sleep-{i}")
for i in range(3, 0, -1)
]
for p in processes:
p.start()
# 启动监控进程
monitor = multiprocessing.Process(target=monitor_processes, daemon=True)
monitor.start()
# 等待工作进程完成
for p in processes:
p.join()
print("All processes completed")
CPU密集型任务:
独立数据处理:
高可靠性需求:
场景 | 推荐方案 | 理由 |
---|---|---|
CPU密集型 + 数据独立 | 多进程 | 充分利用多核 |
I/O密集型 + 资源共享 | 多线程 | 切换开销小,共享内存方便 |
CPU密集型 + 数据共享 | 多进程+IPC | 绕过GIL,隔离内存 |
混合型任务 | 混合 | 进程处理计算,线程处理I/O |
官方文档:
进阶书籍:
实践项目:
调试工具:
multiprocessing.log_to_stderr()
psutil
库监控进程py-spy
性能分析工具