torch.distributed
与 torch.multiprocessing
init_process
确保每个进程都与同一个master进行协作。coordination tool
是啥?距离说有pdsh/clustershell/slurmimport os
import torch
import torch.distributed as dist
from torch.multiprocessing import Process
def run(rank, size):
""" Distributed function to be implemented later. """
pass
def init_process(rank, size, fn, backend='gloo'):
""" Initialize the distributed environment. """
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29500'
dist.init_process_group(backend, rank=rank, world_size=size)
fn(rank, size)
if __name__ == "__main__":
size = 2
processes = []
for rank in range(size):
p = Process(target=init_process, args=(rank, size, run))
p.start()
processes.append(p)
for p in processes:
p.join()
isend/irecv
后直接结束,但返回一个Work
对象,可以执行worker.wait()
方法实现类似阻塞的功能。worker.wait()
执行完成之前,不应该向对应的tensor执行操作。dist.isend()
后再向对应的 tensor
写入数据会导致未知错误。dist.irecv()
后再想对应的 tensor
读取数据会导致未知问题。"""Blocking point-to-point communication."""
def run(rank, size):
tensor = torch.zeros(1)
if rank == 0:
tensor += 1
# Send the tensor to process 1
dist.send(tensor=tensor, dst=1)
else:
# Receive tensor from process 0
dist.recv(tensor=tensor, src=0)
print('Rank ', rank, ' has data ', tensor[0])
"""Non-blocking point-to-point communication."""
def run(rank, size):
tensor = torch.zeros(1)
req = None
if rank == 0:
tensor += 1
# Send the tensor to process 1
req = dist.isend(tensor=tensor, dst=1)
print('Rank 0 started sending')
else:
# Receive tensor from process 0
req = dist.irecv(tensor=tensor, src=0)
print('Rank 1 started receiving')
req.wait()
print('Rank ', rank, ' has data ', tensor[0])
dist.new_group(group)
来实现。world
。
dist.all_reduce(tensor, op, group)
""" All-Reduce example."""
def run(rank, size):
""" Simple point-to-point communication. """
group = dist.new_group([0, 1])
tensor = torch.ones(1)
dist.all_reduce(tensor, op=dist.reduce_op.SUM, group=group)
print('Rank ', rank, ' has data ', tensor[0])
dist.broadcast(tensor, src, group)
: Copies tensor from src to all other processes.dist.reduce(tensor, dst, op, group)
: Applies op to all tensor and stores the result in dst.dist.all_reduce(tensor, op, group)
: Same as reduce, but the result is stored in all processes.dist.scatter(tensor, src, scatter_list, group)
: Copies the ith tensor scatter_list[i] to the ith process.dist.gather(tensor, dst, gather_list, group)
: Copies tensor from all processes in dst.dist.all_gather(tensor_list, tensor, group)
: Copies tensor from all processes to tensor_list, on all processes.dist.barrier(group)
: block all processes in group until each one has entered this function.dist.reduce_op.SUM
dist.reduce_op.PRODUCT
dist.reduce_op.MAX
dist.reduce_op.MIN
DistributedDataParallel
的功能。
""" Dataset partitioning helper """
class Partition(object):
def __init__(self, data, index):
self.data = data
self.index = index
def __len__(self):
return len(self.index)
def __getitem__(self, index):
data_idx = self.index[index]
return self.data[data_idx]
class DataPartitioner(object):
def __init__(self, data, sizes=[0.7, 0.2, 0.1], seed=1234):
self.data = data
self.partitions = []
rng = Random()
rng.seed(seed)
data_len = len(data)
indexes = [x for x in range(0, data_len)]
rng.shuffle(indexes)
for frac in sizes:
part_len = int(frac * data_len)
self.partitions.append(indexes[0:part_len])
indexes = indexes[part_len:]
def use(self, partition):
return Partition(self.data, self.partitions[partition])
""" Partitioning MNIST """
def partition_dataset():
dataset = datasets.MNIST('./data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
size = dist.get_world_size()
bsz = 128 / float(size)
partition_sizes = [1.0 / size for _ in range(size)]
partition = DataPartitioner(dataset, partition_sizes)
partition = partition.use(dist.get_rank())
train_set = torch.utils.data.DataLoader(partition,
batch_size=bsz,
shuffle=True)
return train_set, bsz
""" Distributed Synchronous SGD Example """
def run(rank, size):
torch.manual_seed(1234)
train_set, bsz = partition_dataset()
model = Net()
optimizer = optim.SGD(model.parameters(),
lr=0.01, momentum=0.5)
num_batches = ceil(len(train_set.dataset) / float(bsz))
for epoch in range(10):
epoch_loss = 0.0
for data, target in train_set:
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
epoch_loss += loss.item()
loss.backward()
average_gradients(model)
optimizer.step()
print('Rank ', dist.get_rank(), ', epoch ',
epoch, ': ', epoch_loss / num_batches)
""" Gradient averaging. """
def average_gradients(model):
size = float(dist.get_world_size())
for param in model.parameters():
dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM)
param.grad.data /= size
""" Implementation of a ring-reduce with addition. """
def allreduce(send, recv):
rank = dist.get_rank()
size = dist.get_world_size()
send_buff = send.clone()
recv_buff = send.clone()
accum = send.clone()
left = ((rank - 1) + size) % size
right = (rank + 1) % size
for i in range(size - 1):
if i % 2 == 0:
# Send send_buff
send_req = dist.isend(send_buff, right)
dist.recv(recv_buff, left)
accum[:] += recv_buff[:]
else:
# Send recv_buff
send_req = dist.isend(recv_buff, right)
dist.recv(send_buff, left)
accum[:] += send_buff[:]
send_req.wait()
recv[:] = accum[:]
torch.distributed
的一个非常优雅的地方在于,建立了抽象,可以使用不同的backends。torch.distributed
包重要就是根据MPI设计的。学习如何设置 dist.init_process_group()
方法的参数。
相关环境变量(设置下面四个环境变量后,所有进程都能与master进行通信了,从而获取其他进程的信息,实现相互通信)
MASTER_PORT
:rank 0 指定的端口MASTER_ADDR
:rank 0(或者说master)的IP地址WORLD_SIZE
:进程数量(也可以说是GPU数量?)RANK
:每个进程的rank(编号?),从而判断他们是不是某个worker的master。共享文件系统
fcntl
来实现文件锁。TCP