MPI初始化与线程支持级别:
线程安全性:
负载均衡:
避免过度细分:
层次化并行:
通信优化:
内存使用:
混合并行模式:
下面是一个MPI+OpenMP混合并行的矩阵乘法示例:
#include
#include
#include
#include
#include
#define N 1024 // 矩阵大小
void initialize_matrix(double *matrix, int rows, int cols, int init_value) {
#pragma omp parallel for
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
matrix[i*cols + j] = init_value;
}
}
}
int main(int argc, char *argv[]) {
int rank, size;
int provided, required = MPI_THREAD_FUNNELED;
// 初始化MPI并请求线程支持
MPI_Init_thread(&argc, &argv, required, &provided);
if (provided < required) {
printf("MPI_THREAD_FUNNELED not available!\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
// 计算每个进程负责的行数
int rows_per_proc = N / size;
int remainder = N % size;
int local_rows = rows_per_proc + (rank < remainder ? 1 : 0);
// 分配内存
double *A_local = (double*)malloc(local_rows * N * sizeof(double));
double *B = (double*)malloc(N * N * sizeof(double));
double *C_local = (double*)malloc(local_rows * N * sizeof(double));
// 初始化矩阵
initialize_matrix(A_local, local_rows, N, 1.0);
if (rank == 0) {
initialize_matrix(B, N, N, 2.0);
}
// 广播B矩阵到所有进程
double start_bcast = MPI_Wtime();
MPI_Bcast(B, N*N, MPI_DOUBLE, 0, MPI_COMM_WORLD);
double end_bcast = MPI_Wtime();
if (rank == 0) {
printf("Bcast time: %f seconds\n", end_bcast - start_bcast);
}
// 矩阵乘法计算
double start_comp = MPI_Wtime();
#pragma omp parallel for
for (int i = 0; i < local_rows; i++) {
for (int j = 0; j < N; j++) {
double sum = 0.0;
for (int k = 0; k < N; k++) {
sum += A_local[i*N + k] * B[k*N + j];
}
C_local[i*N + j] = sum;
}
}
double end_comp = MPI_Wtime();
if (rank == 0) {
printf("Computation time with %d threads: %f seconds\n",
omp_get_max_threads(), end_comp - start_comp);
}
// 收集结果到rank 0进程
double *C = NULL;
if (rank == 0) {
C = (double*)malloc(N * N * sizeof(double));
}
// 准备接收计数和位移数组
int *recvcounts = (int*)malloc(size * sizeof(int));
int *displs = (int*)malloc(size * sizeof(int));
int offset = 0;
for (int i = 0; i < size; i++) {
recvcounts[i] = (N / size + (i < remainder ? 1 : 0)) * N;
displs[i] = offset;
offset += recvcounts[i];
}
// 收集结果
MPI_Gatherv(C_local, local_rows * N, MPI_DOUBLE,
C, recvcounts, displs, MPI_DOUBLE,
0, MPI_COMM_WORLD);
// 验证结果(可选)
if (rank == 0) {
int errors = 0;
#pragma omp parallel for reduction(+:errors)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
if (fabs(C[i*N + j] - 2.0*N) > 1e-6) {
errors++;
}
}
}
printf("Found %d errors in result matrix\n", errors);
free(C);
}
// 释放资源
free(A_local);
free(B);
free(C_local);
free(recvcounts);
free(displs);
MPI_Finalize();
return 0;
}
编译命令(使用GCC):
mpicc -fopenmp mpi_omp_matmul.c -o matmul -O3
运行命令(例如使用4个MPI进程,每个进程4个线程):
export OMP_NUM_THREADS=4
mpirun -np 4 ./matmul
调整MPI进程与线程比例:
NUMA架构优化:
通信优化:
OpenMP优化:
通过合理结合MPI的进程级并行和OpenMP的线程级并行,可以充分利用现代集群的计算资源,实现更高的并行效率和更好的性能扩展性。