用Pthread对共享内存系统进行多线程并行编程

在并行程序设计中,有针对分布式内存系统的和针对共享内存系统的。对于分布式内存系统,每个核访问其独立直接相连地内存速度很快,而访问远端内存(其他核直连的内存)速度很慢,可能比执行一次浮点运算慢数千倍。针对分布式内存系统,可以采用MPI进行编程,这里主要是针对共享内存系统亦即本地多核主机进行并行编程。

    这里采用拉格朗日公式粗略地计算PI, 虽然这个公式收敛很慢,但这里仅仅是作为学习并行编程的一个开始。设置一个全局变量pi,对于不同段的求和调用不同的核计算。值得注意的是,在共享内存系统中,进行并行编程时,需要注意多个线程对同一个变量进行读写的问题,这是很经典的问题,不注意的话往往会造成灾难性的后果,在本例中其表现结果是计算地根本不准确,这也很容易理解,当两个甚至更多线程同时对同一个变量进行写操作时就会出现问题。本例还给出了基于忙等方式的方案、基于临界区和基于互斥量的解决方案,并对这几种方案在四核的方式进行了测试,实践表明,基于互斥量和临界区的方案是比较优的,可以很显著的提升计算速度。

    可以用单线程测试(相当于串行也就是普通的for循环方式),计算所需时间大概是240ms,使用临界区或者互斥量调用4线程后所需时间大概是70ms左右,提升了约3.5倍,理想情况是4倍,也就是线性加速比,但是由于并不是程序执行的所有部分都可并行,所以实际上很难达到线性加速比。

具体代码如下:


 

#include 
#include 
#include 

long thread_count;

double pi = 0.0;

const long n = (long)1e8;

int flag = 0;
pthread_mutex_t mutex;

// 共享内存系统的并行编程要注意对共享变量的读写操作,往往导致不一致性,这通常是灾难性的,需要格外注意
void* thread_sum_plain(void* rank);
void* thread_sum_busy_wait(void* rank);
void* thread_sum_critical_section(void* rank);
void* thread_sum_mutex_lock(void* rank);

int main(int argc, char* argv[]) {
  
  clock_t start, finish;

  start = clock();

  pthread_mutex_init(&mutex, NULL);

  thread_count = strtol(argv[1], NULL, 10);
  pthread_t *thread_handles;

  thread_handles = malloc(thread_count * sizeof(pthread_t));

  // lunch threads
  // plain thread, which is not thread safe
  /*
  for(int thread = 0; thread < thread_count; ++thread) {
    pthread_create(&thread_handles[thread], NULL, thread_sum_plain, (void*)thread);
  }*/
  
  /*
  for(int thread = 0; thread < thread_count; ++thread) {
    pthread_create(&thread_handles[thread], NULL, thread_sum_busy_wait, (void*)thread);
  }*/
  
  /*
  for(int thread = 0; thread < thread_count; ++thread) {
    pthread_create(&thread_handles[thread], NULL, thread_sum_critical_section, (void*)thread);
  }*/
  
  for(int thread = 0; thread < thread_count; ++thread) {
    pthread_create(&thread_handles[thread], NULL, thread_sum_mutex_lock, (void*)thread);
  }

  // stop threads
  for(int thread = 0; thread < thread_count; ++thread) {
    pthread_join(thread_handles[thread], NULL);
  }


  finish = clock();

  double duration = 1000 * (finish - start) / CLOCKS_PER_SEC;
  
  printf("Computed PI = %.8f\n", pi * 4.0);

  // plain thread
  // printf("Time comsumption of plain thread is %lf ms.", duration);
  
  // busy wait
  // printf("Time comsumption of busy wait is %lf ms.", duration);
  
  // critical section
  // printf("Time comsumption of critical section  is %lf ms.", duration);
  
  // mutex lock
  printf("Time comsumption of mutex lock is %lf ms.", duration);
  
  free(thread_handles);
  pthread_mutex_destroy(&mutex);
  return 0;
}

void* thread_sum_plain(void* rank) { // not safe, about 450 ms using 4 threads
  long my_rank = (long)rank;

  double factor;
  long my_n = n / thread_count;
  long my_first = my_n * my_rank;
  long my_last = my_n * (my_rank + 1) - 1;

  if(my_first & 1) {
    factor = -1.0;
  } else {
    factor = 1.0;
  }
 
  for(long i = my_first; i <= my_last; ++i, factor = -factor) {
    pi += factor / (i * 2.0 + 1.0);   
  }

  return NULL;
}

void* thread_sum_busy_wait(void* rank) { // pi is almost accurate, about 5700 ms using 4 thread, low eifficiency 
  long my_rank = (long)rank;

  double factor;
  long my_n = n / thread_count;
  long my_first = my_n * my_rank;
  long my_last = my_n * (my_rank + 1) - 1;
  
  if(my_rank & 1) {
    factor = -1.0;
  } else {
    factor = 1.0;
  }

  for(long i = my_first; i <= my_last; ++i, factor = -factor) { 
    while(flag != my_rank);
    pi += factor / (i * 2 + 1.0);
    flag = (flag + 1) % thread_count;
  }

  return NULL;
}

void* thread_sum_critical_section(void* rank) { // pi is almost accurate, about 75 ms using 4 threads
  long my_rank = (long)rank;

  double factor, my_sum = 0.0;
  long my_n = n / thread_count;
  long my_first = my_n * my_rank;
  long my_last = my_n * (my_rank + 1) - 1;

  if(my_rank & 1) {
    factor = -1.0;
  } else {
    factor = 1.0;
  }

  for(long i = my_first; i <= my_last; ++i, factor = -factor) {
    my_sum += factor / (i * 2.0 + 1.0); 
  }

  while(flag != my_rank);
  pi += my_sum;

  flag = (flag + 1) % thread_count;

  return NULL;
}

void* thread_sum_mutex_lock(void* rank) {  // pi is almost accurate, about 70 ms using 4 threads
  long my_rank = (long)rank;

  double factor, my_sum = 0.0;
  long my_n = n / thread_count;
  long my_first = my_n * my_rank;
  long my_last = my_n * (my_rank + 1) - 1;

  if(my_rank & 1) {
    factor = -1.0;
  } else {
    factor = 1.0;
  }

  for(long i = my_first; i <= my_last; ++i, factor = -factor) {
    my_sum += factor / (i * 2.0 + 1.0);
  }
  
  pthread_mutex_lock(&mutex);
  pi += my_sum;
  pthread_mutex_unlock(&mutex);

  return NULL;
}

你可能感兴趣的:(C/C++)