Linux Mutex机制与死锁分析

在Linux系统上,Mutex机制相比于信号量,实现更加简单和高效,但使用也更加严格

1. 任何时刻只有一个任务可以持有Mutex

2. 谁上锁谁解锁

3. 不允许递归地上锁和解锁

4. 当进程持有一个Mutex时,不允许退出

5. Mutex只能通过相关API来管理,不可被拷贝,手动初始化或重复初始化

在应用层来说,一般Mutex多用于多线程间的同步,本文针对第四点"当进程持有一个Mutex时,不允许退出"来做一些探讨和测试

关于多进程使用Mutex,有一个很经典的场景,即共享内存通讯

两个进程使用共享内存进行通讯时,一般都需要用到mutex来进行数据保护

而使用到锁,必然会有死锁的情况发生,下面将以多进程Mutex机制来分析死锁的情况

首先来看一段代码, 主要为mutex多进程的使用和模拟死锁场景

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

//测试要点:
//正在持有mutex的进程, 不能退出!

int main()
{
  pid_t pid;
  int shmid;
  int* shmptr;
  int* tmp;
 
  int err;
  pthread_mutexattr_t mattr;
  //只能使用mutex相关API来进行初始化
  if((err = pthread_mutexattr_init(&mattr)) < 0)
  {
    printf("mutex addr init error:%s\n", strerror(err));
    exit(1);
  }
 
  //针对进程同步,使用属性PTHREAD_PROCESS_SHARED,默认属性是同步线程的
  if((err = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED)) < 0)
  {
    printf("mutex addr get shared error:%s\n", strerror(err));
    exit(1);
  }
 
  //注意:这里是个大坑,这里的mutex必须是用共享内存的方式创建,目的是父进程和子进程可以共用此mutex。
  //否则,父进程的mutex就是父进程的,子进程的mutex就是子进程的,不能达到同步的作用。
  pthread_mutex_t* m;
  int mid = shmget(IPC_PRIVATE, sizeof(pthread_mutex_t), 0600);
  m = (pthread_mutex_t*)shmat(mid, NULL, 0);
 
  //只能使用mutex相关API来进行初始化
  if((err = pthread_mutex_init(m, &mattr)) < 0)
  {
    printf("mutex mutex init error:%s\n", strerror(err));
    exit(1);
  }
 
  //创建一个共享内存区域,让父进程和子进程往里写数据。
  if((shmid = shmget(IPC_PRIVATE, 1000, IPC_CREAT | 0600)) < 0)
  {
    perror("shmget error");
    exit(1);
  }
 
  //取得指向共享内存的指针
  if((shmptr = shmat(shmid, 0, 0)) == (void*)-1)
  {
    perror("shmat error");
    exit(1);
  }
 
  tmp = shmptr;
 
  //创建一个共享内存,保存上面共享内存的指针
  int shmid2;
  int** shmptr2;
  if((shmid2 = shmget(IPC_PRIVATE, 20, IPC_CREAT | 0600)) < 0)
  {
    perror("shmget2 error");
    exit(1);
  }
    
  //取得指向共享内存的指针
  if((shmptr2 = shmat(shmid2, 0, 0)) == (void*)-1)
  {
    perror("shmat2 error");
    exit(1);
  }
  //让shmptr2指向共享内存id为shmid的首地址。
  *shmptr2 = shmptr;
 
  if((pid = fork()) < 0)
  {
    perror("fork error");
    exit(1);
  }
  else if(pid == 0)
  {
    //子进程
    //从此处开始给mutex加锁,如果加锁成功,则此期间,父进程无法取得锁
    if((err = pthread_mutex_lock(m)) < 0)
    {
      printf("lock error:%s\n", strerror(err));
      exit(1);
    }
    for(int i = 0; i < 30; ++i)
    {
      **shmptr2 = i;
      (*shmptr2)++;
    }

    //模拟死锁场景
    //exit(1); //持有锁的期间退出;
     
    if((err = pthread_mutex_unlock(m)) < 0)
    {
      printf("unlock error:%s\n", strerror(err));
      exit(1);
    }
    exit(0);
 
  }
  else
  {
    sleep(1);//等待一会儿,让子进程先运行
    
    //从此处开始给mutex加锁,如果加锁成功,则此期间,子进程无法取得锁
    if((err = pthread_mutex_lock(m)) < 0)
    {
      printf("lock error:%s\n", strerror(err));
      exit(1);
    }
    for(int i = 40; i < 70; ++i)
    {
      **shmptr2 = i;
      (*shmptr2)++;
    }
    if((err = pthread_mutex_unlock(m)) < 0)
    {
      printf("unlock error:%s\n", strerror(err));
      exit(1);
    }
  }
 
  //给子进程收尸,防止僵尸进程
  wait(NULL);
 
  //查看共享内存的值
  for(int i = 0; i < 70; ++i)
  {
    printf("%d ", tmp[i]);
  }
 
  printf("\n");
 
  //销毁mutex的属性
  pthread_mutexattr_destroy(&mattr);
  //销毁mutex
  pthread_mutex_destroy(m);
 
  exit(0);
}

程序正常运行结果为:

#$ ./a.out
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 0 0 0 0 0 0 0 0 0 0

有了mutex的保护,数据不会发生错乱; 子进程先运行,写入连续的0-29; 父进程后运行,写入连续的40-69;

下面模拟死锁场景,在子进程加锁后退出;打开103行如下代码

    //模拟死锁场景
    exit(1); //持有锁的期间退出;

运行之后,程序卡住,没任何输出;即进入了死锁状态;

针对死锁情况,下面介绍常用的几种分析工具

首先想到的就是gdb,在有源码且可编译的情况下,使用gdb比较直接

#$ gdb ./a.out
GNU gdb (Ubuntu 7.11.1-0ubuntu1~16.5) 7.11.1
Copyright (C) 2016 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
.
Find the GDB manual and other documentation resources online at:
.
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./a.out...done.
(gdb) r
Starting program: /media/gwind/windcode/self-code-snippet/misc/utils-core/a.out
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
^C     (程序运行到此处卡主, 按CTRL/C退出)
Program received signal SIGINT, Interrupt.
__lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
135    ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S: No such file or directory.
(gdb) bt    (打印调用栈)
#0  __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
#1  0x00007ffff7bc3dbd in __GI___pthread_mutex_lock (mutex=0x7ffff7ff6000) at ../nptl/pthread_mutex_lock.c:80
#2  0x0000000000400d78 in main () at mutex-multi-process.c:119
(gdb)

 可以看到死锁发生在mutex-multi-process.c:119行 __lll_lock_wait 一直等待锁

即子进程获取锁退出后,父进程会加锁不成功,一直等待锁

方式二,使用strace来跟踪系统调用状态

#$ strace ./a.out
execve("./a.out", ["./a.out"], [/* 78 vars */]) = 0
brk(NULL)                               = 0x22a5000
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=188922, ...}) = 0
mmap(NULL, 188922, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f9489b9e000
close(3)                                = 0
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
open("/lib/x86_64-linux-gnu/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\260`\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=138696, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f9489b9d000
mmap(NULL, 2212904, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f948978b000
mprotect(0x7f94897a3000, 2093056, PROT_NONE) = 0
mmap(0x7f94899a2000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x17000) = 0x7f94899a2000
mmap(0x7f94899a4000, 13352, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f94899a4000
close(3)                                = 0
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
open("/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0`\t\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1868984, ...}) = 0
mmap(NULL, 3971488, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f94893c1000
mprotect(0x7f9489581000, 2097152, PROT_NONE) = 0
mmap(0x7f9489781000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1c0000) = 0x7f9489781000
mmap(0x7f9489787000, 14752, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f9489787000
close(3)                                = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f9489b9c000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f9489b9b000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f9489b9a000
arch_prctl(ARCH_SET_FS, 0x7f9489b9b700) = 0
mprotect(0x7f9489781000, 16384, PROT_READ) = 0
mprotect(0x7f94899a2000, 4096, PROT_READ) = 0
mprotect(0x601000, 4096, PROT_READ)     = 0
mprotect(0x7f9489bcd000, 4096, PROT_READ) = 0
munmap(0x7f9489b9e000, 188922)          = 0
set_tid_address(0x7f9489b9b9d0)         = 1010
set_robust_list(0x7f9489b9b9e0, 24)     = 0
rt_sigaction(SIGRTMIN, {0x7f9489790b50, [], SA_RESTORER|SA_SIGINFO, 0x7f948979c390}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {0x7f9489790be0, [], SA_RESTORER|SA_RESTART|SA_SIGINFO, 0x7f948979c390}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
shmget(IPC_PRIVATE, 40, 0600)           = 24346824
shmat(24346824, NULL, 0)                = 0x7f9489bcc000
shmget(IPC_PRIVATE, 1000, IPC_CREAT|0600) = 24379593
shmat(24379593, NULL, 0)                = 0x7f9489bcb000
shmget(IPC_PRIVATE, 20, IPC_CREAT|0600) = 24412362
shmat(24412362, NULL, 0)                = 0x7f9489bca000
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f9489b9b9d0) = 1011
nanosleep({1, 0}, {0, 999848639})       = ? ERESTART_RESTARTBLOCK (Interrupted by signal)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=1011, si_uid=1000, si_status=1, si_utime=0, si_stime=0} ---
restart_syscall(<... resuming interrupted nanosleep ...>) = 0
futex(0x7f9489bcc000, FUTEX_WAIT, 2, NULL

可见最后卡住在futex处

futex (fast userspace mutex) 是Linux的一个基础组件,可以用来构建各种更高级别的同步机制,比如互斥锁或者信号量等等

使用strace只能查看到程序卡住的原因是发生了死锁,具体在哪一行代码无法知晓;

比较适用于只有可执行文件没有源代码的情况

方式三,使用valgrind的drd工具来检测

#$ valgrind valgrind --tool=drd --trace-mutex=yes ./a.out
==7590== Memcheck, a memory error detector
==7590== Copyright (C) 2002-2015, and GNU GPL'd, by Julian Seward et al.
==7590== Using Valgrind-3.11.0 and LibVEX; rerun with -h for copyright info
==7590== Command: /usr/bin/valgrind --tool=drd --trace-mutex=yes ./a.out
==7590==
==7590== drd, a thread error detector
==7590== Copyright (C) 2006-2015, and GNU GPL'd, by Bart Van Assche.
==7590== Using Valgrind-3.11.0 and LibVEX; rerun with -h for copyright info
==7590== Command: ./a.out
==7590==
==7590== [1] mutex_init      mutex 0x4027000
==7608== [1] mutex_trylock   mutex 0x4027000 rc 0 owner 0
==7608== [1] post_mutex_lock mutex 0x4027000 rc 0 owner 0
==7608== [1] mutex_trylock   recursive mutex 0x4226948 rc 0 owner 0
==7608== [1] post_mutex_lock recursive mutex 0x4226948 rc 0 owner 0
==7608== [1] mutex_unlock    recursive mutex 0x4226948 rc 1
==7608==
==7608== For counts of detected and suppressed errors, rerun with: -v
==7608== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)
==7590== [1] mutex_trylock   mutex 0x4027000 rc 0 owner 0

^C 发生死锁 按CTRL/C退出
==7590== Process terminating with default action of signal 2 (SIGINT)
==7590==    at 0x4E5C26D: __lll_lock_wait (lowlevellock.S:135)
==7590==    by 0x4E55DBC: pthread_mutex_lock (pthread_mutex_lock.c:80)
==7590==    by 0x4C371FE: pthread_mutex_lock (in /usr/lib/valgrind/vgpreload_drd-amd64-linux.so)
==7590==    by 0x400D77: main (mutex-multi-process.c:119)
==7590==
==7590== For counts of detected and suppressed errors, rerun with: -v
==7590== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)

 DRD输出信息较多,也可用于分析锁占用时间

 方式四, 使用valgrind的helgrind工具

gwind@gwind-P5820T:/media/gwind/windcode/self-code-snippet/misc/utils-core$ valgrind --tool=helgrind  ./a.out ==7003== Helgrind, a thread error detector
==7003== Copyright (C) 2007-2015, and GNU GPL'd, by OpenWorks LLP et al.
==7003== Using Valgrind-3.11.0 and LibVEX; rerun with -h for copyright info
==7003== Command: ./a.out
==7003==
==7004== ---Thread-Announcement------------------------------------------
==7004==
==7004== Thread #1 is the program's root thread
==7004==
==7004== ----------------------------------------------------------------
==7004==
==7004== Thread #1: Exiting thread still holds 1 lock
==7004==    at 0x51297C8: _Exit (_exit.c:31)
==7004==    by 0x5096FBA: __run_exit_handlers (exit.c:97)
==7004==    by 0x5097054: exit (exit.c:104)
==7004==    by 0x400D61: main (mutex-multi-process.c:104)
==7004==
==7004==
==7004== For counts of detected and suppressed errors, rerun with: -v
==7004== Use --history-level=approx or =none to gain increased speed, at
==7004== the cost of reduced accuracy of conflicting-access information
==7004== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)

^C  CTRL/C退出
==7003== Process terminating with default action of signal 2 (SIGINT)
==7003==    at 0x4E5026D: __lll_lock_wait (lowlevellock.S:135)
==7003==    by 0x4E49DBC: pthread_mutex_lock (pthread_mutex_lock.c:80)
==7003==    by 0x4C32156: ??? (in /usr/lib/valgrind/vgpreload_helgrind-amd64-linux.so)
==7003==    by 0x400D77: main (mutex-multi-process.c:119)
==7003==
==7003== For counts of detected and suppressed errors, rerun with: -v
==7003== Use --history-level=approx or =none to gain increased speed, at
==7003== the cost of reduced accuracy of conflicting-access information
==7003== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)

helgrind工具输出较为精炼,查看更加方便

总结:

1. Mutex多用于线程间同步,用于多进程同步时,需要设置为PTHREAD_PROCESS_SHARED

2. 分析死锁时,根据不同情况使用不同工具来说结合分析

strace可跟踪系统调用状态

有源码可编译时,直接使用gdb跟踪

valgrind的drd工具不仅能分析锁状态,同时能评估锁效率, helgrind分析死锁更加精炼

你可能感兴趣的:(Linux,linux,mutex,死锁)