如何解决caffe和video-caffe不能使用cudnn8编译的问题

因为caffe之类的代码很久不更新了,只支持到了使用cudnn7.x,在使用了cudnn8的环境下编译caffe或video-caffe时,会在src/caffe/layers/cudnn_conv_layer.cpp等文件里出错:

  error: identifier "CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT" is undefined

  error: identifier "cudnnGetConvolutionForwardAlgorithm" is undefined

这是因为cudnn8里没有cudnnGetConvolutionForwardAlgorithm()这个函数了,改成了cudnnGetConvolutionForwardAlgorithm_v7(),也没了CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT这个宏定义,这些不兼容带来的麻烦很恼火,但是NVIDIA声明cudnn8不支持了,caffe的代码也没人去更新了,所以不能指望NVIDIA或者berkeley,还只能自己琢磨去解决。

参考了网上的解决caffe的编译问题的方案后,实验解决了video-caffe的编译问题,对video-caffe的代码做如下修改:

1.修改cmake/Cuda.cmake ,  将里面的"cudnn.h" 全部用 "cudnn_version.h"代替;

2.修改下面三个源码文件里的代码,增加针对性代码,当cudnn版本是8以上时,改成调用cudnnGetConvolutionForwardAlgorithm_v7(),否则仍保持原来的cudnnGetConvolutionForwardAlgorithm()调用不变:

   1) video-caffe/src/caffe/layers/cudnn_ndconv_layer.cu:

       

...

template 
void CudnnNdConvolutionLayer::Forward_gpu(
  const vector*>& bottom, const vector*>& top) {
  
  #if  CUDNN_VERSION_MIN(8, 0, 0)
  int RetCnt;
  bool found_conv_algorithm;
  size_t free_memory, total_memory;
  cudnnConvolutionFwdAlgoPerf_t     fwd_algo_pref_[4];
  //cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];

  //get memory sizes
  cudaMemGetInfo(&free_memory, &total_memory);
  #endif

  for (int i = 0; i < bottom.size(); ++i) {
    const Dtype* bottom_data = bottom[i]->gpu_data();
    Dtype* top_data = top[i]->mutable_gpu_data();
    const Dtype* weight = this->blobs_[0]->gpu_data();

    size_t workspace_limit_bytes = this->channels_*sizeof(int);
    for (int j = 0; j < this->kernel_shape_.size(); ++j) {
      workspace_limit_bytes *= kernel_shape_[j];
    }
    ++workspace_limit_bytes;

    // Forward through cuDNN in parallel over groups.
    for (int g = 0; g < this->group_; g++) {
      cudnnConvolutionFwdAlgo_t algo;
      #if CUDNN_VERSION_MIN(8, 0, 0)
      // choose forward algorithm for filter
      // in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
        bottom_descs_[i],
        filter_desc_,
        conv_descs_[i],
        top_descs_[i],
        4,
        &RetCnt,
        fwd_algo_pref_));

      found_conv_algorithm = false;
      for(int n=0;n workspaceSizeInBytes) {
        workspaceSizeInBytes = workspaceSizeInBytes_temp;
        // free the existing workspace and allocate a new (larger) one
        cudaFree(this->workspace_data_);
        cudaError_t err = cudaMalloc(&(this->workspace_data_),
                          workspaceSizeInBytes);
        if (err != cudaSuccess) {
          // force zero memory path
          algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
          workspace_data_ = NULL;
          workspaceSizeInBytes = 0;
        }
      }

      // Filters.
      CUDNN_CHECK(cudnnConvolutionForward(handle_[g],
                  cudnn::dataType::one,
                  bottom_descs_[i], bottom_data + bottom_offset_ * g,
                  filter_desc_, weight + weight_offset_ * g,
                  conv_descs_[i],
                  algo, workspace_data_, workspaceSizeInBytes,
                  cudnn::dataType::zero,
                  top_descs_[i], top_data + top_offset_ * g));

      // Bias.
      if (this->bias_term_) {
        const Dtype* bias_data = this->blobs_[1]->gpu_data();
#if CUDNN_VERSION_MIN(5, 0, 0)
        CUDNN_CHECK(cudnnAddTensor(handle_[g],
                    cudnn::dataType::one,
                    bias_desc_, bias_data + bias_offset_ * g,
                    cudnn::dataType::one,
                    top_descs_[i], top_data + top_offset_ * g));
#else
        CUDNN_CHECK(cudnnAddTensor_v3(handle_[g],
                    cudnn::dataType::one,
                    bias_desc_, bias_data + bias_offset_ * g,
                    cudnn::dataType::one,
                    top_descs_[i], top_data + top_offset_ * g));
#endif
      }
    }

    // Synchronize the work across groups, each of which went into its own
    // stream, by launching an empty kernel into the default (null) stream.
    // NOLINT_NEXT_LINE(whitespace/operators)
    sync_ndconv_groups<<<1, 1>>>();
  }
}

...

   2) src/caffe/layers/cudnn_conv_layer.cpp:

   

template 
void CuDNNConvolutionLayer::Reshape(
    const vector*>& bottom, const vector*>& top) {
  ConvolutionLayer::Reshape(bottom, top);
  CHECK_LE(2, this->num_spatial_axes_)
      << "CuDNNConvolution input must have 2 spatial axes "
      << "(e.g., height and width). "
      << "Use 'engine: CAFFE' for general ND convolution.";
  bottom_offset_ = this->bottom_dim_ / this->group_;
  top_offset_ = this->top_dim_ / this->group_;
  const bool forced_3d = this->forced_3d_;
  const int height = bottom[0]->shape(this->channel_axis_ + 1 + forced_3d);
  const int width = bottom[0]->shape(this->channel_axis_ + 2 + forced_3d);
  const int height_out = top[0]->shape(this->channel_axis_ + 1 + forced_3d);
  const int width_out = top[0]->shape(this->channel_axis_ + 2 + forced_3d);
  const int* pad_data = this->pad_.cpu_data();
  const int pad_h = pad_data[0];
  const int pad_w = pad_data[1];
  const int* stride_data = this->stride_.cpu_data();
  const int stride_h = stride_data[0];
  const int stride_w = stride_data[1];
  #if  CUDNN_VERSION_MIN(8, 0, 0)
  int RetCnt;
  bool found_conv_algorithm;
  size_t free_memory, total_memory;
  cudnnConvolutionFwdAlgoPerf_t     fwd_algo_pref_[4];
  cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];

  //get memory sizes
  cudaMemGetInfo(&free_memory, &total_memory);
  #else
  // Specify workspace limit for kernels directly until we have a
  // planning strategy and a rewrite of Caffe's GPU memory mangagement
  size_t workspace_limit_bytes = 8*1024*1024;
  #endif
  for (int i = 0; i < bottom.size(); i++) {
    cudnn::setTensor4dDesc(&bottom_descs_[i],
        this->num_,
        this->channels_ / this->group_, height, width,
        this->channels_ * height * width,
        height * width, width, 1);
    cudnn::setTensor4dDesc(&top_descs_[i],
        this->num_,
        this->num_output_ / this->group_, height_out, width_out,
        this->num_output_ * this->out_spatial_dim_,
        this->out_spatial_dim_, width_out, 1);
    cudnn::setConvolutionDesc(&conv_descs_[i], bottom_descs_[i],
        filter_desc_, pad_h, pad_w,
        stride_h, stride_w);
    #if CUDNN_VERSION_MIN(8, 0, 0)
    // choose forward algorithm for filter
    // in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
      bottom_descs_[i],
      filter_desc_,
      conv_descs_[i],
      top_descs_[i],
      4,
      &RetCnt,
      fwd_algo_pref_));

    found_conv_algorithm = false;
    for(int n=0;ngroup_ * CUDNN_STREAMS_PER_GROUP);

  // this is the total amount of storage needed over all groups + streams
  if (total_max_workspace > workspaceSizeInBytes) {
    DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
    workspaceSizeInBytes = total_max_workspace;

    // free the existing workspace and allocate a new (larger) one
    cudaFree(this->workspaceData);

    cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
    if (err != cudaSuccess) {
      // force zero memory path
      for (int i = 0; i < bottom.size(); i++) {
        workspace_fwd_sizes_[i] = 0;
        workspace_bwd_filter_sizes_[i] = 0;
        workspace_bwd_data_sizes_[i] = 0;
        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
        bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
        bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
      }

      // NULL out all workspace pointers
      for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
        workspace[g] = NULL;
      }
      // NULL out underlying data
      workspaceData = NULL;
      workspaceSizeInBytes = 0;
    }

    // if we succeed in the allocation, set pointer aliases for workspaces
    for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
      workspace[g] = reinterpret_cast(workspaceData) + g*max_workspace;
    }
  }

  // Tensor descriptor for bias.
  if (this->bias_term_) {
    cudnn::setTensor4dDesc(&bias_desc_,
        1, this->num_output_ / this->group_, 1, 1);
  }
}

   3) src/caffe/layers/cudnn_deconv_layer.cpp:

   

template 
void CuDNNDeconvolutionLayer::Reshape(
    const vector*>& bottom, const vector*>& top) {
  DeconvolutionLayer::Reshape(bottom, top);
  CHECK_EQ(2, this->num_spatial_axes_)
      << "CuDNNDeconvolutionLayer input must have 2 spatial axes "
      << "(e.g., height and width). "
      << "Use 'engine: CAFFE' for general ND convolution.";
  bottom_offset_ = this->bottom_dim_ / this->group_;
  top_offset_ = this->top_dim_ / this->group_;
  const bool forced_3d = this->forced_3d_;
  const int height = bottom[0]->shape(this->channel_axis_ + 1 + forced_3d);
  const int width = bottom[0]->shape(this->channel_axis_ + 2 + forced_3d);
  const int height_out = top[0]->shape(this->channel_axis_ + 1 + forced_3d);
  const int width_out = top[0]->shape(this->channel_axis_ + 2 + forced_3d);
  const int* pad_data = this->pad_.cpu_data();
  const int pad_h = pad_data[0];
  const int pad_w = pad_data[1];
  const int* stride_data = this->stride_.cpu_data();
  const int stride_h = stride_data[0];
  const int stride_w = stride_data[1];
  #if  CUDNN_VERSION_MIN(8, 0, 0)
  int RetCnt;
  bool found_conv_algorithm;
  size_t free_memory, total_memory;
  cudnnConvolutionFwdAlgoPerf_t     fwd_algo_pref_[4];
  cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];

  //get memory sizes
  cudaMemGetInfo(&free_memory, &total_memory);
  #else
  // Specify workspace limit for kernels directly until we have a
  // planning strategy and a rewrite of Caffe's GPU memory mangagement
  size_t workspace_limit_bytes = 8*1024*1024;
  #endif
  for (int i = 0; i < bottom.size(); i++) {
    cudnn::setTensor4dDesc(&bottom_descs_[i],
                                  this->num_,
                                  this->channels_ / this->group_,
                                  height,
                                  width,
                                  this->channels_ * height * width,
                                  height * width,
                                  width,
                                  1);
    cudnn::setTensor4dDesc(&top_descs_[i],
                                  this->num_,
                                  this->num_output_ / this->group_,
                                  height_out,
                                  width_out,
                                  this->num_output_ * height_out * width_out,
                                  height_out * width_out,
                                  width_out,
                                  1);
    cudnn::setConvolutionDesc(&conv_descs_[i],
                                     top_descs_[i],
                                     filter_desc_,
                                     pad_h,
                                     pad_w,
                                     stride_h,
                                     stride_w);
    #if  CUDNN_VERSION_MIN(8, 0, 0)
    // choose forward algorithm for filter
    // in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
      top_descs_[i],
      filter_desc_,
      conv_descs_[i],
      bottom_descs_[i],
      4,
      &RetCnt,
      fwd_algo_pref_));

    found_conv_algorithm = false;
    for(int n=0;n= workspace_limit_bytes) {
        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
      } else {
        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
      }
    }

    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
        handle_[0],
        top_descs_[i],
        filter_desc_,
        conv_descs_[i],
        bottom_descs_[i],
        fwd_algo_[i],
        &(workspace_fwd_sizes_[i])));

    // choose backward algorithm for filter
    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
        handle_[0],
        top_descs_[i],
        bottom_descs_[i],
        conv_descs_[i],
        filter_desc_,
        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
        workspace_limit_bytes,
        &bwd_filter_algo_[i]));

    // get workspace for backwards filter algorithm
    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
        handle_[0],
        top_descs_[i],
        bottom_descs_[i],
        conv_descs_[i],
        filter_desc_,
        bwd_filter_algo_[i],
        &workspace_bwd_filter_sizes_[i]));

    // choose backward algo for data
    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
        handle_[0],
        filter_desc_,
        bottom_descs_[i],
        conv_descs_[i],
        top_descs_[i],
        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
        workspace_limit_bytes,
        &bwd_data_algo_[i]));

    // get workspace size
    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
        handle_[0],
        filter_desc_,
        bottom_descs_[i],
        conv_descs_[i],
        top_descs_[i],
        bwd_data_algo_[i],
        &workspace_bwd_data_sizes_[i]));
    #endif
  }

  // reduce over all workspace sizes to get a maximum to allocate / reallocate
  size_t total_workspace_fwd = 0;
  size_t total_workspace_bwd_data = 0;
  size_t total_workspace_bwd_filter = 0;

  for (size_t i = 0; i < bottom.size(); i++) {
    total_workspace_fwd        = std::max(total_workspace_fwd,
                                     workspace_fwd_sizes_[i]);
    total_workspace_bwd_data   = std::max(total_workspace_bwd_data,
                                     workspace_bwd_data_sizes_[i]);
    total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
                                     workspace_bwd_filter_sizes_[i]);
  }
  // get max over all operations
  size_t max_workspace = std::max(total_workspace_fwd,
                             total_workspace_bwd_data);
  max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
  // ensure all groups have enough workspace
  size_t total_max_workspace = max_workspace *
                               (this->group_ * CUDNN_STREAMS_PER_GROUP);

  // this is the total amount of storage needed over all groups + streams
  if (total_max_workspace > workspaceSizeInBytes) {
    DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
    workspaceSizeInBytes = total_max_workspace;

    // free the existing workspace and allocate a new (larger) one
    cudaFree(this->workspaceData);

    cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
    if (err != cudaSuccess) {
      // force zero memory path
      for (int i = 0; i < bottom.size(); i++) {
        workspace_fwd_sizes_[i] = 0;
        workspace_bwd_filter_sizes_[i] = 0;
        workspace_bwd_data_sizes_[i] = 0;
        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING;
        bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
        bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
      }

      // NULL out all workspace pointers
      for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
        workspace[g] = NULL;
      }
      // NULL out underlying data
      workspaceData = NULL;
      workspaceSizeInBytes = 0;
    }

    // if we succeed in the allocation, set pointer aliases for workspaces
    for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
      workspace[g] = reinterpret_cast(workspaceData) + g*max_workspace;
    }
  }

  // Tensor descriptor for bias.
  if (this->bias_term_) {
    cudnn::setTensor4dDesc(
        &bias_desc_, 1, this->num_output_ / this->group_, 1, 1);
  }
}

上面针对cudnn8的改动全部使用 "#if  CUDNN_VERSION_MIN(8, 0, 0)"包含。

对于caffe使用cudnn8编译出错的问题就更简单了,只需像上面那样修改cmake/Cuda.cmake配置文件和修改src/caffe/layers/cudnn_conv_layer.cpp和src/caffe/layers/cudnn_deconv_layer.cpp的代码即可解决。

相关代码我已提交在我的github项目上: https://github.com/arnoldfychen/video-caffe   和https://github.com/arnoldfychen/caffe 详细说明参考README

如果同一环境下安装过多个版本的cudnn,编译前要检查一下确认cudnn8正确安装了(否则会导致cudnn找不到而被disable,因而涉及到c3d的卷积层NdConvolution都不会被编译!),例如,存在 /usr/include/cudnn_version.h,且/usr/lib/aarch64-linux-gnu/libcudnn.so是指向libcudnn.so.8.0.0:

 /usr/lib/aarch64-linux-gnu/libcudnn.so -> /etc/alternatives/libcudnn_so
 /etc/alternatives/libcudnn_so -> /usr/lib/aarch64-linux-gnu/libcudnn.so.8
 /usr/lib/aarch64-linux-gnu/libcudnn.so.8 -> libcudnn.so.8.0.0 

如果不存在,使用命令安装或者修复:

apt-get install --reinstall libcudnn8-dev

补充一个提示,如果你的caffe源码对cudnn_conv_layer.cpp和cudnn_deconv_layer.cpp做了上面的修改还是编译时报这样类似的错:

include/caffe/util/cudnn.hpp: In function ‘void libdnn::cudnn::setConvolutionDesc(cudnnConvolutionStruct**, cudnnTensorDescriptor_t, cudnnFilterDescriptor_t, int, int, int, int)’:
include/caffe/util/cudnn.hpp:109:70: error: too few arguments to function ‘cudnnStatus_t cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t)’
       pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));

这说明你用的caffe的代码版本太老了,需要到GitHub - BVLC/caffe: Caffe: a fast open framework for deep learning. clone最后版本的代码,可能Make和Make.config都得相应用最新的然后根据项目情况修改一下。

你可能感兴趣的:(video-caffe,深度学习,Caffe,video-caffe,caffe,深度学习,cudnn8,cudnn)