TensorRT基于caffe模型加速MobileNet SSD

TensorRT加速MobileNet SSD分解为三个问题:

1)图像的预处理;

2)结果的处理;

3)depthwise convolution层的实现;

针对1)图像预处理我就不多言了;

针对2)结果处理,由于tensorRT中有SSD的detectout插件,所以出来的结果如何处理,也没有什么多说的,结果的个数是100个,for循环就可以了.可以借鉴这个demo:https://github.com/maomaoyuchengzi/MobileNetSSD-detect;

针对3)我参考的是prelu 插件层以及caffe中gpu前向处理的代码:下面粘贴部分代码在这里:

class DepthWiseConvPlugin : public IPlugin
{
public:
    DepthWiseConvPlugin();
    ~DepthWiseConvPlugin(){
        if (mWeights.values){
            free(const_cast(mWeights.values));
        }
        if (mbias.values){
            free(const_cast(mbias.values));
        }
    }
    DepthWiseConvPlugin(DepthWiseConv_Param DWConv_param,const Weights *weights, int nbWeights){
        mdepthWiseConv_param.bias_term = DWConv_param.bias_term;
        mdepthWiseConv_param.kernel_h = DWConv_param.kernel_h;
        mdepthWiseConv_param.kernel_w = DWConv_param.kernel_w;
        mdepthWiseConv_param.pad_h = DWConv_param.pad_h;
        mdepthWiseConv_param.pad_w = DWConv_param.pad_w;
        mdepthWiseConv_param.stride_h = DWConv_param.stride_h;
        mdepthWiseConv_param.stride_w = DWConv_param.stride_w;

        if(mdepthWiseConv_param.bias_term){
            assert(nbWeights==2);
            mWeights = weights[0];
            mbias = weights[1];
            assert(mWeights.type == DataType::kFLOAT || mWeights.type == DataType::kHALF);
            mWeights.values = malloc(mWeights.count*type2size(mWeights.type));
            memcpy(const_cast(mWeights.values),weights[0].values,mWeights.count*type2size(mWeights.type));

            assert(mbias.type == DataType::kFLOAT || mbias.type == DataType::kHALF);
            mbias.values = malloc(mbias.count*type2size(mbias.type));
            memcpy(const_cast(mbias.values),weights[1].values,mbias.count*type2size(mbias.type));

        }
        else{
            assert(nbWeights==1);
            mWeights = weights[0];
            mbias = weights[1];
            assert(mWeights.type == DataType::kFLOAT || mWeights.type == DataType::kHALF);
            mWeights.values = malloc(mWeights.count*type2size(mWeights.type));
            memcpy(const_cast(mWeights.values),weights[0].values,mWeights.count*type2size(mWeights.type));
        }
    }
    DepthWiseConvPlugin(const void* buffer, size_t size){
        const char* d = reinterpret_cast(buffer), *a = d;
        read(d, m_top_count);

        read(d, mdepthWiseConv_param.channels);
        read(d, mdepthWiseConv_param.height);
        read(d, mdepthWiseConv_param.width);
        read(d, mdepthWiseConv_param.kernel_h);
        read(d, mdepthWiseConv_param.kernel_w);
        read(d, mdepthWiseConv_param.stride_h);
        read(d, mdepthWiseConv_param.stride_w);
        read(d, mdepthWiseConv_param.pad_h);
        read(d, mdepthWiseConv_param.pad_w);
        read(d, mdepthWiseConv_param.conved_height);
        read(d, mdepthWiseConv_param.conved_width);
        read(d,mdepthWiseConv_param.bias_term);

        read(d,mWeights.count);
        read(d,mWeights.type);

        mWeights.values = nullptr;
        mWeights.values = malloc(mWeights.count * type2size(mWeights.type));//deserializeToDevice(d,mDeviceKernel,mWeights.count);
        memcpy(const_cast(mWeights.values), d, mWeights.count * type2size(mWeights.type));
        d += mWeights.count * type2size(mWeights.type);

        if(mdepthWiseConv_param.bias_term)
        {
            read(d,mbias.count);
            read(d,mbias.type);

            mbias.values = nullptr;
            mbias.values = malloc(mbias.count * type2size(mbias.type));
            memcpy(const_cast(mbias.values), d, mbias.count * type2size(mbias.type));
            d += mbias.count * type2size(mbias.type);
        }
        assert(d == a + size);
    }

    inline int getNbOutputs() const override {
        return 1;
    }
    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override{
        mdepthWiseConv_param.channels = inputs[0].d[0];
        mdepthWiseConv_param.height = inputs[0].d[1];
        mdepthWiseConv_param.width = inputs[0].d[2];

        assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3);
        int h_output = floor((inputs[0].d[1] - mdepthWiseConv_param.kernel_h + 2*mdepthWiseConv_param.pad_h)/mdepthWiseConv_param.stride_h) + 1;
        int w_output = floor((inputs[0].d[2] - mdepthWiseConv_param.kernel_w + 2*mdepthWiseConv_param.pad_w)/mdepthWiseConv_param.stride_w)+ 1;
        mdimstop =DimsCHW(inputs[0].d[0], h_output, w_output);
        mdepthWiseConv_param.conved_height = mdimstop.h();
        mdepthWiseConv_param.conved_width = mdimstop.w();
        m_top_count = mdimstop.c()*mdimstop.h()*mdimstop.w();
//        cout<<"depthwise_Conv:"<<"c = "<"Plugin layer output count is not equal to caffe output count".

解决方法:

原始的prototxt:

layer {
  name: "detection_out"
  type: "IPlugin"
  bottom: "mbox_loc"
  bottom: "mbox_conf_flatten"
  bottom: "mbox_priorbox"
  top: "detection_out"
...
}

加一个图top层就可以了,修改后:

layer {
  name: "detection_out"
  type: "IPlugin"
  bottom: "mbox_loc"
  bottom: "mbox_conf_flatten"
  bottom: "mbox_priorbox"
  top: "detection_out"
  #here
  top:"out2"
...
}

这样就可以解决了,此问题解决参考:https://devtalk.nvidia.com/default/topic/1025153/?comment=5214393

https://github.com/dusty-nv/jetson-inference/issues/171#issuecomment-360982183

下面是第二参考的原文:

wa。 I've solved this problem, it's because the tensorrt ssd implementation of
 detection output layer has TWO outputs. Therefore you should add an output blob 
in the prototxt file. I've built it successfully.

谢谢无私的贡献者; 

 

你可能感兴趣的:(TensorRT)