GraphicsMagick 的 OpenCL 开发记录(二十六)

文章目录

  • 我给`ImageMagick`的`ResizeHorizontalFilter`核函数添加了注释

<2022-01-23 周日>

我给ImageMagickResizeHorizontalFilter核函数添加了注释

ResizeHorizontalFilter这个函数是做什么的?为什么要分析它?

  1. ImageMagick中,缩放函数ResizeImage是用于图片的高质量缩放,因为侧重于缩放质量,所以ResizeImage的耗时必定很高,所以在启用了OpenCL的情况下,就有了一个OpenCL版的ResizeImage,即AccelerateResizeImage
  2. ResizeImage缩放时会调用两个函数,分别处理垂直和水平两个方向,即HorizontalFilterVerticalFilter
  3. AccelerateResizeImage中调用的是resizeHorizontalFilterresizeVerticalFilter这两个函数,它们并不负责缩放任务,它们的主要任务是完成传参,最终调用真正的核函数(kernel函数),即ResizeHorizontalFilterResizeVerticalFilter
  4. 在理解了原函数HorizontalFilter的流程情况下,再来理解核函数ResizeHorizontalFilter,就有了对照,可以很快的写出其它普通函数的核函数。
  5. ResizeVerticalFilterResizeHorizontalFilter大同小异,理解了其中一个,另外一个也就理解了。
  6. ResizeHorizontalFilter函数我认真地看了可能有一天时间,涉及的变量太多,由于本人水平有限,不能说理解的完全正确,这里仅供参考。如下:
STRINGIFY(
__kernel __attribute__((reqd_work_group_size(256, 1, 1)))
  void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
    const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
    const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
    const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
    const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
    const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
    const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
    __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
{
  // calculate the range of resized image pixels computed by this workgroup
  const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
  const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
  // 这里的 actualNumPixelToCompute 就是 HorizontalFilter 中最大的循环:
  // for (x=0; x < (ssize_t) resize_image->columns; x++)
  const unsigned int actualNumPixelToCompute = stopX - startX;

  // calculate the range of input image pixels to cache
  float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
  scale = PerceptibleReciprocal(scale);
  // 可以认为上面 scale 是计算好的常数

  /* 这里对应 HorizontalFilter 中的代码:
  bisect=(double) (x+0.5)/x_factor+MagickEpsilon;
  start=(ssize_t) MagickMax(bisect-support+0.5,0.0);
  stop=(ssize_t) MagickMin(bisect+support+0.5,(double) image->columns);
  density=0.0;
  contribution=contributions[id];
  for (n=0; n < (stop-start); n++)
  {
    contribution[n].pixel=start+n;
    contribution[n].weight=GetResizeFilterWeight(resize_filter,scale*
      ((double) (start+n)-bisect+0.5));
    density+=contribution[n].weight;
  }*/
  // 为什么 cacheRangeEndX 的计算与上面贴来的代码不一样?因为 numCachedPixels 是
  // 通过参数传进来的,在 resizeHorizontalFilter 函数中它是这样得来的:
  // numCachedPixels=(int) ceil((pixelPerWorkgroup-1)/xFactor+2*support);
  // 因为做了 +2*support 的处理,计算后与上面注释中 stop 值相同
  const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
  const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);

  // cache the input pixels into local memory
  const unsigned int y = get_global_id(1);
  const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
  const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
  // 不理解 inputImageCache 这个参数,它是 resizeHorizontalFilter 中的 imageCacheLocalMemorySize
  // 这个 size_t 类型的变量,传进这个函数后怎么当成指针来用了?
  // inputImage 是在 ComputeResizeImage 函数中调用 GetAuthenticOpenCLBuffer 获得的
  // async_work_group_copy 这个函数有两种形式:
  // 1)从 __global 拷贝到 __local
  // 2)从 __local 拷贝到 __global
  // 它的用意应该是将这个 kernel 函数要计算的其中一个 workgroup 拷贝到 local 内存,即 inputImageCache
  event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
  wait_group_events(1, &e);

  unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
  {
    const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
    const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
    const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;

    // determine which resized pixel computed by this workitem
    const unsigned int itemID = get_local_id(0);
    const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));

    const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));

    // 目前的理解是:此 kernel 函数中存在多个 workgroup,一个 workgroup 的计算大小是:(pixelPerWorkgroup, 1)
    // 整个 kernel 函数的计算范围是:(actualNumPixelToCompute, 1),见函数最开始处的计算,接着在这个 for 循环
    // 中将计算整个 (actualNumPixelToCompute, 1),分割成 totalNumChunks 个 chunk,按照传参 pixelChunkSize
    // 计算,每个 chunk 的总计算范围是:(actualNumPixelInThisChunk, 1),这一个 chunk 可以被 numItems 个
    // workitem 同时计算,itemID 是通过 get_local_id(0) 获得的,所以 itemID 是用于计算的索引,pixelIndex 是
    // itemID 通过 pixelToCompute 计算获得的索引,这两个都是索引,itemID 是在 workitem 中的索引,pixelIndex
    // 是整个 chunk 的索引,因此整理得:
    // 整个 kernel 函数计算范围:(actualNumPixelToCompute, 1),<--- 这个大小好像不对,从第一,二行代码分析,
    // 这应该是整个需要计算图片的大小?从代码上看 pixelPerWorkgroup,下面代码计算未用到这个变量,因此这里忽略
    // workgroup 的大小,将 actualNumPixelToCompute 认为是整个图片的需要计算的大小,这里即指图片的宽度,然后
    // 以 chunk 的概念来理解:
    // 一个 chunk 大小:(actualNumPixelInThisChunk, 1)
    // 一个 workitem 大小:(get_local_id(0), 1)
    // 从 getNumWorkItemsPerPixel 函数看 workitem 的大小比 chunk 大

    float4 filteredPixel = (float4)0.0f;
    float density = 0.0f;
    float gamma = 0.0f;
    // -1 means this workitem doesn't participate in the computation
    if (pixelIndex != -1)
    {
      // x coordinated of the resized pixel computed by this workitem
      const int x = chunkStartX + pixelIndex;

      // calculate how many steps required for this pixel
      const float bisect = (x+0.5)/xFactor+MagickEpsilon;
      const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
      const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
      const unsigned int n = stop - start;

      // calculate how many steps this workitem will contribute
      unsigned int numStepsPerWorkItem = n / numItems;
      numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);

      const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
      if (startStep < n)
      {
        const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);

        unsigned int cacheIndex = start+startStep-cacheRangeStartX;
        for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
        {
          float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
            (ResizeWeightingFunctionType) resizeFilterType,
            (ResizeWeightingFunctionType) resizeWindowType,
            resizeFilterScale, resizeFilterWindowSupport,
            resizeFilterBlur, scale*(start + i - bisect + 0.5));

          float4 cp = (float4)0.0f;

          __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
          cp.x = (float) *(p);
          if (number_channels > 2)
          {
            cp.y = (float) *(p + 1);
            cp.z = (float) *(p + 2);
          }

          if (alpha_index != 0)
          {
            cp.w = (float) *(p + alpha_index);

            float alpha = weight * QuantumScale * cp.w;

            filteredPixel.x += alpha * cp.x;
            filteredPixel.y += alpha * cp.y;
            filteredPixel.z += alpha * cp.z;
            filteredPixel.w += weight * cp.w;
            gamma += alpha;
          }
          else
            filteredPixel += ((float4) weight)*cp;

          density += weight;
        }
      }
    }

    // initialize the accumulators to zero
    if (itemID < actualNumPixelInThisChunk) {
      outputPixelCache[itemID] = (float4)0.0f;
      densityCache[itemID] = 0.0f;
      if (alpha_index != 0)
        gammaCache[itemID] = 0.0f;
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    // accumulatte the filtered pixel value and the density
    for (unsigned int i = 0; i < numItems; i++) {
      if (pixelIndex != -1) {
        if (itemID%numItems == i) {
          outputPixelCache[pixelIndex]+=filteredPixel;
          densityCache[pixelIndex]+=density;
          if (alpha_index != 0)
            gammaCache[pixelIndex]+=gamma;
        }
      }
      barrier(CLK_LOCAL_MEM_FENCE);
    }

    if (itemID < actualNumPixelInThisChunk)
    {
      float4 filteredPixel = outputPixelCache[itemID];

      float gamma = 0.0f;
      if (alpha_index != 0)
        gamma = gammaCache[itemID];

      float density = densityCache[itemID];
      if ((density != 0.0f) && (density != 1.0f))
      {
        density = PerceptibleReciprocal(density);
        filteredPixel *= (float4) density;
        if (alpha_index != 0)
          gamma *= density;
      }

      if (alpha_index != 0)
      {
        gamma = PerceptibleReciprocal(gamma);
        filteredPixel.x *= gamma;
        filteredPixel.y *= gamma;
        filteredPixel.z *= gamma;
      }

      WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
    }
  }
}
)

你可能感兴趣的:(GraphicsMagick,的,OpenCL,开发,ImageMagick,OpenCL,c++,GraphicsMagick)