GraphicsMagick 的 OpenCL 开发记录(三十四)

文章目录

  • 如何写`ScaleImage()`的硬件加速函数(八)

<2022-05-05 周四>

如何写ScaleImage()的硬件加速函数(八)

我觉得Y方向的缩放以下面这种ScaleFilter()的方法是实现不了的,我只能添加进X方向的处理,缩小正常,放大的话图片变亮。

STRINGIFY(
  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
  void ScaleFilter(const __global CLQuantum* inputImage, const unsigned int matte_or_cmyk,
    const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum* filteredImage,
    const unsigned int filteredColumns, const unsigned int filteredRows,
    const float resizeFilterScale,
    __local CLQuantum* inputImageCache, const int numCachedPixels,
    const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
    __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache)
{
  // calculate the range of resized image pixels computed by this workgroup
  const unsigned int startX = get_group_id(0) * pixelPerWorkgroup;
  const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup, filteredColumns);
  const unsigned int actualNumPixelToCompute = stopX - startX;

  float xFactor = (float)filteredColumns / inputColumns;

  // calculate the range of input image pixels to cache
  const int cacheRangeStartX = MagickMax((int)((startX + 0.5f) / xFactor), (int)(0));
  const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);

  // cache the input pixels into local memory
  const unsigned int y = get_global_id(1);
  const unsigned int pos = getPixelIndex(4, inputColumns, cacheRangeStartX, y / xFactor);
  const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * 4;
  event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
  wait_group_events(1, &e);

  unsigned int totalNumChunks = (actualNumPixelToCompute + pixelChunkSize - 1) / pixelChunkSize;
  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
  {
    const unsigned int chunkStartX = startX + chunk * pixelChunkSize;
    const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
    const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;

    // determine which resized pixel computed by this workitem
    const unsigned int itemID = get_local_id(0);
    const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));

    const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));

    float4 filteredPixel = (float4)0.0f;

    // -1 means this workitem doesn't participate in the computation
    if (pixelIndex != -1)
    {
      // x coordinated of the resized pixel computed by this workitem
      const int x = chunkStartX + pixelIndex;

      // calculate how many steps required for this pixel
      const float bisect = (x + 0.5) / xFactor + MagickEpsilon;
      const unsigned int start = (unsigned int)MagickMax(bisect, 0.0f);
      const unsigned int stop = (unsigned int)MagickMin(bisect + 1, (float)inputColumns);
      const unsigned int n = stop - start;

      // calculate how many steps this workitem will contribute
      unsigned int numStepsPerWorkItem = n / numItems;
      numStepsPerWorkItem += ((numItems * numStepsPerWorkItem) == n ? 0 : 1);

      const unsigned int startStep = (itemID % numItems) * numStepsPerWorkItem;
      if (startStep < n)
      {
        float x_scale = (float)filteredColumns / inputColumns;
        float x_span = 1.0;
        float x_volume = 0.0;
        float factor = 0.0;

        const unsigned int stopStep = MagickMin(startStep + numStepsPerWorkItem, n);

        unsigned int cacheIndex = start + startStep - cacheRangeStartX;
        for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
        {
          float4 cp = (float4)0.0f;

          __local CLQuantum* p = inputImageCache + (cacheIndex * 4);
          cp.x = (float)*(p);
          cp.y = (float)*(p + 1);
          cp.z = (float)*(p + 2);
          cp.w = (float)*(p + 3);

          while (x_scale >= x_span) {
            if (x_volume > 0.0 && x_volume < 1.0) {
              factor = 1 / x_volume;
              filteredPixel.x *= factor;
              filteredPixel.y *= factor;
              filteredPixel.z *= factor;
            }
            if (cp.w < 255.0) {
              x_volume += x_span;
            }
            filteredPixel += x_span * cp;
            filteredPixel.x = filteredPixel.x > 255.0 ? 255.0 : filteredPixel.x;
            filteredPixel.y = filteredPixel.y > 255.0 ? 255.0 : filteredPixel.y;
            filteredPixel.z = filteredPixel.z > 255.0 ? 255.0 : filteredPixel.z;
            filteredPixel.w = filteredPixel.w > 255.0 ? 255.0 : filteredPixel.w;
            x_scale -= x_span;
            x_span = 1.0;
          }

          if (x_scale > 0.0) {
            if (x_volume > 0.0 && x_volume < 1.0) {
              factor = 1 / x_volume;
              filteredPixel.x *= factor;
              filteredPixel.y *= factor;
              filteredPixel.z *= factor;
            }
            if (cp.w < 255.0)
              x_volume += x_scale;
            filteredPixel += x_scale * cp;
            x_span -= x_scale;
          }

          if (x_span > 0.0) {
            if (cp.w < 255.0)
              x_volume += x_span;
            filteredPixel += x_span * cp;
          }

          filteredPixel.x = filteredPixel.x > 255.0 ? 255.0 : filteredPixel.x;
          filteredPixel.y = filteredPixel.y > 255.0 ? 255.0 : filteredPixel.y;
          filteredPixel.z = filteredPixel.z > 255.0 ? 255.0 : filteredPixel.z;
          filteredPixel.w = filteredPixel.w > 255.0 ? 255.0 : filteredPixel.w;
        }
      }
    }

    if (itemID < actualNumPixelInThisChunk) {
      outputPixelCache[itemID] = (float4)0.0f;
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    for (unsigned int i = 0; i < numItems; i++) {
      if (pixelIndex != -1) {
        if (itemID % numItems == i) {
          outputPixelCache[pixelIndex] += filteredPixel;
        }
      }
      barrier(CLK_LOCAL_MEM_FENCE);
    }

    if (itemID < actualNumPixelInThisChunk)
    {
      float4 filteredPixel = outputPixelCache[itemID];
      WriteAllChannels(filteredImage, 4, filteredColumns, chunkStartX + itemID, y, filteredPixel);
    }
  }
}
)

你可能感兴趣的:(GraphicsMagick,的,OpenCL,开发,ImageMagick,OpenCL,c++,GraphicsMagick)