// Row summation filter kernel with rescaling, using LMEM
__kernel void BoxRowsLmem(__global const float4* uc4Source,
                          __global float4* uiDest,
                          __local float4* uc4LocalData,
                          int width,
                          int height,
                          int iRadius,
                          int iRadiusAligned,
                          unsigned int uiNumOutputPix)
{
    // Compute x and y pixel coordinates from group ID and local ID indexes
    int globalPosX = ((int)get_group_id(0) * uiNumOutputPix) + (int)get_local_id(0) - iRadiusAligned;
    int globalPosY = (int)get_group_id(1);
    int iGlobalOffset = globalPosY * width + globalPosX;

    // Read global data into LMEM
    if (globalPosX >= 0 && globalPosX < width)
    {
        uc4LocalData[get_local_id(0)] = uc4Source[iGlobalOffset];
    }
    else
    {
        uc4LocalData[get_local_id(0)].xyzw = (float4)0;
    }

    // Synchronize the read into LMEM
    barrier(CLK_LOCAL_MEM_FENCE);

    const float4 areaInv = (float4)(1.0f/(float)(iRadius*2+1));
    
    // Compute (if pixel plus apron is within bounds)
    if((globalPosX >= 0) && (globalPosX < width) && (get_local_id(0) >= iRadiusAligned) && (get_local_id(0) < (iRadiusAligned + (int)uiNumOutputPix)))
    {
        // Init summation registers to zero
        float4 f4Sum = (float4)0.0f;

        // Do summation, using inline function to break up uint value from LMEM into independent RGBA values
        int iOffsetX = (int)get_local_id(0) - iRadius;
        int iLimit = iOffsetX + (2 * iRadius) + 1;
        for(iOffsetX; iOffsetX < iLimit; iOffsetX++)
        {
            f4Sum.x += uc4LocalData[iOffsetX].x;
            f4Sum.y += uc4LocalData[iOffsetX].y;
            f4Sum.z += uc4LocalData[iOffsetX].z;
            f4Sum.w += uc4LocalData[iOffsetX].w;
        }

        // Use inline function to scale and convert registers to packed RGBA values in a uchar4, and write back out to GMEM
        uiDest[iGlobalOffset] = f4Sum*areaInv;
    }
}

// Column kernel using coalesced global memory reads
//*****************************************************************
__kernel void BoxColumns(__global float4* inputImage,
                         __global float4* outputImage,
                         int width,
                         int height,
                         int iRadius)
{
	size_t globalPosX = get_global_id(0);
    inputImage = &inputImage[globalPosX];
    outputImage = &outputImage[globalPosX];

    const float4 areaInv = (float4)(1.0f/(float)(iRadius*2+1));
    
    // do left edge
    float4 f4Sum = inputImage[0] * (float4)(iRadius);
    for (int y = 0; y < iRadius + 1; y++) 
    {
        f4Sum += inputImage[y * width];
    }
    outputImage[0] = f4Sum;
    for(int y = 1; y < iRadius + 1; y++) 
    {
        f4Sum += inputImage[(y + iRadius) * width];
        f4Sum -= inputImage[0];
        outputImage[y * width] = f4Sum*areaInv;
    }
    
    // main loop
    for(int y = iRadius + 1; y < height - iRadius; y++)
    {
        f4Sum += inputImage[(y + iRadius) * width];
        f4Sum -= inputImage[((y - iRadius) * width) - width];
        outputImage[y * width] = f4Sum*areaInv;
    }

    // do right edge
    for (int y = height - iRadius; y < height; y++)
    {
        f4Sum += inputImage[(height - 1) * width];
        f4Sum -= inputImage[((y - iRadius) * width) - width];
        outputImage[y * width] = f4Sum*areaInv;
    }
}
