#define patchSide 8
#define patchSideSh 3
#define side_2_sh 6
#define side_2 (patchSide * patchSide)

#define N 16
#define NSHIFT 4

#define PATCHSHIFT 3
#define PATCHSIZE (1<<PATCHSHIFT)

__constant float kaiser_window[side_2] =
{
    0.192400f, 0.298900f, 0.384600f, 0.432500f, 0.432500f, 0.384600f, 0.298900f, 0.192400f,
    0.298900f, 0.464200f, 0.597400f, 0.671700f, 0.671700f, 0.597400f, 0.464200f, 0.298900f,
    0.384600f, 0.597400f, 0.768800f, 0.864400f, 0.864400f, 0.768800f, 0.597400f, 0.384600f,
    0.432500f, 0.671700f, 0.864400f, 0.971800f, 0.971800f, 0.864400f, 0.671700f, 0.432500f,
    0.432500f, 0.671700f, 0.864400f, 0.971800f, 0.971800f, 0.864400f, 0.671700f, 0.432500f,
    0.384600f, 0.597400f, 0.768800f, 0.864400f, 0.864400f, 0.768800f, 0.597400f, 0.384600f,
    0.298900f, 0.464200f, 0.597400f, 0.671700f, 0.671700f, 0.597400f, 0.464200f, 0.298900f,
    0.192400f, 0.298900f, 0.384600f, 0.432500f, 0.432500f, 0.384600f, 0.298900f, 0.192400f
};

#define WGS_W 8
#define WGS_H 8

__kernel __attribute__((reqd_work_group_size(WGS_W, WGS_H, 1)))
void apply_stack(
                 int                   step_i,
                 int                   step_j,
                 int                   w_ind_size,
                 int                   h_ind_size,
                 __global unsigned*    offsets,
                 __global float*       stacks,
                 __global float*       weights,
                 int                   width,
                 int                   height,
                 __global float2*      numerator_denominator)
{
    const int ind_i = get_group_id(0) * 5 + step_i;
    const int ind_j = get_group_id(1) * 5 + step_j;
    
    const float weight = 1;
    
    const unsigned ind_offset = ind_j * w_ind_size + ind_i;
    __global unsigned* _offsets = &offsets[ind_offset<<NSHIFT];
    
    const int stackSize = side_2<<NSHIFT;
    __global float* stack = &stacks[ind_offset*stackSize];
    
    __local float2 numDenom[24 * 24];
    
    const int p = get_local_id(0);
    const int q = get_local_id(1);
    
    const int offsetOrg = _offsets[0];
    const int dst_i = (offsetOrg % width) - 8;
    const int dst_j = (offsetOrg / width) - 8;
    
    for (int y = q * 3; y < q * 3 + 3; ++y) {
        for (int x = p * 3; x < p * 3 + 3; ++x) {
            
            const int offsetGlobal = (y + dst_j) * width + x + dst_i;
            const int offsetLocal = y * 24 + x;
            
            numDenom[offsetLocal] = numerator_denominator[offsetGlobal];
        }
    }
    
    barrier(CLK_LOCAL_MEM_FENCE);
    
    for (unsigned n = 0; n < N; n++)
    {
        const int sideOffs = (q << patchSideSh) + p;
        const unsigned patchOffset = _offsets[n];
        
        const int px = patchOffset % width;
        const int py = patchOffset / width;
        
        const unsigned ind = (py - dst_j + q) * 24 + px - dst_i + p;
        
        const float w = weight * kaiser_window[sideOffs];
        
        float2 s = numDenom[ind];
        s.s0 += w * stack[n*side_2 + sideOffs];//patch_1[q * 8 + p];
        s.s1 += w;
        numDenom[ind] = s;
    }
    
    for (int y = q * 3; y < q * 3 + 3; ++y) {
        for (int x = p * 3; x < p * 3 + 3; ++x) {
            
            const int offsetGlobal = (y + dst_j) * width + x + dst_i;
            const int offsetLocal = y * 24 + x;
            
            numerator_denominator[offsetGlobal] = numDenom[offsetLocal];
        }
    }
}

//#define patchSide 8
//#define patchSideSh 3
//#define side_2_sh 6
//#define side_2 (patchSide * patchSide)
//
//#define N 16
//#define NSHIFT 4
//
//#define PATCHSHIFT 3
//#define PATCHSIZE (1<<PATCHSHIFT)
//
//__constant float kaiser_window[side_2] =
//{
//    0.192400f, 0.298900f, 0.384600f, 0.432500f, 0.432500f, 0.384600f, 0.298900f, 0.192400f,
//    0.298900f, 0.464200f, 0.597400f, 0.671700f, 0.671700f, 0.597400f, 0.464200f, 0.298900f,
//    0.384600f, 0.597400f, 0.768800f, 0.864400f, 0.864400f, 0.768800f, 0.597400f, 0.384600f,
//    0.432500f, 0.671700f, 0.864400f, 0.971800f, 0.971800f, 0.864400f, 0.671700f, 0.432500f,
//    0.432500f, 0.671700f, 0.864400f, 0.971800f, 0.971800f, 0.864400f, 0.671700f, 0.432500f,
//    0.384600f, 0.597400f, 0.768800f, 0.864400f, 0.864400f, 0.768800f, 0.597400f, 0.384600f,
//    0.298900f, 0.464200f, 0.597400f, 0.671700f, 0.671700f, 0.597400f, 0.464200f, 0.298900f,
//    0.192400f, 0.298900f, 0.384600f, 0.432500f, 0.432500f, 0.384600f, 0.298900f, 0.192400f
//};
//
//#define WGS_W 8
//#define WGS_H 8
//
//#define W_S 24
//
//__kernel __attribute__((reqd_work_group_size(WGS_W, WGS_H, 1)))
//void apply_stack(
//                 int                   step_i,
//                 int                   step_j,
//                 int                   w_ind_size,
//                 int                   h_ind_size,
//                 __global unsigned*    offsets,
//                 __global float*       stacks,
//                 __global float*       weights,
//                 int                   width,
//                 int                   height,
//                 __global float2*      numerator_denominator)
//{
//    const int ind_i = get_group_id(0) * 5 + step_i;
//    const int ind_j = get_group_id(1) * 5 + step_j;
//    
//    __local float2 numDenom[W_S * 24];
//    
//    const int p = get_local_id(0);
//    const int q = get_local_id(1);
//    
//    offsets += (ind_j * w_ind_size + ind_i) * 16;
//    stacks += (ind_j * w_ind_size + ind_i) * 16 * 64 + p;
//    const int offsetOrg = offsets[0];
//    const int dst_i = (offsetOrg % width) - 8;
//    const int dst_j = (offsetOrg / width) - 8;
//    
//    numerator_denominator += (q + dst_j) * width + p + dst_i;
//    __local float2 *numDenom_ptr = numDenom + q * W_S + p;
//    
//    for (int y = 0; y < 24; y += 8) {
//        for (int x = 0; x < 24; x += 8) {
//            numDenom_ptr[y * W_S + x] = numerator_denominator[y * width + x];
//        }
//    }
//    
//    barrier(CLK_LOCAL_MEM_FENCE);
//    
//    const float w = kaiser_window[q * 8 + p];
//
//    for (unsigned _n = 0; _n < 2; ++_n)
//    {
//        const int n = _n * 8 + q;
//        const unsigned patchOffset = offsets[n];
//        
//        const int px = patchOffset % width;
//        const int py = patchOffset / width;
//        
//        for (int y = 0; y < 8; ++y) {
//            
//            const int xx = max(0, min(23, (px - dst_i + p)));
//            const int yy = max(0, min(23, (py - dst_j + q)));
//            
//            const int ind = yy * W_S + xx;
//            numDenom[ind] += (float2)(w * stacks[n*side_2 + y * 8], w);
//        }
//    }
//    
////    for (unsigned n = 0; n < N; ++n)
////    {
////        const unsigned patchOffset = offsets[n];
////        
////        const int px = patchOffset % width;
////        const int py = patchOffset / width;
////        
////        const int ind = (py - dst_j + q) * W_S + px - dst_i + p;
////        
////        float2 s = numDenom[ind];
////        s.s0 += w * stacks[n*side_2];
////        s.s1 += w;
////        numDenom[ind] = s;
////    }
//    
//    for (int y = 0; y < 24; y += 8) {
//        for (int x = 0; x < 24; x += 8) {
//            numerator_denominator[y * width + x] = numDenom_ptr[y * W_S + x];
//        }
//    }
//}


//#define patchSide 8
//#define patchSideSh 3
//#define side_2_sh 6
//#define side_2 (patchSide * patchSide)
//
//#define N 16
//#define NSHIFT 4
//
//#define PATCHSHIFT 3
//#define PATCHSIZE (1<<PATCHSHIFT)
//
//void atomic_add_local(volatile local float *source, const float operand) {
//    union {
//        unsigned int intVal;
//        float floatVal;
//    } newVal;
//    
//    union {
//        unsigned int intVal;
//        float floatVal;
//    } prevVal;
//    
//    do {
//        prevVal.floatVal = *source;
//        newVal.floatVal = prevVal.floatVal + operand;
//    } while (atomic_cmpxchg((volatile local unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
//}
//
//__constant float kaiser_window[side_2] =
//{
//    0.192400f, 0.298900f, 0.384600f, 0.432500f, 0.432500f, 0.384600f, 0.298900f, 0.192400f,
//    0.298900f, 0.464200f, 0.597400f, 0.671700f, 0.671700f, 0.597400f, 0.464200f, 0.298900f,
//    0.384600f, 0.597400f, 0.768800f, 0.864400f, 0.864400f, 0.768800f, 0.597400f, 0.384600f,
//    0.432500f, 0.671700f, 0.864400f, 0.971800f, 0.971800f, 0.864400f, 0.671700f, 0.432500f,
//    0.432500f, 0.671700f, 0.864400f, 0.971800f, 0.971800f, 0.864400f, 0.671700f, 0.432500f,
//    0.384600f, 0.597400f, 0.768800f, 0.864400f, 0.864400f, 0.768800f, 0.597400f, 0.384600f,
//    0.298900f, 0.464200f, 0.597400f, 0.671700f, 0.671700f, 0.597400f, 0.464200f, 0.298900f,
//    0.192400f, 0.298900f, 0.384600f, 0.432500f, 0.432500f, 0.384600f, 0.298900f, 0.192400f
//};
//
//#define WGS_W 8
//#define WGS_H 8
//
//#define W_S 24
//
//__kernel //__attribute__((reqd_work_group_size(WGS_W, WGS_H, 1)))
//void apply_stack(
//                 int                   step_i,
//                 int                   step_j,
//                 int                   w_ind_size,
//                 int                   h_ind_size,
//                 __global unsigned*    offsets,
//                 __global float*       stacks,
//                 __global float*       weights,
//                 int                   width,
//                 int                   height,
//                 __global float2*      numerator_denominator)
//{
//    const int ind_i = get_group_id(0) * 5 + step_i;
//    const int ind_j = get_group_id(1) * 5 + step_j;
//    
//    __local float local_numerator[W_S * 24];
//    __local float local_denominator[W_S * 24];
//    
//    const int p = get_local_id(0);
//    const int q = get_local_id(1);
//    
//    offsets += (ind_j * w_ind_size + ind_i) * 16;
//    stacks += (ind_j * w_ind_size + ind_i) * 16 * 64 + q * 8 + p;
//    
//    const int offsetOrg = offsets[0];
//    const int dst_i = (offsetOrg % width) - 8;
//    const int dst_j = (offsetOrg / width) - 8;
//    
//    numerator_denominator += (q + dst_j) * width + p + dst_i;
//    __local float *local_numerator_ptr = local_numerator + q * W_S + p;
//    __local float *local_denominator_ptr = local_denominator + q * W_S + p;
//    
//    if (0 == get_local_id(2)) {
//        for (int y = 0; y < 24; y += 8) {
//            for (int x = 0; x < 24; x += 8) {
//                float2 v = numerator_denominator[y * width + x];
//                local_numerator_ptr[y * W_S + x] = v.s0;
//                local_denominator_ptr[y * W_S + x] = v.s1;
//                //            numDenom_ptr[y * W_S + x] = numerator_denominator[y * width + x];
//            }
//        }
//    }
//    
//    barrier(CLK_LOCAL_MEM_FENCE);
//    
//    const float w = kaiser_window[q * 8 + p];
//    
//    for (unsigned n = 0; n < 8; ++n)
//    {
//        const int _n = n + get_local_id(2) * 8;
//        const unsigned patchOffset = offsets[n];
//        
//        const int px = patchOffset % width;
//        const int py = patchOffset / width;
//        
//        const int ind = (py - dst_j + q) * W_S + px - dst_i + p;
//        
//        //        local_numerator[ind] += w * stacks[n*side_2];
//        //        local_denominator[ind] += w;
//        
//        atomic_add_local(local_numerator + ind, w * stacks[n*side_2]);
//        atomic_add_local(local_denominator + ind, w);
//        
//        //        float2 s = numDenom[ind];
//        //        s.s0 += w * stacks[n*side_2];
//        //        s.s1 += w;
//        //        numDenom[ind] = s;
//    }
//    
//    barrier(CLK_LOCAL_MEM_FENCE);
//    
//    if (0 == get_local_id(2)) {
//        for (int y = 0; y < 24; y += 8) {
//            for (int x = 0; x < 24; x += 8) {
//                float2 v;
//                v.s0 = local_numerator_ptr[y * W_S + x];
//                v.s1 = local_denominator_ptr[y * W_S + x];
//                numerator_denominator[y * width + x] = v;// numDenom_ptr[y * W_S + x];
//            }
//        }
//    }
//}
