#include "my_cutil.h"

//////////////////////////////////////////////////////////////////////////////
//// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
//// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
//// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
//// PARTICULAR PURPOSE.
////
//// Copyright (c) Microsoft Corporation. All rights reserved
//////////////////////////////////////////////////////////////////////////////

//----------------------------------------------------------------------------
// File: Convolution.cpp
// 
// Implement C++ AMP based simple and tiled version of Convolution filter used in 
// image processing.
//----------------------------------------------------------------------------

#define DEFAULT_WIDTH   512
#define DEFAULT_HEIGHT  512
// TILE_SIZE should be multiple of both DEFAULT_WIDTH and DEFAULT_HEIGHT
#define TILE_SIZE		128

#define width DEFAULT_WIDTH
#define height DEFAULT_HEIGHT

#define clamp(a, b, c) ((a) < (b) ? (b) : ((a) > (c) ? (c) : (a)))

#define dim_to_convolve y

#define radius 7

//----------------------------------------------------------------------------
// Tile implementation of convolution filter along different dimension
//----------------------------------------------------------------------------
__global__ void convolution_tiling_kernel(const float* img, const float* filter, float* result)
{
    assume(blockDim.x == 1);
    assume(blockDim.y == TILE_SIZE);
    assume(blockDim.x*gridDim.x == DEFAULT_WIDTH);
    assume(blockDim.y*gridDim.y == (((DEFAULT_HEIGHT - 1) / (TILE_SIZE - 2*radius)) + 1)*TILE_SIZE);

    __shared__ float local_buf[TILE_SIZE];
    
    int idx_convolve = (blockDim.dim_to_convolve)*(TILE_SIZE - 2 * radius) + (int)(threadIdx.dim_to_convolve) - radius;
    int max_idx_convolve = height;
    float sum = 0.0f;

    int a_idxY = blockDim.y;
    int a_idxX = blockDim.x;

    a_idxY = clamp(idx_convolve, 0, max_idx_convolve-1);
    if (idx_convolve < (max_idx_convolve + radius))
    {
        local_buf[threadIdx.dim_to_convolve] = img[a_idxY*width + a_idxX];
    }

#ifndef MUTATION
     /* BUGINJECT: REMOVE_BARRIER, DOWN */
    __syncthreads();
#endif

    if ((int)(threadIdx.dim_to_convolve) >= radius && (int)(threadIdx.dim_to_convolve) < (TILE_SIZE - radius) && idx_convolve < max_idx_convolve)
    {
        for (int k = -radius; k <= radius; k++)
        {
            int k_idx = k + radius;
            sum += local_buf[threadIdx.dim_to_convolve + k]*filter[k_idx];
        }
        result[a_idxY*width + a_idxX] = sum;
    }
}
