#define __1D_GRID
#define __1D_THREAD_BLOCK
#include <cuda.h>

#define N 1024
__axiom(blockDim.x == N/2);

/*
    Parallel reduction kernels
*/

/*
    Parallel sum reduction using shared memory
    - takes log(n) steps for n input elements
    - uses n threads
    - only works for power-of-2 arrays
*/

//REVISIT: cannot refer to this array in invariants
//extern __shared__ int sdata[];

/*
    This version uses n/2 threads --
    it performs the first level of reduction when reading from global memory
*/

__global__ void reduce3kernel(int *g_idata, int *g_odata, unsigned int n)
{
    __requires(__is_pow2(blockDim.x));

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    __shared__ int sdata[N];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;

    sdata[tid] = (i < n) ? g_idata[i] : 0;
    if (i + blockDim.x < n)
    {
        sdata[tid] += g_idata[i+blockDim.x];  
    }

    __syncthreads();

    // do reduction in shared mem
    for(unsigned int s=blockDim.x/2;
#ifndef INFERENCE
        __invariant(s >= 0),
        __invariant(s ==    0 | s ==  1  | s ==   2 | s ==    4 | 
                    s ==    8 | s == 16  | s ==  32 | s ==   64 |
                    s ==  128 | s == 256 | s == 512),
        __invariant(__no_read(g_odata)),
        __invariant(__no_write(g_odata)),
        __invariant(__no_read(sdata)),
        __invariant(__no_write(sdata)),
#endif
              s>0; s>>=1) 
    {
          /* BUGINJECT: ADD_BARRIER, DOWN */
        if (tid < s) 
        {
#ifdef MUTATION
            __syncthreads();
#endif
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    // write result for this block to global mem 
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
