#define __1D_GRID
#define __1D_THREAD_BLOCK
#include <cuda.h>

#define N 32
__axiom(blockDim.x == N);
__axiom(gridDim.x == 1);

///////////////////////////////////////////////////////////////////////////////
//! Naive compute implementation of scan, one thread per element
//! Not work efficient: log(n) steps, but n * (log(n) - 1) adds.
//! Not shared storage efficient either -- this requires ping-ponging
//! arrays in shared memory due to hazards so 2 * n storage space.
//!
//! Pro: Simple
//! Con: Not work efficient
//!
//! @param g_odata  output data in global memory
//! @param g_idata  input data in global memory
//! @param n        input number of elements to scan from input data
///////////////////////////////////////////////////////////////////////////////
__global__ void kernel(float *g_odata, float *g_idata, int n)
{
#ifndef AWKWARD_INPUT_SIZE
    __requires(n == blockDim.x); //< n is a pow2 and equal to blockDim.x
#else
    __requires(n <= blockDim.x); //< n is probably not a pow2 and less than blockDim.x
#endif

    // REVISIT: removed extern
    // REVISIT: give temp static size
    // Dynamically allocated shared memory for scan kernels
    /*extern*/__shared__  float temp[N*2];

    int thid = threadIdx.x;

    int pout = 0;
    int pin = 1;

    // Cache the computational window in shared memory
    temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;

    for (int offset = 1;
#ifndef INFERENCE
        __invariant(offset >= 1),
        __invariant(__is_pow2(offset)),
#endif
#ifdef AWKWARD_INPUT_SIZE //< if n is not equal to blockDim.x
        // pout and pin are inverse of each other
        __invariant((pout == 0 & pin == 1)|
                    (pout == 1 & pin == 0)),
        // only write into one half of the buffer each time
        __invariant(__write_implies(temp,
          (pout == 0 & __write_offset(temp) == thid) |
          (pout == 1 & __write_offset(temp) - n == thid))),
        // only read from other half of the buffer (this invariant isn't strong enough)
        __invariant(__read_implies(temp,
          (pout == 0 & __read_offset(temp) - n          == thid) |
          (pout == 0 & __read_offset(temp) + offset - n == thid) |
          (pout == 1 & __read_offset(temp)              == thid) |
          (pout == 1 & __read_offset(temp) + offset     == thid)
        )),
        // neither is this one
        __invariant(__read_implies(temp,
          (pout == 0 & n <= __read_offset(temp) & __read_offset(temp) < 2*n) |
          (pout == 1 & 0 <= __read_offset(temp) & __read_offset(temp) <   n))),
#endif
         offset < n; offset *= 2)
    {

        pout = 1 - pout;
        pin  = 1 - pout;

#ifndef MUTATION
        /* BUGINJECT: REMOVE_BARRIER, DOWN */
        __syncthreads();
#endif
        temp[pout*n+thid] = temp[pin*n+thid];

        if (thid >= offset) {
             temp[pout*n+thid] += temp[pin*n+thid - offset];
        }
    }

    __syncthreads();

    g_odata[thid] = temp[pout*n+thid];
}
