
#ifndef _SCAN_NAIVE_KERNEL_H_
#define _SCAN_NAIVE_KERNEL_H_

#include "my_cutil.h"

////////////////////////////////////////////////////////////////////////////////
//    Notes for running in PUG:
//   Bitvector size: 16 bits
////////////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////////
//! Naive compute implementation of scan, one thread per element
//! Not work efficient: log(n) steps, but n * (log(n) - 1) adds.
//! Not shared storage efficient either -- this requires ping-ponging
//! arrays in shared memory due to hazards so 2 * n storage space.
//!
//! Pro: Simple
//! Con: Not work efficient
//!
//! @param g_odata  output data in global memory
//! @param g_idata  input data in global memory
//! @param n        input number of elements to scan from input data
///////////////////////////////////////////////////////////////////////////////
__global__ void kernel(float *g_odata, float *g_idata, int n)
{
  assume(n >= blockDim.x);       // +C

    // Dynamically allocated shared memory for scan kernels
    extern  __shared__  float temp[];

    int thid = threadIdx.x;

    int pout = 0;
    int pin = 1;

    // Cache the computational window in shared memory
    temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;

    for (int offset = 1; offset < n; offset *= 2)
    {
        pout = 1 - pout;
        pin  = 1 - pout;

	assume(pout == pin + 1);  // +R

        __syncthreads();

        temp[pout*n+thid] = temp[pin*n+thid];

        if (thid >= offset)
             temp[pout*n+thid] += temp[pin*n+thid - offset];
    }

#ifndef MUTATION
    __syncthreads();
#endif

    g_odata[thid] = temp[pout*n+thid];
}

#endif // #ifndef _SCAN_NAIVE_KERNEL_H_
