#define WARP_SIZE 32
#define CTA_SIZE 256
#define UINT_MAX 0xffffffff

// In emulationmode, we need __syncthreads() inside warp-synchronous code,
// but we don't in code running on the GPU, so we define this macro to use
// in the warp-scan portion of the radix sort (see CUDPP for information
// on the warp scan algorithm.
#define  __DEVICE_EMULATION__

#ifdef __DEVICE_EMULATION__
#define __SYNC __syncthreads();
#else
#define __SYNC
#endif

typedef unsigned int uint;

// -----------------------------------------------------------------------------------------------
// The floatFlip and floatUnflip functions below are based on code in the web article 
// "Radix Tricks" by Michael Herf (http://www.stereopsis.com/radix.html). They are used to convert
// floating point values into sortable unsigned integers (and back).
//
// Paraphrasing Michael: Binary single-precision floating point numbers have two features that 
// keep them from being directly sortable. First, the sign bit is set when the value is negative, 
// which means that all negative numbers are bigger than positive ones. Second, the values are 
// signed-magnitude, so "more negative" floating point numbers actually look bigger to a normal 
// bitwise comparison.
// 
// "To fix our floating point numbers, we define the following rules:
//
//   1. Always flip the sign bit.
//   2. If the sign bit was set, flip the other bits too.
//
// To get back, we flip the sign bit always, and if the sign bit was not set, we flip the other 
// bits too."
//
// This is a very inexpensive operation and it is only done on the first and last steps of the
// sort.
// -----------------------------------------------------------------------------------------------


// ================================================================================================
// Flip a float for sorting
//  finds SIGN of fp number.
//  if it's 1 (negative float), it flips all bits
//  if it's 0 (positive float), it flips the sign only
// ================================================================================================
static __attribute__((always_inline))
__device__ uint floatFlip(bool doFlip, uint f)
{
    if (doFlip)
    {
        uint mask = (f >> 31) | 0x80000000;
        return f ^ mask;
    }
    else
        return f;
}


// ================================================================================================
// flip a float back (invert FloatFlip)
//  signed was flipped from above, so:
//  if sign is 1 (negative), it flips the sign bit back
//  if sign is 0 (positive), it flips all bits back
// ================================================================================================
static __attribute__((always_inline))
__device__ uint floatUnflip(bool doFlip, uint f)
{
    if (doFlip)
    {
        uint mask = ((f >> 31) - 1) | 0x80000000;
        return f ^ mask;
    }
    else
        return f;
}

//----------------------------------------------------------------------------
// Scans each warp in parallel ("warp-scan"), one element per thread.
// uses 2 numElements of shared memory per thread (64 = elements per warp)
//----------------------------------------------------------------------------
static __attribute__((always_inline))
__device__ int scanwarp(uint maxlevel, int val, volatile uint* sData)
{
    // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp = 
    // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1))

    int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1));
    sData[idx] = 0;
    idx += WARP_SIZE;
    sData[idx] = val;          __SYNC

#ifdef __DEVICE_EMULATION__
        uint t = sData[idx -  1]; __SYNC 
        sData[idx] += t;       __SYNC
        t = sData[idx -  2];   __SYNC 
        sData[idx] += t;       __SYNC
        t = sData[idx -  4];   __SYNC 
        sData[idx] += t;       __SYNC
        t = sData[idx -  8];   __SYNC 
        sData[idx] += t;       __SYNC
        t = sData[idx - 16];   __SYNC 
        sData[idx] += t;       __SYNC
#else
        if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; } __SYNC
        if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; } __SYNC
        if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; } __SYNC
        if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; } __SYNC
        if (4 <= maxlevel) { sData[idx] += sData[idx -16]; } __SYNC
#endif

        return sData[idx] - val;  // convert inclusive -> exclusive
}

//----------------------------------------------------------------------------
// scan4 scans 4*RadixSort::CTA_SIZE numElements in a block (4 per thread), using 
// a warp-scan algorithm
//----------------------------------------------------------------------------
static __attribute__((always_inline))
__device__ uint4 scan4(uint4 idata)
{    
    extern  __shared__  uint ptr[1];
    
    uint idx = threadIdx.x;

    uint4 val4 = idata;
    uint sum[3];
    sum[0] = val4.x;
    sum[1] = val4.y + sum[0];
    sum[2] = val4.z + sum[1];
    
    uint val = val4.w + sum[2];
    
    val = scanwarp(4, val, ptr);
    __syncthreads();

    if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
    {
        ptr[idx >> 5] = val + val4.w + sum[2];
    }
    __syncthreads();

#ifndef __DEVICE_EMULATION__
    if (idx < WARP_SIZE)
#endif
    {
      ptr[idx] = scanwarp(2, ptr[idx], ptr);
    }
    __syncthreads();

    val += ptr[idx >> 5];

    val4.x = val;
    val4.y = val + sum[0];
    val4.z = val + sum[1];
    val4.w = val + sum[2];

    return val4;
}

//----------------------------------------------------------------------------
//
// Rank is the core of the radix sort loop.  Given a predicate, it
// computes the output position for each thread in an ordering where all
// True threads come first, followed by all False threads.
// 
// This version handles 4 predicates per thread; hence, "rank4".
//
//----------------------------------------------------------------------------
static __attribute__((always_inline))
__device__ uint4 rank4(int ctasize, uint4 preds)
{
    __requires(preds.x == 0 | preds.x == 1);
    __requires(preds.y == 0 | preds.y == 1);
    __requires(preds.z == 0 | preds.z == 1);
    __requires(preds.w == 0 | preds.w == 1);

    uint4 address = scan4(preds);  

    __shared__ uint numtrue;
    if (threadIdx.x == ctasize-1)
    {
        numtrue = address.w + preds.w;
    }
    __syncthreads();

    uint4 rank;
    uint idx = threadIdx.x << 2;
    rank.x = (preds.x) ? address.x : numtrue + idx     - address.x;
    rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y;
    rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z;
    rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w;

    return rank;
}
