



// -----------------------------------------------------------------------
// Fast CUDA Radix Sort Implementation
//
// The parallel radix sort algorithm implemented by this code is described
// in the following paper.
//
// Satish, N., Harris, M., and Garland, M. "Designing Efficient Sorting 
// Algorithms for Manycore GPUs". In Proceedings of IEEE International
// Parallel & Distributed Processing Symposium 2009 (IPDPS 2009).
//
// -----------------------------------------------------------------------

// #include "radixsort.h"

#include "my_cutil.h"

#define WARP_SIZE 16
#define CTA_SIZE 8
#define UINT_MAX 0xffffffff

enum kernelName
{
    SORT_KERNEL_EMPTY,
    SORT_KERNEL_RADIX_SORT_BLOCKS,
    SORT_KERNEL_RADIX_SORT_BLOCKS_KEYSONLY,
    SORT_KERNEL_FIND_RADIX_OFFSETS,
    SORT_KERNEL_REORDER_DATA,
    SORT_KERNEL_REORDER_DATA_KEYSONLY,
    SORT_KERNEL_COUNT,
};

bool bManualCoalesce = false;
unsigned int numCTAs[SORT_KERNEL_COUNT] = { 0, 0, 0, 0, 0, 0 };
unsigned int numSMs = 0;
unsigned int persistentCTAThreshold[2] = { 0, 0 };
unsigned int persistentCTAThresholdFullBlocks[2] = { 0, 0 };

#define MIN(a,b) ((a < b) ? a : b)
#define MAX(a,b) ((a > b) ? a : b)


// In emulationmode, we need __syncthreads() inside warp-synchronous code,
// but we don't in code running on the GPU, so we define this macro to use
// in the warp-scan portion of the radix sort (see CUDPP for information
// on the warp scan algorithm.
#define  __DEVICE_EMULATION__

#ifdef __DEVICE_EMULATION__
#define __SYNC __syncthreads();
#else
#define __SYNC
#endif

typedef unsigned int uint;


// -----------------------------------------------------------------------------------------------
// The floatFlip and floatUnflip functions below are based on code in the web article 
// "Radix Tricks" by Michael Herf (http://www.stereopsis.com/radix.html). They are used to convert
// floating point values into sortable unsigned integers (and back).
//
// Paraphrasing Michael: Binary single-precision floating point numbers have two features that 
// keep them from being directly sortable. First, the sign bit is set when the value is negative, 
// which means that all negative numbers are bigger than positive ones. Second, the values are 
// signed-magnitude, so "more negative" floating point numbers actually look bigger to a normal 
// bitwise comparison.
// 
// "To fix our floating point numbers, we define the following rules:
//
//   1. Always flip the sign bit.
//   2. If the sign bit was set, flip the other bits too.
//
// To get back, we flip the sign bit always, and if the sign bit was not set, we flip the other 
// bits too."
//
// This is a very inexpensive operation and it is only done on the first and last steps of the
// sort.
// -----------------------------------------------------------------------------------------------


// ================================================================================================
// flip a float back (invert FloatFlip)
//  signed was flipped from above, so:
//  if sign is 1 (negative), it flips the sign bit back
//  if sign is 0 (positive), it flips all bits back
// ================================================================================================

__device__ uint floatUnflip(bool doFlip, uint f)
{
    if (doFlip)
    {
        uint mask = ((f >> 31) - 1) | 0x80000000;
	    return f ^ mask;
    }
    else
        return f;
}

// ================================================================================================
// Kernel to unflip all floats in an array (see floatUnflip, above)
// Each thread unflips four values (each 256-thread CTA unflips 1024 values).
// ================================================================================================
__global__ void unflipFloats_kernel(uint *values, uint numValues)
{
    uint index = blockDim.x*4 * blockIdx.x + threadIdx.x; 
    if (index < numValues) values[index] = floatUnflip(true, values[index]);
    index += blockDim.x;
#ifdef MUTATION
    values[index+1] = values[index+1];
#endif
    if (index < numValues) values[index] = floatUnflip(true, values[index]);
    index += blockDim.x;
    if (index < numValues) values[index] = floatUnflip(true, values[index]);
    index += blockDim.x;
    if (index < numValues) values[index] = floatUnflip(true, values[index]);
}
