#define __1D_GRID
#define __1D_THREAD_BLOCK
#include <cuda.h>
#include "../common.h"

__axiom(blockDim.x == 256);

// ================================================================================================
// Kernel to flip all floats in an array (see floatFlip, above)
// Each thread flips four values (each 256-thread CTA flips 1024 values).
// ================================================================================================
__global__ void flipFloats(uint *values, uint numValues)
{
    uint index = blockDim.x*4 * blockIdx.x + threadIdx.x; 
    if (index < numValues) values[index] = floatFlip(true, values[index]);
    index += blockDim.x;
#ifdef MUTATION
     /* BUGINJECT: ADD_ACCESS, DOWN */
    values[index+1] = values[index+1];
#endif
    if (index < numValues) values[index] = floatFlip(true, values[index]);
    index += blockDim.x;
    if (index < numValues) values[index] = floatFlip(true, values[index]);
    index += blockDim.x;
    if (index < numValues) values[index] = floatFlip(true, values[index]);
}
