#define __1D_GRID
#define __1D_THREAD_BLOCK
#include <cuda.h>

#define NUM 32
__axiom(blockDim.x == NUM);

__global__ void BitonicKernel(int * values)
{
  extern __shared__ int shared[NUM];

  unsigned int tid = threadIdx.x;

  // Copy input to shared mem.
  shared[tid] = values[tid];

#ifdef MUTATION
  if (threadIdx.x == 0) {
#endif
  __syncthreads();
#ifdef MUTATION
   /* BUGINJECT: NON_UNIFORM_CONTROL_FLOW, UP */
  }
#endif

  // Parallel bitonic sort.
  for (unsigned int k = 2; k <= NUM; k *= 2)
  {
#ifndef INFERENCE
    __invariant(k == 2 || k == 4 || k == 8 || k == 16 || k == 32 || k == 64);
    __invariant(__no_read("BitonicKernel::shared"));
    __invariant(__no_write("BitonicKernel::shared"));
    __invariant(__no_read("BitonicKernel::values"));
    __invariant(__no_write("BitonicKernel::values"));
#endif
    // Bitonic merge:
    for (unsigned int j = k / 2; j>0; j /= 2)
    {
#ifndef INFERENCE
      __invariant(j == 0 || j == 1 || j == 2 || j == 4 || j == 8 || j == 16);
      __invariant(__no_read("BitonicKernel::shared"));
      __invariant(__no_write("BitonicKernel::shared"));
      __invariant(__no_read("BitonicKernel::values"));
      __invariant(__no_write("BitonicKernel::values"));
#endif
      unsigned int ixj = tid ^ j;

      if (ixj > tid)
      {
        if ((tid & k) == 0)
        {
          if (shared[tid] > shared[ixj])
          {
            unsigned int tmp = shared[tid];
            shared[tid] = shared[ixj];
            shared[ixj] = shared[tid];
          }
        }
        else
        {
          if (shared[tid] < shared[ixj])
          {
            unsigned int tmp = shared[tid];
            shared[tid] = shared[ixj];
            shared[ixj] = shared[tid];
          }
        }
      }

      __syncthreads();
    }
  }

  // Write result.
  values[tid] = shared[tid];
}
