//pass
//--gridDim=[64,64] --blockDim=[16,16]

#include "common.h"

__global__ void transposeNaive(float *odata, float *idata, int width, int height, int nreps)
{
    __requires(width == 1024);
    __requires(height == 1024);
    __requires(nreps == 1);

    int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;

    int index_in  = xIndex + width * yIndex;
    int index_out = yIndex + height * xIndex;

    for (int r=0;
          __invariant(__write_implies(odata, (__mod_pow2(__write_offset(odata)/sizeof(float), (height*TILE_DIM)) / height) == threadIdx.x)),
       // __invariant(__write_implies(odata, __mod_pow2(__write_offset(odata)/sizeof(float), BLOCK_ROWS) == threadIdx.y)),
          __invariant(__write_implies(odata, __write_offset(odata)/sizeof(float)/(height*TILE_DIM) == blockIdx.x)),
          __invariant(__write_implies(odata, (__mod_pow2(__write_offset(odata)/sizeof(float), height) / TILE_DIM) == blockIdx.y)),
          r < nreps; r++)
    {
        for (int i=0;
          __invariant(__write_implies(odata, (__mod_pow2(__write_offset(odata)/sizeof(float), (height*TILE_DIM)) / height) == threadIdx.x)),
       // __invariant(__write_implies(odata, __mod_pow2(__write_offset(odata)/sizeof(float), BLOCK_ROWS) == threadIdx.y)),
          __invariant(__write_implies(odata, __write_offset(odata)/sizeof(float)/(height*TILE_DIM) == blockIdx.x)),
          __invariant(__write_implies(odata, (__mod_pow2(__write_offset(odata)/sizeof(float), height) / TILE_DIM) == blockIdx.y)),
            i<TILE_DIM; i+=BLOCK_ROWS)
        {
            odata[index_out+i] = idata[index_in+i*width];
        }
    }
}
