
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_

// #include <stdio.h>
// #include "matrixMul.h"
#include "my_cutil.h"

// #define CHECK_BANK_CONFLICTS 0
// #if CHECK_BANK_CONFLICTS
// #define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
// #define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
// #else
#define AS(i, j) As[i][j]
#ifdef MUTATION
#define BS(i, j) Bs[i][0]
#else
#define BS(i, j) Bs[i][j]
#endif
// #endif

#define BLOCK_SIZE 16


////////////////////////////////////////////////////////////////////////////////
//    Notes for running in PUG:
//   Bitvector size: preferablly 16 bits
//   The ASSUME_NO_OVFLO flag can be turned off to obtain substantial speedups
////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! wA is A's width and wB is B's width
////////////////////////////////////////////////////////////////////////////////
__global__ void
MultKernel( float* C, float* A, float* B, int wA, int wB)
{
  // Block index
  int bx = blockIdx.x;
  int by = blockIdx.y;

  // Thread index
  int tx = threadIdx.x;
  int ty = threadIdx.y;
  
  // Index of the first sub-matrix of A processed by the block
  int aBegin = wA * BLOCK_SIZE * by;
  
  // Index of the last sub-matrix of A processed by the block
  int aEnd   = aBegin + wA - 1;
  
  // Step size used to iterate through the sub-matrices of A
  int aStep  = BLOCK_SIZE;
  
  // Index of the first sub-matrix of B processed by the block
  int bBegin = BLOCK_SIZE * bx;

  // Step size used to iterate through the sub-matrices of B
  int bStep  = BLOCK_SIZE * wB;
  
  // Csub is used to store the element of the block sub-matrix
  // that is computed by the thread
  float Csub = 0;
  
  // Loop over all the sub-matrices of A and B
  // required to compute the block sub-matrix
  int b = bBegin;
  for (int a = aBegin;
       a <= aEnd;
       a += aStep) {
    
    // Declaration of the shared memory array As used to
    // store the sub-matrix of A
    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
    
    // Declaration of the shared memory array Bs used to
    // store the sub-matrix of B
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
    
    // Load the matrices from device memory
    // to shared memory; each thread loads
    // one element of each matrix
    AS(ty, tx) = A[a + wA * ty + tx];
    BS(ty, tx) = B[b + wB * ty + tx];
    
    // Synchronize to make sure the matrices are loaded
    __syncthreads();
    
    // Multiply the two matrices together;
    // each thread computes one element
    // of the block sub-matrix
    for (int k = 0; k < BLOCK_SIZE; ++k)
      Csub += AS(ty, k) * BS(k, tx);
    
    // Synchronize to make sure that the preceding
    // computation is done before loading two new
    // sub-matrices of A and B in the next iteration
    __syncthreads();

    b += bStep;
  }

  // Write the block sub-matrix to device memory;
  // each thread writes one element
  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

  assume (wB >= BLOCK_SIZE);
  assume (blockDim.x == BLOCK_SIZE && blockDim.y == BLOCK_SIZE); 
  //  && blockDim.x < BLOCK_SIZE && blockDim.y < BLOCK_SIZE);       // +C, by Guodong

  C[c + wB * ty + tx] = Csub;
  
}

#endif // #ifndef _MATRIXMUL_KERNEL_H_
