#define __1D_GRID
#define __2D_THREAD_BLOCK
#include <cuda.h>

#define NTHREAD_X 4
#define NTHREAD_Y 1
__axiom(gridDim.x  == 4);         // ] together this means numBodies is 16
__axiom(blockDim.x == NTHREAD_X); // ]
__axiom(blockDim.y == NTHREAD_Y); // == q parameter

// REVISIT: softeningSquared should be a constant
// __constant__ float softeningSquared;
#define softeningSquared 0.0000015625f

// Macros to simplify shared memory addressing
#define SX(i) sharedPos[i+blockDim.x*threadIdx.y]
// This macro is only used when multithreadBodies is true (below)
#define SX_SUM(i,j) sharedPos[i+blockDim.x*j]
// WRAP is used to force each block to start working on a different 
// chunk (and wrap around back to the beginning of the array) so that
// not all multiprocessors try to read the same memory locations at 
// once.
#define WRAP(x,m) (((x)<m)?(x):(x-m))  // Mod without divide, works on values from 0 up to 2m

__global__ void
integrateBodiesKernel(float4* newPos, float4* newVel, 
                float4* oldPos, float4* oldVel,
                float deltaTime, float damping,
                int numBodies)
{
    __requires(numBodies == gridDim.x * blockDim.x);

    int index = blockIdx.x * blockDim.x + threadIdx.x;
    float4 pos = oldPos[index];   
   
    // -----------------------------------------------------------------------
    // computeBodyAccel(bodyPos=pos, positions=oldPos, numBodies=numBodies)
    // -----------------------------------------------------------------------
    /*extern*/ __shared__ float4 sharedPos[NTHREAD_X*NTHREAD_Y]; //< length p*q
    float3 accel;
    accel.x = 0.0f;
    accel.y = 0.0f;
    accel.z = 0.0f;
    
    int p = blockDim.x;
    int q = blockDim.y;
    int n = numBodies;
    int numTiles = n / (p * q);

    for (int tile = blockIdx.y;
#ifndef INFERENCE
        __invariant(__no_read(newPos)),
        __invariant(__no_write(newPos)),
        __invariant(__no_read(newVel)),
        __invariant(__no_write(newVel)),
        // oldPos has read above
        __invariant(__no_write(oldPos)),
        __invariant(__no_read(oldVel)),
        __invariant(__no_write(oldVel)),
        __invariant(__no_read(sharedPos)),
        __invariant(__no_write(sharedPos)),
#endif
        tile < numTiles + blockIdx.y; tile++) 
    {
#if MUTATION
#if NTHREAD_Y == 1 // multithreadBodies == false
        sharedPos[0] =
            oldPos[WRAP(blockIdx.x + q * tile + threadIdx.y, gridDim.x) * p + threadIdx.x];
#else              // multithreadBodies == true
        sharedPos[0] =
            oldPos[WRAP(blockIdx.x + tile,                   gridDim.x) * p + threadIdx.x];
#endif
#else
#if NTHREAD_Y == 1 // multithreadBodies == false
        sharedPos[threadIdx.x+blockDim.x*threadIdx.y] =
            oldPos[WRAP(blockIdx.x + q * tile + threadIdx.y, gridDim.x) * p + threadIdx.x];
#else              // multithreadBodies == true
        sharedPos[threadIdx.x+blockDim.x*threadIdx.y] =
            oldPos[WRAP(blockIdx.x + tile,                   gridDim.x) * p + threadIdx.x];
#endif
#endif
       
        __syncthreads();

        // -------------------------------------------------------------------
        // -- gravitation(myPos=bodyPos, accel=acc)
        // -------------------------------------------------------------------
        // This is the "tile_calculation" function from the GPUG3 article.
        // REVISIT: made long into int
        unsigned int i = 0;
        for (unsigned int counter = 0; counter < blockDim.x; counter++) 
        {
          // -------------------------------------------------------------------
          // -- bodyBodyInteraction(ai=accel, bi=SX(i++), bj=myPos)
          // -------------------------------------------------------------------
          // float ai = accel is returned
          float4 bi = SX(i); i++;
          float3 r;

          // r_ij  [3 FLOPS]
          r.x = bi.x - pos.x;
          r.y = bi.y - pos.y;
          r.z = bi.z - pos.z;

          // distSqr = dot(r_ij, r_ij) + EPS^2  [6 FLOPS]
          float distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
          distSqr += softeningSquared;

          // invDistCube =1/distSqr^(3/2)  [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
           /* BUGINJECT: MUTATE_OFFSET, UP, ZERO */
          float invDist = 1.0f / sqrtf(distSqr);
          float invDistCube =  invDist * invDist * invDist;

          // s = m_j * invDistCube [1 FLOP]
          float s = pos.w * invDistCube;

          // a_i =  a_i + s * r_ij [6 FLOPS]
          accel.x += r.x * s;
          accel.y += r.y * s;
          accel.z += r.z * s;
          // -------------------------------------------------------------------
        }
        // -------------------------------------------------------------------
        __syncthreads();
    }
#if NTHREAD_Y != 1 // multithreadBodies == true
    // When the numBodies / thread block size is < # multiprocessors (16 on G80), the GPU is 
    // underutilized.  For example, with a 256 threads per block and 1024 bodies, there will only 
    // be 4 thread blocks, so the GPU will only be 25% utilized. To improve this, we use multiple 
    // threads per body.  We still can use blocks of 256 threads, but they are arranged in q rows 
    // of p threads each.  Each thread processes 1/q of the forces that affect each body, and then 
    // 1/q of the threads (those with threadIdx.y==0) add up the partial sums from the other 
    // threads for that body.  To enable this, use the "--p=" and "--q=" command line options to 
    // this example. e.g.: "nbody.exe --n=1024 --p=64 --q=4" will use 4 threads per body and 256 
    // threads per block. There will be n/p = 16 blocks, so a G80 GPU will be 100% utilized.

    // We use a bool template parameter to specify when the number of threads per body is greater 
    // than one, so that when it is not we don't have to execute the more complex code required!
    // if (multithreadBodies)
    //{
        SX_SUM(threadIdx.x, threadIdx.y).x = accel.x;
        SX_SUM(threadIdx.x, threadIdx.y).y = accel.y;
        SX_SUM(threadIdx.x, threadIdx.y).z = accel.z;

        __syncthreads();

        // Save the result in global memory for the integration step
        if (threadIdx.y == 0) 
        {
            for (int i = 1; i < blockDim.y; i++) 
            {
                accel.x += SX_SUM(threadIdx.x,i).x;
                accel.y += SX_SUM(threadIdx.x,i).y;
                accel.z += SX_SUM(threadIdx.x,i).z;
            }
        }
    //}
#endif
    // -----------------------------------------------------------------------

    // acceleration = force \ mass; 
    // new velocity = old velocity + acceleration * deltaTime
    // note we factor out the body's mass from the equation, here and in bodyBodyInteraction 
    // (because they cancel out).  Thus here force == acceleration
    float4 vel = oldVel[index];
       
    vel.x += accel.x * deltaTime;
    vel.y += accel.y * deltaTime;
    vel.z += accel.z * deltaTime;  

    vel.x *= damping;
    vel.y *= damping;
    vel.z *= damping;
        
    // new position = old position + velocity * deltaTime
    pos.x += vel.x * deltaTime;
    pos.y += vel.y * deltaTime;
    pos.z += vel.z * deltaTime;

    // store new position and velocity
    newPos[index] = pos;
    newVel[index] = vel;
}
