/***************************************************************************
 *cr
 *cr            (C) Copyright 2007 The Board of Trustees of the
 *cr                        University of Illinois
 *cr                         All Rights Reserved
 *cr
 ***************************************************************************/

/* The compute kernel. */
/* The macros THREADS_W and THREADS_H specify the width and height of the
 * area to be processed by one thread, measured in 4-by-4 pixel blocks.
 * Larger numbers mean more computation per thread block.
 *
 * The macro POS_PER_THREAD specifies the number of search positions for which
 * an SAD is computed.  A larger value indicates more computation per thread,
 * and fewer threads per thread block.  It must be a multiple of 3 and also
 * must be at most 33 because the loop to copy from shared memory uses
 * 32 threads per 4-by-4 pixel block.
 *
 */

#include "CLSmith.h"
 
__kernel void mb_sad_calc(__global unsigned short *blk_sad,
                            __global unsigned short *frame,
                            int mb_width,
                            int mb_height,
                            __read_only image2d_t img_ref,
                            __global int *emi_data)
{   
	const sampler_t texSampler =
	CLK_NORMALIZED_COORDS_FALSE |
	CLK_ADDRESS_CLAMP_TO_EDGE |
	CLK_FILTER_NEAREST;


  int tx = (get_local_id(0) / CEIL_POS) % THREADS_W;
  int ty = (get_local_id(0) / CEIL_POS) / THREADS_W;
  int bx = get_group_id(0);
  int by = get_group_id(1);
  int img_width = mb_width*16;

  // Macroblock and sub-block coordinates
  int mb_x = (tx + bx * THREADS_W) >> 2;
  int mb_y = (ty + by * THREADS_H) >> 2;
  int block_x = (tx + bx * THREADS_W) & 0x03;
  int block_y = (ty + by * THREADS_H) & 0x03;

  // If this thread is assigned to an invalid 4x4 block, do nothing 
  if ((mb_x < mb_width) && (mb_y < mb_height))
    {
      // Pixel offset of the origin of the current 4x4 block
      int frame_x = ((mb_x << 2) + block_x) << 2;
      int frame_y = ((mb_y << 2) + block_y) << 2;

      // Origin of the search area for this 4x4 block
      int ref_x = frame_x - SEARCH_RANGE;
      int ref_y = frame_y - SEARCH_RANGE;

      // Origin in the current frame for this 4x4 block
      int cur_o = frame_y * img_width + frame_x;

      int search_pos;
      int search_pos_base =
        (get_local_id(0) % CEIL_POS) * POS_PER_THREAD;
      int search_pos_end = search_pos_base + POS_PER_THREAD;

      // All SADs from this thread are stored in a contiguous chunk
      // of memory starting at this offset
      blk_sad += mb_width * mb_height * MAX_POS_PADDED * (9 + 16) +
        (mb_y * mb_width + mb_x) * MAX_POS_PADDED * 16 +
        (4 * block_y + block_x) * MAX_POS_PADDED;

      // Don't go past bounds
      if (search_pos_end > MAX_POS)
        search_pos_end = MAX_POS;

      // For each search position, within the range allocated to this thread
      for (search_pos = search_pos_base;
           search_pos < search_pos_end;
           search_pos++) {
        unsigned short sad4x4 = 0;
        int search_off_x = ref_x + (search_pos % SEARCH_DIMENSION);
        int search_off_y = ref_y + (search_pos / SEARCH_DIMENSION);

        // 4x4 SAD computation
        for(int y=0; y<4; y++) {
          for (int x=0; x<4; x++) {
          
          // ([unsigned] short)read_imageui or
          //                   read_imagei  is required for correct calculation.
          // Though read_imagei() is shorter, its results are undefined by specification since the input
          // is an unsigned type, CL_UNSIGNED_INT16
          
            sad4x4 += abs((unsigned short)((read_imageui(img_ref, texSampler, (int2)(search_off_x + x, search_off_y + y) )).x) -
                  frame[cur_o + y * img_width + x]);
                  
          }
        }

        // Save this value into the local SAD array 
        blk_sad[search_pos] = sad4x4;

        // INJECTION
        if (emi_data[139] < emi_data[21]) {
          #include "emi0/subst.h"
          #include "emi0/EMI_BLOCK.h"
        }
        // END INJECTION
      }
    }

}


//typedef unsigned int uint;

__kernel void larger_sad_calc_8(__global unsigned short *blk_sad,
				  int mb_width,
				  int mb_height)
{
  int tx = get_local_id(1) & 1;
  int ty = get_local_id(1) >> 1;

  // Macroblock and sub-block coordinates
  int mb_x = get_group_id(0);
  int mb_y = get_group_id(1);

  // Number of macroblocks in a frame
  int macroblocks = mul24(mb_width, mb_height);
  int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;

  __global unsigned short *bi;
  __global unsigned short *bo_6, *bo_5, *bo_4;


  bi = blk_sad    
    + (mul24(macroblocks, 25) + (ty * 8 + tx * 2)) * MAX_POS_PADDED
    + macroblock_index * 16;

  // Block type 6: 4x8
  bo_6 = blk_sad
    + ((macroblocks << 4) + macroblocks + (ty * 4 + tx * 2)) * MAX_POS_PADDED
    + macroblock_index * 8;

  if (ty < 100) // always true, but improves register allocation
    {
      // Block type 5: 8x4
      bo_5 = blk_sad
	+ ((macroblocks << 3) + macroblocks + (ty * 4 + tx)) * MAX_POS_PADDED
	+ macroblock_index * 8;

      // Block type 4: 8x8
      bo_4 = blk_sad
	+ ((macroblocks << 2) + macroblocks + (ty * 2 + tx)) * MAX_POS_PADDED
	+ macroblock_index * 4;
    }

  for (int search_pos = get_local_id(0); search_pos < (MAX_POS+1)/2; search_pos += 32)
    {
      ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]);
      ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]);
      ushort2 s10 = (ushort2) (bi[(search_pos + 4*MAX_POS_PADDED/2)*2], bi[(search_pos + 4*MAX_POS_PADDED/2)*2+1]);
      ushort2 s11 = (ushort2) (bi[(search_pos + 5*MAX_POS_PADDED/2)*2], bi[(search_pos + 5*MAX_POS_PADDED/2)*2+1]);

      bo_6[search_pos*2] = s00.x + s10.x;
      bo_6[search_pos*2+1] = s00.y + s10.y;
      bo_6[(search_pos+MAX_POS_PADDED/2)*2] = s01.x + s11.x;
      bo_6[(search_pos+MAX_POS_PADDED/2)*2+1] = s01.y + s11.y;
      bo_5[search_pos*2] = s00.x + s01.x;
      bo_5[search_pos*2+1] = s00.y + s01.y;
      bo_5[(search_pos+2*MAX_POS_PADDED/2)*2] = s10.x + s11.x;
      bo_5[(search_pos+2*MAX_POS_PADDED/2)*2+1] = s10.y + s11.y;
      bo_4[search_pos*2] = (s00.x + s01.x) + (s10.x + s11.x);
      bo_4[search_pos*2+1] = (s00.y + s01.y) + (s10.y + s11.y);
    }
    
}



__kernel void larger_sad_calc_16(__global unsigned short *blk_sad,
				   int mb_width,
				   int mb_height)
{
  // Macroblock coordinates 
  int mb_x = get_group_id(0);
  int mb_y = get_group_id(1);

  // Number of macroblocks in a frame
  int macroblocks = mul24(mb_width, mb_height) * MAX_POS_PADDED;
  int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;

  __global unsigned short *bi;
  __global unsigned short *bo_3, *bo_2, *bo_1;

  //bi = blk_sad + macroblocks * 5 + macroblock_index * 4;
  bi = blk_sad + ((macroblocks + macroblock_index) << 2) + macroblocks;

  // Block type 3: 8x16
  //bo_3 = blk_sad + macroblocks * 3 + macroblock_index * 2;
  bo_3 = blk_sad + ((macroblocks + macroblock_index) << 1) + macroblocks;

  // Block type 5: 8x4
  bo_2 = blk_sad + macroblocks + macroblock_index * 2;

  // Block type 4: 8x8
  bo_1 = blk_sad + macroblock_index;

  for (int search_pos = get_local_id(0); search_pos < (MAX_POS+1)/2; search_pos += 32)
    {
      ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]);
      ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]);
      ushort2 s10 = (ushort2) (bi[(search_pos + 2*MAX_POS_PADDED/2)*2], bi[(search_pos + 2*MAX_POS_PADDED/2)*2+1]);
      ushort2 s11 = (ushort2) (bi[(search_pos + 3*MAX_POS_PADDED/2)*2], bi[(search_pos + 3*MAX_POS_PADDED/2)*2+1]);

      bo_3[search_pos*2] = s00.x + s10.x;
      bo_3[search_pos*2+1] = s00.y + s10.y;
      bo_3[(search_pos+MAX_POS_PADDED/2)*2] = s01.x + s11.x;
      bo_3[(search_pos+MAX_POS_PADDED/2)*2+1] = s01.y + s11.y;
      bo_2[search_pos*2] = s00.x + s01.x;
      bo_2[search_pos*2+1] = s00.y + s01.y;
      bo_2[(search_pos+MAX_POS_PADDED/2)*2] = s10.x + s11.x;
      bo_2[(search_pos+MAX_POS_PADDED/2)*2+1] = s10.y + s11.y;
      bo_1[search_pos*2] = (s00.x + s01.x) + (s10.x + s11.x);
      bo_1[search_pos*2+1] = (s00.y + s01.y) + (s10.y + s11.y);
    }
}


