/***************************************************************************
 *cr
 *cr            (C) Copyright 2010 The Board of Trustees of the
 *cr                        University of Illinois
 *cr                         All Rights Reserved
 *cr
 ***************************************************************************/
/*
  Implementing Breadth first search on CUDA using algorithm given in DAC'10
  paper "An Effective GPU Implementation of Breadth-First Search"

  Copyright (c) 2010 University of Illinois at Urbana-Champaign. 
  All rights reserved.

  Permission to use, copy, modify and distribute this software and its documentation for 
  educational purpose is hereby granted without fee, provided that the above copyright 
  notice and this permission notice appear in all copies of this software and that you do 
  not sell the software.

  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR 
  OTHERWISE.

  Author: Lijiuan Luo (lluo3@uiuc.edu)
  Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu (gengliu2@illinois.edu)
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <CL/cl.h>
#include "parboil.h"
#include "OpenCL_common.h"
#include "config.h"

// IMPERIAL EDIT
#include <sstream>
#include <fstream>
#include <string>
#include <iostream>
#include <cassert>
#include "emi.h"

#ifdef _MSC_VER
#include "windows_exception.h"
#endif
// END IMPERIAL EDIT

const int h_top = 1;
const int zero = 0;
////////////////////////////////////////////////////////////////////////////////
// Main Program
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv) 
{
  #ifdef _MSC_VER
  ENABLE_WINDOWS_EXCEPTION_HANDLING
  #endif
    
  //the number of nodes in the graph
  int num_of_nodes = 0; 
  //the number of edges in the graph
  int num_of_edges = 0;
  struct pb_Parameters *params;
  struct pb_TimerSet timers;

  pb_InitializeTimerSet(&timers);
  params = pb_ReadParameters(&argc, argv);
  // IMPERIAL EDIT
  parseExtraArgs(argc, argv);
  // END IMPERIAL EDIT

  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
  {
    fprintf(stderr, "Expecting one input filename\n");
    exit(-1);
  }

  pb_SwitchToTimer(&timers, pb_TimerID_IO);
  //Read in Graph from a file
  FILE *fp = fopen(params->inpFiles[0],"r");
  if(!fp)
  {
    printf("Error Reading graph file\n");
    return 0;
  }
  int source;

  fscanf(fp,"%d",&num_of_nodes);
  // allocate host memory
  struct Node* h_graph_nodes = (struct Node*) malloc(sizeof(struct Node)*num_of_nodes);
  int *color = (int*) malloc(sizeof(int)*num_of_nodes);
  int start, edgeno;   
  // initalize the memory
  int i;
  for( i = 0; i < num_of_nodes; i++) 
  {
    fscanf(fp,"%d %d",&start,&edgeno);
    h_graph_nodes[i].x = start;
    h_graph_nodes[i].y = edgeno;
    color[i]=WHITE;
  }
  //read the source node from the file
  fscanf(fp,"%d",&source);
  fscanf(fp,"%d",&num_of_edges);
  int id,cost;
  struct Edge* h_graph_edges = (struct Edge*) malloc(sizeof(struct Edge)*num_of_edges);
  for(i=0; i < num_of_edges ; i++)
  {
    fscanf(fp,"%d",&id);
    fscanf(fp,"%d",&cost);
    h_graph_edges[i].x = id;
    h_graph_edges[i].y = cost;
  }
  if(fp)
    fclose(fp);    

  // allocate mem for the result on host side
  int* h_cost = (int*) malloc( sizeof(int)*num_of_nodes);
  for(i = 0; i < num_of_nodes; i++){
    h_cost[i] = INF;
  }
  h_cost[source] = 0;

  // load snapshot
  int init_k, init_num_t;
  int *init_q1;
  int *init_q2;
  if (LOAD_SNAPSHOT) {
    fp = fopen(LOAD_SNAPSHOT, "r");
    if (!fp) {
      printf("Error reading snapshot file\n");
      return 0;
    }
    fscanf(fp,"%d",&init_k);
    fscanf(fp,"%d",&init_num_t);
    int nnodes;
    fscanf(fp,"%d",&nnodes);
    assert(nnodes == num_of_nodes && "Mismatch between snapshot and graph input.");
    for (int i=0; i<num_of_nodes; i++)  {
      int c;
      fscanf(fp,"%d",&c);
      h_cost[i] = c;
    }
    for (int i=0; i<num_of_nodes; i++)  {
      int c;
      fscanf(fp,"%d",&c);
      color[i] = c;
    }
    init_q1 = new int[num_of_nodes];
    for (int i=0; i<num_of_nodes; i++)  {
      int c;
      fscanf(fp,"%d",&c);
      init_q1[i] = c;
    }
    init_q2 = new int[num_of_nodes];
    for (int i=0; i<num_of_nodes; i++)  {
      int c;
      fscanf(fp,"%d",&c);
      init_q2[i] = c;
    }
    std::cout << "# Loading snapshot k = " << init_k << " and num_t = " << init_num_t << std::endl;
  }

  pb_SwitchToTimer(&timers, pb_TimerID_COPY);

  // IMPERIAL EDIT
  cl_int clStatus;
  cl_device_id clDevice;
  cl_platform_id clPlatform;
  cl_context clContext;
  cl_command_queue clCommandQueue;
  initCL(clPlatform, clDevice, clContext, clCommandQueue);
  // END IMPERIAL EDIT
  pb_SetOpenCL(&clContext, &clCommandQueue);

  // IMPERIAL EDIT
  cl_program clProgram;
  cl_kernel BFS_kernel;
  compileProgram( "src/kernel.cl", "BFS_kernel", clDevice, clContext, clProgram, BFS_kernel);
  // END IMPERIAL EDIT

  //Copy the Node list to device memory
  cl_mem d_graph_nodes;
  d_graph_nodes = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_nodes*sizeof(struct Node),NULL,&clStatus);
  OCL_ERRCK_VAR(clStatus);
  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_nodes,CL_TRUE,0,num_of_nodes*sizeof(struct Node),h_graph_nodes,0,NULL,NULL));
  //Copy the Edge List to device Memory
  cl_mem d_graph_edges;
  d_graph_edges = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_edges*sizeof(struct Edge),NULL,&clStatus);
  OCL_ERRCK_VAR(clStatus);
  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_edges,CL_TRUE,0,num_of_edges*sizeof(struct Edge),h_graph_edges,0,NULL,NULL));

  cl_mem d_color, d_cost, d_q1, d_q2, tail;
  d_color = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
  d_cost = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
  d_q1 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
  d_q2 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
  tail = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus);
  OCL_ERRCK_VAR(clStatus);
  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL));
  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL));

  // IMPERIAL EDIT: initialise *all* elements of d_q1 and d_q2 (for snapshoting)
  int *zeroes = new int[num_of_nodes];
  for (int i=0; i<num_of_nodes; i++) zeroes[i] = 0;
  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_q1,CL_TRUE,0,num_of_nodes*sizeof(int),zeroes,0,NULL,NULL));
  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_q2,CL_TRUE,0,num_of_nodes*sizeof(int),zeroes,0,NULL,NULL));
  // IMPERIAL EDIT: emi buffer
  cl_mem d_emi_data;
  initEMI(clContext, clCommandQueue, d_emi_data);
  // END IMPERIAL EDIT

  printf("Starting GPU kernel\n");
  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
  int num_of_blocks; 
  int num_of_threads_per_block;

  // IMPERIAL EDIT: remove initialisation of tail, d_cost and d_q1 if loading from snapshot
  if (!LOAD_SNAPSHOT) {
    OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&h_top,0,NULL,NULL));
    OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL));
    OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_q1,CL_TRUE,0,sizeof(int),&source,0,NULL,NULL));
  }

  int num_t;//number of threads
  int k=0;//BFS level index

  if (LOAD_SNAPSHOT) {
    OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&init_num_t,0,NULL,NULL)); //< initialises num_t below
    OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_q1,CL_TRUE,0,num_of_nodes*sizeof(int),init_q1,0,NULL,NULL));
    OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_q2,CL_TRUE,0,num_of_nodes*sizeof(int),init_q2,0,NULL,NULL));
    k = init_k;
  }

  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,2,sizeof(cl_mem),(void*)&d_graph_nodes));
  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,3,sizeof(cl_mem),(void*)&d_graph_edges));
  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,4,sizeof(cl_mem),(void*)&d_color));
  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,5,sizeof(cl_mem),(void*)&d_cost));
  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,6,sizeof(cl_mem),(void*)&tail));
  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,13,sizeof(cl_mem),(void*)&d_emi_data));

  do
  {
    OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&num_t,0,NULL,NULL));
    OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL));

    // IMPERIAL EDIT: take snapshot
    if (k == SNAPSHOT) {
      std::stringstream fname;
      fname << params->outFile << "_snapshot_" << k;
      int *h_q1 = new int[num_of_nodes];
      int *h_q2 = new int[num_of_nodes];
      OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL));
      OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL));
      OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_q1,CL_TRUE,0,num_of_nodes*sizeof(int),h_q1,0,NULL,NULL));
      OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_q2,CL_TRUE,0,num_of_nodes*sizeof(int),h_q2,0,NULL,NULL));
      fp = fopen(fname.str().c_str(),"w");
      fprintf(fp, "%d\n", k);
      fprintf(fp, "%d\n", num_t);
      fprintf(fp, "%d\n", num_of_nodes);
      for(int j=0;j<num_of_nodes;j++)
        fprintf(fp,"%d\n", h_cost[j]);
      for(int j=0;j<num_of_nodes;j++)
        fprintf(fp,"%d\n", color[j]);
      for(int j=0;j<num_of_nodes;j++)
        fprintf(fp,"%d\n", h_q1[j]);
      for(int j=0;j<num_of_nodes;j++)
        fprintf(fp,"%d\n", h_q2[j]);
      fclose(fp);
    }

    if(num_t == 0){//frontier is empty
      break;
    }

    num_of_blocks = (int)ceil(num_t/(double)MAX_THREADS_PER_BLOCK); 
    num_of_threads_per_block = num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t;

    size_t grid[1] = {num_of_blocks*num_of_threads_per_block};
    size_t block[1] = {num_of_threads_per_block};


    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,7,sizeof(int),(void*)&num_t));
    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,9,sizeof(int),(void*)&k));
    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,10,sizeof(int),NULL));
    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,11,LOCAL_MEM_SIZE*sizeof(int),NULL));
    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,12,sizeof(int),NULL));
    if(k%2 == 0){
      int gray = GRAY0;
      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,0,sizeof(cl_mem),(void*)&d_q1));
      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,1,sizeof(cl_mem),(void*)&d_q2));
      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,8,sizeof(int),(void*)&gray));
    }
    else{
      int gray = GRAY1;
      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,0,sizeof(cl_mem),(void*)&d_q2));
      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,1,sizeof(cl_mem),(void*)&d_q1));
      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,8,sizeof(int),(void*)&gray));
    }
  //std::cout << "k          = " << k << std::endl;
  //std::cout << "block_size = " << num_of_threads_per_block << std::endl;
  //std::cout << "num_blocks = " << num_of_blocks << std::endl;
    OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel,1,0,grid,block,0,0,0));
    OCL_ERRCK_RETVAL(clFinish(clCommandQueue));

    k++;
  } while(!RUN_SINGLE_ITERATION);
  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
  printf("GPU kernel done\n");

  // SANITY CHECK emi_data
#if 0
  int *emi = new int[EMI_DATA_LEN];
  for (int i=0; i<EMI_DATA_LEN; i++) {
    emi[i] = -1;
  }
  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_emi_data,CL_TRUE,0,EMI_DATA_LEN*sizeof(int),emi,0,NULL,NULL));
  std::cout << "emi check" << std::endl;
  for (int i=0; i<EMI_DATA_LEN; i++) {
    assert(emi[i] == i);
  }
//for (int i=0; i<EMI_DATA_LEN; i++) std::cout << emi[i] << ", ";
  std::cout << "done" << std::endl;
#endif

  // copy result from device to host
  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL));
  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL));

  OCL_ERRCK_RETVAL(clReleaseMemObject(d_graph_nodes));
  OCL_ERRCK_RETVAL(clReleaseMemObject(d_graph_edges));
  OCL_ERRCK_RETVAL(clReleaseMemObject(d_color));
  OCL_ERRCK_RETVAL(clReleaseMemObject(d_cost));
  OCL_ERRCK_RETVAL(clReleaseMemObject(tail));
  OCL_ERRCK_RETVAL(clReleaseMemObject(d_emi_data));
  //Store the result into a file
  pb_SwitchToTimer(&timers, pb_TimerID_IO);
  fp = fopen(params->outFile,"w");
  fprintf(fp, "%d\n", num_of_nodes);
  for (int j=0;j<num_of_nodes;j++)
    fprintf(fp,"%d %d\n",j,h_cost[j]);
  fclose(fp);
  // cleanup memory
  free(h_graph_nodes);
  free(h_graph_edges);
  free(color);
  free(h_cost);
  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
  pb_PrintTimerSet(&timers);
  pb_FreeParameters(params);
  return 0;
}
