#ifdef __APPLE__
  #include <OpenCL/opencl.h>
#elif __linux__
  #include <CL/cl.h>
#elif _WIN32
  #include <CL/cl.h>
#else
  #error Not sure where to find OpenCL header
#endif

#include <cassert>
#include <cstdlib>
#include <ctime>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>

#ifndef _WIN32
#include <csignal>
#endif

#ifdef PROFILING
#include <chrono> //< requires c++11
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::duration<int,std::milli> millisecs_t;
#endif

#define ispow2(x) ((x & (x-2)) == 0)
#define MAX_GROUP_SIZE 1024
#define TYPE char

using namespace std;

// --------------------------------------------------------------------------
// COMMAND-LINE OPTIONS
// --------------------------------------------------------------------------
unsigned nelements;
bool print_vectors = false;
bool print_results = false;
bool check_random = false;
bool force_host_scan = false;
bool skip_op1 = false;
bool skip_op2 = false;
string kernel;

uint64_t nvectors = 0;
unsigned N;
unsigned ngroups;
size_t ArraySize;
size_t SumSize;
size_t ErrorSize;
TYPE *out;
TYPE *sum;
TYPE *sumout;

// --------------------------------------------------------------------------
// EXIT HANDLER
// --------------------------------------------------------------------------
void cleanup();
void exit_handler(int signal) {
  cout << "Early exit; managed to run " << nvectors << " test vectors" << endl;
  cleanup();
  exit(signal);
}

// --------------------------------------------------------------------------
// OPENCL
// --------------------------------------------------------------------------
cl_uint num_platforms;
cl_platform_id *platforms;
cl_uint num_devices;
cl_device_id *devices;

cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue command_queue;

cl_mem d_in;
cl_mem d_out;
cl_mem d_expected;
cl_mem d_sum;
cl_mem d_sumout;
cl_mem d_error;

cl_kernel k1, k2;
cl_kernel kscan1, kscan2;
cl_kernel kinc1, kinc2;
cl_kernel kcheck;

// --------------------------------------------------------------------------
// OPERATORS (only used in host-level scan)
// --------------------------------------------------------------------------
class Op1 {
  public:
  TYPE operator() (TYPE x, TYPE y) const {
    if (y == 0) return x;
    if (x == 0 && y == 1) return 1;
    return 2;
  }
};

class Op2 {
  public:
  TYPE operator() (TYPE x, TYPE y) const {
    if (y == 0) return x;
    if (y == 1) return 1;
    if (y == 2) return 2;
    assert(false && "op2 bad y");
  }
};

// --------------------------------------------------------------------------
// RUN SINGLE VECTOR
// --------------------------------------------------------------------------
void print_vector(TYPE *v, TYPE *e) {
  printf("v = {%d", v[0]);
  for (unsigned i=1; i<nelements; ++i) {
    printf(", %d", v[i]);
  }
  printf("}\t");
  printf("e = {%d", e[0]);
  for (unsigned i=1; i<nelements; ++i) {
    printf(", %d", e[i]);
  }
  printf("}\n");
}

template <class Op>
unsigned runvector(TYPE *v, TYPE *e, cl_kernel &k, cl_kernel &kscan, cl_kernel &kinc) {
  if (print_vectors) {
    print_vector(v, e);
  }

  // block level scan
  {
    cl_bool blocking_write = CL_TRUE;
    cl_uint num_events_in_wait_list = 0;
    cl_event *event_wait_list = NULL;
#ifdef EVENT
    cl_event event;
    cl_uint err = clEnqueueWriteBuffer(command_queue, d_in, blocking_write, /*offset=*/0, ArraySize, v, num_events_in_wait_list, event_wait_list, &event);
    assert(err == CL_SUCCESS && "Error writing to d_in");
    err = clReleaseEvent(event);
    assert(err == CL_SUCCESS && "Error releasing event");
#else
    cl_uint err = clEnqueueWriteBuffer(command_queue, d_in, blocking_write, /*offset=*/0, ArraySize, v, num_events_in_wait_list, event_wait_list, NULL);
    assert(err == CL_SUCCESS && "Error writing to d_in");
#endif
  }
  {
    cl_int err = clSetKernelArg(k, 0, sizeof(cl_mem), (void *)&d_in);
    assert(err == CL_SUCCESS && "Error setting k argument 0");
  }
  {
    cl_int err = clSetKernelArg(k, 1, sizeof(cl_mem), (void *)&d_out);
    assert(err == CL_SUCCESS && "Error setting k argument 1");
  }
  {
    cl_int err = clSetKernelArg(k, 2, sizeof(cl_mem), (void *)&d_sum);
    assert(err == CL_SUCCESS && "Error setting k argument 2");
  }
  {
    cl_uint dim = 1;
    size_t global_work_size;
    size_t local_work_size;
    if (kernel == "sklansky.cl") {
      global_work_size = nelements/2;
      local_work_size  = N/2;
    } else if (kernel == "koggestone.cl") {
      global_work_size = nelements;
      local_work_size  = N;
    } else if (kernel == "brentkung.cl") {
      global_work_size = nelements/2;
      local_work_size  = N/2;
    } else if (kernel == "blelloch.cl") {
      global_work_size = nelements/2;
      local_work_size  = N/2;
    } else {
      assert(0);
    }
    cl_uint num_events_in_wait_list = 0;
    cl_event *event_wait_list = NULL;
#ifdef EVENT
    cl_event event;
    cl_uint err = clEnqueueNDRangeKernel(command_queue, k, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, &event);
    assert(err == CL_SUCCESS && "Error running k");
    err = clReleaseEvent(event);
    assert(err == CL_SUCCESS && "Error releasing event");
#else
    cl_uint err = clEnqueueNDRangeKernel(command_queue, k, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, NULL);
    assert(err == CL_SUCCESS && "Error running k");
#endif
  }

  // nb: if ngroups > MAX_GROUP_SIZE then the test will use host scan
  if (1 < ngroups) {
    if (ngroups <= MAX_GROUP_SIZE && !force_host_scan) {
      // device level exclusive scan
      {
        cl_int err = clSetKernelArg(kscan, 0, sizeof(cl_mem), (void *)&d_sum);
        assert(err == CL_SUCCESS && "Error setting kscan argument 0");
      }
      {
        cl_int err = clSetKernelArg(kscan, 1, sizeof(cl_mem), (void *)&d_sumout);
        assert(err == CL_SUCCESS && "Error setting kscan argument 1");
      }
      {
        unsigned is_exclusive = 1;
        cl_int err = clSetKernelArg(kscan, 2, sizeof(unsigned), &is_exclusive);
        assert(err == CL_SUCCESS && "Error setting kscan argument 1");
      }
      {
        cl_uint dim = 1;
        size_t global_work_size;
        size_t local_work_size;
        if (kernel == "sklansky.cl") {
          global_work_size = ngroups/2;
          local_work_size  = ngroups/2;
        } else if (kernel == "koggestone.cl") {
          global_work_size = ngroups;
          local_work_size  = ngroups;
        } else if (kernel == "brentkung.cl") {
          global_work_size = ngroups/2;
          local_work_size  = ngroups/2;
        } else if (kernel == "blelloch.cl") {
          global_work_size = ngroups/2;
          local_work_size  = ngroups/2;
        } else {
          assert(false && "Unreachable");
        }
        cl_uint num_events_in_wait_list = 0;
        cl_event *event_wait_list = NULL;
#ifdef EVENT
        cl_event event;
        cl_uint err = clEnqueueNDRangeKernel(command_queue, kscan, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, &event);
        assert(err == CL_SUCCESS && "Error running kscan");
        err = clReleaseEvent(event);
        assert(err == CL_SUCCESS && "Error releasing event");
#else
        cl_uint err = clEnqueueNDRangeKernel(command_queue, kscan, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, NULL);
        assert(err == CL_SUCCESS && "Error running kscan");
#endif
      }
    } else {
      // host level exclusive scan
      assert(sum && "sum is null");
      assert(sumout && "sumout is null");
      {
        cl_bool blocking_read = CL_TRUE;
        cl_uint num_events_in_wait_list = 0;
        cl_event *event_wait_list = NULL;
        cl_uint err = clEnqueueReadBuffer(command_queue, d_sum, blocking_read, /*offset=*/0, SumSize, sum, num_events_in_wait_list, event_wait_list, NULL);
        assert(err == CL_SUCCESS && "Error reading from d_sum");
      }
      sumout[0] = 0;
      for (unsigned i=1; i<ngroups; ++i) {
        sumout[i] = Op()(sumout[i-1], sum[i-1]);
      }
      {
        cl_bool blocking_write = CL_TRUE;
        cl_uint num_events_in_wait_list = 0;
        cl_event *event_wait_list = NULL;
        cl_uint err = clEnqueueWriteBuffer(command_queue, d_sumout, blocking_write, /*offset=*/0, SumSize, sumout, num_events_in_wait_list, event_wait_list, NULL);
        assert(err == CL_SUCCESS && "Error writing to d_sumout");
      }
    }

    // final increment
    {
      cl_int err = clSetKernelArg(kinc, 0, sizeof(cl_mem), (void *)&d_out);
      assert(err == CL_SUCCESS && "Error setting kinc argument 0");
    }
    {
      cl_int err = clSetKernelArg(kinc, 1, sizeof(cl_mem), (void *)&d_sumout);
      assert(err == CL_SUCCESS && "Error setting kinc argument 1");
    }
    {
      cl_uint dim = 1;
      size_t global_work_size = nelements;
      size_t local_work_size = N;
      cl_uint num_events_in_wait_list = 0;
      cl_event *event_wait_list = NULL;
#ifdef EVENT
      cl_event event;
      cl_uint err = clEnqueueNDRangeKernel(command_queue, kinc, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, &event);
      assert(err == CL_SUCCESS && "Error running kinc");
      err = clReleaseEvent(event);
      assert(err == CL_SUCCESS && "Error releasing event");
#else
      cl_uint err = clEnqueueNDRangeKernel(command_queue, kinc, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, NULL);
      assert(err == CL_SUCCESS && "Error running kinc");
#endif
    }
  }

  // check results
  unsigned error = 0;
  {
    cl_bool blocking_write = CL_TRUE;
    cl_uint num_events_in_wait_list = 0;
    cl_event *event_wait_list = NULL;
#ifdef EVENT
    cl_event event;
    cl_uint err = clEnqueueWriteBuffer(command_queue, d_expected, blocking_write, /*offset=*/0, ArraySize, e, num_events_in_wait_list, event_wait_list, &event);
    assert(err == CL_SUCCESS && "Error writing to d_expected");
    err = clReleaseEvent(event);
    assert(err == CL_SUCCESS && "Error releasing event");
#else
    cl_uint err = clEnqueueWriteBuffer(command_queue, d_expected, blocking_write, /*offset=*/0, ArraySize, e, num_events_in_wait_list, event_wait_list, NULL);
    assert(err == CL_SUCCESS && "Error writing to d_expected");
#endif
  }
  {
    cl_bool blocking_write = CL_TRUE;
    cl_uint num_events_in_wait_list = 0;
    cl_event *event_wait_list = NULL;
#ifdef EVENT
    cl_event event;
    cl_uint err = clEnqueueWriteBuffer(command_queue, d_error, blocking_write, /*offset=*/0, ErrorSize, &error, num_events_in_wait_list, event_wait_list, &event);
    assert(err == CL_SUCCESS && "Error writing to d_error");
    err = clReleaseEvent(event);
    assert(err == CL_SUCCESS && "Error releasing event");
#else
    cl_uint err = clEnqueueWriteBuffer(command_queue, d_error, blocking_write, /*offset=*/0, ErrorSize, &error, num_events_in_wait_list, event_wait_list, NULL);
    assert(err == CL_SUCCESS && "Error writing to d_error");
#endif
  }
  {
    cl_int err = clSetKernelArg(kcheck, 0, sizeof(cl_mem), (void *)&d_out);
    assert(err == CL_SUCCESS && "Error setting kcheck argument 0");
  }
  {
    cl_int err = clSetKernelArg(kcheck, 1, sizeof(cl_mem), (void *)&d_expected);
    assert(err == CL_SUCCESS && "Error setting kcheck argument 1");
  }
  {
    cl_int err = clSetKernelArg(kcheck, 2, sizeof(cl_mem), (void *)&d_error);
    assert(err == CL_SUCCESS && "Error setting kcheck argument 3");
  }
  {
    cl_uint dim = 1;
    size_t global_work_size = nelements;
    size_t local_work_size = N;
    cl_uint num_events_in_wait_list = 0;
    cl_event *event_wait_list = NULL;
#ifdef EVENT
    cl_event event;
    cl_uint err = clEnqueueNDRangeKernel(command_queue, kcheck, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, &event);
    assert(err == CL_SUCCESS && "Error running kcheck");
    err = clReleaseEvent(event);
    assert(err == CL_SUCCESS && "Error releasing event");
#else
    cl_uint err = clEnqueueNDRangeKernel(command_queue, kcheck, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, NULL);
    assert(err == CL_SUCCESS && "Error running kcheck");
#endif
  }
  {
    cl_bool blocking_read = CL_TRUE;
    cl_uint num_events_in_wait_list = 0;
    cl_event *event_wait_list = NULL;
#ifdef EVENT
    cl_event event;
    cl_uint err = clEnqueueReadBuffer(command_queue, d_error, blocking_read, /*offset=*/0, ErrorSize, &error, num_events_in_wait_list, event_wait_list, &event);
    assert(err == CL_SUCCESS && "Error reading from d_error");
    err = clReleaseEvent(event);
    assert(err == CL_SUCCESS && "Error releasing event");
#else
    cl_uint err = clEnqueueReadBuffer(command_queue, d_error, blocking_read, /*offset=*/0, ErrorSize, &error, num_events_in_wait_list, event_wait_list, NULL);
    assert(err == CL_SUCCESS && "Error reading from d_error");
#endif
  }

  if (error || print_results || check_random) {
    assert(out && "out is null");
    cl_bool blocking_read = CL_TRUE;
    cl_uint num_events_in_wait_list = 0;
    cl_event *event_wait_list = NULL;
#ifdef EVENT
    cl_event event;
    cl_uint err = clEnqueueReadBuffer(command_queue, d_out, blocking_read, /*offset=*/0, ArraySize, out, num_events_in_wait_list, event_wait_list, &event);
    assert(err == CL_SUCCESS && "Error reading from d_out");
    err = clReleaseEvent(event);
    assert(err == CL_SUCCESS && "Error releasing event");
#else
    cl_uint err = clEnqueueReadBuffer(command_queue, d_out, blocking_read, /*offset=*/0, ArraySize, out, num_events_in_wait_list, event_wait_list, NULL);
    assert(err == CL_SUCCESS && "Error reading from d_out");
#endif
  }

  if (error) {
    print_vector(v, e);
    printf("out = {%d", out[0]);
    for (unsigned i=1; i<nelements; ++i) {
      printf(", %d", out[i]);
    }
    printf("}\n");
    {
      cl_bool blocking_read = CL_TRUE;
      cl_uint num_events_in_wait_list = 0;
      cl_event *event_wait_list = NULL;
      cl_uint err = clEnqueueReadBuffer(command_queue, d_sum, blocking_read, /*offset=*/0, SumSize, sum, num_events_in_wait_list, event_wait_list, NULL);
      assert(err == CL_SUCCESS && "Error reading from d_sum");
    }
    printf("sum = {%d", sum[0]);
    for (unsigned i=1; i<ngroups; ++i) {
      printf(", %d", sum[i]);
    }
    printf("}\n");
    {
      cl_bool blocking_read = CL_TRUE;
      cl_uint num_events_in_wait_list = 0;
      cl_event *event_wait_list = NULL;
      cl_uint err = clEnqueueReadBuffer(command_queue, d_sumout, blocking_read, /*offset=*/0, SumSize, sumout, num_events_in_wait_list, event_wait_list, NULL);
      assert(err == CL_SUCCESS && "Error reading from d_sum");
    }
    printf("sumout = {%d", sumout[0]);
    for (unsigned i=1; i<ngroups; ++i) {
      printf(", %d", sumout[i]);
    }
    printf("}\n");
    printf("expected[1024] = %d\n", e[1024]);
    printf("out[1024] = %d\n", out[1024]);
  }

  if (print_results) {
    printf("out = {%d", out[0]);
    for (unsigned i=1; i<nelements; ++i) {
      printf(", %d", out[i]);
    }
    printf("}\n");
  }

  if (check_random) {
    srand(time(NULL));
    unsigned i = (rand() % (int)(nelements + 1));
    printf("random-check: out[%d] = %d\n", i, out[i]);
  }

  return error;
}

// --------------------------------------------------------------------------
// HELPERS
// --------------------------------------------------------------------------
void cleanup() {
  if (platforms)     delete[] platforms;
  if (devices)       delete[] devices;
  if (out)           delete[] out;
  if (sum)           delete[] sum;
  if (sumout)        delete[] sumout;
  if (command_queue) clReleaseCommandQueue(command_queue);
  if (context)       clReleaseContext(context);
  if (k1)            clReleaseKernel(k1);
  if (k2)            clReleaseKernel(k2);
  if (kscan1)        clReleaseKernel(kscan1);
  if (kscan2)        clReleaseKernel(kscan2);
  if (kinc1)         clReleaseKernel(kinc1);
  if (kinc2)         clReleaseKernel(kinc2);
  if (kcheck)        clReleaseKernel(kcheck);
}

void compile_from_string(cl_program &program, const char *program_string, const char *extra_flags) {
  {
    cl_int err;
    //lengths=NULL -> program_string is null terminated
    program = clCreateProgramWithSource(context, /*count=*/1, (const char **) &program_string, /*lengths=*/NULL, &err);
    assert(err == CL_SUCCESS && "Error creating program");
    assert(program && "Null program");
  }

  ostringstream flags;
  flags << extra_flags;

  //pfn_notify=NULL -> call is blocking
  cl_int builderr = clBuildProgram(program, /*ndev=*/1, &device, flags.str().c_str(), /*pfn_notify=*/NULL, /*user_data=*/NULL);

  //print out build logs
  if (builderr != CL_SUCCESS) {
    cerr << "Build error " << builderr << endl;

    cl_build_status status;
    {
      cl_int err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL);
      assert(err == CL_SUCCESS && "Error getting program build status");
    }
    cerr << "Build status " << status << endl;

    size_t size;
    {
      cl_int err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &size);
      assert(err == CL_SUCCESS && "Error getting program build info"); 
    }
    char *build_log = new char[size+1];
    {
      cl_int err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, size, build_log, NULL);
      assert(err == CL_SUCCESS && "Error getting program build log");
    }
    build_log[size] = '\0';

    cerr << "Build log" << endl;
    cerr << build_log << endl;
    delete[] build_log;

    assert(false && "Build error");
  }

}

void compile(cl_program &p, const char *fname, const char *extra_flags) {
	fstream f(fname, (fstream::in | fstream::binary));
  assert(f.is_open() && "Unable to open file");

  size_t fileSize;
  f.seekg(0, fstream::end);
  size_t size = fileSize = f.tellg();
  f.seekg(0, fstream::beg);

  char *str = new char[size+1];
  f.read(str, fileSize);
  f.close();
  str[size] = '\0';

#ifdef PROFILING
  auto t1 = Clock::now();
#endif
  compile_from_string(p, str, extra_flags);
#ifdef PROFILING
  auto t2 = Clock::now();
  millisecs_t duration(chrono::duration_cast<millisecs_t>(t2-t1));
  cout << "Time to compile " << fname << ": " << duration.count() << " milliseconds." << endl;
#endif

  delete[] str;
}

// the nth triangular number (the number of op1 vectors)
uint64_t triangular(uint64_t n) {
  return (n * (n+1))/2;
}

void print_usage(char *progname) {
  cerr << "Usage: " << progname << " <nelements> <kernel.cl> [options]" << endl;
  cerr << "  where <nelements> must be a power of two" << endl;
  cerr << "        <kernel.cl> must be one of" << endl;
  cerr << "          o blelloch.cl" << endl;
  cerr << "          o brentkung.cl" << endl;
  cerr << "          o koggestone.cl" << endl;
  cerr << "          o sklansky.cl" << endl;
  cerr << "  --clinfo           print platform/device info" << endl;
  cerr << "  --force-host-scan  force host scan if multiblock scan required" << endl;
  cerr << "  --check-random     force random check at end of each kernel invocation" << endl;
  cerr << "  --print-vectors    print generated test vectors" << endl;
  cerr << "  --print-results    print results" << endl;
  cerr << "  --platform=X       use platform X (default 0)" << endl;
  cerr << "  --device=Y         use device Y (default 0)" << endl;
}

void clinfo() {
  stringstream ss;

  cl_uint num_platforms;
  {
    cl_int err = clGetPlatformIDs(/*num_entries=*/0, /*platforms=*/NULL, &num_platforms);
    assert(err == CL_SUCCESS && "Error getting number of platforms");
  }
  ss << "# Found " << num_platforms << " OpenCL platform" << (num_platforms == 1 ?  "":"s") << "\n";

  cl_platform_id *platforms = new cl_platform_id[num_platforms];
  {
    cl_int err = clGetPlatformIDs(num_platforms, platforms, /*num_platforms=*/NULL);
    assert(err == CL_SUCCESS && "Error getting platforms");
  }

  // query platform and devices
  char platform_name[1024];
  char platform_version[1024];
  char device_name[1024];
  char device_vendor[1024];
  cl_uint num_cores;
  cl_uint clk_freq;
  cl_long global_mem_size;
  cl_ulong local_mem_size;
  cl_ulong max_group_size;
  for (unsigned i=0; i<num_platforms; ++i) {
    {
      cl_uint err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name), platform_name, /*param_value_size_ret=*/NULL);
      assert(err == CL_SUCCESS && "Error getting platform name");
    }
    {
      cl_uint err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(platform_version), platform_version, /*param_value_size_ret=*/NULL);
      assert(err == CL_SUCCESS && "Error getting platform version");
    }
    cl_uint num_devices;
    {
      cl_uint err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, /*num_entries=*/0, /*devices=*/NULL, &num_devices);
      assert(err == CL_SUCCESS && "Error getting number of devices");
    }
    cl_device_id *devices = new cl_device_id[num_devices];
    {
      cl_uint err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, /*num_devices=*/NULL);
      assert(err == CL_SUCCESS && "Error getting devices");
    }
    ss << "# Platform " << i << "\n";
    ss << "# Name: " << platform_name << "\n";
    ss << "# Version: " << platform_version << "\n";
    ss << "# Number of devices: " << num_devices << "\n";

    // get device list
    for (unsigned j=0; j<num_devices; ++j) {
      cl_uint err;
      err  = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(device_name), device_name, /*param_value_size_ret=*/NULL);
      err |= clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(device_vendor), device_vendor, /*param_value_size_ret=*/NULL);
      err |= clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_cores), &num_cores, /*param_value_size_ret=*/NULL);
      err |= clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clk_freq), &clk_freq, /*param_value_size_ret=*/NULL);
      err |= clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(global_mem_size), &global_mem_size, /*param_value_size_ret=*/NULL);
      err |= clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, /*param_value_size_ret=*/NULL);
      err |= clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_group_size), &max_group_size, /*param_value_size_ret=*/NULL);
      assert(err == CL_SUCCESS && "Error getting device information");

      ss << "# Device " << j << "\n";
      ss << "# \tName: " << device_name << "\n";
      ss << "# \tVendor: " << device_vendor << "\n";
      ss << "# \tCompute units: " << num_cores << "\n";
      ss << "# \tClock frequency: " << clk_freq << " MHz\n";
      ss << "# \tGlobal memory: " << (global_mem_size>>30) << "GB\n";
      ss << "# \tLocal memory: " <<  (local_mem_size>>10) << "KB\n";
      ss << "# \tMax group size: " <<  max_group_size << "\n";
    }
    delete[] devices;
  }
  delete[] platforms;

  cout << ss.str();
}

int main(int argc, char **argv) {
  if (argc < 3) {
    print_usage(argv[0]);
    return 1;
  }

  // total number of elements (per testvector)
  nelements = atoi(argv[1]);
  if (!ispow2(nelements)) {
    cerr << "Error: nelements must be a power of two" << endl;
    return 1;
  }

  kernel = string(argv[2]);
  if (kernel != "sklansky.cl" &&
      kernel != "koggestone.cl" &&
      kernel != "brentkung.cl" &&
      kernel != "blelloch.cl") {
    cerr << "Error: kernel not unrecognised " << kernel << endl;
    return 1;
  }

  unsigned p = 0;
  unsigned d = 0;
  for (int i=3; i<argc; ++i) {
    string opt = string(argv[i]);
    string platform_prefix = "--platform=";
    string device_prefix   = "--device=";
    if (opt == "-h" || opt == "--help") {
      print_usage(argv[0]);
      return 0;
    } else if (opt == "--clinfo") {
      clinfo();
      return 0;
    } else if (opt == "--force-host-scan") {
      force_host_scan = true;
    } else if (opt == "--print-vectors") {
      print_vectors = true;
    } else if (opt == "--print-results") {
      print_results = true;
    } else if (opt == "--check-random") {
      check_random = true;
    } else if (opt == "--skip-op1") {
      skip_op1 = true;
    } else if (opt == "--skip-op2") {
      skip_op2 = true;
    } else if ((!opt.compare(0, platform_prefix.size(), platform_prefix))) {
      p = (unsigned) atoi(opt.substr(platform_prefix.size()).c_str());
    } else if ((!opt.compare(0, device_prefix.size(), device_prefix))) {
      d = (unsigned) atoi(opt.substr(device_prefix.size()).c_str());
    } else {
      cout << "Unrecognised option: " << opt << endl;
    }
  }

#ifndef _WIN32
  struct sigaction sa;
  sa.sa_handler = exit_handler;
  sigemptyset(&sa.sa_mask);
  sa.sa_flags = 0;
  sigaction(SIGINT, &sa, NULL);
#endif

  if (nelements <= MAX_GROUP_SIZE) {
    N = nelements;
    ngroups = 1;
  } else {
    N = MAX_GROUP_SIZE;
    ngroups = nelements/N;
  }
  out = new TYPE[nelements];
  sum = new TYPE[ngroups];
  sumout = new TYPE[ngroups];

  {
    cl_int err = clGetPlatformIDs(/*num_entries=*/0, /*platforms=*/NULL, &num_platforms);
    assert(err == CL_SUCCESS && "Error getting number of platforms");
  }
  assert(p < num_platforms && "Requested platform not found");

  platforms = new cl_platform_id[num_platforms];
  {
    cl_int err = clGetPlatformIDs(num_platforms, platforms, /*num_platforms=*/NULL);
    assert(err == CL_SUCCESS && "Error getting platforms");
  }
  platform = platforms[p];

  {
    cl_int err = clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, /*num_entries=*/0, /*devices=*/NULL, &num_devices);
    assert(err == CL_SUCCESS && "Error getting number of devices");
  }
  assert(d < num_devices && "Requested device not found");

  devices = new cl_device_id[num_devices];
  {
    clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, num_devices, devices, /*num_devices=*/NULL);
  }
  device = devices[d];

  {
    cl_int err;
    context = clCreateContext(/*properties=*/NULL, /*ndev=*/1, &device, /*context_error_callback=*/NULL, NULL, &err);
    assert(err == CL_SUCCESS && "Error getting context");
    assert(context && "Null context");
  }

  {
    cl_int err;
    cl_command_queue_properties properties = 0;
    command_queue = clCreateCommandQueue(context, devices[d], properties, &err);
    assert(err == CL_SUCCESS && "Error attaching command queue");
    assert(command_queue && "Null command queue");
  }

  ArraySize = nelements * sizeof(TYPE);
  SumSize = ngroups * sizeof(TYPE);
  ErrorSize = sizeof(unsigned);
  {
    cl_int err;
    cl_mem_flags flags = CL_MEM_READ_ONLY;
    d_in = clCreateBuffer(context, flags, ArraySize, /*host_ptr=*/NULL, &err);
    assert(err == CL_SUCCESS && "Error malloc-ing d_in");
  }
  {
    cl_int err;
    cl_mem_flags flags = CL_MEM_WRITE_ONLY;
    d_out = clCreateBuffer(context, flags, ArraySize, /*host_ptr=*/NULL, &err);
    assert(err == CL_SUCCESS && "Error malloc-ing d_out");
  }
  {
    cl_int err;
    cl_mem_flags flags = CL_MEM_READ_ONLY;
    d_expected = clCreateBuffer(context, flags, ArraySize, /*host_ptr=*/NULL, &err);
    assert(err == CL_SUCCESS && "Error malloc-ing d_expected");
  }
  {
    cl_int err;
    cl_mem_flags flags = CL_MEM_READ_WRITE;
    d_sum = clCreateBuffer(context, flags, SumSize, /*host_ptr=*/NULL, &err);
    assert(err == CL_SUCCESS && "Error malloc-ing d_sum");
  }
  {
    cl_int err;
    cl_mem_flags flags = CL_MEM_READ_WRITE;
    d_sumout = clCreateBuffer(context, flags, SumSize, /*host_ptr=*/NULL, &err);
    assert(err == CL_SUCCESS && "Error malloc-ing d_sumout");
  }
  {
    cl_int err;
    cl_mem_flags flags = CL_MEM_READ_WRITE;
    d_error = clCreateBuffer(context, flags, ErrorSize, /*host_ptr=*/NULL, &err);
    assert(err == CL_SUCCESS && "Error malloc-ing d_error");
  }

  cl_program program;
  {
    ostringstream oss;
    oss << "-I. -DOP1 -DNO_INVARIANTS -DNELEMENTS_PER_GROUP=" << N << " -DINNER=" << kernel;
    compile(program, "meta.cl", oss.str().c_str());
  }
  {
    cl_int err;
    k1 = clCreateKernel(program, "voigtlander_op1", &err);
    assert(err == CL_SUCCESS && "Error creating voigtlander_op1 kernel");
    kinc1 = clCreateKernel(program, "inc_op1", &err);
    assert(err == CL_SUCCESS && "Error creating inc_op1 kernel");
    kcheck = clCreateKernel(program, "check", &err);
    assert(err == CL_SUCCESS && "Error creating check kernel");
  }
  {
    cl_uint err = clReleaseProgram(program);
    assert(err == CL_SUCCESS && "Error releasing program");
  }

  {
    ostringstream oss;
    oss << "-I. -DOP2 -DNO_INVARIANTS -DNELEMENTS_PER_GROUP=" << N << " -DINNER=" << kernel;
    compile(program, "meta.cl", oss.str().c_str());
  }
  {
    cl_int err;
    k2 = clCreateKernel(program, "voigtlander_op2", &err);
    assert(err == CL_SUCCESS && "Error creating voigtlander_op1 kernel");
    kinc2 = clCreateKernel(program, "inc_op2", &err);
    assert(err == CL_SUCCESS && "Error creating inc1_op1 kernel");
  }
  {
    cl_uint err = clReleaseProgram(program);
    assert(err == CL_SUCCESS && "Error releasing program");
  }
  {
    ostringstream oss;
    oss << "-I. -DOP1 -DNO_INVARIANTS -DNELEMENTS_PER_GROUP=" << ngroups;
    compile(program, kernel.c_str(), oss.str().c_str());
  }
  {
    cl_int err;
    kscan1 = clCreateKernel(program, "prefixsum", &err);
    assert(err == CL_SUCCESS && "Error creating op1 prefixsum kernel");
  }
  {
    cl_uint err = clReleaseProgram(program);
    assert(err == CL_SUCCESS && "Error releasing program");
  }
  {
    ostringstream oss;
    oss << "-I. -DOP2 -DNO_INVARIANTS -DNELEMENTS_PER_GROUP=" << ngroups;
    compile(program, kernel.c_str(), oss.str().c_str());
  }
  {
    cl_int err;
    kscan2 = clCreateKernel(program, "prefixsum", &err);
    assert(err == CL_SUCCESS && "Error creating op2 prefixsum kernel");
  }
  {
    cl_uint err = clReleaseProgram(program);
    assert(err == CL_SUCCESS && "Error releasing program");
  }

  uint64_t num_op1_vectors = triangular((uint64_t)nelements);
  uint64_t num_op2_vectors = nelements - 1;
  cout << "Will run testvectors "
       << "(op1: " << num_op1_vectors << ") "
       << "(op2: " << num_op2_vectors << ")."
       << endl;

  unsigned error = 0;
  nvectors = 0;
  vector<TYPE> v(nelements); vector<TYPE> e(nelements);
  if (skip_op1) {
    printf("TEST SKIPPED (OP1)\n");
  } else {
    for (unsigned place_one=0; error == 0 && place_one<nelements; ++place_one) {
      for (unsigned place_two=place_one+1; error == 0 && place_two<nelements; ++place_two) {
        fill_n(v.begin(), nelements, 0);
        fill_n(e.begin(), nelements, 0);
        v[place_one] = 1;
        fill_n(v.begin() + place_two, nelements-place_two, 2);
        fill_n(e.begin() + place_one, place_two-place_one, 1);
        fill_n(e.begin() + place_two, nelements-place_two, 2);

        error |= runvector<Op1>(&v[0], &e[0], k1, kscan1, kinc1);
        ++nvectors;
      }
      if (error) break;
      {
        fill_n(v.begin(), nelements, 0);
        fill_n(e.begin(), nelements, 0);
        v[place_one] = 1;
        fill_n(e.begin() + place_one, nelements-place_one, 1);

        error |= runvector<Op1>(&v[0], &e[0], k1, kscan1, kinc1);
        ++nvectors;
      }
    }
    if (error == 0) {
      printf("TEST PASSED (OP1)\n");
    } else {
      printf("TEST FAILED (OP1)\n");
    }
    assert(error || nvectors == num_op1_vectors);
  }

  error = 0;
  if (skip_op2) {
    printf("TEST SKIPPED (OP2)\n");
  } else {
    for (unsigned place_one_two=0; error == 0 && place_one_two<nelements-1; ++place_one_two) {
      fill_n(v.begin(), nelements, 0);
      fill_n(e.begin(), nelements, 0);
      v[place_one_two] = 1;
      v[place_one_two+1] = 2;
      e[place_one_two] = 1;
      fill_n(e.begin() + place_one_two+1, nelements-place_one_two-1, 2);
      error |= runvector<Op2>(&v[0], &e[0], k2, kscan2, kinc2);
      ++nvectors;
    }
    assert(error || nvectors == num_op1_vectors + num_op2_vectors);
    if (error == 0) {
      printf("TEST PASSED (OP2)\n");
    } else {
      printf("TEST FAILED (OP2)\n");
    }
  }

  cleanup();
  return 0;
}
