#ifdef __APPLE__
  #include <OpenCL/opencl.h>
#elif __linux__
  #include <CL/cl.h>
#elif _WIN32
  #include <CL/cl.h>
#else
  #error Not sure where to find OpenCL header
#endif

#include <cassert>
#include <cstdlib>
#include <ctime>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>

#ifdef PROFILING
#include <chrono> //< requires c++11
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::duration<int,std::milli> millisecs_t;
#endif

#define ispow2(x) ((x & (x-2)) == 0)
#define MAX_GROUP_SIZE 1024
#define TYPE cl_uint

using namespace std;

// --------------------------------------------------------------------------
// COMMAND-LINE OPTIONS
// --------------------------------------------------------------------------
unsigned nelements;
bool print_results = false;
bool check_random = false;
bool force_host_scan = false;
unsigned is_exclusive = 0;
string kernel;

unsigned N;
unsigned ngroups;
size_t ArraySize;
size_t SumSize;
size_t ErrorSize;
TYPE *out;
TYPE *sum;
TYPE *sumout;

// --------------------------------------------------------------------------
// OPENCL
// --------------------------------------------------------------------------
cl_uint num_platforms;
cl_platform_id *platforms;
cl_uint num_devices;
cl_device_id *devices;

cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue command_queue;

cl_mem d_in;
cl_mem d_out;
cl_mem d_sum;
cl_mem d_sumout;
cl_mem d_error;

cl_kernel kinit;
cl_kernel k;
cl_kernel kinc;
cl_kernel kscan;
cl_kernel kcheck;

void print_vector(TYPE *v, const char *name, unsigned len) {
  printf("%s = { (%d,%d)", name, v[0], v[1]);
  for (unsigned i=1; i<len; ++i) {
    printf(", (%d,%d)", v[2*i], v[2*i+1]);
  }
  printf(" }\n");
}

unsigned runvector() {

  // initialise input vector
  {
    cl_int err = clSetKernelArg(kinit, 0, sizeof(cl_mem), (void *)&d_in);
    assert(err == CL_SUCCESS && "Error setting kinit argument 0");
  }
  {
    cl_uint dim = 1;
    size_t global_work_size = nelements;
    size_t local_work_size = N;
    cl_uint num_events_in_wait_list = 0;
    cl_event *event_wait_list = NULL;
#ifdef EVENT
    cl_event event;
    cl_uint err = clEnqueueNDRangeKernel(command_queue, kinit, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, &event);
    assert(err == CL_SUCCESS && "Error running kinit");
    err = clReleaseEvent(event);
    assert(err == CL_SUCCESS && "Error releasing event");
#else
    cl_uint err = clEnqueueNDRangeKernel(command_queue, kinit, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, NULL);
    assert(err == CL_SUCCESS && "Error running kinit");
#endif
  }

  // block level scan
  {
    cl_int err = clSetKernelArg(k, 0, sizeof(cl_mem), (void *)&d_in);
    assert(err == CL_SUCCESS && "Error setting k argument 0");
  }
  {
    cl_int err = clSetKernelArg(k, 1, sizeof(cl_mem), (void *)&d_out);
    assert(err == CL_SUCCESS && "Error setting k argument 1");
  }
  {
    cl_int err = clSetKernelArg(k, 2, sizeof(cl_mem), (void *)&d_sum);
    assert(err == CL_SUCCESS && "Error setting k argument 2");
  }
  {
    cl_int err = clSetKernelArg(k, 3, sizeof(unsigned), &is_exclusive);
    assert(err == CL_SUCCESS && "Error setting k argument 3");
  }
  {
    cl_uint dim = 1;
    size_t global_work_size;
    size_t local_work_size;
    if (kernel == "sklansky.cl") {
      global_work_size = nelements/2;
      local_work_size  = N/2;
    } else if (kernel == "koggestone.cl") {
      global_work_size = nelements;
      local_work_size  = N;
    } else if (kernel == "brentkung.cl") {
      global_work_size = nelements/2;
      local_work_size  = N/2;
    } else if (kernel == "blelloch.cl") {
      global_work_size = nelements/2;
      local_work_size  = N/2;
    } else {
      assert(0);
    }
    cl_uint num_events_in_wait_list = 0;
    cl_event *event_wait_list = NULL;
#ifdef EVENT
    cl_event event;
    cl_uint err = clEnqueueNDRangeKernel(command_queue, k, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, &event);
    assert(err == CL_SUCCESS && "Error running k");
    err = clReleaseEvent(event);
    assert(err == CL_SUCCESS && "Error releasing event");
#else
    cl_uint err = clEnqueueNDRangeKernel(command_queue, k, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, NULL);
    assert(err == CL_SUCCESS && "Error running k");
#endif
  }

  // nb: if ngroups > MAX_GROUP_SIZE then the test will use host scan
  if (1 < ngroups) {
    if (ngroups <= MAX_GROUP_SIZE && !force_host_scan) {
      // device level exclusive scan
      {
        cl_int err = clSetKernelArg(kscan, 0, sizeof(cl_mem), (void *)&d_sum);
        assert(err == CL_SUCCESS && "Error setting kscan argument 0");
      }
      {
        cl_int err = clSetKernelArg(kscan, 1, sizeof(cl_mem), (void *)&d_sumout);
        assert(err == CL_SUCCESS && "Error setting kscan argument 1");
      }
      {
        unsigned kscan_is_exclusive = 1;
        cl_int err = clSetKernelArg(kscan, 2, sizeof(unsigned), &kscan_is_exclusive);
        assert(err == CL_SUCCESS && "Error setting kscan argument 1");
      }
      {
        cl_uint dim = 1;
        size_t global_work_size;
        size_t local_work_size;
        if (kernel == "sklansky.cl") {
          global_work_size = ngroups/2;
          local_work_size  = ngroups/2;
        } else if (kernel == "koggestone.cl") {
          global_work_size = ngroups;
          local_work_size  = ngroups;
        } else if (kernel == "brentkung.cl") {
          global_work_size = ngroups/2;
          local_work_size  = ngroups/2;
        } else if (kernel == "blelloch.cl") {
          global_work_size = ngroups/2;
          local_work_size  = ngroups/2;
        } else {
          assert(false && "Unreachable");
        }
        cl_uint num_events_in_wait_list = 0;
        cl_event *event_wait_list = NULL;
#ifdef EVENT
        cl_event event;
        cl_uint err = clEnqueueNDRangeKernel(command_queue, kscan, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, &event);
        assert(err == CL_SUCCESS && "Error running kscan");
        err = clReleaseEvent(event);
        assert(err == CL_SUCCESS && "Error releasing event");
#else
        cl_uint err = clEnqueueNDRangeKernel(command_queue, kscan, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, NULL);
        assert(err == CL_SUCCESS && "Error running kscan");
#endif
      }
    } else {
      // host level exclusive scan
      assert(sum && "sum is null");
      assert(sumout && "sumout is null");
      {
        cl_bool blocking_read = CL_TRUE;
        cl_uint num_events_in_wait_list = 0;
        cl_event *event_wait_list = NULL;
        cl_uint err = clEnqueueReadBuffer(command_queue, d_sum, blocking_read, /*offset=*/0, SumSize, sum, num_events_in_wait_list, event_wait_list, NULL);
        assert(err == CL_SUCCESS && "Error reading from d_sum");
      }
      sumout[0] = 1; sumout[1] = 0; // IDENTITY
      for (unsigned i=1; i<ngroups; ++i) {
        cl_uint pre_lo = sumout[2*(i-1)];
        cl_uint pre_hi = sumout[2*(i-1)+1];
        cl_uint sum_lo = sum[2*(i-1)];
        cl_uint sum_hi = sum[2*(i-1)+1];
        // Op(pre, sum)
        sumout[2*i] = (pre_lo == 1 && pre_hi == 0) ? sum_lo  // pre is identity
                    : (sum_lo == 1 && sum_hi == 0) ? pre_lo  // sum is identity
                    : (pre_lo == 2 && pre_hi == 0) ? 2       // pre is top
                    : (sum_lo == 2 && pre_hi == 0) ? 2       // sum is top
                    : (pre_hi != sum_lo)           ? 2       // not-kissing
                    :                                pre_lo; // kissing
        sumout[2*i+1] = (pre_lo == 1 && pre_hi == 0) ? sum_hi
                    : (sum_lo == 1 && sum_hi == 0) ? pre_hi
                    : (pre_lo == 2 && pre_hi == 0) ? 0
                    : (sum_lo == 2 && pre_hi == 0) ? 0
                    : (pre_hi != sum_lo)           ? 0
                    :                                sum_hi;
      }
      {
        cl_bool blocking_write = CL_TRUE;
        cl_uint num_events_in_wait_list = 0;
        cl_event *event_wait_list = NULL;
        cl_uint err = clEnqueueWriteBuffer(command_queue, d_sumout, blocking_write, /*offset=*/0, SumSize, sumout, num_events_in_wait_list, event_wait_list, NULL);
        assert(err == CL_SUCCESS && "Error writing to d_sumout");
      }
    }

    // final increment
    {
      cl_int err = clSetKernelArg(kinc, 0, sizeof(cl_mem), (void *)&d_out);
      assert(err == CL_SUCCESS && "Error setting kinc argument 0");
    }
    {
      cl_int err = clSetKernelArg(kinc, 1, sizeof(cl_mem), (void *)&d_sumout);
      assert(err == CL_SUCCESS && "Error setting kinc argument 1");
    }
    {
      cl_uint dim = 1;
      size_t global_work_size = nelements;
      size_t local_work_size = N;
      cl_uint num_events_in_wait_list = 0;
      cl_event *event_wait_list = NULL;
#ifdef EVENT
      cl_event event;
      cl_uint err = clEnqueueNDRangeKernel(command_queue, kinc, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, &event);
      assert(err == CL_SUCCESS && "Error running kinc");
      err = clReleaseEvent(event);
      assert(err == CL_SUCCESS && "Error releasing event");
#else
      cl_uint err = clEnqueueNDRangeKernel(command_queue, kinc, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, NULL);
      assert(err == CL_SUCCESS && "Error running kinc");
#endif
    }
  }

  // check results
  unsigned error = 0;
  {
    cl_bool blocking_write = CL_TRUE;
    cl_uint num_events_in_wait_list = 0;
    cl_event *event_wait_list = NULL;
#ifdef EVENT
    cl_event event;
    cl_uint err = clEnqueueWriteBuffer(command_queue, d_error, blocking_write, /*offset=*/0, ErrorSize, &error, num_events_in_wait_list, event_wait_list, &event);
    assert(err == CL_SUCCESS && "Error writing to d_error");
    err = clReleaseEvent(event);
    assert(err == CL_SUCCESS && "Error releasing event");
#else
    cl_uint err = clEnqueueWriteBuffer(command_queue, d_error, blocking_write, /*offset=*/0, ErrorSize, &error, num_events_in_wait_list, event_wait_list, NULL);
    assert(err == CL_SUCCESS && "Error writing to d_error");
#endif
  }
  {
    cl_int err = clSetKernelArg(kcheck, 0, sizeof(cl_mem), (void *)&d_out);
    assert(err == CL_SUCCESS && "Error setting kcheck argument 0");
  }
  {
    cl_int err = clSetKernelArg(kcheck, 1, sizeof(cl_mem), (void *)&d_error);
    assert(err == CL_SUCCESS && "Error setting kcheck argument 1");
  }
  {
    cl_uint dim = 1;
    size_t global_work_size = nelements;
    size_t local_work_size = N;
    cl_uint num_events_in_wait_list = 0;
    cl_event *event_wait_list = NULL;
#ifdef EVENT
    cl_event event;
    cl_uint err = clEnqueueNDRangeKernel(command_queue, kcheck, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, &event);
    assert(err == CL_SUCCESS && "Error running kcheck");
    err = clReleaseEvent(event);
    assert(err == CL_SUCCESS && "Error releasing event");
#else
    cl_uint err = clEnqueueNDRangeKernel(command_queue, kcheck, dim, /*global_work_offset=*/NULL, &global_work_size, &local_work_size, num_events_in_wait_list, event_wait_list, NULL);
    assert(err == CL_SUCCESS && "Error running kcheck");
#endif
  }
  {
    cl_bool blocking_read = CL_TRUE;
    cl_uint num_events_in_wait_list = 0;
    cl_event *event_wait_list = NULL;
#ifdef EVENT
    cl_event event;
    cl_uint err = clEnqueueReadBuffer(command_queue, d_error, blocking_read, /*offset=*/0, ErrorSize, &error, num_events_in_wait_list, event_wait_list, &event);
    assert(err == CL_SUCCESS && "Error reading from d_error");
    err = clReleaseEvent(event);
    assert(err == CL_SUCCESS && "Error releasing event");
#else
    cl_uint err = clEnqueueReadBuffer(command_queue, d_error, blocking_read, /*offset=*/0, ErrorSize, &error, num_events_in_wait_list, event_wait_list, NULL);
    assert(err == CL_SUCCESS && "Error reading from d_error");
#endif
  }

  if (error || print_results || check_random) {
    assert(out && "out is null");
    cl_bool blocking_read = CL_TRUE;
    cl_uint num_events_in_wait_list = 0;
    cl_event *event_wait_list = NULL;
#ifdef EVENT
    cl_event event;
    cl_uint err = clEnqueueReadBuffer(command_queue, d_out, blocking_read, /*offset=*/0, ArraySize, out, num_events_in_wait_list, event_wait_list, &event);
    assert(err == CL_SUCCESS && "Error reading from d_out");
    err = clReleaseEvent(event);
    assert(err == CL_SUCCESS && "Error releasing event");
#else
    cl_uint err = clEnqueueReadBuffer(command_queue, d_out, blocking_read, /*offset=*/0, ArraySize, out, num_events_in_wait_list, event_wait_list, NULL);
    assert(err == CL_SUCCESS && "Error reading from d_out");
#endif
  }

  if (error) {
    assert(sum && "sum is null");
    assert(sumout && "sumout is null");
    {
      cl_bool blocking_read = CL_TRUE;
      cl_uint num_events_in_wait_list = 0;
      cl_event *event_wait_list = NULL;
      cl_uint err = clEnqueueReadBuffer(command_queue, d_sum, blocking_read, /*offset=*/0, SumSize, sum, num_events_in_wait_list, event_wait_list, NULL);
      assert(err == CL_SUCCESS && "Error reading from d_sum");
    }
    {
      cl_bool blocking_read = CL_TRUE;
      cl_uint num_events_in_wait_list = 0;
      cl_event *event_wait_list = NULL;
      cl_uint err = clEnqueueReadBuffer(command_queue, d_sumout, blocking_read, /*offset=*/0, SumSize, sumout, num_events_in_wait_list, event_wait_list, NULL);
      assert(err == CL_SUCCESS && "Error reading from d_sum");
    }
    print_vector(out, "out", nelements);
    print_vector(sum, "sum", ngroups);
    print_vector(sumout, "sumout", ngroups);
  }

  if (print_results) {
    print_vector(out, "out", nelements);
  }

  if (check_random) {
    srand(time(NULL));
    unsigned i = (rand() % (int)(nelements + 1));
    printf("random-check: out[%d] = (%d,%d)\n", i, out[2*i], out[2*i+1]);
  }

  return error;
}

// --------------------------------------------------------------------------
// HELPERS
// --------------------------------------------------------------------------
void cleanup() {
  if (platforms)     delete[] platforms;
  if (devices)       delete[] devices;
  if (out)           delete[] out;
  if (sum)           delete[] sum;
  if (sumout)        delete[] sumout;
  if (command_queue) clReleaseCommandQueue(command_queue);
  if (context)       clReleaseContext(context);
  if (kinit)         clReleaseKernel(kinit);
  if (k)             clReleaseKernel(k);
  if (kscan)         clReleaseKernel(kscan);
  if (kinc)          clReleaseKernel(kinc);
  if (kcheck)        clReleaseKernel(kcheck);
}

void compile_from_string(cl_program &program, const char *program_string, const char *extra_flags) {
  {
    cl_int err;
    //lengths=NULL -> program_string is null terminated
    program = clCreateProgramWithSource(context, /*count=*/1, (const char **) &program_string, /*lengths=*/NULL, &err);
    assert(err == CL_SUCCESS && "Error creating program");
    assert(program && "Null program");
  }

  ostringstream flags;
  flags << extra_flags;

  //pfn_notify=NULL -> call is blocking
  cl_int builderr = clBuildProgram(program, /*ndev=*/1, &device, flags.str().c_str(), /*pfn_notify=*/NULL, /*user_data=*/NULL);

  //print out build logs
  if (builderr != CL_SUCCESS) {
    cerr << "Build error " << builderr << endl;

    cl_build_status status;
    {
      cl_int err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL);
      assert(err == CL_SUCCESS && "Error getting program build status");
    }
    cerr << "Build status " << status << endl;

    size_t size;
    {
      cl_int err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &size);
      assert(err == CL_SUCCESS && "Error getting program build info"); 
    }
    char *build_log = new char[size+1];
    {
      cl_int err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, size, build_log, NULL);
      assert(err == CL_SUCCESS && "Error getting program build log");
    }
    build_log[size] = '\0';

    cerr << "Build log" << endl;
    cerr << build_log << endl;
    delete[] build_log;

    assert(false && "Build error");
  }

}

void compile(cl_program &p, const char *fname, const char *extra_flags) {
	fstream f(fname, (fstream::in | fstream::binary));
  assert(f.is_open() && "Unable to open file");

  size_t fileSize;
  f.seekg(0, fstream::end);
  size_t size = fileSize = f.tellg();
  f.seekg(0, fstream::beg);

  char *str = new char[size+1];
  f.read(str, fileSize);
  f.close();
  str[size] = '\0';

#ifdef PROFILING
  auto t1 = Clock::now();
#endif
  compile_from_string(p, str, extra_flags);
#ifdef PROFILING
  auto t2 = Clock::now();
  millisecs_t duration(chrono::duration_cast<millisecs_t>(t2-t1));
  cout << "Time to compile " << fname << ": " << duration.count() << " milliseconds." << endl;
#endif

  delete[] str;
}

void print_usage(char *progname) {
  cerr << "Usage: " << progname << " <nelements> <kernel.cl> [options]" << endl;
  cerr << "  where <nelements> must be a power of two" << endl;
  cerr << "        <kernel.cl> must be one of" << endl;
  cerr << "          o blelloch.cl" << endl;
  cerr << "          o brentkung.cl" << endl;
  cerr << "          o koggestone.cl" << endl;
  cerr << "          o sklansky.cl" << endl;
  cerr << "  --clinfo           print platform/device info" << endl;
  cerr << "  --force-host-scan  force host scan if multiblock scan required" << endl;
  cerr << "  --check-random     force random check at end of each kernel invocation" << endl;
  cerr << "  --print-results    print results" << endl;
  cerr << "  --is-exclusive     perform exclusive scan" << endl;
  cerr << "  --platform=X       use platform X (default 0)" << endl;
  cerr << "  --device=Y         use device Y (default 0)" << endl;
}

void clinfo() {
  stringstream ss;

  cl_uint num_platforms;
  {
    cl_int err = clGetPlatformIDs(/*num_entries=*/0, /*platforms=*/NULL, &num_platforms);
    assert(err == CL_SUCCESS && "Error getting number of platforms");
  }
  ss << "# Found " << num_platforms << " OpenCL platform" << (num_platforms == 1 ?  "":"s") << "\n";

  cl_platform_id *platforms = new cl_platform_id[num_platforms];
  {
    cl_int err = clGetPlatformIDs(num_platforms, platforms, /*num_platforms=*/NULL);
    assert(err == CL_SUCCESS && "Error getting platforms");
  }

  // query platform and devices
  char platform_name[1024];
  char platform_version[1024];
  char device_name[1024];
  char device_vendor[1024];
  cl_uint num_cores;
  cl_uint clk_freq;
  cl_long global_mem_size;
  cl_ulong local_mem_size;
  cl_ulong max_group_size;
  for (unsigned i=0; i<num_platforms; ++i) {
    {
      cl_uint err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name), platform_name, /*param_value_size_ret=*/NULL);
      assert(err == CL_SUCCESS && "Error getting platform name");
    }
    {
      cl_uint err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(platform_version), platform_version, /*param_value_size_ret=*/NULL);
      assert(err == CL_SUCCESS && "Error getting platform version");
    }
    cl_uint num_devices;
    {
      cl_uint err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, /*num_entries=*/0, /*devices=*/NULL, &num_devices);
      assert(err == CL_SUCCESS && "Error getting number of devices");
    }
    cl_device_id *devices = new cl_device_id[num_devices];
    {
      cl_uint err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, /*num_devices=*/NULL);
      assert(err == CL_SUCCESS && "Error getting devices");
    }
    ss << "# Platform " << i << "\n";
    ss << "# Name: " << platform_name << "\n";
    ss << "# Version: " << platform_version << "\n";
    ss << "# Number of devices: " << num_devices << "\n";

    // get device list
    for (unsigned j=0; j<num_devices; ++j) {
      cl_uint err;
      err  = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(device_name), device_name, /*param_value_size_ret=*/NULL);
      err |= clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(device_vendor), device_vendor, /*param_value_size_ret=*/NULL);
      err |= clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_cores), &num_cores, /*param_value_size_ret=*/NULL);
      err |= clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clk_freq), &clk_freq, /*param_value_size_ret=*/NULL);
      err |= clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(global_mem_size), &global_mem_size, /*param_value_size_ret=*/NULL);
      err |= clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, /*param_value_size_ret=*/NULL);
      err |= clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_group_size), &max_group_size, /*param_value_size_ret=*/NULL);
      assert(err == CL_SUCCESS && "Error getting device information");

      ss << "# Device " << j << "\n";
      ss << "# \tName: " << device_name << "\n";
      ss << "# \tVendor: " << device_vendor << "\n";
      ss << "# \tCompute units: " << num_cores << "\n";
      ss << "# \tClock frequency: " << clk_freq << " MHz\n";
      ss << "# \tGlobal memory: " << (global_mem_size>>30) << "GB\n";
      ss << "# \tLocal memory: " <<  (local_mem_size>>10) << "KB\n";
      ss << "# \tMax group size: " <<  max_group_size << "\n";
    }
    delete[] devices;
  }
  delete[] platforms;

  cout << ss.str();
}

int main(int argc, char **argv) {
  if (argc < 3) {
    print_usage(argv[0]);
    return 1;
  }

  // total number of elements (per testvector)
  nelements = atoi(argv[1]);
  if (!ispow2(nelements)) {
    cerr << "Error: nelements must be a power of two" << endl;
    return 1;
  }

  kernel = string(argv[2]);
  if (kernel != "sklansky.cl" &&
      kernel != "koggestone.cl" &&
      kernel != "brentkung.cl" &&
      kernel != "blelloch.cl") {
    cerr << "Error: kernel not unrecognised " << kernel << endl;
    return 1;
  }

  unsigned p = 0;
  unsigned d = 0;
  for (int i=3; i<argc; ++i) {
    string opt = string(argv[i]);
    string platform_prefix = "--platform=";
    string device_prefix   = "--device=";
    if (opt == "-h" || opt == "--help") {
      print_usage(argv[0]);
      return 0;
    } else if (opt == "--clinfo") {
      clinfo();
      return 0;
    } else if (opt == "--is-exclusive") {
      is_exclusive = 1;
    } else if (opt == "--force-host-scan") {
      force_host_scan = true;
    } else if (opt == "--print-results") {
      print_results = true;
    } else if (opt == "--check-random") {
      check_random = true;
    } else if ((!opt.compare(0, platform_prefix.size(), platform_prefix))) {
      p = (unsigned) atoi(opt.substr(platform_prefix.size()).c_str());
    } else if ((!opt.compare(0, device_prefix.size(), device_prefix))) {
      d = (unsigned) atoi(opt.substr(device_prefix.size()).c_str());
    } else {
      cout << "Unrecognised option: " << opt << endl;
    }
  }

  if (nelements <= MAX_GROUP_SIZE) {
    N = nelements;
    ngroups = 1;
  } else {
    N = MAX_GROUP_SIZE;
    ngroups = nelements/N;
  }
  out = new TYPE[2*nelements];
  sum = new TYPE[2*ngroups];
  sumout = new TYPE[2*ngroups];

  {
    cl_int err = clGetPlatformIDs(/*num_entries=*/0, /*platforms=*/NULL, &num_platforms);
    assert(err == CL_SUCCESS && "Error getting number of platforms");
  }
  assert(p < num_platforms && "Requested platform not found");

  platforms = new cl_platform_id[num_platforms];
  {
    cl_int err = clGetPlatformIDs(num_platforms, platforms, /*num_platforms=*/NULL);
    assert(err == CL_SUCCESS && "Error getting platforms");
  }
  platform = platforms[p];

  {
    cl_int err = clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, /*num_entries=*/0, /*devices=*/NULL, &num_devices);
    assert(err == CL_SUCCESS && "Error getting number of devices");
  }
  assert(d < num_devices && "Requested device not found");

  devices = new cl_device_id[num_devices];
  {
    clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, num_devices, devices, /*num_devices=*/NULL);
  }
  device = devices[d];

  {
    cl_int err;
    context = clCreateContext(/*properties=*/NULL, /*ndev=*/1, &device, /*context_error_callback=*/NULL, NULL, &err);
    assert(err == CL_SUCCESS && "Error getting context");
    assert(context && "Null context");
  }

  {
    cl_int err;
    cl_command_queue_properties properties = 0;
    command_queue = clCreateCommandQueue(context, devices[d], properties, &err);
    assert(err == CL_SUCCESS && "Error attaching command queue");
    assert(command_queue && "Null command queue");
  }

  ArraySize = nelements * sizeof(cl_uint2);
  SumSize = ngroups * sizeof(cl_uint2);
  ErrorSize = sizeof(unsigned);
  {
    cl_int err;
    cl_mem_flags flags = CL_MEM_READ_WRITE;
    d_in = clCreateBuffer(context, flags, ArraySize, /*host_ptr=*/NULL, &err);
    assert(err == CL_SUCCESS && "Error malloc-ing d_in");
  }
  {
    cl_int err;
    cl_mem_flags flags = CL_MEM_WRITE_ONLY;
    d_out = clCreateBuffer(context, flags, ArraySize, /*host_ptr=*/NULL, &err);
    assert(err == CL_SUCCESS && "Error malloc-ing d_out");
  }
  {
    cl_int err;
    cl_mem_flags flags = CL_MEM_READ_WRITE;
    d_sum = clCreateBuffer(context, flags, SumSize, /*host_ptr=*/NULL, &err);
    assert(err == CL_SUCCESS && "Error malloc-ing d_sum");
  }
  {
    cl_int err;
    cl_mem_flags flags = CL_MEM_READ_WRITE;
    d_sumout = clCreateBuffer(context, flags, SumSize, /*host_ptr=*/NULL, &err);
    assert(err == CL_SUCCESS && "Error malloc-ing d_sumout");
  }
  {
    cl_int err;
    cl_mem_flags flags = CL_MEM_READ_WRITE;
    d_error = clCreateBuffer(context, flags, ErrorSize, /*host_ptr=*/NULL, &err);
    assert(err == CL_SUCCESS && "Error malloc-ing d_error");
  }

  cl_program program;
  {
    ostringstream oss;
    oss << "-I. -DABSTRACT -DNO_INVARIANTS -DNELEMENTS_PER_GROUP=" << N << " -DINNER=" << kernel;
    if (is_exclusive) {
      oss << " -DEXCLUSIVE";
    }
    compile(program, "meta.cl", oss.str().c_str());
  }
  {
    cl_int err;
    kinit = clCreateKernel(program, "abstract_init", &err);
    assert(err == CL_SUCCESS && "Error creating kinit kernel");
    k = clCreateKernel(program, "meta1", &err);
    assert(err == CL_SUCCESS && "Error creating k kernel");
    kinc = clCreateKernel(program, "meta2", &err);
    assert(err == CL_SUCCESS && "Error creating kinc kernel");
    kcheck = clCreateKernel(program, "abstract_check", &err);
    assert(err == CL_SUCCESS && "Error creating check kernel");
  }
  {
    cl_uint err = clReleaseProgram(program);
    assert(err == CL_SUCCESS && "Error releasing program");
  }

  {
    ostringstream oss;
    oss << "-I. -DABSTRACT -DNO_INVARIANTS -DNELEMENTS_PER_GROUP=" << ngroups;
    compile(program, kernel.c_str(), oss.str().c_str());
  }
  {
    cl_int err;
    kscan = clCreateKernel(program, "prefixsum", &err);
    assert(err == CL_SUCCESS && "Error creating kscan kernel");
  }
  {
    cl_uint err = clReleaseProgram(program);
    assert(err == CL_SUCCESS && "Error releasing program");
  }

  unsigned error = runvector();
  if (!error) {
    printf("TEST PASSED (%s)\n", is_exclusive ? "EXCL" : "INCL");
  } else {
    printf("TEST FAILED\n");
  }
  cleanup();
  return error;
}
