ceph/src/boost/libs/compute/perf/perf_accumulate.cpp

   1 //---------------------------------------------------------------------------//
   2 // Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
   3 //
   4 // Distributed under the Boost Software License, Version 1.0
   5 // See accompanying file LICENSE_1_0.txt or copy at
   6 // http://www.boost.org/LICENSE_1_0.txt
   7 //
   8 // See http://boostorg.github.com/compute for more information.
   9 //---------------------------------------------------------------------------//
  10
  11 #include <algorithm>
  12 #include <iostream>
  13 #include <numeric>
  14 #include <vector>
  15
  16 #include <boost/program_options.hpp>
  17
  18 #include <boost/compute/system.hpp>
  19 #include <boost/compute/algorithm/accumulate.hpp>
  20 #include <boost/compute/container/vector.hpp>
  21
  22 #include "perf.hpp"
  23
  24 namespace po = boost::program_options;
  25 namespace compute = boost::compute;
  26
  27 int rand_int()
  28 {
  29     return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
  30 }
  31
  32 template<class T>
  33 double perf_accumulate(const compute::vector<T>& data,
  34                        const size_t trials,
  35                        compute::command_queue& queue)
  36 {
  37     perf_timer t;
  38     for(size_t trial = 0; trial < trials; trial++){
  39         t.start();
  40         compute::accumulate(data.begin(), data.end(), T(0), queue);
  41         queue.finish();
  42         t.stop();
  43     }
  44     return t.min_time();
  45 }
  46
  47 template<class T>
  48 void tune_accumulate(const compute::vector<T>& data,
  49                      const size_t trials,
  50                      compute::command_queue& queue)
  51 {
  52     boost::shared_ptr<compute::detail::parameter_cache>
  53         params = compute::detail::parameter_cache::get_global_cache(queue.get_device());
  54
  55     const std::string cache_key =
  56         std::string("__boost_reduce_on_gpu_") + compute::type_name<T>();
  57
  58     const compute::uint_ tpbs[] = { 4, 8, 16, 32, 64, 128, 256, 512, 1024 };
  59     const compute::uint_ vpts[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
  60
  61     double min_time = (std::numeric_limits<double>::max)();
  62     compute::uint_ best_tpb = 0;
  63     compute::uint_ best_vpt = 0;
  64
  65     for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){
  66         params->set(cache_key, "tpb", tpbs[i]);
  67         for(size_t j = 0; j < sizeof(vpts) / sizeof(*vpts); j++){
  68             params->set(cache_key, "vpt", vpts[j]);
  69
  70             try {
  71                 const double t = perf_accumulate(data, trials, queue);
  72                 if(t < min_time){
  73                     best_tpb = tpbs[i];
  74                     best_vpt = vpts[j];
  75                     min_time = t;
  76                 }
  77             }
  78             catch(compute::opencl_error&){
  79                 // invalid parameters for this device, skip
  80             }
  81         }
  82     }
  83
  84     // store optimal parameters
  85     params->set(cache_key, "tpb", best_tpb);
  86     params->set(cache_key, "vpt", best_vpt);
  87 }
  88
  89 int main(int argc, char *argv[])
  90 {
  91     // setup command line arguments
  92     po::options_description options("options");
  93     options.add_options()
  94         ("help", "show usage instructions")
  95         ("size", po::value<size_t>()->default_value(8192), "input size")
  96         ("trials", po::value<size_t>()->default_value(3), "number of trials to run")
  97         ("tune", "run tuning procedure")
  98     ;
  99     po::positional_options_description positional_options;
 100     positional_options.add("size", 1);
 101
 102     // parse command line
 103     po::variables_map vm;
 104     po::store(
 105         po::command_line_parser(argc, argv)
 106             .options(options).positional(positional_options).run(),
 107         vm
 108     );
 109     po::notify(vm);
 110
 111     const size_t size = vm["size"].as<size_t>();
 112     const size_t trials = vm["trials"].as<size_t>();
 113     std::cout << "size: " << size << std::endl;
 114
 115     // setup context and queue for the default device
 116     compute::device device = compute::system::default_device();
 117     compute::context context(device);
 118     compute::command_queue queue(context, device);
 119     std::cout << "device: " << device.name() << std::endl;
 120
 121     // create vector of random numbers on the host
 122     std::vector<int> host_data(size);
 123     std::generate(host_data.begin(), host_data.end(), rand_int);
 124
 125     // create vector on the device and copy the data
 126     compute::vector<int> device_data(
 127         host_data.begin(), host_data.end(), queue
 128     );
 129
 130     // run tuning proceure (if requested)
 131     if(vm.count("tune")){
 132         tune_accumulate(device_data, trials, queue);
 133     }
 134
 135     // run benchmark
 136     double t = perf_accumulate(device_data, trials, queue);
 137     std::cout << "time: " << t / 1e6 << " ms" << std::endl;
 138
 139     return 0;
 140 }