perf_accumulate.cpp 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. //---------------------------------------------------------------------------//
  2. // Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
  3. //
  4. // Distributed under the Boost Software License, Version 1.0
  5. // See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt
  7. //
  8. // See http://boostorg.github.com/compute for more information.
  9. //---------------------------------------------------------------------------//
  10. #include <algorithm>
  11. #include <iostream>
  12. #include <numeric>
  13. #include <vector>
  14. #include <boost/program_options.hpp>
  15. #include <boost/compute/system.hpp>
  16. #include <boost/compute/algorithm/accumulate.hpp>
  17. #include <boost/compute/container/vector.hpp>
  18. #include "perf.hpp"
  19. namespace po = boost::program_options;
  20. namespace compute = boost::compute;
  21. int rand_int()
  22. {
  23. return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
  24. }
  25. template<class T>
  26. double perf_accumulate(const compute::vector<T>& data,
  27. const size_t trials,
  28. compute::command_queue& queue)
  29. {
  30. perf_timer t;
  31. for(size_t trial = 0; trial < trials; trial++){
  32. t.start();
  33. compute::accumulate(data.begin(), data.end(), T(0), queue);
  34. queue.finish();
  35. t.stop();
  36. }
  37. return t.min_time();
  38. }
  39. template<class T>
  40. void tune_accumulate(const compute::vector<T>& data,
  41. const size_t trials,
  42. compute::command_queue& queue)
  43. {
  44. boost::shared_ptr<compute::detail::parameter_cache>
  45. params = compute::detail::parameter_cache::get_global_cache(queue.get_device());
  46. const std::string cache_key =
  47. std::string("__boost_reduce_on_gpu_") + compute::type_name<T>();
  48. const compute::uint_ tpbs[] = { 4, 8, 16, 32, 64, 128, 256, 512, 1024 };
  49. const compute::uint_ vpts[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
  50. double min_time = (std::numeric_limits<double>::max)();
  51. compute::uint_ best_tpb = 0;
  52. compute::uint_ best_vpt = 0;
  53. for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){
  54. params->set(cache_key, "tpb", tpbs[i]);
  55. for(size_t j = 0; j < sizeof(vpts) / sizeof(*vpts); j++){
  56. params->set(cache_key, "vpt", vpts[j]);
  57. try {
  58. const double t = perf_accumulate(data, trials, queue);
  59. if(t < min_time){
  60. best_tpb = tpbs[i];
  61. best_vpt = vpts[j];
  62. min_time = t;
  63. }
  64. }
  65. catch(compute::opencl_error&){
  66. // invalid parameters for this device, skip
  67. }
  68. }
  69. }
  70. // store optimal parameters
  71. params->set(cache_key, "tpb", best_tpb);
  72. params->set(cache_key, "vpt", best_vpt);
  73. }
  74. int main(int argc, char *argv[])
  75. {
  76. // setup command line arguments
  77. po::options_description options("options");
  78. options.add_options()
  79. ("help", "show usage instructions")
  80. ("size", po::value<size_t>()->default_value(8192), "input size")
  81. ("trials", po::value<size_t>()->default_value(3), "number of trials to run")
  82. ("tune", "run tuning procedure")
  83. ;
  84. po::positional_options_description positional_options;
  85. positional_options.add("size", 1);
  86. // parse command line
  87. po::variables_map vm;
  88. po::store(
  89. po::command_line_parser(argc, argv)
  90. .options(options).positional(positional_options).run(),
  91. vm
  92. );
  93. po::notify(vm);
  94. const size_t size = vm["size"].as<size_t>();
  95. const size_t trials = vm["trials"].as<size_t>();
  96. std::cout << "size: " << size << std::endl;
  97. // setup context and queue for the default device
  98. compute::device device = compute::system::default_device();
  99. compute::context context(device);
  100. compute::command_queue queue(context, device);
  101. std::cout << "device: " << device.name() << std::endl;
  102. // create vector of random numbers on the host
  103. std::vector<int> host_data(size);
  104. std::generate(host_data.begin(), host_data.end(), rand_int);
  105. // create vector on the device and copy the data
  106. compute::vector<int> device_data(
  107. host_data.begin(), host_data.end(), queue
  108. );
  109. // run tuning proceure (if requested)
  110. if(vm.count("tune")){
  111. tune_accumulate(device_data, trials, queue);
  112. }
  113. // run benchmark
  114. double t = perf_accumulate(device_data, trials, queue);
  115. std::cout << "time: " << t / 1e6 << " ms" << std::endl;
  116. return 0;
  117. }