perf_saxpy.cpp 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. //---------------------------------------------------------------------------//
  2. // Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
  3. //
  4. // Distributed under the Boost Software License, Version 1.0
  5. // See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt
  7. //
  8. // See http://boostorg.github.com/compute for more information.
  9. //---------------------------------------------------------------------------//
  10. #include <algorithm>
  11. #include <iostream>
  12. #include <vector>
  13. #include <boost/program_options.hpp>
  14. #include <boost/compute/lambda.hpp>
  15. #include <boost/compute/system.hpp>
  16. #include <boost/compute/algorithm/copy.hpp>
  17. #include <boost/compute/algorithm/transform.hpp>
  18. #include <boost/compute/container/vector.hpp>
  19. #include "perf.hpp"
  20. namespace po = boost::program_options;
  21. namespace compute = boost::compute;
  22. float rand_float()
  23. {
  24. return (float(rand()) / float(RAND_MAX)) * 1000.f;
  25. }
  26. template<class T>
  27. double perf_saxpy(const compute::vector<T>& x,
  28. const compute::vector<T>& y,
  29. const T alpha,
  30. const size_t trials,
  31. compute::command_queue& queue)
  32. {
  33. // create vector on the device to store the result
  34. compute::vector<T> result(x.size(), queue.get_context());
  35. perf_timer t;
  36. for(size_t trial = 0; trial < trials; trial++){
  37. compute::fill(result.begin(), result.end(), T(0), queue);
  38. queue.finish();
  39. t.start();
  40. using compute::lambda::_1;
  41. using compute::lambda::_2;
  42. compute::transform(
  43. x.begin(), x.end(), y.begin(), result.begin(), alpha * _1 + _2, queue
  44. );
  45. queue.finish();
  46. t.stop();
  47. }
  48. return t.min_time();
  49. }
  50. template<class T>
  51. void tune_saxpy(const compute::vector<T>& x,
  52. const compute::vector<T>& y,
  53. const T alpha,
  54. const size_t trials,
  55. compute::command_queue& queue)
  56. {
  57. boost::shared_ptr<compute::detail::parameter_cache>
  58. params = compute::detail::parameter_cache::get_global_cache(queue.get_device());
  59. const std::string cache_key =
  60. std::string("__boost_copy_kernel_") + boost::lexical_cast<std::string>(sizeof(T));
  61. const compute::uint_ tpbs[] = { 4, 8, 16, 32, 64, 128, 256, 512, 1024 };
  62. const compute::uint_ vpts[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
  63. double min_time = (std::numeric_limits<double>::max)();
  64. compute::uint_ best_tpb = 0;
  65. compute::uint_ best_vpt = 0;
  66. for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){
  67. params->set(cache_key, "tpb", tpbs[i]);
  68. for(size_t j = 0; j < sizeof(vpts) / sizeof(*vpts); j++){
  69. params->set(cache_key, "vpt", vpts[j]);
  70. try {
  71. const double t = perf_saxpy(x, y, alpha, trials, queue);
  72. if(t < min_time){
  73. best_tpb = tpbs[i];
  74. best_vpt = vpts[j];
  75. min_time = t;
  76. }
  77. }
  78. catch(compute::opencl_error&){
  79. // invalid parameters for this device, skip
  80. }
  81. }
  82. }
  83. // store optimal parameters
  84. params->set(cache_key, "tpb", best_tpb);
  85. params->set(cache_key, "vpt", best_vpt);
  86. }
  87. int main(int argc, char *argv[])
  88. {
  89. // setup command line arguments
  90. po::options_description options("options");
  91. options.add_options()
  92. ("help", "show usage instructions")
  93. ("size", po::value<size_t>()->default_value(8192), "input size")
  94. ("trials", po::value<size_t>()->default_value(3), "number of trials to run")
  95. ("tune", "run tuning procedure")
  96. ("alpha", po::value<double>()->default_value(2.5), "saxpy alpha value")
  97. ;
  98. po::positional_options_description positional_options;
  99. positional_options.add("size", 1);
  100. // parse command line
  101. po::variables_map vm;
  102. po::store(
  103. po::command_line_parser(argc, argv)
  104. .options(options).positional(positional_options).run(),
  105. vm
  106. );
  107. po::notify(vm);
  108. const size_t size = vm["size"].as<size_t>();
  109. const size_t trials = vm["trials"].as<size_t>();
  110. const float alpha = vm["alpha"].as<double>();
  111. std::cout << "size: " << size << std::endl;
  112. // setup context and queue for the default device
  113. compute::device device = boost::compute::system::default_device();
  114. compute::context context(device);
  115. compute::command_queue queue(context, device);
  116. std::cout << "device: " << device.name() << std::endl;
  117. // create vector of random numbers on the host
  118. std::vector<float> host_x(size);
  119. std::vector<float> host_y(size);
  120. std::generate(host_x.begin(), host_x.end(), rand_float);
  121. std::generate(host_y.begin(), host_y.end(), rand_float);
  122. // create vector on the device and copy the data
  123. compute::vector<float> x(host_x.begin(), host_x.end(), queue);
  124. compute::vector<float> y(host_y.begin(), host_y.end(), queue);
  125. // run tuning proceure (if requested)
  126. if(vm.count("tune")){
  127. tune_saxpy(x, y, alpha, trials, queue);
  128. }
  129. // run benchmark
  130. double t = perf_saxpy(x, y, alpha, trials, queue);
  131. std::cout << "time: " << t / 1e6 << " ms" << std::endl;
  132. return 0;
  133. }