block_indirect_sort.hpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503
  1. //----------------------------------------------------------------------------
  2. /// @file block_indirect_sort.hpp
  3. /// @brief block indirect sort algorithm
  4. ///
  5. /// @author Copyright (c) 2016 Francisco Jose Tapia (fjtapia@gmail.com )\n
  6. /// Distributed under the Boost Software License, Version 1.0.\n
  7. /// ( See accompanying file LICENSE_1_0.txt or copy at
  8. /// http://www.boost.org/LICENSE_1_0.txt )
  9. /// @version 0.1
  10. ///
  11. /// @remarks
  12. //-----------------------------------------------------------------------------
  13. #ifndef __BOOST_SORT_PARALLEL_DETAIL_BLOCK_INDIRECT_SORT_HPP
  14. #define __BOOST_SORT_PARALLEL_DETAIL_BLOCK_INDIRECT_SORT_HPP
  15. #include <atomic>
  16. #include <boost/sort/block_indirect_sort/blk_detail/merge_blocks.hpp>
  17. #include <boost/sort/block_indirect_sort/blk_detail/move_blocks.hpp>
  18. #include <boost/sort/block_indirect_sort/blk_detail/parallel_sort.hpp>
  19. #include <boost/sort/pdqsort/pdqsort.hpp>
  20. #include <boost/sort/common/util/traits.hpp>
  21. #include <boost/sort/common/util/algorithm.hpp>
  22. #include <future>
  23. #include <iterator>
  24. // This value is the minimal number of threads for to use the
  25. // block_indirect_sort algorithm
  26. #define BOOST_NTHREAD_BORDER 6
  27. namespace boost
  28. {
  29. namespace sort
  30. {
  31. namespace blk_detail
  32. {
  33. //---------------------------------------------------------------------------
  34. // USING SENTENCES
  35. //---------------------------------------------------------------------------
  36. namespace bs = boost::sort;
  37. namespace bsc = bs::common;
  38. namespace bscu = bsc::util;
  39. using bscu::compare_iter;
  40. using bscu::value_iter;
  41. using bsc::range;
  42. using bsc::destroy;
  43. using bsc::initialize;
  44. using bscu::nbits64;
  45. using bs::pdqsort;
  46. using bscu::enable_if_string;
  47. using bscu::enable_if_not_string;
  48. using bscu::tmsb;
  49. //
  50. ///---------------------------------------------------------------------------
  51. /// @struct block_indirect_sort
  52. /// @brief This class is the entry point of the block indirect sort. The code
  53. /// of this algorithm is divided in several classes:
  54. /// bis/block.hpp : basic structures used in the algorithm
  55. /// bis/backbone.hpp : data used by all the classes
  56. /// bis/merge_blocks.hpp : merge the internal blocks
  57. /// bis/move_blocks.hpp : move the blocks, and obtain all the elements
  58. /// phisicaly sorted
  59. /// bis/parallel_sort.hpp : make the parallel sort of each part in the
  60. /// initial division of the data
  61. ///
  62. //----------------------------------------------------------------------------
  63. template<uint32_t Block_size, uint32_t Group_size, class Iter_t,
  64. class Compare = compare_iter<Iter_t> >
  65. struct block_indirect_sort
  66. {
  67. //------------------------------------------------------------------------
  68. // D E F I N I T I O N S
  69. //------------------------------------------------------------------------
  70. typedef typename std::iterator_traits<Iter_t>::value_type value_t;
  71. typedef std::atomic<uint32_t> atomic_t;
  72. typedef range<size_t> range_pos;
  73. typedef range<Iter_t> range_it;
  74. typedef range<value_t *> range_buf;
  75. typedef std::function<void(void)> function_t;
  76. // classes used in the internal operations of the algorithm
  77. typedef block_pos block_pos_t;
  78. typedef block<Block_size, Iter_t> block_t;
  79. typedef backbone<Block_size, Iter_t, Compare> backbone_t;
  80. typedef parallel_sort<Block_size, Iter_t, Compare> parallel_sort_t;
  81. typedef merge_blocks<Block_size, Group_size, Iter_t, Compare> merge_blocks_t;
  82. typedef move_blocks<Block_size, Group_size, Iter_t, Compare> move_blocks_t;
  83. typedef compare_block_pos<Block_size, Iter_t, Compare> compare_block_pos_t;
  84. //
  85. //------------------------------------------------------------------------
  86. // V A R I A B L E S A N D C O N S T A N T S
  87. //------------------------------------------------------------------------
  88. // contains the data and the internal data structures of the algorithm for
  89. // to be shared between the classes which are part of the algorithm
  90. backbone_t bk;
  91. // atomic counter for to detect the end of the works created inside
  92. // the object
  93. atomic_t counter;
  94. // pointer to the uninitialized memory used for the thread buffers
  95. value_t *ptr;
  96. // indicate if the memory pointed by ptr is initialized
  97. bool construct;
  98. // range from extract the buffers for the threads
  99. range_buf rglobal_buf;
  100. // number of threads to use
  101. uint32_t nthread;
  102. //
  103. //------------------------------------------------------------------------
  104. // F U N C T I O N S
  105. //------------------------------------------------------------------------
  106. block_indirect_sort(Iter_t first, Iter_t last, Compare cmp, uint32_t nthr);
  107. block_indirect_sort(Iter_t first, Iter_t last) :
  108. block_indirect_sort(first, last, Compare(),
  109. std::thread::hardware_concurrency()) { }
  110. block_indirect_sort(Iter_t first, Iter_t last, Compare cmp) :
  111. block_indirect_sort(first, last, cmp,
  112. std::thread::hardware_concurrency()) { }
  113. block_indirect_sort(Iter_t first, Iter_t last, uint32_t nthread) :
  114. block_indirect_sort(first, last, Compare(), nthread){}
  115. //
  116. //------------------------------------------------------------------------
  117. // function :destroy_all
  118. /// @brief destructor all the data structures of the class (if the memory
  119. /// is constructed, is destroyed) and return the uninitialized
  120. /// memory
  121. //------------------------------------------------------------------------
  122. void destroy_all(void)
  123. {
  124. if (ptr != nullptr)
  125. {
  126. if (construct)
  127. {
  128. destroy(rglobal_buf);
  129. construct = false;
  130. };
  131. std::return_temporary_buffer(ptr);
  132. ptr = nullptr;
  133. };
  134. }
  135. //
  136. //------------------------------------------------------------------------
  137. // function :~block_indirect_sort
  138. /// @brief destructor of the class (if the memory is constructed, is
  139. /// destroyed) and return the uninitialized memory
  140. //------------------------------------------------------------------------
  141. ~block_indirect_sort(void)
  142. {
  143. destroy_all();
  144. }
  145. void split_range(size_t pos_index1, size_t pos_index2,
  146. uint32_t level_thread);
  147. void start_function(void);
  148. //-------------------------------------------------------------------------
  149. }; // End class block_indirect_sort
  150. //----------------------------------------------------------------------------
  151. //
  152. //############################################################################
  153. // ##
  154. // ##
  155. // N O N I N L I N E F U N C T I O N S ##
  156. // ##
  157. // ##
  158. //############################################################################
  159. //
  160. //-------------------------------------------------------------------------
  161. // function : block_indirect_sort
  162. /// @brief begin with the execution of the functions stored in works
  163. /// @param first : iterator to the first element of the range to sort
  164. /// @param last : iterator after the last element to the range to sort
  165. /// @param comp : object for to compare two elements pointed by Iter_t
  166. /// iterators
  167. /// @param nthr : Number of threads to use in the process.When this value
  168. /// is lower than 2, the sorting is done with 1 thread
  169. //-------------------------------------------------------------------------
  170. template<uint32_t Block_size, uint32_t Group_size, class Iter_t, class Compare>
  171. block_indirect_sort<Block_size, Group_size, Iter_t, Compare>
  172. ::block_indirect_sort(Iter_t first, Iter_t last, Compare cmp, uint32_t nthr)
  173. : bk(first, last, cmp), counter(0), ptr(nullptr), construct(false),
  174. nthread(nthr)
  175. {
  176. try
  177. {
  178. assert((last - first) >= 0);
  179. size_t nelem = size_t(last - first);
  180. if (nelem == 0) return;
  181. //------------------- check if sort -----------------------------------
  182. bool sorted = true;
  183. for (Iter_t it1 = first, it2 = first + 1; it2 != last and (sorted =
  184. not bk.cmp(*it2, *it1)); it1 = it2++);
  185. if (sorted) return;
  186. //------------------- check if reverse sort ---------------------------
  187. sorted = true;
  188. for (Iter_t it1 = first, it2 = first + 1; it2 != last and (sorted =
  189. not bk.cmp(*it1, *it2)); it1 = it2++);
  190. if (sorted)
  191. {
  192. size_t nelem2 = nelem >> 1;
  193. Iter_t it1 = first, it2 = last - 1;
  194. for (size_t i = 0; i < nelem2; ++i)
  195. {
  196. std::swap(*(it1++), *(it2--));
  197. };
  198. return;
  199. };
  200. //---------------- check if only single thread -----------------------
  201. size_t nthreadmax = nelem / (Block_size * Group_size) + 1;
  202. if (nthread > nthreadmax) nthread = (uint32_t) nthreadmax;
  203. uint32_t nbits_size = (nbits64(sizeof(value_t)) >> 1);
  204. if (nbits_size > 5) nbits_size = 5;
  205. size_t max_per_thread = 1 << (18 - nbits_size);
  206. if (nelem < (max_per_thread) or nthread < 2)
  207. {
  208. //intro_sort (first, last, bk.cmp);
  209. pdqsort(first, last, bk.cmp);
  210. return;
  211. };
  212. //----------- creation of the temporary buffer --------------------
  213. ptr = std::get_temporary_buffer<value_t>(Block_size * nthread).first;
  214. if (ptr == nullptr)
  215. {
  216. bk.error = true;
  217. throw std::bad_alloc();
  218. };
  219. rglobal_buf = range_buf(ptr, ptr + (Block_size * nthread));
  220. initialize(rglobal_buf, *first);
  221. construct = true;
  222. // creation of the buffers for the threads
  223. std::vector<value_t *> vbuf(nthread);
  224. for (uint32_t i = 0; i < nthread; ++i)
  225. {
  226. vbuf[i] = ptr + (i * Block_size);
  227. };
  228. // Insert the first work in the stack
  229. bscu::atomic_write(counter, 1);
  230. function_t f1 = [&]( )
  231. {
  232. start_function ( );
  233. bscu::atomic_sub (counter, 1);
  234. };
  235. bk.works.emplace_back(f1);
  236. //---------------------------------------------------------------------
  237. // PROCESS
  238. //---------------------------------------------------------------------
  239. std::vector<std::future<void> > vfuture(nthread);
  240. // The function launched with the futures is "execute the functions of
  241. // the stack until this->counter is zero
  242. // vbuf[i] is the memory from the main thread for to configure the
  243. // thread local buffer
  244. for (uint32_t i = 0; i < nthread; ++i)
  245. {
  246. auto f1 = [=, &vbuf]( )
  247. { bk.exec (vbuf[i], this->counter);};
  248. vfuture[i] = std::async(std::launch::async, f1);
  249. };
  250. for (uint32_t i = 0; i < nthread; ++i)
  251. vfuture[i].get();
  252. if (bk.error) throw std::bad_alloc();
  253. }
  254. catch (std::bad_alloc &)
  255. {
  256. destroy_all();
  257. throw;
  258. }
  259. };
  260. //
  261. //-----------------------------------------------------------------------------
  262. // function : split_rage
  263. /// @brief this function splits a range of positions in the index, and
  264. /// depending of the size, sort directly or make to a recursive call
  265. /// to split_range
  266. /// @param pos_index1 : first position in the index
  267. /// @param pos_index2 : position after the last in the index
  268. /// @param level_thread : depth of the call. When 0 sort the blocks
  269. //-----------------------------------------------------------------------------
  270. template<uint32_t Block_size, uint32_t Group_size, class Iter_t, class Compare>
  271. void block_indirect_sort<Block_size, Group_size, Iter_t, Compare>
  272. ::split_range(size_t pos_index1, size_t pos_index2, uint32_t level_thread)
  273. {
  274. size_t nblock = pos_index2 - pos_index1;
  275. //-------------------------------------------------------------------------
  276. // In the blocks not sorted, the physical position is the logical position
  277. //-------------------------------------------------------------------------
  278. Iter_t first = bk.get_block(pos_index1).first;
  279. Iter_t last = bk.get_range(pos_index2 - 1).last;
  280. if (nblock < Group_size)
  281. {
  282. pdqsort(first, last, bk.cmp);
  283. return;
  284. };
  285. size_t pos_index_mid = pos_index1 + (nblock >> 1);
  286. atomic_t son_counter(1);
  287. //-------------------------------------------------------------------------
  288. // Insert in the stack the work for the second part, and the actual thread,
  289. // execute the first part
  290. //-------------------------------------------------------------------------
  291. if (level_thread != 0)
  292. {
  293. auto f1 = [=, &son_counter]( )
  294. {
  295. split_range (pos_index_mid, pos_index2, level_thread - 1);
  296. bscu::atomic_sub (son_counter, 1);
  297. };
  298. bk.works.emplace_back(f1);
  299. if (bk.error) return;
  300. split_range(pos_index1, pos_index_mid, level_thread - 1);
  301. }
  302. else
  303. {
  304. Iter_t mid = first + ((nblock >> 1) * Block_size);
  305. auto f1 = [=, &son_counter]( )
  306. {
  307. parallel_sort_t (bk, mid, last);
  308. bscu::atomic_sub (son_counter, 1);
  309. };
  310. bk.works.emplace_back(f1);
  311. if (bk.error) return;
  312. parallel_sort_t(bk, first, mid);
  313. };
  314. bk.exec(son_counter);
  315. if (bk.error) return;
  316. merge_blocks_t(bk, pos_index1, pos_index_mid, pos_index2);
  317. };
  318. //
  319. //-----------------------------------------------------------------------------
  320. // function : start_function
  321. /// @brief this function init the process. When the number of threads is lower
  322. /// than a predefined value, sort the elements with a parallel pdqsort.
  323. //-----------------------------------------------------------------------------
  324. template<uint32_t Block_size, uint32_t Group_size, class Iter_t, class Compare>
  325. void block_indirect_sort<Block_size, Group_size, Iter_t, Compare>
  326. ::start_function(void)
  327. {
  328. if (nthread < BOOST_NTHREAD_BORDER)
  329. {
  330. parallel_sort_t(bk, bk.global_range.first, bk.global_range.last);
  331. }
  332. else
  333. {
  334. size_t level_thread = nbits64(nthread - 1) - 1;
  335. split_range(0, bk.nblock, level_thread - 1);
  336. if (bk.error) return;
  337. move_blocks_t k(bk);
  338. };
  339. };
  340. ///---------------------------------------------------------------------------
  341. // function block_indirect_sort_call
  342. /// @brief This class is select the block size in the block_indirect_sort
  343. /// algorithm depending of the type and size of the data to sort
  344. ///
  345. //----------------------------------------------------------------------------
  346. template <class Iter_t, class Compare,
  347. enable_if_string<value_iter<Iter_t>> * = nullptr>
  348. inline void block_indirect_sort_call(Iter_t first, Iter_t last, Compare cmp,
  349. uint32_t nthr)
  350. {
  351. block_indirect_sort<128, 128, Iter_t, Compare>(first, last, cmp, nthr);
  352. };
  353. template<size_t Size>
  354. struct block_size
  355. {
  356. static constexpr const uint32_t BitsSize =
  357. (Size == 0) ? 0 : (Size > 256) ? 9 : tmsb[Size - 1];
  358. static constexpr const uint32_t sz[10] =
  359. { 4096, 4096, 4096, 4096, 2048, 1024, 768, 512, 256, 128 };
  360. static constexpr const uint32_t data = sz[BitsSize];
  361. };
  362. //
  363. ///---------------------------------------------------------------------------
  364. /// @struct block_indirect_sort_call
  365. /// @brief This class is select the block size in the block_indirect_sort
  366. /// algorithm depending of the type and size of the data to sort
  367. ///
  368. //----------------------------------------------------------------------------
  369. template <class Iter_t, class Compare,
  370. enable_if_not_string<value_iter<Iter_t>> * = nullptr>
  371. inline void block_indirect_sort_call (Iter_t first, Iter_t last, Compare cmp,
  372. uint32_t nthr)
  373. {
  374. block_indirect_sort<block_size<sizeof (value_iter<Iter_t> )>::data, 64,
  375. Iter_t, Compare> (first, last, cmp, nthr);
  376. };
  377. //
  378. //****************************************************************************
  379. }; // End namespace blk_detail
  380. //****************************************************************************
  381. //
  382. namespace bscu = boost::sort::common::util;
  383. //
  384. //############################################################################
  385. // ##
  386. // ##
  387. // B L O C K _ I N D I R E C T _ S O R T ##
  388. // ##
  389. // ##
  390. //############################################################################
  391. //
  392. //-----------------------------------------------------------------------------
  393. // function : block_indirect_sort
  394. /// @brief invocation of block_indirtect_sort with 2 parameters
  395. ///
  396. /// @param first : iterator to the first element of the range to sort
  397. /// @param last : iterator after the last element to the range to sort
  398. //-----------------------------------------------------------------------------
  399. template<class Iter_t>
  400. void block_indirect_sort(Iter_t first, Iter_t last)
  401. {
  402. typedef bscu::compare_iter<Iter_t> Compare;
  403. blk_detail::block_indirect_sort_call (first, last, Compare(),
  404. std::thread::hardware_concurrency());
  405. }
  406. //
  407. //-----------------------------------------------------------------------------
  408. // function : block_indirect_sort
  409. /// @brief invocation of block_indirtect_sort with 3 parameters. The third is
  410. /// the number of threads
  411. ///
  412. /// @param first : iterator to the first element of the range to sort
  413. /// @param last : iterator after the last element to the range to sort
  414. /// @param nthread : Number of threads to use in the process. When this value
  415. /// is lower than 2, the sorting is done with 1 thread
  416. //-----------------------------------------------------------------------------
  417. template<class Iter_t>
  418. void block_indirect_sort(Iter_t first, Iter_t last, uint32_t nthread)
  419. {
  420. typedef bscu::compare_iter<Iter_t> Compare;
  421. blk_detail::block_indirect_sort_call(first, last, Compare(), nthread);
  422. }
  423. //
  424. //-----------------------------------------------------------------------------
  425. // function : block_indirect_sort
  426. /// @brief invocation of block_indirtect_sort with 3 parameters. The third is
  427. /// the comparison object
  428. ///
  429. /// @param first : iterator to the first element of the range to sort
  430. /// @param last : iterator after the last element to the range to sort
  431. /// @param comp : object for to compare two elements pointed by Iter_t
  432. /// iterators
  433. //-----------------------------------------------------------------------------
  434. template <class Iter_t, class Compare,
  435. bscu::enable_if_not_integral<Compare> * = nullptr>
  436. void block_indirect_sort(Iter_t first, Iter_t last, Compare comp)
  437. {
  438. blk_detail::block_indirect_sort_call (first, last, comp,
  439. std::thread::hardware_concurrency());
  440. }
  441. //
  442. //-----------------------------------------------------------------------------
  443. // function : block_indirect_sort
  444. /// @brief invocation of block_indirtect_sort with 4 parameters.
  445. ///
  446. /// @param first : iterator to the first element of the range to sort
  447. /// @param last : iterator after the last element to the range to sort
  448. /// @param comp : object for to compare two elements pointed by Iter_t
  449. /// iterators
  450. /// @param nthread : Number of threads to use in the process. When this value
  451. /// is lower than 2, the sorting is done with 1 thread
  452. //-----------------------------------------------------------------------------
  453. template<class Iter_t, class Compare>
  454. void block_indirect_sort (Iter_t first, Iter_t last, Compare comp,
  455. uint32_t nthread)
  456. {
  457. blk_detail::block_indirect_sort_call(first, last, comp, nthread);
  458. }
  459. //
  460. //****************************************************************************
  461. }; // End namespace sort
  462. }; // End namespace boost
  463. //****************************************************************************
  464. //
  465. #endif