// Boost.uBLAS // // Copyright (c) 2018 Fady Essam // Copyright (c) 2018 Stefan Seefeld // // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt or // copy at http://www.boost.org/LICENSE_1_0.txt) #ifndef boost_numeric_ublas_opencl_transpose_hpp_ #define boost_numeric_ublas_opencl_transpose_hpp_ #include #include #include // Kernel for transposition of various data types #define OPENCL_TRANSPOSITION_KERNEL(DATA_TYPE) \ "__kernel void transpose(__global " #DATA_TYPE "* in, __global " #DATA_TYPE "* result, unsigned int width, unsigned int height) \n" \ "{ \n" \ " unsigned int column_index = get_global_id(0); \n" \ " unsigned int row_index = get_global_id(1); \n" \ " if (column_index < width && row_index < height) \n" \ " { \n" \ " unsigned int index_in = column_index + width * row_index; \n" \ " unsigned int index_result = row_index + height * column_index; \n" \ " result[index_result] = in[index_in]; \n" \ " } \n" \ "} \n" namespace boost { namespace numeric { namespace ublas { namespace opencl { template typename std::enable_if::value>::type change_layout(ublas::matrix const &m, ublas::matrix &result, compute::command_queue& queue) { assert(m.size1() == result.size1() && m.size2() == result.size2()); assert(m.device() == result.device() && m.device() == queue.get_device()); assert(!(std::is_same::value)); char const *kernel; if (std::is_same::value) kernel = OPENCL_TRANSPOSITION_KERNEL(float); else if (std::is_same::value) kernel = OPENCL_TRANSPOSITION_KERNEL(double); else if (std::is_same>::value) kernel = OPENCL_TRANSPOSITION_KERNEL(float2); else if (std::is_same>::value) kernel = OPENCL_TRANSPOSITION_KERNEL(double2); size_t len = strlen(kernel); cl_int err; cl_context c_context = queue.get_context().get(); cl_program program = clCreateProgramWithSource(c_context, 1, &kernel, &len, &err); clBuildProgram(program, 1, &queue.get_device().get(), NULL, NULL, NULL); cl_kernel c_kernel = clCreateKernel(program, "transpose", &err); size_t width = std::is_same < L1, ublas::basic_row_major<>>::value ? m.size2() : m.size1(); size_t height = std::is_same < L1, ublas::basic_row_major<>>::value ? m.size1() : m.size2(); size_t global_size[2] = { width , height }; clSetKernelArg(c_kernel, 0, sizeof(T*), &m.begin().get_buffer().get()); clSetKernelArg(c_kernel, 1, sizeof(T*), &result.begin().get_buffer().get()); clSetKernelArg(c_kernel, 2, sizeof(unsigned int), &width); clSetKernelArg(c_kernel, 3, sizeof(unsigned int), &height); cl_command_queue c_queue = queue.get(); cl_event event = NULL; clEnqueueNDRangeKernel(c_queue, c_kernel, 2, NULL, global_size, NULL, 0, NULL, &event); clWaitForEvents(1, &event); } template typename std::enable_if::value>::type change_layout(ublas::matrix const &m, ublas::matrix &result, compute::command_queue& queue) { ublas::matrix mdev(m, queue); ublas::matrix rdev(result.size1(), result.size2(), queue.get_context()); change_layout(mdev, rdev, queue); rdev.to_host(result, queue); } template typename std::enable_if::value>::type trans(ublas::matrix const &m, ublas::matrix &result, compute::command_queue& queue) { assert(m.size1() == result.size2() && m.size2() == result.size1()); assert(m.device() == result.device() && m.device() == queue.get_device()); char const *kernel; if (std::is_same::value) kernel = OPENCL_TRANSPOSITION_KERNEL(float); else if (std::is_same::value) kernel = OPENCL_TRANSPOSITION_KERNEL(double); else if (std::is_same>::value) kernel = OPENCL_TRANSPOSITION_KERNEL(float2); else if (std::is_same>::value) kernel = OPENCL_TRANSPOSITION_KERNEL(double2); size_t len = strlen(kernel); cl_int err; cl_context c_context = queue.get_context().get(); cl_program program = clCreateProgramWithSource(c_context, 1, &kernel, &len, &err); clBuildProgram(program, 1, &queue.get_device().get(), NULL, NULL, NULL); cl_kernel c_kernel = clCreateKernel(program, "transpose", &err); size_t width = std::is_same >::value ? m.size2() : m.size1(); size_t height = std::is_same >::value ? m.size1() : m.size2(); size_t global_size[2] = { width , height }; clSetKernelArg(c_kernel, 0, sizeof(T*), &m.begin().get_buffer().get()); clSetKernelArg(c_kernel, 1, sizeof(T*), &result.begin().get_buffer().get()); clSetKernelArg(c_kernel, 2, sizeof(unsigned int), &width); clSetKernelArg(c_kernel, 3, sizeof(unsigned int), &height); cl_command_queue c_queue = queue.get(); cl_event event = NULL; clEnqueueNDRangeKernel(c_queue, c_kernel, 2, NULL, global_size, NULL, 0, NULL, &event); clWaitForEvents(1, &event); } template typename std::enable_if::value>::type trans(ublas::matrix const &m, ublas::matrix &result, compute::command_queue& queue) { ublas::matrix mdev(m, queue); ublas::matrix rdev(result.size1(), result.size2(), queue.get_context()); trans(mdev, rdev, queue); rdev.to_host(result, queue); } template typename std::enable_if::value, ublas::matrix>::type trans(ublas::matrix& m, compute::command_queue& queue) { ublas::matrix result(m.size2(), m.size1()); trans(m, result, queue); return result; } }}}} #endif