func_matrix_simd.inl 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. #if GLM_ARCH & GLM_ARCH_SSE2_BIT
  2. #include "type_mat4x4.hpp"
  3. #include "../geometric.hpp"
  4. #include "../simd/matrix.h"
  5. #include <cstring>
  6. namespace glm{
  7. namespace detail
  8. {
  9. # if GLM_CONFIG_ALIGNED_GENTYPES == GLM_ENABLE
  10. template<qualifier Q>
  11. struct compute_matrixCompMult<4, 4, float, Q, true>
  12. {
  13. GLM_STATIC_ASSERT(detail::is_aligned<Q>::value, "Specialization requires aligned");
  14. GLM_FUNC_QUALIFIER static mat<4, 4, float, Q> call(mat<4, 4, float, Q> const& x, mat<4, 4, float, Q> const& y)
  15. {
  16. mat<4, 4, float, Q> Result;
  17. glm_mat4_matrixCompMult(
  18. *static_cast<glm_vec4 const (*)[4]>(&x[0].data),
  19. *static_cast<glm_vec4 const (*)[4]>(&y[0].data),
  20. *static_cast<glm_vec4(*)[4]>(&Result[0].data));
  21. return Result;
  22. }
  23. };
  24. # endif
  25. template<qualifier Q>
  26. struct compute_transpose<4, 4, float, Q, true>
  27. {
  28. GLM_FUNC_QUALIFIER static mat<4, 4, float, Q> call(mat<4, 4, float, Q> const& m)
  29. {
  30. mat<4, 4, float, Q> Result;
  31. glm_mat4_transpose(&m[0].data, &Result[0].data);
  32. return Result;
  33. }
  34. };
  35. template<qualifier Q>
  36. struct compute_determinant<4, 4, float, Q, true>
  37. {
  38. GLM_FUNC_QUALIFIER static float call(mat<4, 4, float, Q> const& m)
  39. {
  40. return _mm_cvtss_f32(glm_mat4_determinant(&m[0].data));
  41. }
  42. };
  43. template<qualifier Q>
  44. struct compute_inverse<4, 4, float, Q, true>
  45. {
  46. GLM_FUNC_QUALIFIER static mat<4, 4, float, Q> call(mat<4, 4, float, Q> const& m)
  47. {
  48. mat<4, 4, float, Q> Result;
  49. glm_mat4_inverse(&m[0].data, &Result[0].data);
  50. return Result;
  51. }
  52. };
  53. }//namespace detail
  54. # if GLM_CONFIG_ALIGNED_GENTYPES == GLM_ENABLE
  55. template<>
  56. GLM_FUNC_QUALIFIER mat<4, 4, float, aligned_lowp> outerProduct<4, 4, float, aligned_lowp>(vec<4, float, aligned_lowp> const& c, vec<4, float, aligned_lowp> const& r)
  57. {
  58. __m128 NativeResult[4];
  59. glm_mat4_outerProduct(c.data, r.data, NativeResult);
  60. mat<4, 4, float, aligned_lowp> Result;
  61. std::memcpy(&Result[0], &NativeResult[0], sizeof(Result));
  62. return Result;
  63. }
  64. template<>
  65. GLM_FUNC_QUALIFIER mat<4, 4, float, aligned_mediump> outerProduct<4, 4, float, aligned_mediump>(vec<4, float, aligned_mediump> const& c, vec<4, float, aligned_mediump> const& r)
  66. {
  67. __m128 NativeResult[4];
  68. glm_mat4_outerProduct(c.data, r.data, NativeResult);
  69. mat<4, 4, float, aligned_mediump> Result;
  70. std::memcpy(&Result[0], &NativeResult[0], sizeof(Result));
  71. return Result;
  72. }
  73. template<>
  74. GLM_FUNC_QUALIFIER mat<4, 4, float, aligned_highp> outerProduct<4, 4, float, aligned_highp>(vec<4, float, aligned_highp> const& c, vec<4, float, aligned_highp> const& r)
  75. {
  76. __m128 NativeResult[4];
  77. glm_mat4_outerProduct(c.data, r.data, NativeResult);
  78. mat<4, 4, float, aligned_highp> Result;
  79. std::memcpy(&Result[0], &NativeResult[0], sizeof(Result));
  80. return Result;
  81. }
  82. # endif
  83. }//namespace glm
  84. #elif GLM_ARCH & GLM_ARCH_NEON_BIT
  85. namespace glm {
  86. #if GLM_LANG & GLM_LANG_CXX11_FLAG
  87. template <qualifier Q>
  88. GLM_FUNC_QUALIFIER
  89. typename std::enable_if<detail::is_aligned<Q>::value, mat<4, 4, float, Q>>::type
  90. operator*(mat<4, 4, float, Q> const & m1, mat<4, 4, float, Q> const & m2)
  91. {
  92. auto MulRow = [&](int l) {
  93. float32x4_t const SrcA = m2[l].data;
  94. float32x4_t r = neon::mul_lane(m1[0].data, SrcA, 0);
  95. r = neon::madd_lane(r, m1[1].data, SrcA, 1);
  96. r = neon::madd_lane(r, m1[2].data, SrcA, 2);
  97. r = neon::madd_lane(r, m1[3].data, SrcA, 3);
  98. return r;
  99. };
  100. mat<4, 4, float, aligned_highp> Result;
  101. Result[0].data = MulRow(0);
  102. Result[1].data = MulRow(1);
  103. Result[2].data = MulRow(2);
  104. Result[3].data = MulRow(3);
  105. return Result;
  106. }
  107. #endif // CXX11
  108. template<qualifier Q>
  109. struct detail::compute_inverse<4, 4, float, Q, true>
  110. {
  111. GLM_FUNC_QUALIFIER static mat<4, 4, float, Q> call(mat<4, 4, float, Q> const& m)
  112. {
  113. float32x4_t const& m0 = m[0].data;
  114. float32x4_t const& m1 = m[1].data;
  115. float32x4_t const& m2 = m[2].data;
  116. float32x4_t const& m3 = m[3].data;
  117. // m[2][2] * m[3][3] - m[3][2] * m[2][3];
  118. // m[2][2] * m[3][3] - m[3][2] * m[2][3];
  119. // m[1][2] * m[3][3] - m[3][2] * m[1][3];
  120. // m[1][2] * m[2][3] - m[2][2] * m[1][3];
  121. float32x4_t Fac0;
  122. {
  123. float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 2), neon::dup_lane(m1, 2));
  124. float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 3), 3, m2, 3);
  125. float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 2), 3, m2, 2);
  126. float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 3), neon::dup_lane(m1, 3));
  127. Fac0 = w0 * w1 - w2 * w3;
  128. }
  129. // m[2][1] * m[3][3] - m[3][1] * m[2][3];
  130. // m[2][1] * m[3][3] - m[3][1] * m[2][3];
  131. // m[1][1] * m[3][3] - m[3][1] * m[1][3];
  132. // m[1][1] * m[2][3] - m[2][1] * m[1][3];
  133. float32x4_t Fac1;
  134. {
  135. float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 1), neon::dup_lane(m1, 1));
  136. float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 3), 3, m2, 3);
  137. float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 1), 3, m2, 1);
  138. float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 3), neon::dup_lane(m1, 3));
  139. Fac1 = w0 * w1 - w2 * w3;
  140. }
  141. // m[2][1] * m[3][2] - m[3][1] * m[2][2];
  142. // m[2][1] * m[3][2] - m[3][1] * m[2][2];
  143. // m[1][1] * m[3][2] - m[3][1] * m[1][2];
  144. // m[1][1] * m[2][2] - m[2][1] * m[1][2];
  145. float32x4_t Fac2;
  146. {
  147. float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 1), neon::dup_lane(m1, 1));
  148. float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 2), 3, m2, 2);
  149. float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 1), 3, m2, 1);
  150. float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 2), neon::dup_lane(m1, 2));
  151. Fac2 = w0 * w1 - w2 * w3;
  152. }
  153. // m[2][0] * m[3][3] - m[3][0] * m[2][3];
  154. // m[2][0] * m[3][3] - m[3][0] * m[2][3];
  155. // m[1][0] * m[3][3] - m[3][0] * m[1][3];
  156. // m[1][0] * m[2][3] - m[2][0] * m[1][3];
  157. float32x4_t Fac3;
  158. {
  159. float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 0), neon::dup_lane(m1, 0));
  160. float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 3), 3, m2, 3);
  161. float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 0), 3, m2, 0);
  162. float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 3), neon::dup_lane(m1, 3));
  163. Fac3 = w0 * w1 - w2 * w3;
  164. }
  165. // m[2][0] * m[3][2] - m[3][0] * m[2][2];
  166. // m[2][0] * m[3][2] - m[3][0] * m[2][2];
  167. // m[1][0] * m[3][2] - m[3][0] * m[1][2];
  168. // m[1][0] * m[2][2] - m[2][0] * m[1][2];
  169. float32x4_t Fac4;
  170. {
  171. float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 0), neon::dup_lane(m1, 0));
  172. float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 2), 3, m2, 2);
  173. float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 0), 3, m2, 0);
  174. float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 2), neon::dup_lane(m1, 2));
  175. Fac4 = w0 * w1 - w2 * w3;
  176. }
  177. // m[2][0] * m[3][1] - m[3][0] * m[2][1];
  178. // m[2][0] * m[3][1] - m[3][0] * m[2][1];
  179. // m[1][0] * m[3][1] - m[3][0] * m[1][1];
  180. // m[1][0] * m[2][1] - m[2][0] * m[1][1];
  181. float32x4_t Fac5;
  182. {
  183. float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 0), neon::dup_lane(m1, 0));
  184. float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 1), 3, m2, 1);
  185. float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 0), 3, m2, 0);
  186. float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 1), neon::dup_lane(m1, 1));
  187. Fac5 = w0 * w1 - w2 * w3;
  188. }
  189. float32x4_t Vec0 = neon::copy_lane(neon::dupq_lane(m0, 0), 0, m1, 0); // (m[1][0], m[0][0], m[0][0], m[0][0]);
  190. float32x4_t Vec1 = neon::copy_lane(neon::dupq_lane(m0, 1), 0, m1, 1); // (m[1][1], m[0][1], m[0][1], m[0][1]);
  191. float32x4_t Vec2 = neon::copy_lane(neon::dupq_lane(m0, 2), 0, m1, 2); // (m[1][2], m[0][2], m[0][2], m[0][2]);
  192. float32x4_t Vec3 = neon::copy_lane(neon::dupq_lane(m0, 3), 0, m1, 3); // (m[1][3], m[0][3], m[0][3], m[0][3]);
  193. float32x4_t Inv0 = Vec1 * Fac0 - Vec2 * Fac1 + Vec3 * Fac2;
  194. float32x4_t Inv1 = Vec0 * Fac0 - Vec2 * Fac3 + Vec3 * Fac4;
  195. float32x4_t Inv2 = Vec0 * Fac1 - Vec1 * Fac3 + Vec3 * Fac5;
  196. float32x4_t Inv3 = Vec0 * Fac2 - Vec1 * Fac4 + Vec2 * Fac5;
  197. float32x4_t r0 = float32x4_t{-1, +1, -1, +1} * Inv0;
  198. float32x4_t r1 = float32x4_t{+1, -1, +1, -1} * Inv1;
  199. float32x4_t r2 = float32x4_t{-1, +1, -1, +1} * Inv2;
  200. float32x4_t r3 = float32x4_t{+1, -1, +1, -1} * Inv3;
  201. float32x4_t det = neon::mul_lane(r0, m0, 0);
  202. det = neon::madd_lane(det, r1, m0, 1);
  203. det = neon::madd_lane(det, r2, m0, 2);
  204. det = neon::madd_lane(det, r3, m0, 3);
  205. float32x4_t rdet = vdupq_n_f32(1 / vgetq_lane_f32(det, 0));
  206. mat<4, 4, float, Q> r;
  207. r[0].data = vmulq_f32(r0, rdet);
  208. r[1].data = vmulq_f32(r1, rdet);
  209. r[2].data = vmulq_f32(r2, rdet);
  210. r[3].data = vmulq_f32(r3, rdet);
  211. return r;
  212. }
  213. };
  214. }//namespace glm
  215. #endif