9
3

neon.h 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. /// @ref simd_neon
  2. /// @file glm/simd/neon.h
  3. #pragma once
  4. #if GLM_ARCH & GLM_ARCH_NEON_BIT
  5. #include <arm_neon.h>
  6. namespace glm {
  7. namespace neon {
  8. static float32x4_t dupq_lane(float32x4_t vsrc, int lane) {
  9. switch(lane) {
  10. #if GLM_ARCH & GLM_ARCH_ARMV8_BIT
  11. case 0: return vdupq_laneq_f32(vsrc, 0);
  12. case 1: return vdupq_laneq_f32(vsrc, 1);
  13. case 2: return vdupq_laneq_f32(vsrc, 2);
  14. case 3: return vdupq_laneq_f32(vsrc, 3);
  15. #else
  16. case 0: return vdupq_n_f32(vgetq_lane_f32(vsrc, 0));
  17. case 1: return vdupq_n_f32(vgetq_lane_f32(vsrc, 1));
  18. case 2: return vdupq_n_f32(vgetq_lane_f32(vsrc, 2));
  19. case 3: return vdupq_n_f32(vgetq_lane_f32(vsrc, 3));
  20. #endif
  21. }
  22. assert(!"Unreachable code executed!");
  23. return vdupq_n_f32(0.0f);
  24. }
  25. static float32x2_t dup_lane(float32x4_t vsrc, int lane) {
  26. switch(lane) {
  27. #if GLM_ARCH & GLM_ARCH_ARMV8_BIT
  28. case 0: return vdup_laneq_f32(vsrc, 0);
  29. case 1: return vdup_laneq_f32(vsrc, 1);
  30. case 2: return vdup_laneq_f32(vsrc, 2);
  31. case 3: return vdup_laneq_f32(vsrc, 3);
  32. #else
  33. case 0: return vdup_n_f32(vgetq_lane_f32(vsrc, 0));
  34. case 1: return vdup_n_f32(vgetq_lane_f32(vsrc, 1));
  35. case 2: return vdup_n_f32(vgetq_lane_f32(vsrc, 2));
  36. case 3: return vdup_n_f32(vgetq_lane_f32(vsrc, 3));
  37. #endif
  38. }
  39. assert(!"Unreachable code executed!");
  40. return vdup_n_f32(0.0f);
  41. }
  42. static float32x4_t copy_lane(float32x4_t vdst, int dlane, float32x4_t vsrc, int slane) {
  43. #if GLM_ARCH & GLM_ARCH_ARMV8_BIT
  44. switch(dlane) {
  45. case 0:
  46. switch(slane) {
  47. case 0: return vcopyq_laneq_f32(vdst, 0, vsrc, 0);
  48. case 1: return vcopyq_laneq_f32(vdst, 0, vsrc, 1);
  49. case 2: return vcopyq_laneq_f32(vdst, 0, vsrc, 2);
  50. case 3: return vcopyq_laneq_f32(vdst, 0, vsrc, 3);
  51. }
  52. assert(!"Unreachable code executed!");
  53. case 1:
  54. switch(slane) {
  55. case 0: return vcopyq_laneq_f32(vdst, 1, vsrc, 0);
  56. case 1: return vcopyq_laneq_f32(vdst, 1, vsrc, 1);
  57. case 2: return vcopyq_laneq_f32(vdst, 1, vsrc, 2);
  58. case 3: return vcopyq_laneq_f32(vdst, 1, vsrc, 3);
  59. }
  60. assert(!"Unreachable code executed!");
  61. case 2:
  62. switch(slane) {
  63. case 0: return vcopyq_laneq_f32(vdst, 2, vsrc, 0);
  64. case 1: return vcopyq_laneq_f32(vdst, 2, vsrc, 1);
  65. case 2: return vcopyq_laneq_f32(vdst, 2, vsrc, 2);
  66. case 3: return vcopyq_laneq_f32(vdst, 2, vsrc, 3);
  67. }
  68. assert(!"Unreachable code executed!");
  69. case 3:
  70. switch(slane) {
  71. case 0: return vcopyq_laneq_f32(vdst, 3, vsrc, 0);
  72. case 1: return vcopyq_laneq_f32(vdst, 3, vsrc, 1);
  73. case 2: return vcopyq_laneq_f32(vdst, 3, vsrc, 2);
  74. case 3: return vcopyq_laneq_f32(vdst, 3, vsrc, 3);
  75. }
  76. assert(!"Unreachable code executed!");
  77. }
  78. #else
  79. float l;
  80. switch(slane) {
  81. case 0: l = vgetq_lane_f32(vsrc, 0); break;
  82. case 1: l = vgetq_lane_f32(vsrc, 1); break;
  83. case 2: l = vgetq_lane_f32(vsrc, 2); break;
  84. case 3: l = vgetq_lane_f32(vsrc, 3); break;
  85. default:
  86. assert(!"Unreachable code executed!");
  87. }
  88. switch(dlane) {
  89. case 0: return vsetq_lane_f32(l, vdst, 0);
  90. case 1: return vsetq_lane_f32(l, vdst, 1);
  91. case 2: return vsetq_lane_f32(l, vdst, 2);
  92. case 3: return vsetq_lane_f32(l, vdst, 3);
  93. }
  94. #endif
  95. assert(!"Unreachable code executed!");
  96. return vdupq_n_f32(0.0f);
  97. }
  98. static float32x4_t mul_lane(float32x4_t v, float32x4_t vlane, int lane) {
  99. #if GLM_ARCH & GLM_ARCH_ARMV8_BIT
  100. switch(lane) {
  101. case 0: return vmulq_laneq_f32(v, vlane, 0); break;
  102. case 1: return vmulq_laneq_f32(v, vlane, 1); break;
  103. case 2: return vmulq_laneq_f32(v, vlane, 2); break;
  104. case 3: return vmulq_laneq_f32(v, vlane, 3); break;
  105. default:
  106. assert(!"Unreachable code executed!");
  107. }
  108. assert(!"Unreachable code executed!");
  109. return vdupq_n_f32(0.0f);
  110. #else
  111. return vmulq_f32(v, dupq_lane(vlane, lane));
  112. #endif
  113. }
  114. static float32x4_t madd_lane(float32x4_t acc, float32x4_t v, float32x4_t vlane, int lane) {
  115. #if GLM_ARCH & GLM_ARCH_ARMV8_BIT
  116. #ifdef GLM_CONFIG_FORCE_FMA
  117. # define FMADD_LANE(acc, x, y, L) do { asm volatile ("fmla %0.4s, %1.4s, %2.4s" : "+w"(acc) : "w"(x), "w"(dup_lane(y, L))); } while(0)
  118. #else
  119. # define FMADD_LANE(acc, x, y, L) do { acc = vmlaq_laneq_f32(acc, x, y, L); } while(0)
  120. #endif
  121. switch(lane) {
  122. case 0:
  123. FMADD_LANE(acc, v, vlane, 0);
  124. return acc;
  125. case 1:
  126. FMADD_LANE(acc, v, vlane, 1);
  127. return acc;
  128. case 2:
  129. FMADD_LANE(acc, v, vlane, 2);
  130. return acc;
  131. case 3:
  132. FMADD_LANE(acc, v, vlane, 3);
  133. return acc;
  134. default:
  135. assert(!"Unreachable code executed!");
  136. }
  137. assert(!"Unreachable code executed!");
  138. return vdupq_n_f32(0.0f);
  139. # undef FMADD_LANE
  140. #else
  141. return vaddq_f32(acc, vmulq_f32(v, dupq_lane(vlane, lane)));
  142. #endif
  143. }
  144. } //namespace neon
  145. } // namespace glm
  146. #endif // GLM_ARCH & GLM_ARCH_NEON_BIT