|
1 | 1 | #include <vector_math/matrix4f.hpp> |
| 2 | +#include <vector_math/common.hpp> |
| 3 | + |
| 4 | +#ifdef __VECTOR_MATH_ARCH_X86_X64 |
| 5 | + #include <immintrin.h> |
| 6 | +#elif defined(__VECTOR_MATH_ARCH_ARM) |
| 7 | + #include <arm_neon.h> |
| 8 | +#endif |
2 | 9 |
|
3 | 10 | using namespace systems::leal::vector_math; |
4 | 11 |
|
@@ -35,8 +42,47 @@ Matrix4f Matrix4f::operator*(const Matrix4f &rhs) const { |
35 | 42 | } |
36 | 43 | return toReturn; |
37 | 44 | #elif defined(__VECTOR_MATH_ARCH_ARM) |
38 | | - auto toReturn = ((Matrix4<float> *)this)->operator*(rhs); |
39 | | - return *(Matrix4f *)&toReturn; |
| 45 | + Matrix4f toReturn; |
| 46 | + float32x4_t A0 = vld1q_f32(this->data); |
| 47 | + float32x4_t A1 = vld1q_f32(this->data+4); |
| 48 | + float32x4_t A2 = vld1q_f32(this->data+8); |
| 49 | + float32x4_t A3 = vld1q_f32(this->data+12); |
| 50 | + |
| 51 | + float32x4_t C0 = vmovq_n_f32(0); |
| 52 | + float32x4_t C1 = vmovq_n_f32(0); |
| 53 | + float32x4_t C2 = vmovq_n_f32(0); |
| 54 | + float32x4_t C3 = vmovq_n_f32(0); |
| 55 | + |
| 56 | + float32x4_t B0 = vld1q_f32(rhs.data); |
| 57 | + C0 = vfmaq_laneq_f32(C0, A0, B0, 0); |
| 58 | + C0 = vfmaq_laneq_f32(C0, A1, B0, 1); |
| 59 | + C0 = vfmaq_laneq_f32(C0, A2, B0, 2); |
| 60 | + C0 = vfmaq_laneq_f32(C0, A3, B0, 3); |
| 61 | + vst1q_f32(toReturn.data, C0); |
| 62 | + |
| 63 | + float32x4_t B1 = vld1q_f32(rhs.data+4); |
| 64 | + C1 = vfmaq_laneq_f32(C1, A0, B1, 0); |
| 65 | + C1 = vfmaq_laneq_f32(C1, A1, B1, 1); |
| 66 | + C1 = vfmaq_laneq_f32(C1, A2, B1, 2); |
| 67 | + C1 = vfmaq_laneq_f32(C1, A3, B1, 3); |
| 68 | + vst1q_f32(toReturn.data+4, C1); |
| 69 | + |
| 70 | + float32x4_t B2 = vld1q_f32(rhs.data+8); |
| 71 | + C2 = vfmaq_laneq_f32(C2, A0, B2, 0); |
| 72 | + C2 = vfmaq_laneq_f32(C2, A1, B2, 1); |
| 73 | + C2 = vfmaq_laneq_f32(C2, A2, B2, 2); |
| 74 | + C2 = vfmaq_laneq_f32(C2, A3, B2, 3); |
| 75 | + vst1q_f32(toReturn.data+8, C2); |
| 76 | + |
| 77 | + float32x4_t B3 = vld1q_f32(rhs.data+12); |
| 78 | + C3 = vfmaq_laneq_f32(C3, A0, B3, 0); |
| 79 | + C3 = vfmaq_laneq_f32(C3, A1, B3, 1); |
| 80 | + C3 = vfmaq_laneq_f32(C3, A2, B3, 2); |
| 81 | + C3 = vfmaq_laneq_f32(C3, A3, B3, 3); |
| 82 | + vst1q_f32(toReturn.data+12, C3); |
| 83 | + |
| 84 | + //auto toReturn = ((Matrix4<float> *)this)->operator*(rhs); |
| 85 | + return toReturn; |
40 | 86 | #endif |
41 | 87 | } |
42 | 88 |
|
|
0 commit comments