Skip to content

Commit a93070d

Browse files
committed
arm neon intrinsics test
1 parent f7bf612 commit a93070d

4 files changed

Lines changed: 55 additions & 16 deletions

File tree

inc/vector_math/matrix4d.hpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,8 @@
11
#pragma once
22

3-
#include <vector_math/common.hpp>
43
#include <vector_math/matrix4.hpp>
54
#include <vector_math/vector4d.hpp>
65

7-
#ifdef __VECTOR_MATH_ARCH_X86_X64
8-
#include <immintrin.h>
9-
#elif defined(__VECTOR_MATH_ARCH_ARM)
10-
#endif
11-
12-
136
namespace systems::leal::vector_math
147
{
158
class alignas(32) Matrix4d : public Matrix4<double> {

inc/vector_math/matrix4f.hpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,8 @@
11
#pragma once
22

3-
#include <vector_math/common.hpp>
43
#include <vector_math/matrix4.hpp>
54
#include <vector_math/vector4f.hpp>
65

7-
#ifdef __VECTOR_MATH_ARCH_X86_X64
8-
#include <immintrin.h>
9-
#elif defined(__VECTOR_MATH_ARCH_ARM)
10-
#endif
11-
12-
136
namespace systems::leal::vector_math
147
{
158
class alignas(16) Matrix4f : public Matrix4<float> {

src/matrix4d.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
1+
#include <vector_math/common.hpp>
12
#include <vector_math/matrix4d.hpp>
23

4+
#ifdef __VECTOR_MATH_ARCH_X86_X64
5+
#include <immintrin.h>
6+
#elif defined(__VECTOR_MATH_ARCH_ARM)
7+
#include <arm_neon.h>
8+
#endif
9+
310
using namespace systems::leal::vector_math;
411

512
Matrix4d::Matrix4d():Matrix4<double>() {

src/matrix4f.cpp

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
11
#include <vector_math/matrix4f.hpp>
2+
#include <vector_math/common.hpp>
3+
4+
#ifdef __VECTOR_MATH_ARCH_X86_X64
5+
#include <immintrin.h>
6+
#elif defined(__VECTOR_MATH_ARCH_ARM)
7+
#include <arm_neon.h>
8+
#endif
29

310
using namespace systems::leal::vector_math;
411

@@ -35,8 +42,47 @@ Matrix4f Matrix4f::operator*(const Matrix4f &rhs) const {
3542
}
3643
return toReturn;
3744
#elif defined(__VECTOR_MATH_ARCH_ARM)
38-
auto toReturn = ((Matrix4<float> *)this)->operator*(rhs);
39-
return *(Matrix4f *)&toReturn;
45+
Matrix4f toReturn;
46+
float32x4_t A0 = vld1q_f32(this->data);
47+
float32x4_t A1 = vld1q_f32(this->data+4);
48+
float32x4_t A2 = vld1q_f32(this->data+8);
49+
float32x4_t A3 = vld1q_f32(this->data+12);
50+
51+
float32x4_t C0 = vmovq_n_f32(0);
52+
float32x4_t C1 = vmovq_n_f32(0);
53+
float32x4_t C2 = vmovq_n_f32(0);
54+
float32x4_t C3 = vmovq_n_f32(0);
55+
56+
float32x4_t B0 = vld1q_f32(rhs.data);
57+
C0 = vfmaq_laneq_f32(C0, A0, B0, 0);
58+
C0 = vfmaq_laneq_f32(C0, A1, B0, 1);
59+
C0 = vfmaq_laneq_f32(C0, A2, B0, 2);
60+
C0 = vfmaq_laneq_f32(C0, A3, B0, 3);
61+
vst1q_f32(toReturn.data, C0);
62+
63+
float32x4_t B1 = vld1q_f32(rhs.data+4);
64+
C1 = vfmaq_laneq_f32(C1, A0, B1, 0);
65+
C1 = vfmaq_laneq_f32(C1, A1, B1, 1);
66+
C1 = vfmaq_laneq_f32(C1, A2, B1, 2);
67+
C1 = vfmaq_laneq_f32(C1, A3, B1, 3);
68+
vst1q_f32(toReturn.data+4, C1);
69+
70+
float32x4_t B2 = vld1q_f32(rhs.data+8);
71+
C2 = vfmaq_laneq_f32(C2, A0, B2, 0);
72+
C2 = vfmaq_laneq_f32(C2, A1, B2, 1);
73+
C2 = vfmaq_laneq_f32(C2, A2, B2, 2);
74+
C2 = vfmaq_laneq_f32(C2, A3, B2, 3);
75+
vst1q_f32(toReturn.data+8, C2);
76+
77+
float32x4_t B3 = vld1q_f32(rhs.data+12);
78+
C3 = vfmaq_laneq_f32(C3, A0, B3, 0);
79+
C3 = vfmaq_laneq_f32(C3, A1, B3, 1);
80+
C3 = vfmaq_laneq_f32(C3, A2, B3, 2);
81+
C3 = vfmaq_laneq_f32(C3, A3, B3, 3);
82+
vst1q_f32(toReturn.data+12, C3);
83+
84+
//auto toReturn = ((Matrix4<float> *)this)->operator*(rhs);
85+
return toReturn;
4086
#endif
4187
}
4288

0 commit comments

Comments
 (0)