-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpart2.c~
More file actions
133 lines (117 loc) · 10.2 KB
/
part2.c~
File metadata and controls
133 lines (117 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#include <emmintrin.h>
#include <omp.h>
#include <stdio.h>
#include <string.h>
#define KERNX 3 //this is the x-size of the kernel. It will always be odd.
#define KERNY 3 //this is the y-size of the kernel. It will always be odd.
int conv2D(float* in, float* out, int data_size_X, int data_size_Y,
float* kernel)
{
// the x coordinate of the kernel's center
int kern_cent_X = (KERNX - 1)/2;
// the y coordinate of the kernel's center
int kern_cent_Y = (KERNY - 1)/2;
// creating a padded matrix
int matrix_size = ((data_size_X+2)*(data_size_Y+2));
float padded_in[matrix_size];
memset(padded_in, 0.0, 4*matrix_size);
#pragma omp parallel for
for(int q = 0; q < data_size_Y; q++){
for(int w = 0; w < data_size_X; w++){
padded_in[((q+1)*(data_size_X+2)+w+1)] = in[(q*(data_size_X)+w)];
}
}
// rotating kernel
int length = KERNX*KERNY;
float r_kernel[KERNX*KERNY];
for(int i=0; i<length; i++){
r_kernel[length-1-i] = kernel[i];
}
// main convolution loop
__m128 kernel1 = _mm_load1_ps(r_kernel+0);
__m128 kernel2 = _mm_load1_ps(r_kernel+1);
__m128 kernel3 = _mm_load1_ps(r_kernel+2);
__m128 kernel4 = _mm_load1_ps(r_kernel+3);
__m128 kernel5 = _mm_load1_ps(r_kernel+4);
__m128 kernel6 = _mm_load1_ps(r_kernel+5);
__m128 kernel7 = _mm_load1_ps(r_kernel+6);
__m128 kernel8 = _mm_load1_ps(r_kernel+7);
__m128 kernel9 = _mm_load1_ps(r_kernel+8);
#pragma omp parallel for
for(int y = 0; y < data_size_Y; y++){ // the x coordinate of the output location we're focusing on
for(int x=0; x<=16*(data_size_X/16)-16; x+=16/*int x = 0; x < data_size_X; x++*/){ // the y coordinate of theoutput location we're focusing on
// int i = 0;
__m128 total_vector = _mm_setzero_ps();
total_vector = _mm_add_ps(total_vector, _mm_mul_ps(kernel1, _mm_loadu_ps(padded_in + ((x+1+0%KERNX-kern_cent_X ) + (y+1+0/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector = _mm_add_ps(total_vector, _mm_mul_ps(kernel2, _mm_loadu_ps(padded_in + ((x+1+(1)%KERNX-kern_cent_X ) + (y+1+1/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector = _mm_add_ps(total_vector, _mm_mul_ps(kernel3, _mm_loadu_ps(padded_in + ((x+1+(2)%KERNX-kern_cent_X ) + (y+1+2/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector = _mm_add_ps(total_vector, _mm_mul_ps(kernel4, _mm_loadu_ps(padded_in + ((x+1+(3)%KERNX-kern_cent_X ) + (y+1+3/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector = _mm_add_ps(total_vector, _mm_mul_ps(kernel5, _mm_loadu_ps(padded_in + ((x+1+(4)%KERNX-kern_cent_X ) + (y+1+4/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector = _mm_add_ps(total_vector, _mm_mul_ps(kernel6, _mm_loadu_ps(padded_in + ((x+1+(5)%KERNX-kern_cent_X ) + (y+1+5/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector = _mm_add_ps(total_vector, _mm_mul_ps(kernel7, _mm_loadu_ps(padded_in + ((x+1+(6)%KERNX-kern_cent_X ) + (y+1+6/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector = _mm_add_ps(total_vector, _mm_mul_ps(kernel8, _mm_loadu_ps(padded_in + ((x+1+(7)%KERNX-kern_cent_X ) + (y+1+7/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector = _mm_add_ps(total_vector, _mm_mul_ps(kernel9, _mm_loadu_ps(padded_in + ((x+1+(8)%KERNX-kern_cent_X ) + (y+1+8/KERNX-kern_cent_Y)*(data_size_X+2)))));
__m128 total_vector1 = _mm_setzero_ps();
total_vector1 = _mm_add_ps(total_vector1, _mm_mul_ps(kernel1, _mm_loadu_ps(padded_in + ((x+5+0%KERNX-kern_cent_X ) + (y+1+0/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector1 = _mm_add_ps(total_vector1, _mm_mul_ps(kernel2, _mm_loadu_ps(padded_in + ((x+5+(1)%KERNX-kern_cent_X ) + (y+1+1/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector1 = _mm_add_ps(total_vector1, _mm_mul_ps(kernel3, _mm_loadu_ps(padded_in + ((x+5+(2)%KERNX-kern_cent_X ) + (y+1+2/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector1 = _mm_add_ps(total_vector1, _mm_mul_ps(kernel4, _mm_loadu_ps(padded_in + ((x+5+(3)%KERNX-kern_cent_X ) + (y+1+3/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector1 = _mm_add_ps(total_vector1, _mm_mul_ps(kernel5, _mm_loadu_ps(padded_in + ((x+5+(4)%KERNX-kern_cent_X ) + (y+1+4/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector1 = _mm_add_ps(total_vector1, _mm_mul_ps(kernel6, _mm_loadu_ps(padded_in + ((x+5+(5)%KERNX-kern_cent_X ) + (y+1+5/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector1 = _mm_add_ps(total_vector1, _mm_mul_ps(kernel7, _mm_loadu_ps(padded_in + ((x+5+(6)%KERNX-kern_cent_X ) + (y+1+6/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector1 = _mm_add_ps(total_vector1, _mm_mul_ps(kernel8, _mm_loadu_ps(padded_in + ((x+5+(7)%KERNX-kern_cent_X ) + (y+1+7/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector1 = _mm_add_ps(total_vector1, _mm_mul_ps(kernel9, _mm_loadu_ps(padded_in + ((x+5+(8)%KERNX-kern_cent_X ) + (y+1+8/KERNX-kern_cent_Y)*(data_size_X+2)))));
__m128 total_vector2 = _mm_setzero_ps();
total_vector2 = _mm_add_ps(total_vector2, _mm_mul_ps(kernel1, _mm_loadu_ps(padded_in + ((x+9+0%KERNX-kern_cent_X ) + (y+1+0/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector2 = _mm_add_ps(total_vector2, _mm_mul_ps(kernel2, _mm_loadu_ps(padded_in + ((x+9+(1)%KERNX-kern_cent_X ) + (y+1+1/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector2 = _mm_add_ps(total_vector2, _mm_mul_ps(kernel3, _mm_loadu_ps(padded_in + ((x+9+(2)%KERNX-kern_cent_X ) + (y+1+2/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector2 = _mm_add_ps(total_vector2, _mm_mul_ps(kernel4, _mm_loadu_ps(padded_in + ((x+9+(3)%KERNX-kern_cent_X ) + (y+1+3/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector2 = _mm_add_ps(total_vector2, _mm_mul_ps(kernel5, _mm_loadu_ps(padded_in + ((x+9+(4)%KERNX-kern_cent_X ) + (y+1+4/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector2 = _mm_add_ps(total_vector2, _mm_mul_ps(kernel6, _mm_loadu_ps(padded_in + ((x+9+(5)%KERNX-kern_cent_X ) + (y+1+5/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector2 = _mm_add_ps(total_vector2, _mm_mul_ps(kernel7, _mm_loadu_ps(padded_in + ((x+9+(6)%KERNX-kern_cent_X ) + (y+1+6/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector2 = _mm_add_ps(total_vector2, _mm_mul_ps(kernel8, _mm_loadu_ps(padded_in + ((x+9+(7)%KERNX-kern_cent_X ) + (y+1+7/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector2 = _mm_add_ps(total_vector2, _mm_mul_ps(kernel9, _mm_loadu_ps(padded_in + ((x+9+(8)%KERNX-kern_cent_X ) + (y+1+8/KERNX-kern_cent_Y)*(data_size_X+2)))));
__m128 total_vector3 = _mm_setzero_ps();
total_vector3 = _mm_add_ps(total_vector3, _mm_mul_ps(kernel1, _mm_loadu_ps(padded_in + ((x+13+0%KERNX-kern_cent_X ) + (y+1+0/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector3 = _mm_add_ps(total_vector3, _mm_mul_ps(kernel2, _mm_loadu_ps(padded_in + ((x+13+(1)%KERNX-kern_cent_X ) + (y+1+1/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector3 = _mm_add_ps(total_vector3, _mm_mul_ps(kernel3, _mm_loadu_ps(padded_in + ((x+13+(2)%KERNX-kern_cent_X ) + (y+1+2/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector3 = _mm_add_ps(total_vector3, _mm_mul_ps(kernel4, _mm_loadu_ps(padded_in + ((x+13+(3)%KERNX-kern_cent_X ) + (y+1+3/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector3 = _mm_add_ps(total_vector3, _mm_mul_ps(kernel5, _mm_loadu_ps(padded_in + ((x+13+(4)%KERNX-kern_cent_X ) + (y+1+4/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector3 = _mm_add_ps(total_vector3, _mm_mul_ps(kernel6, _mm_loadu_ps(padded_in + ((x+13+(5)%KERNX-kern_cent_X ) + (y+1+5/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector3 = _mm_add_ps(total_vector3, _mm_mul_ps(kernel7, _mm_loadu_ps(padded_in + ((x+13+(6)%KERNX-kern_cent_X ) + (y+1+6/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector3 = _mm_add_ps(total_vector3, _mm_mul_ps(kernel8, _mm_loadu_ps(padded_in + ((x+13+(7)%KERNX-kern_cent_X ) + (y+1+7/KERNX-kern_cent_Y)*(data_size_X+2)))));
total_vector3 = _mm_add_ps(total_vector3, _mm_mul_ps(kernel9, _mm_loadu_ps(padded_in + ((x+13+(8)%KERNX-kern_cent_X ) + (y+1+8/KERNX-kern_cent_Y)*(data_size_X+2)))));
_mm_storeu_ps(out+(x+y*data_size_X), total_vector);
_mm_storeu_ps(out+(x+4+y*data_size_X), total_vector1);
_mm_storeu_ps(out+(x+8+y*data_size_X), total_vector2);
_mm_storeu_ps(out+(x+12+y*data_size_X), total_vector3);
}
}
float* k0 = r_kernel;
float* k1 = r_kernel +1;
float* k2 = r_kernel +2;
float* k3 = r_kernel +3;
float* k4 = r_kernel +4;
float* k5 = r_kernel +5;
float* k6 = r_kernel +6;
float* k7 = r_kernel +7;
float* k8 = r_kernel + 8;
#pragma omp parallel for
for(int y = 0; y < data_size_Y; y++){
for (int x=16*(data_size_X/16); x< data_size_X; x++) {
// for(int i=0; i<length; i++){
// out[x+y*data_size_X] += *(r_kernel+i) * *(padded_in+((x+1+i%KERNX-kern_cent_X ) + (y+1+i/KERNX-kern_cent_Y)*(data_size_X+2)));
// }
out[x+y*data_size_X] += *(k0) * *(padded_in+((x+1+0%KERNX-kern_cent_X ) + (y+1+0/KERNX-kern_cent_Y)*(data_size_X+2))); //(x+y)*(data+size_X-1)
out[x+y*data_size_X] += *(k1) * *(padded_in+((x+1+1%KERNX-kern_cent_X ) + (y+1+1/KERNX-kern_cent_Y)*(data_size_X+2)));
out[x+y*data_size_X] += *(k2) * *(padded_in+((x+1+2%KERNX-kern_cent_X ) + (y+1+2/KERNX-kern_cent_Y)*(data_size_X+2)));
out[x+y*data_size_X] += *(k3) * *(padded_in+((x+1+3%KERNX-kern_cent_X ) + (y+1+3/KERNX-kern_cent_Y)*(data_size_X+2)));
out[x+y*data_size_X] += *(k4) * *(padded_in+((x+1+4%KERNX-kern_cent_X ) + (y+1+4/KERNX-kern_cent_Y)*(data_size_X+2)));
out[x+y*data_size_X] += *(k5) * *(padded_in+((x+1+5%KERNX-kern_cent_X ) + (y+1+5/KERNX-kern_cent_Y)*(data_size_X+2)));
out[x+y*data_size_X] += *(k6) * *(padded_in+((x+1+6%KERNX-kern_cent_X ) + (y+1+6/KERNX-kern_cent_Y)*(data_size_X+2)));
out[x+y*data_size_X] += *(k7) * *(padded_in+((x+1+7%KERNX-kern_cent_X ) + (y+1+7/KERNX-kern_cent_Y)*(data_size_X+2)));
out[x+y*data_size_X] += *(k8) * *(padded_in+((x+1+8%KERNX-kern_cent_X ) + (y+1+8/KERNX-kern_cent_Y)*(data_size_X+2)));
}
}
return 1;
}