@@ -81,6 +81,25 @@ public Matrix addRowToRows(Matrix row) throws DimensionsMismatchException {
8181 return result ;
8282 }
8383
84+ public Matrix addColToCols (Matrix col ) throws DimensionsMismatchException {
85+ if (rows != col .rows ) {
86+ final int [] dimensionsA = {rows , cols };
87+ final int [] dimensionsB = {col .rows , col .cols };
88+ throw new DimensionsMismatchException (dimensionsA , dimensionsB );
89+ }
90+
91+ Matrix result = new Matrix (rows , cols );
92+
93+ for (int row = 0 ; row < rows ; row ++) {
94+ for (int currentCol = 0 ; currentCol < cols ; currentCol ++) {
95+ int index = row * cols + currentCol ;
96+ result .data [index ] = data [index ] + col .data [row ];
97+ }
98+ }
99+
100+ return result ;
101+ }
102+
84103 public Matrix relu () {
85104 Matrix result = new Matrix (rows , cols );
86105
@@ -91,6 +110,18 @@ public Matrix relu() {
91110 return result ;
92111 }
93112
113+ public Matrix vectorizedReluDerivative () {
114+ Matrix result = new Matrix (rows , cols );
115+
116+ for (int i = 0 ; i < data .length ; i ++) {
117+ if (data [i ] > 0 ) {
118+ result .data [i ] = 1 ;
119+ }
120+ }
121+
122+ return result ;
123+ }
124+
94125 public Matrix softmax () {
95126 Matrix result = new Matrix (rows , cols );
96127
@@ -123,6 +154,35 @@ public Matrix softmax() {
123154 return result ;
124155 }
125156
157+ // public Matrix fastBatchSoftmaxDerivative(Matrix output) {
158+ // Matrix partialDerivatives = new Matrix(cols, cols);
159+ //
160+ // // for each set of output features
161+ // for(int outputRow = 0; outputRow < output.rows; outputRow++) {
162+ // int offset = outputRow * cols;
163+ // // for each output feature in the set
164+ // for(int i = 0; i < cols; i++) {
165+ // float valueI = output.data[offset + i];
166+ // // for each input feature in the set
167+ // for(int j = 0; j < cols; j++) {
168+ // float valueJ = output.data[offset + j];
169+ // if(i == j) {
170+ // partialDerivatives.data[i * cols + j] += valueI * (1 - valueI);
171+ // }
172+ // else {
173+ // partialDerivatives.data[i * cols + j] += -valueI * valueJ;
174+ // }
175+ // }
176+ // }
177+ // }
178+ //
179+ // for(int i = 0; i < partialDerivatives.data.length; i++) {
180+ // partialDerivatives.data[i] /= output.rows;
181+ // }
182+ //
183+ // return partialDerivatives;
184+ // }
185+
126186 public static boolean isCompatibleWithGPU (GPU gpu ) {
127187 return gpu .isInitialized () &&
128188 gpu .getKernel ("Matrices::matrixMultiply" ) != null &&
@@ -189,15 +249,17 @@ public Matrix multiply(GPU gpu, Matrix other) {
189249
190250 public Matrix addRowToRows (GPU gpu , Matrix row ) {
191251 if (cols != row .cols ) {
192- return null ;
252+ final int [] dimensionsA = {rows , cols };
253+ final int [] dimensionsB = {row .rows , row .cols };
254+ throw new DimensionsMismatchException (dimensionsA , dimensionsB );
193255 }
194256
195257 cl_context context = gpu .getContext ();
196258 cl_command_queue commandQueue = gpu .getCommandQueue ();
197259 cl_kernel kernel = gpu .getKernel ("Matrices::addRowToRows" );
198260
199261 if (kernel == null ) {
200- return null ;
262+ throw new NullPointerException ( "Matrices::addRowToRows not found to be loaded in GPU" ) ;
201263 }
202264
203265 Matrix result = new Matrix (rows , cols );
@@ -243,6 +305,64 @@ public Matrix addRowToRows(GPU gpu, Matrix row) {
243305 return result ;
244306 }
245307
308+ public Matrix addColToCols (GPU gpu , Matrix col ) {
309+ if (rows != col .rows ) {
310+ final int [] dimensionsA = {rows , cols };
311+ final int [] dimensionsB = {col .rows , col .cols };
312+ throw new DimensionsMismatchException (dimensionsA , dimensionsB );
313+ }
314+
315+ cl_context context = gpu .getContext ();
316+ cl_command_queue commandQueue = gpu .getCommandQueue ();
317+ cl_kernel kernel = gpu .getKernel ("Matrices::addColToCols" );
318+
319+ if (kernel == null ) {
320+ throw new NullPointerException ("Matrices::addColToCols not found to be loaded in GPU" );
321+ }
322+
323+ Matrix result = new Matrix (rows , cols );
324+
325+ Pointer pointerA = Pointer .to (data );
326+ Pointer pointerB = Pointer .to (col .data );
327+ Pointer pointerOut = Pointer .to (result .data );
328+
329+ // Allocate the memory objects for the input- and output data
330+ cl_mem memoryA = clCreateBuffer (context ,
331+ CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR ,
332+ Sizeof .cl_float * data .length , pointerA , null );
333+ cl_mem memoryB = clCreateBuffer (context ,
334+ CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR ,
335+ Sizeof .cl_float * col .data .length , pointerB , null );
336+ cl_mem memoryOut = clCreateBuffer (context ,
337+ CL_MEM_READ_WRITE ,
338+ Sizeof .cl_float * result .data .length , null , null );
339+
340+ // Set the arguments for the kernel
341+ int argNum = 0 ;
342+ clSetKernelArg (kernel , argNum ++, Sizeof .cl_mem , Pointer .to (memoryOut ));
343+ clSetKernelArg (kernel , argNum ++, Sizeof .cl_mem , Pointer .to (memoryA ));
344+ clSetKernelArg (kernel , argNum ++, Sizeof .cl_mem , Pointer .to (memoryB ));
345+ clSetKernelArg (kernel , argNum ++, Sizeof .cl_uint , Pointer .to (new int []{cols }));
346+
347+ // Set the work-item dimensions
348+ long local_work_sizes [] = new long []{1 };
349+ long global_work_sizes [] = new long []{rows };
350+
351+ // Execute the kernel
352+ clEnqueueNDRangeKernel (commandQueue , kernel , 1 , null ,
353+ global_work_sizes , local_work_sizes , 0 , null , null );
354+
355+ // Read the output data
356+ clEnqueueReadBuffer (commandQueue , memoryOut , CL_TRUE , 0 ,
357+ result .data .length * Sizeof .cl_float , pointerOut , 0 , null , null );
358+
359+ clReleaseMemObject (memoryA );
360+ clReleaseMemObject (memoryB );
361+ clReleaseMemObject (memoryOut );
362+
363+ return result ;
364+ }
365+
246366 public Matrix relu (GPU gpu ) {
247367 cl_context context = gpu .getContext ();
248368 cl_command_queue commandQueue = gpu .getCommandQueue ();
0 commit comments