addColToCols

TannerLow · TannerLow · commit 8f7c25162b10 · 2024-05-15T15:47:41.000-05:00
diff --git a/src/main/java/com/github/TannerLow/JavaMatrixMath/Matrix.java b/src/main/java/com/github/TannerLow/JavaMatrixMath/Matrix.java
@@ -81,6 +81,25 @@ public Matrix addRowToRows(Matrix row) throws DimensionsMismatchException {
         return result;
     }
 
+    public Matrix addColToCols(Matrix col) throws DimensionsMismatchException {
+        if(rows != col.rows) {
+            final int[] dimensionsA = {rows, cols};
+            final int[] dimensionsB = {col.rows, col.cols};
+            throw new DimensionsMismatchException(dimensionsA, dimensionsB);
+        }
+
+        Matrix result = new Matrix(rows, cols);
+
+        for(int row = 0; row < rows; row++) {
+            for(int currentCol = 0; currentCol < cols; currentCol++) {
+                int index = row * cols + currentCol;
+                result.data[index] = data[index] + col.data[row];
+            }
+        }
+
+        return result;
+    }
+
     public Matrix relu() {
         Matrix result = new Matrix(rows, cols);
 
@@ -91,6 +110,18 @@ public Matrix relu() {
         return result;
     }
 
+    public Matrix vectorizedReluDerivative() {
+        Matrix result = new Matrix(rows, cols);
+
+        for(int i = 0; i < data.length; i++) {
+            if(data[i] > 0) {
+                result.data[i] = 1;
+            }
+        }
+
+        return result;
+    }
+
     public Matrix softmax() {
         Matrix result = new Matrix(rows, cols);
 
@@ -123,6 +154,35 @@ public Matrix softmax() {
         return result;
     }
 
+//    public Matrix fastBatchSoftmaxDerivative(Matrix output) {
+//        Matrix partialDerivatives = new Matrix(cols, cols);
+//
+//        // for each set of output features
+//        for(int outputRow = 0; outputRow < output.rows; outputRow++) {
+//            int offset = outputRow * cols;
+//            // for each output feature in the set
+//            for(int i = 0; i < cols; i++) {
+//                float valueI = output.data[offset + i];
+//                // for each input feature in the set
+//                for(int j = 0; j < cols; j++) {
+//                    float valueJ = output.data[offset + j];
+//                    if(i == j) {
+//                        partialDerivatives.data[i * cols + j] += valueI * (1 - valueI);
+//                    }
+//                    else {
+//                        partialDerivatives.data[i * cols + j] += -valueI * valueJ;
+//                    }
+//                }
+//            }
+//        }
+//
+//        for(int i = 0; i < partialDerivatives.data.length; i++) {
+//            partialDerivatives.data[i] /= output.rows;
+//        }
+//
+//        return partialDerivatives;
+//    }
+
     public static boolean isCompatibleWithGPU(GPU gpu) {
         return  gpu.isInitialized() &&
                 gpu.getKernel("Matrices::matrixMultiply") != null &&
@@ -189,15 +249,17 @@ public Matrix multiply(GPU gpu, Matrix other) {
 
     public Matrix addRowToRows(GPU gpu, Matrix row) {
         if(cols != row.cols) {
-            return null;
+            final int[] dimensionsA = {rows, cols};
+            final int[] dimensionsB = {row.rows, row.cols};
+            throw new DimensionsMismatchException(dimensionsA, dimensionsB);
         }
 
         cl_context context = gpu.getContext();
         cl_command_queue commandQueue = gpu.getCommandQueue();
         cl_kernel kernel = gpu.getKernel("Matrices::addRowToRows");
 
         if(kernel == null) {
-            return null;
+            throw new NullPointerException("Matrices::addRowToRows not found to be loaded in GPU");
         }
 
         Matrix result = new Matrix(rows, cols);
@@ -243,6 +305,64 @@ public Matrix addRowToRows(GPU gpu, Matrix row) {
         return result;
     }
 
+    public Matrix addColToCols(GPU gpu, Matrix col) {
+        if(rows != col.rows) {
+            final int[] dimensionsA = {rows, cols};
+            final int[] dimensionsB = {col.rows, col.cols};
+            throw new DimensionsMismatchException(dimensionsA, dimensionsB);
+        }
+
+        cl_context context = gpu.getContext();
+        cl_command_queue commandQueue = gpu.getCommandQueue();
+        cl_kernel kernel = gpu.getKernel("Matrices::addColToCols");
+
+        if(kernel == null) {
+            throw new NullPointerException("Matrices::addColToCols not found to be loaded in GPU");
+        }
+
+        Matrix result = new Matrix(rows, cols);
+
+        Pointer pointerA = Pointer.to(data);
+        Pointer pointerB = Pointer.to(col.data);
+        Pointer pointerOut = Pointer.to(result.data);
+
+        // Allocate the memory objects for the input- and output data
+        cl_mem memoryA = clCreateBuffer(context,
+                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                Sizeof.cl_float * data.length, pointerA, null);
+        cl_mem memoryB = clCreateBuffer(context,
+                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                Sizeof.cl_float * col.data.length, pointerB, null);
+        cl_mem memoryOut = clCreateBuffer(context,
+                CL_MEM_READ_WRITE,
+                Sizeof.cl_float * result.data.length, null, null);
+
+        // Set the arguments for the kernel
+        int argNum = 0;
+        clSetKernelArg(kernel, argNum++, Sizeof.cl_mem, Pointer.to(memoryOut));
+        clSetKernelArg(kernel, argNum++, Sizeof.cl_mem, Pointer.to(memoryA));
+        clSetKernelArg(kernel, argNum++, Sizeof.cl_mem, Pointer.to(memoryB));
+        clSetKernelArg(kernel, argNum++, Sizeof.cl_uint, Pointer.to(new int[]{cols}));
+
+        // Set the work-item dimensions
+        long local_work_sizes[] = new long[]{1};
+        long global_work_sizes[] = new long[]{rows};
+
+        // Execute the kernel
+        clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
+                global_work_sizes, local_work_sizes, 0, null, null);
+
+        // Read the output data
+        clEnqueueReadBuffer(commandQueue, memoryOut, CL_TRUE, 0,
+                result.data.length * Sizeof.cl_float, pointerOut, 0, null, null);
+
+        clReleaseMemObject(memoryA);
+        clReleaseMemObject(memoryB);
+        clReleaseMemObject(memoryOut);
+
+        return result;
+    }
+
     public Matrix relu(GPU gpu) {
         cl_context context = gpu.getContext();
         cl_command_queue commandQueue = gpu.getCommandQueue();
diff --git a/src/main/resources/kernels/Matrices.cl b/src/main/resources/kernels/Matrices.cl
@@ -55,14 +55,34 @@ addRowToRows(__global float* C,
 {
     int globalRow = get_global_id(0);
 
+    int offset = globalRow * rowSize;
+
     // one thread per row
     for (int i = 0; i < rowSize; i++)
     {
-        C[globalRow * rowSize + i] = A[globalRow * rowSize + i] + B[i];
+        C[offset + i] = A[offset + i] + B[i];
+    }
+}
+
+// Add col to cols: C = A[i][j] + B, for all cols j.
+__kernel void
+addColToCols(__global float* C,
+             __global float* A,
+             __global float* B,
+             const int rowSize)
+{
+    int globalRow = get_global_id(0);
+
+    int offset = globalRow * rowSize;
+
+    // one thread per row of A
+    for (int i = 0; i < rowSize; i++)
+    {
+        C[offset + i] = A[offset + i] + B[globalRow];
     }
 }
 
-// Add row to rows: output = ReLu(A).
+// Relu: output = ReLu(A).
 __kernel void
 relu(__global float* output,
      __global float* input,
diff --git a/src/test/java/com/github/TannerLow/JavaMatrixMath/CpuTest.java b/src/test/java/com/github/TannerLow/JavaMatrixMath/CpuTest.java
@@ -7,7 +7,9 @@ public class CpuTest {
     public static void testAll() {
         testMultiply();
         testAddRowToRows();
+        testAddColToCols();
         testRelu();
+        testVectorizedReluDerivative();
         testSoftmax();
     }
 
@@ -53,6 +55,27 @@ private static void testAddRowToRows() {
         }
     }
 
+    private static void testAddColToCols() {
+        float[] aData = {1,0,2,0,3,0};
+        float[] bData = {3,2,1};
+        float[] expected = {4,3,4,2,4,1};
+
+        Matrix a = new Matrix(3,2, aData);
+        Matrix b = new Matrix(3,1, bData);
+
+        Matrix result = a.addColToCols(b);
+
+        if(result.rows != a.rows || result.cols != a.cols) {
+            throw new TestFailedException();
+        }
+
+        for(int i = 0; i < result.data.length; i++) {
+            if(!TestMath.withinMariginOfError(expected[i], result.data[i], 0.0005f)) {
+                throw new TestFailedException();
+            }
+        }
+    }
+
     private static void testRelu() {
         float[] data = {-1,2,-3,0};
         float[] expected = {0,2,0,0};
@@ -72,6 +95,25 @@ private static void testRelu() {
         }
     }
 
+    private static void testVectorizedReluDerivative() {
+        float[] data = {-1,2,-3,0};
+        float[] expected = {0,1,0,0};
+
+        Matrix m = new Matrix(2, 2, data);
+
+        Matrix result = m.vectorizedReluDerivative();
+
+        if(result.rows != m.rows || result.cols != m.cols) {
+            throw new TestFailedException();
+        }
+
+        for(int i = 0; i < result.data.length; i++) {
+            if(!TestMath.withinMariginOfError(expected[i], result.data[i], 0.0005f)) {
+                throw new TestFailedException();
+            }
+        }
+    }
+
     private static void testSoftmax() {
         float[] data = {1.1f,2.2f,0.2f,-1.7f};
         float[] expected = {0.223636f,0.671841f,0.090923f,0.013599f};
diff --git a/src/test/java/com/github/TannerLow/JavaMatrixMath/GpuTest.java b/src/test/java/com/github/TannerLow/JavaMatrixMath/GpuTest.java
@@ -17,6 +17,7 @@ public static void testAll() throws IOException {
 
             testMultiply();
             testAddRowToRows();
+            testAddColToCols();
             testRelu();
             testSoftmax();
         }
@@ -84,6 +85,27 @@ private static void testAddRowToRows() {
         }
     }
 
+    private static void testAddColToCols() {
+        float[] aData = {1,0,2,0,3,0};
+        float[] bData = {3,2,1};
+        float[] expected = {4,3,4,2,4,1};
+
+        Matrix a = new Matrix(3,2, aData);
+        Matrix b = new Matrix(3,1, bData);
+
+        Matrix result = a.addColToCols(b);
+
+        if(result.rows != a.rows || result.cols != a.cols) {
+            throw new TestFailedException();
+        }
+
+        for(int i = 0; i < result.data.length; i++) {
+            if(!TestMath.withinMariginOfError(expected[i], result.data[i], 0.0005f)) {
+                throw new TestFailedException();
+            }
+        }
+    }
+
     private static void testRelu() {
         float[] data = {-1,2,-3,0};
         float[] expected = {0,2,0,0};