|
| 1 | +/* |
| 2 | + * Copyright 2026 Daniel Cederberg and William Zhang |
| 3 | + * |
| 4 | + * This file is part of the DNLP-differentiation-engine project. |
| 5 | + * |
| 6 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 7 | + * you may not use this file except in compliance with the License. |
| 8 | + * You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, software |
| 13 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | + * See the License for the specific language governing permissions and |
| 16 | + * limitations under the License. |
| 17 | + */ |
| 18 | +#include "utils/cblas_wrapper.h" |
| 19 | +#include "utils/iVec.h" |
| 20 | +#include "utils/matrix.h" |
| 21 | +#include <stdlib.h> |
| 22 | +#include <string.h> |
| 23 | + |
| 24 | +static void dense_block_left_mult_vec(const Matrix *A, const double *x, double *y, |
| 25 | + int p) |
| 26 | +{ |
| 27 | + const Dense_Matrix *dm = (const Dense_Matrix *) A; |
| 28 | + int m = dm->base.m; |
| 29 | + int n = dm->base.n; |
| 30 | + |
| 31 | + /* y = kron(I_p, A) @ x via a single dgemm call: |
| 32 | + Treat x as n x p (column-major blocks) and y as m x p. |
| 33 | + But x and y are stored as p blocks of length n and m respectively |
| 34 | + (i.e. block-interleaved). This is the same as treating them as |
| 35 | + row-major matrices of shape p x n and p x m, so: |
| 36 | + y (p x m) = x (p x n) * A^T (n x m), all row-major. |
| 37 | + cblas with RowMajor: C = alpha * A * B + beta * C |
| 38 | + where A = x (p x n), B = A^T (n x m), C = y (p x m). */ |
| 39 | + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, p, m, n, 1.0, x, n, dm->x, |
| 40 | + n, 0.0, y, m); |
| 41 | +} |
| 42 | + |
| 43 | +static CSC_Matrix *dense_block_left_mult_sparsity(const Matrix *A, |
| 44 | + const CSC_Matrix *J, int p) |
| 45 | +{ |
| 46 | + int m = A->m; |
| 47 | + int n = A->n; |
| 48 | + int i, j, jj, block, block_start, block_end, block_jj_start, row_offset; |
| 49 | + |
| 50 | + int *Cp = (int *) malloc((J->n + 1) * sizeof(int)); |
| 51 | + iVec *Ci = iVec_new(J->n * m); |
| 52 | + Cp[0] = 0; |
| 53 | + |
| 54 | + /* for each column of J */ |
| 55 | + for (j = 0; j < J->n; j++) |
| 56 | + { |
| 57 | + /* if empty we continue */ |
| 58 | + if (J->p[j] == J->p[j + 1]) |
| 59 | + { |
| 60 | + Cp[j + 1] = Cp[j]; |
| 61 | + continue; |
| 62 | + } |
| 63 | + |
| 64 | + /* process each of p blocks of rows in this column of J */ |
| 65 | + jj = J->p[j]; |
| 66 | + for (block = 0; block < p; block++) |
| 67 | + { |
| 68 | + // ----------------------------------------------------------------- |
| 69 | + // find start and end indices of rows of J in this block |
| 70 | + // ----------------------------------------------------------------- |
| 71 | + block_start = block * n; |
| 72 | + block_end = block_start + n; |
| 73 | + while (jj < J->p[j + 1] && J->i[jj] < block_start) |
| 74 | + { |
| 75 | + jj++; |
| 76 | + } |
| 77 | + |
| 78 | + block_jj_start = jj; |
| 79 | + while (jj < J->p[j + 1] && J->i[jj] < block_end) |
| 80 | + { |
| 81 | + jj++; |
| 82 | + } |
| 83 | + |
| 84 | + /* if no entries in this block, continue */ |
| 85 | + if (jj == block_jj_start) |
| 86 | + { |
| 87 | + continue; |
| 88 | + } |
| 89 | + |
| 90 | + /* dense A: all m rows contribute */ |
| 91 | + row_offset = block * m; |
| 92 | + for (i = 0; i < m; i++) |
| 93 | + { |
| 94 | + iVec_append(Ci, row_offset + i); |
| 95 | + } |
| 96 | + } |
| 97 | + Cp[j + 1] = Ci->len; |
| 98 | + } |
| 99 | + |
| 100 | + CSC_Matrix *C = new_csc_matrix(m * p, J->n, Ci->len); |
| 101 | + memcpy(C->p, Cp, (J->n + 1) * sizeof(int)); |
| 102 | + memcpy(C->i, Ci->data, Ci->len * sizeof(int)); |
| 103 | + free(Cp); |
| 104 | + iVec_free(Ci); |
| 105 | + |
| 106 | + return C; |
| 107 | +} |
| 108 | + |
| 109 | +static void dense_block_left_mult_values(const Matrix *A, const CSC_Matrix *J, |
| 110 | + CSC_Matrix *C) |
| 111 | +{ |
| 112 | + const Dense_Matrix *dm = (const Dense_Matrix *) A; |
| 113 | + int m = dm->base.m; |
| 114 | + int n = dm->base.n; |
| 115 | + int k = J->n; |
| 116 | + |
| 117 | + int i, j, s, block, block_start, block_end, start, end; |
| 118 | + |
| 119 | + double *j_dense = dm->work; |
| 120 | + |
| 121 | + /* for each column of J (and C) */ |
| 122 | + for (j = 0; j < k; j++) |
| 123 | + { |
| 124 | + for (i = C->p[j]; i < C->p[j + 1]; i += m) |
| 125 | + { |
| 126 | + block = C->i[i] / m; |
| 127 | + block_start = block * n; |
| 128 | + block_end = block_start + n; |
| 129 | + |
| 130 | + start = J->p[j]; |
| 131 | + end = J->p[j + 1]; |
| 132 | + |
| 133 | + while (start < J->p[j + 1] && J->i[start] < block_start) |
| 134 | + { |
| 135 | + start++; |
| 136 | + } |
| 137 | + |
| 138 | + while (end > start && J->i[end - 1] >= block_end) |
| 139 | + { |
| 140 | + end--; |
| 141 | + } |
| 142 | + |
| 143 | + /* scatter sparse J column into dense vector and then compute |
| 144 | + A @ j_dense */ |
| 145 | + memset(j_dense, 0, n * sizeof(double)); |
| 146 | + for (s = start; s < end; s++) |
| 147 | + { |
| 148 | + j_dense[J->i[s] - block_start] = J->x[s]; |
| 149 | + } |
| 150 | + |
| 151 | + cblas_dgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, dm->x, n, j_dense, 1, |
| 152 | + 0.0, C->x + i, 1); |
| 153 | + } |
| 154 | + } |
| 155 | +} |
| 156 | + |
| 157 | +static void dense_free(Matrix *A) |
| 158 | +{ |
| 159 | + Dense_Matrix *dm = (Dense_Matrix *) A; |
| 160 | + free(dm->x); |
| 161 | + free(dm->work); |
| 162 | + free(dm); |
| 163 | +} |
| 164 | + |
| 165 | +Matrix *new_dense_matrix(int m, int n, const double *data) |
| 166 | +{ |
| 167 | + Dense_Matrix *dm = (Dense_Matrix *) calloc(1, sizeof(Dense_Matrix)); |
| 168 | + dm->base.m = m; |
| 169 | + dm->base.n = n; |
| 170 | + dm->base.block_left_mult_vec = dense_block_left_mult_vec; |
| 171 | + dm->base.block_left_mult_sparsity = dense_block_left_mult_sparsity; |
| 172 | + dm->base.block_left_mult_values = dense_block_left_mult_values; |
| 173 | + dm->base.free_fn = dense_free; |
| 174 | + dm->x = (double *) malloc(m * n * sizeof(double)); |
| 175 | + memcpy(dm->x, data, m * n * sizeof(double)); |
| 176 | + dm->work = (double *) malloc(n * sizeof(double)); |
| 177 | + return &dm->base; |
| 178 | +} |
| 179 | + |
| 180 | +Matrix *dense_matrix_trans(const Dense_Matrix *A) |
| 181 | +{ |
| 182 | + int m = A->base.m; |
| 183 | + int n = A->base.n; |
| 184 | + double *AT_x = (double *) malloc(m * n * sizeof(double)); |
| 185 | + |
| 186 | + for (int i = 0; i < m; i++) |
| 187 | + { |
| 188 | + for (int j = 0; j < n; j++) |
| 189 | + { |
| 190 | + AT_x[j * m + i] = A->x[i * n + j]; |
| 191 | + } |
| 192 | + } |
| 193 | + |
| 194 | + Matrix *result = new_dense_matrix(n, m, AT_x); |
| 195 | + free(AT_x); |
| 196 | + return result; |
| 197 | +} |
0 commit comments