Skip to content

Commit 9c23b40

Browse files
Transurgeonclaude
andcommitted
Merge origin/main into parameter-support
Sync parameter-support with main's left matmul 100x perf improvements (PR #44) and right matmul refactor (PR #46). Simplify param matmul to store only the small A matrix instead of block-diagonal — block_left_multiply_* functions handle the rest. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2 parents b8cf436 + 5316598 commit 9c23b40

25 files changed

Lines changed: 2158 additions & 1201 deletions

CMakeLists.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ set(CMAKE_C_STANDARD 99)
44

55
set(DIFF_ENGINE_VERSION_MAJOR 0)
66
set(DIFF_ENGINE_VERSION_MINOR 1)
7-
set(DIFF_ENGINE_VERSION_PATCH 1)
7+
set(DIFF_ENGINE_VERSION_PATCH 2)
88
set(DIFF_ENGINE_VERSION "${DIFF_ENGINE_VERSION_MAJOR}.${DIFF_ENGINE_VERSION_MINOR}.${DIFF_ENGINE_VERSION_PATCH}")
99
add_compile_definitions(DIFF_ENGINE_VERSION="${DIFF_ENGINE_VERSION}")
1010

@@ -81,6 +81,8 @@ set_property(TARGET dnlp_diff PROPERTY POSITION_INDEPENDENT_CODE ON)
8181
# =============================================================================
8282
# C tests (only for standalone builds)
8383
# =============================================================================
84+
option(PROFILE_ONLY "Build only profiling tests" OFF)
85+
8486
if(NOT SKBUILD)
8587
include_directories(${PROJECT_SOURCE_DIR}/tests)
8688
enable_testing()
@@ -90,5 +92,11 @@ if(NOT SKBUILD)
9092
tests/test_helpers.c
9193
)
9294
target_link_libraries(all_tests dnlp_diff)
95+
96+
if(PROFILE_ONLY)
97+
target_compile_definitions(all_tests PRIVATE PROFILE_ONLY)
98+
message(STATUS "Building ONLY profiling tests")
99+
endif()
100+
93101
add_test(NAME AllTests COMMAND all_tests)
94102
endif()

include/subexpr.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,12 @@ typedef struct left_matmul_expr
117117
expr base;
118118
CSR_Matrix *A;
119119
CSR_Matrix *AT;
120-
CSC_Matrix *CSC_work;
121-
expr *param_source;
122-
int src_m, src_n; /* original (non-block-diag) matrix dimensions */
120+
int n_blocks;
121+
CSC_Matrix *Jchild_CSC;
122+
CSC_Matrix *J_CSC;
123+
int *csc_to_csr_workspace;
124+
expr *param_source; /* if non-NULL, A/AT values come from this parameter */
125+
int src_m, src_n; /* original matrix dimensions */
123126
} left_matmul_expr;
124127

125128
/* Right matrix multiplication: y = f(x) * A where f(x) is an expression.

include/utils/CSC_Matrix.h

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,15 +52,7 @@ void csc_matvec_fill_values(const CSC_Matrix *A, const double *z, CSR_Matrix *C)
5252
CSC_Matrix *csr_to_csc_fill_sparsity(const CSR_Matrix *A, int *iwork);
5353
void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork);
5454

55-
/* Allocate CSR matrix for C = A @ B where A is CSR, B is CSC
56-
* Precomputes sparsity pattern. No workspace required.
57-
*/
58-
CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B);
59-
60-
/* Fill values of C = A @ B where A is CSR, B is CSC
61-
* C must have sparsity pattern already computed
62-
*/
63-
void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
64-
CSR_Matrix *C);
55+
CSR_Matrix *csc_to_csr_fill_sparsity(const CSC_Matrix *A, int *iwork);
56+
void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork);
6557

6658
#endif /* CSC_MATRIX_H */

include/utils/CSR_Matrix.h

Lines changed: 12 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -25,31 +25,29 @@ typedef struct CSR_Matrix
2525
int nnz;
2626
} CSR_Matrix;
2727

28-
/* Allocate a new CSR matrix with given dimensions and nnz */
28+
/* constructors and destructors */
2929
CSR_Matrix *new_csr_matrix(int m, int n, int nnz);
30-
31-
/* Free a CSR matrix */
30+
CSR_Matrix *new_csr(const CSR_Matrix *A);
3231
void free_csr_matrix(CSR_Matrix *matrix);
33-
34-
/* Copy CSR matrix A to C */
3532
void copy_csr_matrix(const CSR_Matrix *A, CSR_Matrix *C);
3633

37-
/* Build block-diagonal repeat A_blk = I_p kron A. Returns newly allocated CSR
38-
* matrix of size (p*A->m) x (p*A->n) with nnz = p*A->nnz. */
34+
/* transpose functionality (iwork must be of size A->n) */
35+
CSR_Matrix *transpose(const CSR_Matrix *A, int *iwork);
36+
CSR_Matrix *AT_alloc(const CSR_Matrix *A, int *iwork);
37+
void AT_fill_values(const CSR_Matrix *A, CSR_Matrix *AT, int *iwork);
38+
39+
/* Build (I_p kron A) = blkdiag(A, A, ..., A) of size (p*A->m) x (p*A->n) */
3940
CSR_Matrix *block_diag_repeat_csr(const CSR_Matrix *A, int p);
4041

41-
/* Build left-repeated Kronecker A_kron = A kron I_p. Returns newly allocated CSR
42-
* matrix of size (A->m * p) x (A->n * p) with nnz = A->nnz * p. */
42+
/* Build (A kron I_p) of size (A->m * p) x (A->n * p) with nnz = A->nnz * p. */
4343
CSR_Matrix *kron_identity_csr(const CSR_Matrix *A, int p);
4444

45-
/* matvec y = Ax, where A indices minus col_offset gives x indices. Returns y as
46-
* dense. */
45+
/* y = Ax, where y is returned as dense */
4746
void csr_matvec(const CSR_Matrix *A, const double *x, double *y, int col_offset);
4847
void csr_matvec_wo_offset(const CSR_Matrix *A, const double *x, double *y);
4948

50-
/* C = z^T A is assumed to have one row. C must have column indices pre-computed
51-
and transposed matrix AT must be provided. Fills in values of C only.
52-
*/
49+
/* Computes values of the row matrix C = z^T A (column indices must have been
50+
pre-computed) and transposed matrix AT must be provided) */
5351
void csr_matvec_fill_values(const CSR_Matrix *AT, const double *z, CSR_Matrix *C);
5452

5553
/* Insert value into CSR matrix A with just one row at col_idx. Assumes that A
@@ -63,92 +61,6 @@ void csr_insert_value(CSR_Matrix *A, int col_idx, double value);
6361
void diag_csr_mult(const double *d, const CSR_Matrix *A, CSR_Matrix *C);
6462
void diag_csr_mult_fill_values(const double *d, const CSR_Matrix *A, CSR_Matrix *C);
6563

66-
/* Compute C = A + B where A, B, C are CSR matrices
67-
* A and B must have same dimensions
68-
* C must be pre-allocated with sufficient nnz capacity.
69-
* C must be different from A and B */
70-
void sum_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C);
71-
/* Compute sparsity pattern of A + B where A, B, C are CSR matrices.
72-
* Fills C->p, C->i, and C->nnz; does not touch C->x. */
73-
void sum_csr_matrices_fill_sparsity(const CSR_Matrix *A, const CSR_Matrix *B,
74-
CSR_Matrix *C);
75-
76-
/* Fill only the values of C = A + B, assuming C's sparsity pattern (p and i)
77-
* is already filled and matches the union of A and B per row. Does not modify
78-
* C->p, C->i, or C->nnz. */
79-
void sum_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *B,
80-
CSR_Matrix *C);
81-
82-
/* Compute C = diag(d1) * A + diag(d2) * B where A, B, C are CSR matrices */
83-
void sum_scaled_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C,
84-
const double *d1, const double *d2);
85-
86-
/* Fill only the values of C = diag(d1) * A + diag(d2) * B, assuming C's sparsity
87-
* pattern (p and i) is already filled and matches the union of A and B per row.
88-
* Does not modify C->p, C->i, or C->nnz. */
89-
void sum_scaled_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *B,
90-
CSR_Matrix *C, const double *d1,
91-
const double *d2);
92-
93-
/* Sum all rows of A into a single row matrix C */
94-
void sum_all_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
95-
struct int_double_pair *pairs);
96-
97-
/* iwork must have size max(C->n, A->nnz), and idx_map must have size A->nnz. */
98-
void sum_all_rows_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A, CSR_Matrix *C,
99-
int *iwork, int *idx_map);
100-
101-
/* Fill values of summed rows using precomputed idx_map and sparsity of C */
102-
// void sum_all_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
103-
// const int *idx_map);
104-
105-
/* Fill accumulator for summing rows using precomputed idx_map for each nnz of A.
106-
Must memset accumulator to zero before calling. */
107-
void idx_map_accumulator(const CSR_Matrix *A, const int *idx_map,
108-
double *accumulator);
109-
void idx_map_accumulator_with_spacing(const CSR_Matrix *A, const int *idx_map,
110-
double *accumulator, int spacing);
111-
112-
/* Sum blocks of rows of A into a matrix C */
113-
void sum_block_of_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
114-
struct int_double_pair *pairs, int row_block_size);
115-
116-
/* Build sparsity and index map for summing blocks of rows.
117-
* iwork must have size max(A->n, A->nnz), and idx_map must have size A->nnz. */
118-
void sum_block_of_rows_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A,
119-
CSR_Matrix *C,
120-
int row_block_size, int *iwork,
121-
int *idx_map);
122-
123-
/* Fill values for summing blocks of rows using precomputed idx_map */
124-
// void sum_block_of_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
125-
// const int *idx_map);
126-
127-
/* Sum evenly spaced rows of A into a matrix C */
128-
void sum_evenly_spaced_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
129-
struct int_double_pair *pairs, int row_spacing);
130-
131-
/* Build sparsity and index map for summing evenly spaced rows.
132-
* iwork must have size max(A->n, A->nnz), and idx_map must have size A->nnz. */
133-
void sum_evenly_spaced_rows_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A,
134-
CSR_Matrix *C,
135-
int row_spacing,
136-
int *iwork, int *idx_map);
137-
138-
/* Fill values for summing evenly spaced rows using precomputed idx_map */
139-
// void sum_evenly_spaced_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
140-
// const int *idx_map);
141-
142-
/* Sum evenly spaced rows of A starting at offset into a row matrix C */
143-
void sum_spaced_rows_into_row_csr(const CSR_Matrix *A, CSR_Matrix *C,
144-
struct int_double_pair *pairs, int offset,
145-
int spacing);
146-
/* Fills the sparsity and index map for summing spaced rows into a row matrix */
147-
void sum_spaced_rows_into_row_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A,
148-
CSR_Matrix *C,
149-
int spacing, int *iwork,
150-
int *idx_map);
151-
15264
/* Count number of columns with nonzero entries */
15365
int count_nonzero_cols(const CSR_Matrix *A, bool *col_nz);
15466

@@ -157,13 +69,6 @@ void insert_idx(int idx, int *arr, int len);
15769

15870
double csr_get_value(const CSR_Matrix *A, int row, int col);
15971

160-
/* iwork must be of size A->n*/
161-
CSR_Matrix *transpose(const CSR_Matrix *A, int *iwork);
162-
CSR_Matrix *AT_alloc(const CSR_Matrix *A, int *iwork);
163-
164-
/* Fill values of A^T given sparsity pattern is already computed */
165-
void AT_fill_values(const CSR_Matrix *A, CSR_Matrix *AT, int *iwork);
166-
16772
/* Expand symmetric CSR matrix A to full matrix C. A is assumed to store
16873
only upper triangle. C must be pre-allocated with sufficient nnz */
16974
void symmetrize_csr(const int *Ap, const int *Ai, int m, CSR_Matrix *C);

include/utils/CSR_sum.h

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#ifndef CSR_SUM_H
2+
#define CSR_SUM_H
3+
4+
#include "utils/CSR_Matrix.h"
5+
6+
/* forward declaration */
7+
struct int_double_pair;
8+
9+
/* Compute C = A + B where A, B, C are CSR matrices
10+
* A and B must have same dimensions
11+
* C must be pre-allocated with sufficient nnz capacity.
12+
* C must be different from A and B */
13+
void sum_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C);
14+
15+
/* Compute sparsity pattern of A + B where A, B, C are CSR matrices.
16+
* Fills C->p, C->i, and C->nnz; does not touch C->x. */
17+
void sum_csr_matrices_fill_sparsity(const CSR_Matrix *A, const CSR_Matrix *B,
18+
CSR_Matrix *C);
19+
20+
/* Fill only the values of C = A + B, assuming C's sparsity pattern (p and i)
21+
* is already filled and matches the union of A and B per row. Does not modify
22+
* C->p, C->i, or C->nnz. */
23+
void sum_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *B,
24+
CSR_Matrix *C);
25+
26+
/* Compute C = diag(d1) * A + diag(d2) * B where A, B, C are CSR matrices */
27+
void sum_scaled_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C,
28+
const double *d1, const double *d2);
29+
30+
/* Fill only the values of C = diag(d1) * A + diag(d2) * B, assuming C's sparsity
31+
* pattern (p and i) is already filled and matches the union of A and B per row.
32+
* Does not modify C->p, C->i, or C->nnz. */
33+
void sum_scaled_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *B,
34+
CSR_Matrix *C, const double *d1,
35+
const double *d2);
36+
37+
/* Sum all rows of A into a single row matrix C */
38+
void sum_all_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
39+
struct int_double_pair *pairs);
40+
41+
/* iwork must have size max(C->n, A->nnz), and idx_map must have size A->nnz. */
42+
void sum_all_rows_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A, CSR_Matrix *C,
43+
int *iwork, int *idx_map);
44+
45+
/* Fill values of summed rows using precomputed idx_map and sparsity of C */
46+
// void sum_all_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
47+
// const int *idx_map);
48+
49+
/* Fill accumulator for summing rows using precomputed idx_map for each nnz of A.
50+
Must memset accumulator to zero before calling. */
51+
void idx_map_accumulator(const CSR_Matrix *A, const int *idx_map,
52+
double *accumulator);
53+
void idx_map_accumulator_with_spacing(const CSR_Matrix *A, const int *idx_map,
54+
double *accumulator, int spacing);
55+
56+
/* Sum blocks of rows of A into a matrix C */
57+
void sum_block_of_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
58+
struct int_double_pair *pairs, int row_block_size);
59+
60+
/* Build sparsity and index map for summing blocks of rows.
61+
* iwork must have size max(A->n, A->nnz), and idx_map must have size A->nnz. */
62+
void sum_block_of_rows_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A,
63+
CSR_Matrix *C,
64+
int row_block_size, int *iwork,
65+
int *idx_map);
66+
67+
/* Sum evenly spaced rows of A into a matrix C */
68+
void sum_evenly_spaced_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
69+
struct int_double_pair *pairs, int row_spacing);
70+
71+
/* Build sparsity and index map for summing evenly spaced rows.
72+
* iwork must have size max(A->n, A->nnz), and idx_map must have size A->nnz. */
73+
void sum_evenly_spaced_rows_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A,
74+
CSR_Matrix *C,
75+
int row_spacing,
76+
int *iwork, int *idx_map);
77+
78+
/* Sum evenly spaced rows of A starting at offset into a row matrix C */
79+
void sum_spaced_rows_into_row_csr(const CSR_Matrix *A, CSR_Matrix *C,
80+
struct int_double_pair *pairs, int offset,
81+
int spacing);
82+
83+
/* Fills the sparsity and index map for summing spaced rows into a row matrix */
84+
void sum_spaced_rows_into_row_csr_fill_sparsity_and_idx_map(const CSR_Matrix *A,
85+
CSR_Matrix *C,
86+
int spacing, int *iwork,
87+
int *idx_map);
88+
89+
#endif /* CSR_SUM_H */
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#ifndef LINALG_H
2+
#define LINALG_H
3+
4+
/* Forward declarations */
5+
struct CSR_Matrix;
6+
struct CSC_Matrix;
7+
8+
/* Compute sparsity pattern and values for the matrix-matrix multiplication
9+
C = (I_p kron A) @ J where A is m x n, J is (n*p) x k, and C is (m*p) x k,
10+
without relying on generic sparse matrix-matrix multiplication. Specialized
11+
logic for this is much faster (50-100x) than generic sparse matmul.
12+
13+
* J is provided in CSC format and is split into p blocks of n rows each
14+
* C is returned in CSC format
15+
* Mathematically it corresponds to C = [A @ J1; A @ J2; ...; A @ Jp],
16+
where J = [J1; J2; ...; Jp]
17+
*/
18+
struct CSC_Matrix *block_left_multiply_fill_sparsity(const struct CSR_Matrix *A,
19+
const struct CSC_Matrix *J,
20+
int p);
21+
22+
void block_left_multiply_fill_values(const struct CSR_Matrix *A,
23+
const struct CSC_Matrix *J,
24+
struct CSC_Matrix *C);
25+
26+
/* Compute y = kron(I_p, A) @ x where A is m x n and x is(n*p)-length vector.
27+
The output y is m*p-length vector corresponding to
28+
y = [A @ x1; A @ x2; ...; A @ xp] where x is divided into p blocks of n
29+
elements.
30+
*/
31+
void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y,
32+
int p);
33+
34+
/* Fill values of C = A @ B where A is CSR, B is CSC.
35+
* C must have sparsity pattern already computed.
36+
*/
37+
void csr_csc_matmul_fill_values(const struct CSR_Matrix *A,
38+
const struct CSC_Matrix *B, struct CSR_Matrix *C);
39+
40+
/* C = A @ B where A is CSR, B is CSC. Result C is CSR.
41+
* Allocates and precomputes sparsity pattern. No workspace required.
42+
*/
43+
struct CSR_Matrix *csr_csc_matmul_alloc(const struct CSR_Matrix *A,
44+
const struct CSC_Matrix *B);
45+
46+
#endif /* LINALG_H */

src/affine/add.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
* limitations under the License.
1717
*/
1818
#include "affine.h"
19+
#include "utils/CSR_sum.h"
1920
#include <assert.h>
2021
#include <stdio.h>
2122
#include <stdlib.h>

0 commit comments

Comments
 (0)