Skip to content

Commit a5d75d7

Browse files
authored
Depthwise convolution implementation (#217)
Squashed commit of the following: commit 79e9f31415cde3ec1430229121751429eb7eff25 Merge: 4d1fd5d 12f93a2 Author: Steven Atkinson <steven@atkinson.mn> Date: Thu Jan 29 00:22:38 2026 -0800 Merge branch 'main' into 215-group-2 commit 4d1fd5d Author: Steven Atkinson <steven@atkinson.mn> Date: Thu Jan 29 00:17:36 2026 -0800 Enhance Conv1x1 and Conv1D classes to support depthwise convolutions. Introduced logic to differentiate between depthwise and non-depthwise configurations, optimizing weight storage and processing methods accordingly. Updated weight setting and processing functions to handle depthwise operations efficiently, ensuring correct handling of input channels and weights. commit 2ad9dec Author: Steven Atkinson <steven@atkinson.mn> Date: Wed Jan 28 23:56:35 2026 -0800 Improve grouped convolutions for Conv1D by...ignoring them for now. commit e3be255 Author: Steven Atkinson <steven@atkinson.mn> Date: Wed Jan 28 23:46:36 2026 -0800 Revert "Implement std::vector grouped_weights" This reverts commit e78e191. commit e78e191 Author: Steven Atkinson <steven@atkinson.mn> Date: Wed Jan 28 23:41:45 2026 -0800 Implement std::vector grouped_weights commit 546f820 Author: Steven Atkinson <steven@atkinson.mn> Date: Wed Jan 28 23:31:28 2026 -0800 Improve speed of small grouped convolutions with single GEMM commit c20fb86 Author: Steven Atkinson <steven@atkinson.mn> Date: Wed Jan 28 23:23:28 2026 -0800 Zero out conv weight matrices after resize
1 parent 12f93a2 commit a5d75d7

4 files changed

Lines changed: 219 additions & 43 deletions

File tree

NAM/conv1d.cpp

Lines changed: 132 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,21 @@ namespace nam
77

88
void Conv1D::set_weights_(std::vector<float>::iterator& weights)
99
{
10-
if (this->_weight.size() > 0)
10+
if (this->_is_depthwise)
11+
{
12+
// Depthwise convolution: one weight per channel per kernel tap
13+
// Weight layout: for each channel c, for each kernel position k
14+
const int channels = this->_channels;
15+
const size_t kernel_size = this->_depthwise_weight.size();
16+
for (int c = 0; c < channels; c++)
17+
{
18+
for (size_t k = 0; k < kernel_size; k++)
19+
{
20+
this->_depthwise_weight[k](c) = *(weights++);
21+
}
22+
}
23+
}
24+
else if (this->_weight.size() > 0)
1125
{
1226
const long out_channels = this->_weight[0].rows();
1327
const long in_channels = this->_weight[0].cols();
@@ -53,21 +67,46 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int
5367
}
5468

5569
this->_num_groups = groups;
56-
this->_weight.resize(kernel_size);
57-
for (size_t i = 0; i < this->_weight.size(); i++)
70+
this->_dilation = _dilation;
71+
72+
// Check for depthwise convolution: groups == in_channels == out_channels
73+
// In this case, each channel is processed independently with a single weight per kernel tap,
74+
// so we can use efficient element-wise multiplication instead of matrix multiplication.
75+
this->_is_depthwise = (groups == in_channels && in_channels == out_channels);
76+
77+
if (this->_is_depthwise)
78+
{
79+
// Depthwise: store one weight vector per kernel tap
80+
this->_channels = in_channels;
81+
this->_depthwise_weight.resize(kernel_size);
82+
for (int i = 0; i < kernel_size; i++)
83+
{
84+
this->_depthwise_weight[i].resize(in_channels);
85+
this->_depthwise_weight[i].setZero();
86+
}
87+
this->_weight.clear(); // Not used for depthwise
88+
}
89+
else
5890
{
59-
this->_weight[i].resize(out_channels,
60-
in_channels); // y = Ax, input array (C,L)
61-
this->_weight[i].setZero();
91+
// Non-depthwise: store full weight matrices (block-diagonal for grouped convolutions)
92+
this->_weight.resize(kernel_size);
93+
for (int i = 0; i < kernel_size; i++)
94+
{
95+
this->_weight[i].resize(out_channels,
96+
in_channels); // y = Ax, input array (C,L)
97+
this->_weight[i].setZero();
98+
}
99+
this->_depthwise_weight.clear(); // Not used for non-depthwise
100+
this->_channels = 0;
62101
}
102+
63103
if (do_bias)
64104
{
65105
this->_bias.resize(out_channels);
66106
this->_bias.setZero();
67107
}
68108
else
69109
this->_bias.resize(0);
70-
this->_dilation = _dilation;
71110
}
72111

73112
void Conv1D::set_size_and_weights_(const int in_channels, const int out_channels, const int kernel_size,
@@ -114,18 +153,37 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
114153
// After Write(), data is at positions [_write_pos, _write_pos+num_frames-1]
115154
// For kernel tap k with offset, we need to read from _write_pos + offset
116155
// The offset is negative (looking back), so _write_pos + offset reads from earlier positions
117-
//
118-
// Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
119-
// so we can use a single GEMM for all cases. A more advanced implementation could store
120-
// compact per-group weight matrices and loop over groups, but at typical model sizes
121-
// (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
122-
// and the single sparse GEMM approach is faster.
123-
for (size_t k = 0; k < this->_weight.size(); k++)
156+
157+
if (this->_is_depthwise)
158+
{
159+
// Depthwise convolution: use efficient element-wise multiplication
160+
// Each channel is processed independently with a single weight per kernel tap.
161+
// output[c, t] = sum_k(weight[k, c] * input[c, t - k*dilation])
162+
const size_t kernel_size = this->_depthwise_weight.size();
163+
for (size_t k = 0; k < kernel_size; k++)
164+
{
165+
const long offset = this->_dilation * (k + 1 - (long)kernel_size);
166+
const long lookback = -offset;
167+
auto input_block = _input_buffer.Read(num_frames, lookback);
168+
// Element-wise multiply: each row of input_block is multiplied by corresponding weight
169+
_output.leftCols(num_frames).noalias() +=
170+
this->_depthwise_weight[k].asDiagonal() * input_block.leftCols(num_frames);
171+
}
172+
}
173+
else
124174
{
125-
const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
126-
const long lookback = -offset;
127-
auto input_block = _input_buffer.Read(num_frames, lookback);
128-
_output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
175+
// Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
176+
// so we can use a single GEMM for all cases. A more advanced implementation could store
177+
// compact per-group weight matrices and loop over groups, but at typical model sizes
178+
// (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
179+
// and the single sparse GEMM approach is faster.
180+
for (size_t k = 0; k < this->_weight.size(); k++)
181+
{
182+
const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
183+
const long lookback = -offset;
184+
auto input_block = _input_buffer.Read(num_frames, lookback);
185+
_output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
186+
}
129187
}
130188

131189
// Add bias if present
@@ -141,29 +199,73 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
141199
void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols,
142200
const long j_start) const
143201
{
144-
// Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
145-
// so we can use a single GEMM for all cases. A more advanced implementation could store
146-
// compact per-group weight matrices and loop over groups, but at typical model sizes
147-
// (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
148-
// and the single sparse GEMM approach is faster.
149-
for (size_t k = 0; k < this->_weight.size(); k++)
202+
if (this->_is_depthwise)
150203
{
151-
const long offset = this->_dilation * (k + 1 - this->_weight.size());
152-
if (k == 0)
153-
output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
154-
else
155-
output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
204+
// Depthwise convolution: use efficient element-wise multiplication
205+
const size_t kernel_size = this->_depthwise_weight.size();
206+
for (size_t k = 0; k < kernel_size; k++)
207+
{
208+
const long offset = this->_dilation * (k + 1 - (long)kernel_size);
209+
if (k == 0)
210+
output.middleCols(j_start, ncols).noalias() =
211+
this->_depthwise_weight[k].asDiagonal() * input.middleCols(i_start + offset, ncols);
212+
else
213+
output.middleCols(j_start, ncols).noalias() +=
214+
this->_depthwise_weight[k].asDiagonal() * input.middleCols(i_start + offset, ncols);
215+
}
216+
}
217+
else
218+
{
219+
// Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
220+
// so we can use a single GEMM for all cases. A more advanced implementation could store
221+
// compact per-group weight matrices and loop over groups, but at typical model sizes
222+
// (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
223+
// and the single sparse GEMM approach is faster.
224+
for (size_t k = 0; k < this->_weight.size(); k++)
225+
{
226+
const long offset = this->_dilation * (k + 1 - this->_weight.size());
227+
if (k == 0)
228+
output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
229+
else
230+
output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
231+
}
156232
}
157233
if (this->_bias.size() > 0)
158234
{
159235
output.middleCols(j_start, ncols).colwise() += this->_bias;
160236
}
161237
}
162238

239+
long Conv1D::get_in_channels() const
240+
{
241+
if (this->_is_depthwise)
242+
return this->_channels;
243+
return this->_weight.size() > 0 ? this->_weight[0].cols() : 0;
244+
}
245+
246+
long Conv1D::get_out_channels() const
247+
{
248+
if (this->_is_depthwise)
249+
return this->_channels;
250+
return this->_weight.size() > 0 ? this->_weight[0].rows() : 0;
251+
}
252+
253+
long Conv1D::get_kernel_size() const
254+
{
255+
if (this->_is_depthwise)
256+
return this->_depthwise_weight.size();
257+
return this->_weight.size();
258+
}
259+
163260
long Conv1D::get_num_weights() const
164261
{
165262
long num_weights = this->_bias.size();
166-
if (this->_weight.size() > 0)
263+
if (this->_is_depthwise)
264+
{
265+
// Depthwise: one weight per channel per kernel tap
266+
num_weights += this->_channels * this->_depthwise_weight.size();
267+
}
268+
else if (this->_weight.size() > 0)
167269
{
168270
const long out_channels = this->_weight[0].rows();
169271
const long in_channels = this->_weight[0].cols();

NAM/conv1d.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,19 +95,19 @@ class Conv1D
9595
const long j_start) const;
9696
/// \brief Get the number of input channels
9797
/// \return Number of input channels
98-
long get_in_channels() const { return this->_weight.size() > 0 ? this->_weight[0].cols() : 0; };
98+
long get_in_channels() const;
9999

100100
/// \brief Get the kernel size
101101
/// \return Kernel size
102-
long get_kernel_size() const { return this->_weight.size(); };
102+
long get_kernel_size() const;
103103

104104
/// \brief Get the total number of weights
105105
/// \return Total number of weight parameters
106106
long get_num_weights() const;
107107

108108
/// \brief Get the number of output channels
109109
/// \return Number of output channels
110-
long get_out_channels() const { return this->_weight.size() > 0 ? this->_weight[0].rows() : 0; };
110+
long get_out_channels() const;
111111

112112
/// \brief Get the dilation factor
113113
/// \return Dilation factor
@@ -118,8 +118,13 @@ class Conv1D
118118
bool has_bias() const { return this->_bias.size() > 0; };
119119

120120
protected:
121-
// conv[kernel](cout, cin)
121+
// conv[kernel](cout, cin) - used for non-depthwise convolutions
122122
std::vector<Eigen::MatrixXf> _weight;
123+
// For depthwise convolution (groups == in_channels == out_channels):
124+
// stores one weight per channel per kernel tap
125+
std::vector<Eigen::VectorXf> _depthwise_weight;
126+
bool _is_depthwise = false;
127+
int _channels = 0; // Used for depthwise case (in_channels == out_channels)
123128
Eigen::VectorXf _bias;
124129
int _dilation;
125130
int _num_groups;

NAM/dsp.cpp

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -331,9 +331,30 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool
331331
}
332332

333333
this->_num_groups = groups;
334-
this->_weight.resize(out_channels, in_channels);
335-
this->_weight.setZero();
336334
this->_do_bias = _bias;
335+
336+
// Check for depthwise convolution: groups == in_channels == out_channels
337+
// In this case, each channel is processed independently with a single weight,
338+
// so we can use efficient element-wise multiplication instead of matrix multiplication.
339+
this->_is_depthwise = (groups == in_channels && in_channels == out_channels);
340+
341+
if (this->_is_depthwise)
342+
{
343+
// Depthwise: store one weight per channel
344+
this->_channels = in_channels;
345+
this->_depthwise_weight.resize(in_channels);
346+
this->_depthwise_weight.setZero();
347+
// Clear the matrix weight (not used)
348+
this->_weight.resize(0, 0);
349+
}
350+
else
351+
{
352+
// Non-depthwise: store full weight matrix (block-diagonal for grouped convolutions)
353+
this->_weight.resize(out_channels, in_channels);
354+
this->_weight.setZero();
355+
this->_channels = 0;
356+
}
357+
337358
if (_bias)
338359
{
339360
this->_bias.resize(out_channels);
@@ -349,7 +370,15 @@ void nam::Conv1x1::SetMaxBufferSize(const int maxBufferSize)
349370

350371
void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
351372
{
352-
if (this->_weight.size() > 0)
373+
if (this->_is_depthwise)
374+
{
375+
// Depthwise convolution: one weight per channel
376+
for (int c = 0; c < this->_channels; c++)
377+
{
378+
this->_depthwise_weight(c) = *(weights++);
379+
}
380+
}
381+
else if (this->_weight.size() > 0)
353382
{
354383
const long out_channels = this->_weight.rows();
355384
const long in_channels = this->_weight.cols();
@@ -376,10 +405,35 @@ void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
376405
this->_bias(i) = *(weights++);
377406
}
378407

408+
long nam::Conv1x1::get_out_channels() const
409+
{
410+
if (this->_is_depthwise)
411+
return this->_channels;
412+
return this->_weight.rows();
413+
}
414+
415+
long nam::Conv1x1::get_in_channels() const
416+
{
417+
if (this->_is_depthwise)
418+
return this->_channels;
419+
return this->_weight.cols();
420+
}
421+
379422
Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const
380423
{
381-
// Single GEMM for all cases - block-diagonal zero structure handles grouping
382-
Eigen::MatrixXf result = this->_weight * input.leftCols(num_frames);
424+
Eigen::MatrixXf result(get_out_channels(), num_frames);
425+
426+
if (this->_is_depthwise)
427+
{
428+
// Depthwise convolution: efficient element-wise multiplication
429+
// Each channel is scaled by its corresponding weight
430+
result.noalias() = this->_depthwise_weight.asDiagonal() * input.leftCols(num_frames);
431+
}
432+
else
433+
{
434+
// Single GEMM for all cases - block-diagonal zero structure handles grouping
435+
result.noalias() = this->_weight * input.leftCols(num_frames);
436+
}
383437

384438
if (this->_do_bias)
385439
result.colwise() += this->_bias;
@@ -391,8 +445,17 @@ void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, cons
391445
{
392446
assert(num_frames <= _output.cols());
393447

394-
// Single GEMM for all cases - block-diagonal zero structure handles grouping
395-
_output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
448+
if (this->_is_depthwise)
449+
{
450+
// Depthwise convolution: efficient element-wise multiplication
451+
// Each channel is scaled by its corresponding weight
452+
_output.leftCols(num_frames).noalias() = this->_depthwise_weight.asDiagonal() * input.leftCols(num_frames);
453+
}
454+
else
455+
{
456+
// Single GEMM for all cases - block-diagonal zero structure handles grouping
457+
_output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
458+
}
396459

397460
if (this->_do_bias)
398461
_output.leftCols(num_frames).colwise() += this->_bias;

NAM/dsp.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -323,11 +323,17 @@ class Conv1x1
323323
/// \param num_frames Number of frames to process
324324
void process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames);
325325

326-
long get_out_channels() const { return this->_weight.rows(); };
327-
long get_in_channels() const { return this->_weight.cols(); };
326+
long get_out_channels() const;
327+
long get_in_channels() const;
328328

329329
protected:
330+
// Non-depthwise: full weight matrix (out_channels x in_channels)
330331
Eigen::MatrixXf _weight;
332+
// For depthwise convolution (groups == in_channels == out_channels):
333+
// stores one weight per channel
334+
Eigen::VectorXf _depthwise_weight;
335+
bool _is_depthwise = false;
336+
int _channels = 0; // Used for depthwise case (in_channels == out_channels)
331337
Eigen::VectorXf _bias;
332338
int _num_groups;
333339

0 commit comments

Comments
 (0)