@@ -7,7 +7,21 @@ namespace nam
77
88void Conv1D::set_weights_ (std::vector<float >::iterator& weights)
99{
10- if (this ->_weight .size () > 0 )
10+ if (this ->_is_depthwise )
11+ {
12+ // Depthwise convolution: one weight per channel per kernel tap
13+ // Weight layout: for each channel c, for each kernel position k
14+ const int channels = this ->_channels ;
15+ const size_t kernel_size = this ->_depthwise_weight .size ();
16+ for (int c = 0 ; c < channels; c++)
17+ {
18+ for (size_t k = 0 ; k < kernel_size; k++)
19+ {
20+ this ->_depthwise_weight [k](c) = *(weights++);
21+ }
22+ }
23+ }
24+ else if (this ->_weight .size () > 0 )
1125 {
1226 const long out_channels = this ->_weight [0 ].rows ();
1327 const long in_channels = this ->_weight [0 ].cols ();
@@ -53,21 +67,46 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int
5367 }
5468
5569 this ->_num_groups = groups;
56- this ->_weight .resize (kernel_size);
57- for (size_t i = 0 ; i < this ->_weight .size (); i++)
70+ this ->_dilation = _dilation;
71+
72+ // Check for depthwise convolution: groups == in_channels == out_channels
73+ // In this case, each channel is processed independently with a single weight per kernel tap,
74+ // so we can use efficient element-wise multiplication instead of matrix multiplication.
75+ this ->_is_depthwise = (groups == in_channels && in_channels == out_channels);
76+
77+ if (this ->_is_depthwise )
78+ {
79+ // Depthwise: store one weight vector per kernel tap
80+ this ->_channels = in_channels;
81+ this ->_depthwise_weight .resize (kernel_size);
82+ for (int i = 0 ; i < kernel_size; i++)
83+ {
84+ this ->_depthwise_weight [i].resize (in_channels);
85+ this ->_depthwise_weight [i].setZero ();
86+ }
87+ this ->_weight .clear (); // Not used for depthwise
88+ }
89+ else
5890 {
59- this ->_weight [i].resize (out_channels,
60- in_channels); // y = Ax, input array (C,L)
61- this ->_weight [i].setZero ();
91+ // Non-depthwise: store full weight matrices (block-diagonal for grouped convolutions)
92+ this ->_weight .resize (kernel_size);
93+ for (int i = 0 ; i < kernel_size; i++)
94+ {
95+ this ->_weight [i].resize (out_channels,
96+ in_channels); // y = Ax, input array (C,L)
97+ this ->_weight [i].setZero ();
98+ }
99+ this ->_depthwise_weight .clear (); // Not used for non-depthwise
100+ this ->_channels = 0 ;
62101 }
102+
63103 if (do_bias)
64104 {
65105 this ->_bias .resize (out_channels);
66106 this ->_bias .setZero ();
67107 }
68108 else
69109 this ->_bias .resize (0 );
70- this ->_dilation = _dilation;
71110}
72111
73112void Conv1D::set_size_and_weights_ (const int in_channels, const int out_channels, const int kernel_size,
@@ -114,18 +153,37 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
114153 // After Write(), data is at positions [_write_pos, _write_pos+num_frames-1]
115154 // For kernel tap k with offset, we need to read from _write_pos + offset
116155 // The offset is negative (looking back), so _write_pos + offset reads from earlier positions
117- //
118- // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
119- // so we can use a single GEMM for all cases. A more advanced implementation could store
120- // compact per-group weight matrices and loop over groups, but at typical model sizes
121- // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
122- // and the single sparse GEMM approach is faster.
123- for (size_t k = 0 ; k < this ->_weight .size (); k++)
156+
157+ if (this ->_is_depthwise )
158+ {
159+ // Depthwise convolution: use efficient element-wise multiplication
160+ // Each channel is processed independently with a single weight per kernel tap.
161+ // output[c, t] = sum_k(weight[k, c] * input[c, t - k*dilation])
162+ const size_t kernel_size = this ->_depthwise_weight .size ();
163+ for (size_t k = 0 ; k < kernel_size; k++)
164+ {
165+ const long offset = this ->_dilation * (k + 1 - (long )kernel_size);
166+ const long lookback = -offset;
167+ auto input_block = _input_buffer.Read (num_frames, lookback);
168+ // Element-wise multiply: each row of input_block is multiplied by corresponding weight
169+ _output.leftCols (num_frames).noalias () +=
170+ this ->_depthwise_weight [k].asDiagonal () * input_block.leftCols (num_frames);
171+ }
172+ }
173+ else
124174 {
125- const long offset = this ->_dilation * (k + 1 - (long )this ->_weight .size ());
126- const long lookback = -offset;
127- auto input_block = _input_buffer.Read (num_frames, lookback);
128- _output.leftCols (num_frames).noalias () += this ->_weight [k] * input_block;
175+ // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
176+ // so we can use a single GEMM for all cases. A more advanced implementation could store
177+ // compact per-group weight matrices and loop over groups, but at typical model sizes
178+ // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
179+ // and the single sparse GEMM approach is faster.
180+ for (size_t k = 0 ; k < this ->_weight .size (); k++)
181+ {
182+ const long offset = this ->_dilation * (k + 1 - (long )this ->_weight .size ());
183+ const long lookback = -offset;
184+ auto input_block = _input_buffer.Read (num_frames, lookback);
185+ _output.leftCols (num_frames).noalias () += this ->_weight [k] * input_block;
186+ }
129187 }
130188
131189 // Add bias if present
@@ -141,29 +199,73 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
141199void Conv1D::process_ (const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols,
142200 const long j_start) const
143201{
144- // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
145- // so we can use a single GEMM for all cases. A more advanced implementation could store
146- // compact per-group weight matrices and loop over groups, but at typical model sizes
147- // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
148- // and the single sparse GEMM approach is faster.
149- for (size_t k = 0 ; k < this ->_weight .size (); k++)
202+ if (this ->_is_depthwise )
150203 {
151- const long offset = this ->_dilation * (k + 1 - this ->_weight .size ());
152- if (k == 0 )
153- output.middleCols (j_start, ncols).noalias () = this ->_weight [k] * input.middleCols (i_start + offset, ncols);
154- else
155- output.middleCols (j_start, ncols).noalias () += this ->_weight [k] * input.middleCols (i_start + offset, ncols);
204+ // Depthwise convolution: use efficient element-wise multiplication
205+ const size_t kernel_size = this ->_depthwise_weight .size ();
206+ for (size_t k = 0 ; k < kernel_size; k++)
207+ {
208+ const long offset = this ->_dilation * (k + 1 - (long )kernel_size);
209+ if (k == 0 )
210+ output.middleCols (j_start, ncols).noalias () =
211+ this ->_depthwise_weight [k].asDiagonal () * input.middleCols (i_start + offset, ncols);
212+ else
213+ output.middleCols (j_start, ncols).noalias () +=
214+ this ->_depthwise_weight [k].asDiagonal () * input.middleCols (i_start + offset, ncols);
215+ }
216+ }
217+ else
218+ {
219+ // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
220+ // so we can use a single GEMM for all cases. A more advanced implementation could store
221+ // compact per-group weight matrices and loop over groups, but at typical model sizes
222+ // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
223+ // and the single sparse GEMM approach is faster.
224+ for (size_t k = 0 ; k < this ->_weight .size (); k++)
225+ {
226+ const long offset = this ->_dilation * (k + 1 - this ->_weight .size ());
227+ if (k == 0 )
228+ output.middleCols (j_start, ncols).noalias () = this ->_weight [k] * input.middleCols (i_start + offset, ncols);
229+ else
230+ output.middleCols (j_start, ncols).noalias () += this ->_weight [k] * input.middleCols (i_start + offset, ncols);
231+ }
156232 }
157233 if (this ->_bias .size () > 0 )
158234 {
159235 output.middleCols (j_start, ncols).colwise () += this ->_bias ;
160236 }
161237}
162238
239+ long Conv1D::get_in_channels () const
240+ {
241+ if (this ->_is_depthwise )
242+ return this ->_channels ;
243+ return this ->_weight .size () > 0 ? this ->_weight [0 ].cols () : 0 ;
244+ }
245+
246+ long Conv1D::get_out_channels () const
247+ {
248+ if (this ->_is_depthwise )
249+ return this ->_channels ;
250+ return this ->_weight .size () > 0 ? this ->_weight [0 ].rows () : 0 ;
251+ }
252+
253+ long Conv1D::get_kernel_size () const
254+ {
255+ if (this ->_is_depthwise )
256+ return this ->_depthwise_weight .size ();
257+ return this ->_weight .size ();
258+ }
259+
163260long Conv1D::get_num_weights () const
164261{
165262 long num_weights = this ->_bias .size ();
166- if (this ->_weight .size () > 0 )
263+ if (this ->_is_depthwise )
264+ {
265+ // Depthwise: one weight per channel per kernel tap
266+ num_weights += this ->_channels * this ->_depthwise_weight .size ();
267+ }
268+ else if (this ->_weight .size () > 0 )
167269 {
168270 const long out_channels = this ->_weight [0 ].rows ();
169271 const long in_channels = this ->_weight [0 ].cols ();
0 commit comments