Depthwise convolution implementation (#217)

sdatkinson · web-flow · commit a5d75d7be51e · 2026-01-29T00:26:22.000-08:00
Squashed commit of the following: commit 79e9f31415cde3ec1430229121751429eb7eff25 Merge: 4d1fd5d 12f93a2 Author: Steven Atkinson <steven@atkinson.mn> Date: Thu Jan 29 00:22:38 2026 -0800 Merge branch 'main' into 215-group-2 commit 4d1fd5d Author: Steven Atkinson <steven@atkinson.mn> Date: Thu Jan 29 00:17:36 2026 -0800 Enhance Conv1x1 and Conv1D classes to support depthwise convolutions. Introduced logic to differentiate between depthwise and non-depthwise configurations, optimizing weight storage and processing methods accordingly. Updated weight setting and processing functions to handle depthwise operations efficiently, ensuring correct handling of input channels and weights. commit 2ad9dec Author: Steven Atkinson <steven@atkinson.mn> Date: Wed Jan 28 23:56:35 2026 -0800 Improve grouped convolutions for Conv1D by...ignoring them for now. commit e3be255 Author: Steven Atkinson <steven@atkinson.mn> Date: Wed Jan 28 23:46:36 2026 -0800 Revert "Implement std::vector grouped_weights" This reverts commit e78e191. commit e78e191 Author: Steven Atkinson <steven@atkinson.mn> Date: Wed Jan 28 23:41:45 2026 -0800 Implement std::vector grouped_weights commit 546f820 Author: Steven Atkinson <steven@atkinson.mn> Date: Wed Jan 28 23:31:28 2026 -0800 Improve speed of small grouped convolutions with single GEMM commit c20fb86 Author: Steven Atkinson <steven@atkinson.mn> Date: Wed Jan 28 23:23:28 2026 -0800 Zero out conv weight matrices after resize
diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
@@ -7,7 +7,21 @@ namespace nam
 
 void Conv1D::set_weights_(std::vector<float>::iterator& weights)
 {
-  if (this->_weight.size() > 0)
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: one weight per channel per kernel tap
+    // Weight layout: for each channel c, for each kernel position k
+    const int channels = this->_channels;
+    const size_t kernel_size = this->_depthwise_weight.size();
+    for (int c = 0; c < channels; c++)
+    {
+      for (size_t k = 0; k < kernel_size; k++)
+      {
+        this->_depthwise_weight[k](c) = *(weights++);
+      }
+    }
+  }
+  else if (this->_weight.size() > 0)
   {
     const long out_channels = this->_weight[0].rows();
     const long in_channels = this->_weight[0].cols();
@@ -53,21 +67,46 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int
   }
 
   this->_num_groups = groups;
-  this->_weight.resize(kernel_size);
-  for (size_t i = 0; i < this->_weight.size(); i++)
+  this->_dilation = _dilation;
+
+  // Check for depthwise convolution: groups == in_channels == out_channels
+  // In this case, each channel is processed independently with a single weight per kernel tap,
+  // so we can use efficient element-wise multiplication instead of matrix multiplication.
+  this->_is_depthwise = (groups == in_channels && in_channels == out_channels);
+
+  if (this->_is_depthwise)
+  {
+    // Depthwise: store one weight vector per kernel tap
+    this->_channels = in_channels;
+    this->_depthwise_weight.resize(kernel_size);
+    for (int i = 0; i < kernel_size; i++)
+    {
+      this->_depthwise_weight[i].resize(in_channels);
+      this->_depthwise_weight[i].setZero();
+    }
+    this->_weight.clear(); // Not used for depthwise
+  }
+  else
   {
-    this->_weight[i].resize(out_channels,
-                            in_channels); // y = Ax, input array (C,L)
-    this->_weight[i].setZero();
+    // Non-depthwise: store full weight matrices (block-diagonal for grouped convolutions)
+    this->_weight.resize(kernel_size);
+    for (int i = 0; i < kernel_size; i++)
+    {
+      this->_weight[i].resize(out_channels,
+                              in_channels); // y = Ax, input array (C,L)
+      this->_weight[i].setZero();
+    }
+    this->_depthwise_weight.clear(); // Not used for non-depthwise
+    this->_channels = 0;
   }
+
   if (do_bias)
   {
     this->_bias.resize(out_channels);
     this->_bias.setZero();
   }
   else
     this->_bias.resize(0);
-  this->_dilation = _dilation;
 }
 
 void Conv1D::set_size_and_weights_(const int in_channels, const int out_channels, const int kernel_size,
@@ -114,18 +153,37 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
   // After Write(), data is at positions [_write_pos, _write_pos+num_frames-1]
   // For kernel tap k with offset, we need to read from _write_pos + offset
   // The offset is negative (looking back), so _write_pos + offset reads from earlier positions
-  //
-  // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
-  // so we can use a single GEMM for all cases. A more advanced implementation could store
-  // compact per-group weight matrices and loop over groups, but at typical model sizes
-  // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
-  // and the single sparse GEMM approach is faster.
-  for (size_t k = 0; k < this->_weight.size(); k++)
+
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: use efficient element-wise multiplication
+    // Each channel is processed independently with a single weight per kernel tap.
+    // output[c, t] = sum_k(weight[k, c] * input[c, t - k*dilation])
+    const size_t kernel_size = this->_depthwise_weight.size();
+    for (size_t k = 0; k < kernel_size; k++)
+    {
+      const long offset = this->_dilation * (k + 1 - (long)kernel_size);
+      const long lookback = -offset;
+      auto input_block = _input_buffer.Read(num_frames, lookback);
+      // Element-wise multiply: each row of input_block is multiplied by corresponding weight
+      _output.leftCols(num_frames).noalias() +=
+        this->_depthwise_weight[k].asDiagonal() * input_block.leftCols(num_frames);
+    }
+  }
+  else
   {
-    const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
-    const long lookback = -offset;
-    auto input_block = _input_buffer.Read(num_frames, lookback);
-    _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
+    // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
+    // so we can use a single GEMM for all cases. A more advanced implementation could store
+    // compact per-group weight matrices and loop over groups, but at typical model sizes
+    // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
+    // and the single sparse GEMM approach is faster.
+    for (size_t k = 0; k < this->_weight.size(); k++)
+    {
+      const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
+      const long lookback = -offset;
+      auto input_block = _input_buffer.Read(num_frames, lookback);
+      _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
+    }
   }
 
   // Add bias if present
@@ -141,29 +199,73 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
 void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols,
                       const long j_start) const
 {
-  // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
-  // so we can use a single GEMM for all cases. A more advanced implementation could store
-  // compact per-group weight matrices and loop over groups, but at typical model sizes
-  // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
-  // and the single sparse GEMM approach is faster.
-  for (size_t k = 0; k < this->_weight.size(); k++)
+  if (this->_is_depthwise)
   {
-    const long offset = this->_dilation * (k + 1 - this->_weight.size());
-    if (k == 0)
-      output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
-    else
-      output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
+    // Depthwise convolution: use efficient element-wise multiplication
+    const size_t kernel_size = this->_depthwise_weight.size();
+    for (size_t k = 0; k < kernel_size; k++)
+    {
+      const long offset = this->_dilation * (k + 1 - (long)kernel_size);
+      if (k == 0)
+        output.middleCols(j_start, ncols).noalias() =
+          this->_depthwise_weight[k].asDiagonal() * input.middleCols(i_start + offset, ncols);
+      else
+        output.middleCols(j_start, ncols).noalias() +=
+          this->_depthwise_weight[k].asDiagonal() * input.middleCols(i_start + offset, ncols);
+    }
+  }
+  else
+  {
+    // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
+    // so we can use a single GEMM for all cases. A more advanced implementation could store
+    // compact per-group weight matrices and loop over groups, but at typical model sizes
+    // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
+    // and the single sparse GEMM approach is faster.
+    for (size_t k = 0; k < this->_weight.size(); k++)
+    {
+      const long offset = this->_dilation * (k + 1 - this->_weight.size());
+      if (k == 0)
+        output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
+      else
+        output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
+    }
   }
   if (this->_bias.size() > 0)
   {
     output.middleCols(j_start, ncols).colwise() += this->_bias;
   }
 }
 
+long Conv1D::get_in_channels() const
+{
+  if (this->_is_depthwise)
+    return this->_channels;
+  return this->_weight.size() > 0 ? this->_weight[0].cols() : 0;
+}
+
+long Conv1D::get_out_channels() const
+{
+  if (this->_is_depthwise)
+    return this->_channels;
+  return this->_weight.size() > 0 ? this->_weight[0].rows() : 0;
+}
+
+long Conv1D::get_kernel_size() const
+{
+  if (this->_is_depthwise)
+    return this->_depthwise_weight.size();
+  return this->_weight.size();
+}
+
 long Conv1D::get_num_weights() const
 {
   long num_weights = this->_bias.size();
-  if (this->_weight.size() > 0)
+  if (this->_is_depthwise)
+  {
+    // Depthwise: one weight per channel per kernel tap
+    num_weights += this->_channels * this->_depthwise_weight.size();
+  }
+  else if (this->_weight.size() > 0)
   {
     const long out_channels = this->_weight[0].rows();
     const long in_channels = this->_weight[0].cols();
diff --git a/NAM/conv1d.h b/NAM/conv1d.h
@@ -95,19 +95,19 @@ class Conv1D
                 const long j_start) const;
   /// \brief Get the number of input channels
   /// \return Number of input channels
-  long get_in_channels() const { return this->_weight.size() > 0 ? this->_weight[0].cols() : 0; };
+  long get_in_channels() const;
 
   /// \brief Get the kernel size
   /// \return Kernel size
-  long get_kernel_size() const { return this->_weight.size(); };
+  long get_kernel_size() const;
 
   /// \brief Get the total number of weights
   /// \return Total number of weight parameters
   long get_num_weights() const;
 
   /// \brief Get the number of output channels
   /// \return Number of output channels
-  long get_out_channels() const { return this->_weight.size() > 0 ? this->_weight[0].rows() : 0; };
+  long get_out_channels() const;
 
   /// \brief Get the dilation factor
   /// \return Dilation factor
@@ -118,8 +118,13 @@ class Conv1D
   bool has_bias() const { return this->_bias.size() > 0; };
 
 protected:
-  // conv[kernel](cout, cin)
+  // conv[kernel](cout, cin) - used for non-depthwise convolutions
   std::vector<Eigen::MatrixXf> _weight;
+  // For depthwise convolution (groups == in_channels == out_channels):
+  // stores one weight per channel per kernel tap
+  std::vector<Eigen::VectorXf> _depthwise_weight;
+  bool _is_depthwise = false;
+  int _channels = 0; // Used for depthwise case (in_channels == out_channels)
   Eigen::VectorXf _bias;
   int _dilation;
   int _num_groups;
diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
@@ -331,9 +331,30 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool
   }
 
   this->_num_groups = groups;
-  this->_weight.resize(out_channels, in_channels);
-  this->_weight.setZero();
   this->_do_bias = _bias;
+
+  // Check for depthwise convolution: groups == in_channels == out_channels
+  // In this case, each channel is processed independently with a single weight,
+  // so we can use efficient element-wise multiplication instead of matrix multiplication.
+  this->_is_depthwise = (groups == in_channels && in_channels == out_channels);
+
+  if (this->_is_depthwise)
+  {
+    // Depthwise: store one weight per channel
+    this->_channels = in_channels;
+    this->_depthwise_weight.resize(in_channels);
+    this->_depthwise_weight.setZero();
+    // Clear the matrix weight (not used)
+    this->_weight.resize(0, 0);
+  }
+  else
+  {
+    // Non-depthwise: store full weight matrix (block-diagonal for grouped convolutions)
+    this->_weight.resize(out_channels, in_channels);
+    this->_weight.setZero();
+    this->_channels = 0;
+  }
+
   if (_bias)
   {
     this->_bias.resize(out_channels);
@@ -349,7 +370,15 @@ void nam::Conv1x1::SetMaxBufferSize(const int maxBufferSize)
 
 void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
 {
-  if (this->_weight.size() > 0)
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: one weight per channel
+    for (int c = 0; c < this->_channels; c++)
+    {
+      this->_depthwise_weight(c) = *(weights++);
+    }
+  }
+  else if (this->_weight.size() > 0)
   {
     const long out_channels = this->_weight.rows();
     const long in_channels = this->_weight.cols();
@@ -376,10 +405,35 @@ void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
       this->_bias(i) = *(weights++);
 }
 
+long nam::Conv1x1::get_out_channels() const
+{
+  if (this->_is_depthwise)
+    return this->_channels;
+  return this->_weight.rows();
+}
+
+long nam::Conv1x1::get_in_channels() const
+{
+  if (this->_is_depthwise)
+    return this->_channels;
+  return this->_weight.cols();
+}
+
 Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const
 {
-  // Single GEMM for all cases - block-diagonal zero structure handles grouping
-  Eigen::MatrixXf result = this->_weight * input.leftCols(num_frames);
+  Eigen::MatrixXf result(get_out_channels(), num_frames);
+
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: efficient element-wise multiplication
+    // Each channel is scaled by its corresponding weight
+    result.noalias() = this->_depthwise_weight.asDiagonal() * input.leftCols(num_frames);
+  }
+  else
+  {
+    // Single GEMM for all cases - block-diagonal zero structure handles grouping
+    result.noalias() = this->_weight * input.leftCols(num_frames);
+  }
 
   if (this->_do_bias)
     result.colwise() += this->_bias;
@@ -391,8 +445,17 @@ void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, cons
 {
   assert(num_frames <= _output.cols());
 
-  // Single GEMM for all cases - block-diagonal zero structure handles grouping
-  _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: efficient element-wise multiplication
+    // Each channel is scaled by its corresponding weight
+    _output.leftCols(num_frames).noalias() = this->_depthwise_weight.asDiagonal() * input.leftCols(num_frames);
+  }
+  else
+  {
+    // Single GEMM for all cases - block-diagonal zero structure handles grouping
+    _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
+  }
 
   if (this->_do_bias)
     _output.leftCols(num_frames).colwise() += this->_bias;
diff --git a/NAM/dsp.h b/NAM/dsp.h
@@ -323,11 +323,17 @@ class Conv1x1
   /// \param num_frames Number of frames to process
   void process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames);
 
-  long get_out_channels() const { return this->_weight.rows(); };
-  long get_in_channels() const { return this->_weight.cols(); };
+  long get_out_channels() const;
+  long get_in_channels() const;
 
 protected:
+  // Non-depthwise: full weight matrix (out_channels x in_channels)
   Eigen::MatrixXf _weight;
+  // For depthwise convolution (groups == in_channels == out_channels):
+  // stores one weight per channel
+  Eigen::VectorXf _depthwise_weight;
+  bool _is_depthwise = false;
+  int _channels = 0; // Used for depthwise case (in_channels == out_channels)
   Eigen::VectorXf _bias;
   int _num_groups;