fix(lora): skip gradient computation for frozen parameters to reduce memory

chen2021673 · claude · chen2021673 · commit b33cc8caf95b · 2026-03-29T17:02:27.000Z
Add needs_input_grad_ tracking in autograd Function to skip unnecessary
gradient allocation and computation for frozen (requires_grad=false)
parameters. For LoRA fine-tuning, this avoids allocating grad_weight
tensors for all frozen base model weights, reducing peak GPU memory
from ~10.7GB to ~7.7GB.

Also consolidate LinearBackward loose params into LinearMeta and
LinearGradFlags structs for clarity.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/infini_train/include/autograd/function.h b/infini_train/include/autograd/function.h
@@ -47,6 +47,7 @@ class Function : public std::enable_shared_from_this<Function> {
 
 protected:
     std::vector<std::shared_ptr<Tensor>> saved_tensors_;
+    std::vector<bool> needs_input_grad_;
 
 private:
     std::vector<std::pair<std::shared_ptr<Function>, int>> next_functions_;
diff --git a/infini_train/include/autograd/linear.h b/infini_train/include/autograd/linear.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <cstdint>
 #include <memory>
 #include <vector>
 
@@ -10,6 +11,21 @@ class Tensor;
 }
 
 namespace infini_train::autograd {
+
+struct LinearMeta {
+    bool transpose = false;
+    bool has_bias = false;
+    int64_t in_features = 0;
+    int64_t out_features = 0;
+    std::vector<int64_t> input_dims;
+};
+
+struct LinearGradFlags {
+    bool input = false;
+    bool weight = false;
+    bool bias = false;
+};
+
 class Linear : public Function {
 public:
     static constexpr char kType[] = "LinearFunction";
@@ -22,7 +38,6 @@ class Linear : public Function {
     std::vector<std::shared_ptr<Tensor>> Backward(const std::vector<std::shared_ptr<Tensor>> &grad_outputs) override;
 
 private:
-    int64_t out_features_ = 0;
-    bool bias_ = true;
+    LinearMeta meta_;
 };
 } // namespace infini_train::autograd
diff --git a/infini_train/src/autograd/function.cc b/infini_train/src/autograd/function.cc
@@ -36,6 +36,16 @@ std::vector<std::shared_ptr<Tensor>> Function::Apply(const std::vector<std::shar
         }
     }
 
+    // Populate needs_input_grad_ before Forward/SetupContext so that
+    // SetupContext can use it for saved-tensor pruning.
+    // Must be done before NoGradGuard since it checks GradMode.
+    if (autograd::GradMode::IsEnabled()) {
+        needs_input_grad_.resize(input_tensors.size());
+        for (size_t idx = 0; idx < input_tensors.size(); ++idx) {
+            needs_input_grad_[idx] = input_tensors[idx]->requires_grad();
+        }
+    }
+
     std::vector<std::shared_ptr<Tensor>> output_tensors;
     {
         autograd::NoGradGuard no_grad;
@@ -129,6 +139,7 @@ void Function::BackwardPartial(const std::shared_ptr<Tensor> &grad_output, int g
 
         saved_tensors_.clear();
         grad_outputs_.clear();
+        needs_input_grad_.clear();
         grad_outputs_reached_ = 0;
         dependencies_reached_ = 0;
 
diff --git a/infini_train/src/autograd/linear.cc b/infini_train/src/autograd/linear.cc
@@ -20,9 +20,18 @@ void Linear::SetupContext(const std::vector<std::shared_ptr<Tensor>> &input_tens
                           const std::vector<std::shared_ptr<Tensor>> &) {
     const auto &input = input_tensors[0];
     const auto &weight = input_tensors[1];
-    saved_tensors_ = {input, weight};
-    bias_ = input_tensors.size() == 3;
-    out_features_ = weight->Dims()[0];
+
+    bool need_input = needs_input_grad_.size() > 0 && needs_input_grad_[0];
+    bool need_weight = needs_input_grad_.size() > 1 && needs_input_grad_[1];
+
+    // grad_input needs weight, grad_weight needs input
+    saved_tensors_ = {need_weight ? input : nullptr, need_input ? weight : nullptr};
+
+    meta_ = {.transpose = true,
+             .has_bias = input_tensors.size() == 3,
+             .in_features = weight->Dims()[1],
+             .out_features = weight->Dims()[0],
+             .input_dims = input->Dims()};
 }
 
 std::vector<std::shared_ptr<Tensor>> Linear::Backward(const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
@@ -32,13 +41,20 @@ std::vector<std::shared_ptr<Tensor>> Linear::Backward(const std::vector<std::sha
     CHECK_EQ(grad_outputs.size(), 1);
     const auto &grad_output = grad_outputs[0];
 
-    auto device = input->GetDevice().type();
+    CHECK(!needs_input_grad_.empty()) << "needs_input_grad_ not populated in Linear::Backward";
+    LinearGradFlags grad_flags = {.input = needs_input_grad_[0],
+                                  .weight = needs_input_grad_.size() > 1 && needs_input_grad_[1],
+                                  .bias = meta_.has_bias && needs_input_grad_.size() > 2 && needs_input_grad_[2]};
+
+    auto device = grad_output->GetDevice().type();
     auto [grad_input, grad_weight, grad_bias]
         = Dispatcher::Instance()
               .Call<std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>>(
-                  {device, "LinearBackward"}, input, weight, true, out_features_, grad_output, bias_);
-    return bias_ ? std::vector<std::shared_ptr<Tensor>>{grad_input, grad_weight, grad_bias}
-                 : std::vector<std::shared_ptr<Tensor>>{grad_input, grad_weight};
-    ;
+                  {device, "LinearBackward"}, input, weight, meta_, grad_output, grad_flags);
+    if (meta_.has_bias) {
+        return {grad_input, grad_weight, grad_bias};
+    } else {
+        return {grad_input, grad_weight};
+    }
 }
 } // namespace infini_train::autograd
diff --git a/infini_train/src/kernels/cpu/linear.cc b/infini_train/src/kernels/cpu/linear.cc
@@ -1,11 +1,11 @@
 #include <cstdint>
-#include <fcntl.h>
 #include <memory>
 #include <numeric>
 #include <tuple>
 
 #include "glog/logging.h"
 
+#include "infini_train/include/autograd/linear.h"
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
@@ -70,6 +70,7 @@ MatmulBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
     const int64_t k = input_dims[input_dims.size() - 1];
     CHECK_EQ(k, other_dims[other_dims.size() - 2]);
     const int64_t n = other_dims[other_dims.size() - 1];
+
     CHECK_EQ(m, grad_output_dims[grad_output_dims.size() - 2]);
     CHECK_EQ(n, grad_output_dims[grad_output_dims.size() - 1]);
 
@@ -147,8 +148,9 @@ std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, cons
 
 // TODO(dcj): support linear without bias later
 std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>
-LinearBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &weight, bool transpose,
-               int64_t out_features, const std::shared_ptr<Tensor> &grad_output, const bool bias) {
+LinearBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &weight,
+               infini_train::autograd::LinearMeta meta, const std::shared_ptr<Tensor> &grad_output,
+               infini_train::autograd::LinearGradFlags grad_flags) {
     /*
     transpose: grad_input = grad_output * weight
     grad_input[*, in_features] = grad_output[*, out_features] * weight[out_features, in_features]
@@ -160,32 +162,46 @@ LinearBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
     grad_weight[in_features, out_features] = input[*, in_features]^T * grad_output[*, out_features]
     grad_bias[out_features] = grad_output[*, out_features].sum(axis=0)
     */
+    const auto &input_dims = meta.input_dims;
+    const auto in_features = meta.in_features;
+    const auto out_features = meta.out_features;
+    const auto transpose = meta.transpose;
+    const auto bias = meta.has_bias;
+    const auto compute_grad_input = grad_flags.input;
+    const auto compute_grad_weight = grad_flags.weight;
+    const auto compute_grad_bias = grad_flags.bias;
 
-    const auto &input_dims = input->Dims();
     CHECK_GE(input_dims.size(), 2);
-    const int64_t bs = std::accumulate(input_dims.rbegin() + 1, input_dims.rend(), 1, std::multiplies<int64_t>{});
-    const int64_t in_features = *input_dims.rbegin();
 
-    const auto &weight_dims = weight->Dims();
-    CHECK_EQ(weight_dims.size(), 2);
-    CHECK_EQ(in_features, weight_dims[transpose ? 1 : 0]);
-    CHECK_EQ(out_features, weight_dims[transpose ? 0 : 1]);
+    std::vector<int64_t> weight_dims
+        = transpose ? std::vector<int64_t>{out_features, in_features} : std::vector<int64_t>{in_features, out_features};
 
-    auto grad_input = std::make_shared<Tensor>(input_dims, DataType::kFLOAT32);
-    auto grad_weight = std::make_shared<Tensor>(weight_dims, DataType::kFLOAT32);
+    std::shared_ptr<Tensor> grad_input = nullptr;
+    std::shared_ptr<Tensor> grad_weight = nullptr;
     std::shared_ptr<Tensor> grad_bias = nullptr;
-    if (bias) {
-        grad_bias = std::make_shared<Tensor>(std::vector<int64_t>{out_features}, DataType::kFLOAT32);
+
+    if (compute_grad_input) {
+        CHECK(weight != nullptr) << "compute_grad_input=true but weight is nullptr (selective save mismatch)";
+        grad_input = std::make_shared<Tensor>(input_dims, DataType::kFLOAT32);
+        if (transpose) {
+            grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix();
+        } else {
+            grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix().transpose();
+        }
     }
 
-    if (transpose) {
-        grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix();
-        grad_weight->EigenMatrix() = grad_output->EigenMatrix().transpose() * input->EigenMatrix();
-    } else {
-        grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix().transpose();
-        grad_weight->EigenMatrix() = input->EigenMatrix().transpose() * grad_output->EigenMatrix();
+    if (compute_grad_weight) {
+        CHECK(input != nullptr) << "compute_grad_weight=true but input is nullptr (selective save mismatch)";
+        grad_weight = std::make_shared<Tensor>(weight_dims, DataType::kFLOAT32);
+        if (transpose) {
+            grad_weight->EigenMatrix() = grad_output->EigenMatrix().transpose() * input->EigenMatrix();
+        } else {
+            grad_weight->EigenMatrix() = input->EigenMatrix().transpose() * grad_output->EigenMatrix();
+        }
     }
-    if (bias) {
+
+    if (compute_grad_bias && bias) {
+        grad_bias = std::make_shared<Tensor>(std::vector<int64_t>{out_features}, DataType::kFLOAT32);
         grad_bias->EigenVector() = grad_output->EigenMatrix().colwise().sum();
     }
 
diff --git a/infini_train/src/kernels/cuda/linear.cu b/infini_train/src/kernels/cuda/linear.cu