From 9cf00eb012c808a16911c7b5ed3a843abbb39e1a Mon Sep 17 00:00:00 2001
From: Eren Dogan <erendogan@google.com>
Date: Fri, 5 Jun 2026 12:54:51 -0700
Subject: [PATCH 1/8] rsz: Add global sizing

Signed-off-by: Eren Dogan <erendogan@google.com>
---
 src/rsz/BUILD                              |    4 +
 src/rsz/include/rsz/Resizer.hh             |   17 +
 src/rsz/src/CMakeLists.txt                 |    2 +
 src/rsz/src/LRSubproblem.cc                |  544 ++++++
 src/rsz/src/LRSubproblem.hh                |  221 +++
 src/rsz/src/Optimizer.cc                   |    9 +-
 src/rsz/src/Resizer.cc                     |   41 +-
 src/rsz/src/policy/GlobalSizingPolicy.cc   |  922 +++++++++
 src/rsz/src/policy/GlobalSizingPolicy.hh   |  152 ++
 src/rsz/test/BUILD                         |    3 +
 src/rsz/test/CMakeLists.txt                |    2 +
 src/rsz/test/global_sizing.tcl             |   35 +
 src/rsz/test/global_sizing.vok             | 2019 ++++++++++++++++++++
 src/rsz/test/global_sizing_threads.tcl     |    7 +
 src/rsz/test/repair_setup_invalid_phase.ok |    4 +-
 15 files changed, 3969 insertions(+), 13 deletions(-)
 create mode 100644 src/rsz/src/LRSubproblem.cc
 create mode 100644 src/rsz/src/LRSubproblem.hh
 create mode 100644 src/rsz/src/policy/GlobalSizingPolicy.cc
 create mode 100644 src/rsz/src/policy/GlobalSizingPolicy.hh
 create mode 100644 src/rsz/test/global_sizing.tcl
 create mode 100644 src/rsz/test/global_sizing.vok
 create mode 100644 src/rsz/test/global_sizing_threads.tcl

diff --git a/src/rsz/BUILD b/src/rsz/BUILD
index a862f2a97bb..f816b03e1db 100644
--- a/src/rsz/BUILD
+++ b/src/rsz/BUILD
@@ -23,6 +23,8 @@ cc_library(
         "src/DelayEstimator.hh",
         "src/DelayEstimatorReporter.cc",
         "src/DelayEstimatorReporter.hh",
+        "src/LRSubproblem.cc",
+        "src/LRSubproblem.hh",
         "src/MoveCommitter.cc",
         "src/MoveCommitter.hh",
         "src/MoveTracker.cc",
@@ -104,6 +106,8 @@ cc_library(
         "src/move/VtSwapMtCandidate.hh",
         "src/move/VtSwapMtGenerator.cc",
         "src/move/VtSwapMtGenerator.hh",
+        "src/policy/GlobalSizingPolicy.cc",
+        "src/policy/GlobalSizingPolicy.hh",
         "src/policy/MeasuredVtSwapPolicy.cc",
         "src/policy/MeasuredVtSwapPolicy.hh",
         "src/policy/OptimizationPolicy.cc",
diff --git a/src/rsz/include/rsz/Resizer.hh b/src/rsz/include/rsz/Resizer.hh
index ffe5cc4405d..ad43a93241a 100644
--- a/src/rsz/include/rsz/Resizer.hh
+++ b/src/rsz/include/rsz/Resizer.hh
@@ -690,6 +690,16 @@ class Resizer : public sta::dbStaState, public sta::dbNetworkObserver
                   // Return values.
                   sta::ArcDelay delays[sta::RiseFall::index_count],
                   sta::Slew slews[sta::RiseFall::index_count]);
+  // Worker-safe overload: uses the caller-provided ArcDelayCalc instead of the
+  // shared member, so the table-model lookup can run concurrently.
+  void gateDelays(const sta::LibertyPort* drvr_port,
+                  float load_cap,
+                  const sta::Scene* scene,
+                  const sta::MinMax* min_max,
+                  sta::ArcDelayCalc* arc_delay_calc,
+                  // Return values.
+                  sta::ArcDelay delays[sta::RiseFall::index_count],
+                  sta::Slew slews[sta::RiseFall::index_count]);
   void gateDelays(const sta::LibertyPort* drvr_port,
                   float load_cap,
                   const sta::Slew in_slews[sta::RiseFall::index_count],
@@ -702,6 +712,12 @@ class Resizer : public sta::dbStaState, public sta::dbNetworkObserver
                           float load_cap,
                           const sta::Scene* scene,
                           const sta::MinMax* min_max);
+  // Worker-safe overload (see gateDelays above).
+  sta::ArcDelay gateDelay(const sta::LibertyPort* drvr_port,
+                          float load_cap,
+                          const sta::Scene* scene,
+                          const sta::MinMax* min_max,
+                          sta::ArcDelayCalc* arc_delay_calc);
   sta::ArcDelay gateDelay(const sta::LibertyPort* drvr_port,
                           const sta::RiseFall* rf,
                           float load_cap,
@@ -1028,6 +1044,7 @@ class Resizer : public sta::dbStaState, public sta::dbNetworkObserver
   friend class OdbCallBack;
   friend class SetupLegacyBase;
   friend class RepairTargetCollector;
+  friend class LRSubproblem;
   friend class DelayEstimatorReporter;
 };
 
diff --git a/src/rsz/src/CMakeLists.txt b/src/rsz/src/CMakeLists.txt
index 7ebe9fbceeb..43dbdd2774a 100644
--- a/src/rsz/src/CMakeLists.txt
+++ b/src/rsz/src/CMakeLists.txt
@@ -28,6 +28,7 @@ add_library(rsz_lib
     policy/MeasuredVtSwapPolicy.cc
     DelayEstimator.cc
     DelayEstimatorReporter.cc
+    LRSubproblem.cc
     policy/SetupCritVtSwapPolicy.cc
     policy/SetupDirectionalPolicy.cc
     policy/SetupLastGaspPolicy.cc
@@ -36,6 +37,7 @@ add_library(rsz_lib
     policy/SetupWnsPolicy.cc
     policy/SetupMt1Policy.cc
     policy/SetupReroutePolicy.cc
+    policy/GlobalSizingPolicy.cc
     Resizer.cc
     OdbCallBack.cc
     ConcreteSwapArithModules.cc
diff --git a/src/rsz/src/LRSubproblem.cc b/src/rsz/src/LRSubproblem.cc
new file mode 100644
index 00000000000..34eac8cd2e8
--- /dev/null
+++ b/src/rsz/src/LRSubproblem.cc
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: BSD-3-Clause
+// Copyright (c) 2026-2026, The OpenROAD Authors
+
+#include "LRSubproblem.hh"
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "db_sta/dbNetwork.hh"
+#include "db_sta/dbSta.hh"
+#include "odb/db.h"
+#include "rsz/Resizer.hh"
+#include "sta/Delay.hh"
+#include "sta/Graph.hh"
+#include "sta/GraphDelayCalc.hh"
+#include "sta/Liberty.hh"
+#include "sta/LibertyClass.hh"
+#include "sta/Network.hh"
+#include "sta/NetworkClass.hh"
+#include "sta/PortDirection.hh"
+#include "sta/Scene.hh"
+#include "sta/Sta.hh"
+#include "sta/TimingRole.hh"
+#include "sta/Transition.hh"
+
+namespace rsz {
+
+namespace {
+
+// Resizer::area(Cell*) is protected. Compute the same value through the public
+// dbuToMeters + db_network->staToDb pair so we don't friend-pierce the Resizer
+// class. Matches Resizer::area(dbMaster*) exactly.
+double cellAreaSI(const Resizer& resizer,
+                  sta::dbNetwork* db_network,
+                  sta::LibertyCell* cell)
+{
+  if (cell == nullptr) {
+    return 0.0;
+  }
+  odb::dbMaster* master = db_network->staToDb(db_network->cell(cell));
+  if (master == nullptr || !master->isCoreAutoPlaceable()) {
+    return 0.0;
+  }
+  return resizer.dbuToMeters(master->getWidth())
+         * resizer.dbuToMeters(master->getHeight());
+}
+
+// File-local output-side DRC helpers. Mirrors the pattern used by
+// SizeDownGenerator.cc where similar checks live as file-local statics.
+// Polarity: return true when the proposed replacement would introduce a
+// violation. Both are pure Liberty/SDC reads and so are safe to call from
+// worker threads.
+
+bool checkOutputMaxCap(sta::LibertyPort* output_port,
+                       const float output_cap,
+                       const sta::MinMax* max_mm)
+{
+  float max_cap = 0.0f;
+  bool cap_limit_exists = false;
+  output_port->capacitanceLimit(max_mm, max_cap, cap_limit_exists);
+  return cap_limit_exists && max_cap > 0.0f && output_cap > max_cap;
+}
+
+bool checkOutputMaxSlew(sta::dbSta* sta,
+                        sta::LibertyPort* candidate_port,
+                        const float output_slew_factor,
+                        const float output_cap,
+                        const sta::Scene* scene,
+                        const sta::MinMax* max_mm)
+{
+  const float new_slew
+      = output_slew_factor * candidate_port->driveResistance() * output_cap;
+  float max_slew = 0.0f;
+  bool slew_limit_exists = false;
+  sta->findSlewLimit(
+      candidate_port, scene, max_mm, max_slew, slew_limit_exists);
+  return slew_limit_exists && new_slew > max_slew;
+}
+
+}  // namespace
+
+LRSubproblem::LRSubproblem(Resizer* resizer) : resizer_(resizer)
+{
+}
+
+void LRSubproblem::init()
+{
+  if (initialized_) {
+    return;
+  }
+  logger_ = resizer_->logger();
+  dbStaState::init(resizer_->sta());
+  db_network_ = resizer_->dbNetwork();
+  computeLeakageScale();
+  initialized_ = true;
+}
+
+void LRSubproblem::computeLeakageScale()
+{
+  // Build (leakage, area) pairs for instances whose current cell has both.
+  std::vector<float> leakages;
+  std::vector<float> areas;
+  std::unique_ptr<sta::LeafInstanceIterator> iit(
+      network_->leafInstanceIterator());
+  while (iit->hasNext()) {
+    sta::Instance* inst = iit->next();
+    sta::LibertyCell* cell = network_->libertyCell(inst);
+    if (cell == nullptr) {
+      continue;
+    }
+    const std::optional<float> leak = resizer_->cellLeakage(cell);
+    if (!leak.has_value()) {
+      continue;
+    }
+    const double a = cellAreaSI(*resizer_, db_network_, cell);
+    if (a <= 0.0) {
+      continue;
+    }
+    leakages.push_back(*leak);
+    areas.push_back(static_cast<float>(a));
+  }
+
+  if (leakages.empty()) {
+    // Degenerate: no instance exposes leakage. leakageOrArea will return
+    // raw area, which is order-preserving within this design.
+    area_to_leakage_scale_ = 0.0f;
+    return;
+  }
+
+  const auto mid = leakages.size() / 2;
+  std::nth_element(leakages.begin(), leakages.begin() + mid, leakages.end());
+  const float l_med = leakages[mid];
+  std::nth_element(areas.begin(), areas.begin() + mid, areas.end());
+  const float a_med = areas[mid];
+
+  area_to_leakage_scale_ = (a_med > 0.0f) ? (l_med / a_med) : 0.0f;
+}
+
+float LRSubproblem::leakageOrArea(sta::LibertyCell* cell) const
+{
+  const std::optional<float> leak = resizer_->cellLeakage(cell);
+  if (leak.has_value()) {
+    return *leak;
+  }
+  const float a = static_cast<float>(cellAreaSI(*resizer_, db_network_, cell));
+  return area_to_leakage_scale_ > 0.0f ? area_to_leakage_scale_ * a : a;
+}
+
+bool LRSubproblem::isDataArc(const sta::Edge* edge) const
+{
+  const sta::TimingRole* role = edge->role();
+  if (role->isTimingCheck()) {
+    return false;
+  }
+  if (edge->isDisabledLoop()) {
+    return false;
+  }
+  if (role == sta::TimingRole::latchDtoQ()
+      || role == sta::TimingRole::latchEnToQ()) {
+    return false;
+  }
+  return true;
+}
+
+float LRSubproblem::portInputCap(sta::LibertyCell* cell,
+                                 const char* port_name) const
+{
+  sta::LibertyPort* port = cell->findLibertyPort(port_name);
+  if (port == nullptr) {
+    return 0.0f;
+  }
+  float cap = 0.0f;
+  for (auto rf : sta::RiseFall::range()) {
+    cap = std::max(cap, port->capacitance(rf, max_));
+  }
+  return cap;
+}
+
+bool LRSubproblem::applyReplacement(sta::Instance* inst,
+                                    sta::LibertyCell* replacement)
+{
+  if (inst == nullptr || replacement == nullptr) {
+    return false;
+  }
+  return resizer_->replaceCell(inst, replacement, /*journal=*/true);
+}
+
+bool LRSubproblem::snapshot(sta::Instance* inst,
+                            const float* lambda,
+                            const int lambda_size,
+                            GateSnapshot& snap)
+{
+  init();
+
+  if (resizer_->dontTouch(inst)) {
+    return false;
+  }
+  sta::LibertyCell* cur_cell = network_->libertyCell(inst);
+  if (cur_cell == nullptr) {
+    return false;
+  }
+
+  const sta::Scene* scene = sta_->cmdScene();
+  const sta::MinMax* max_mm = max_;
+
+  snap.inst = inst;
+  snap.cur_cell = cur_cell;
+  snap.scene = scene;
+  snap.outputs.clear();
+  snap.upstream.clear();
+  snap.inputs.clear();
+  snap.candidates.clear();
+
+  std::unique_ptr<sta::InstancePinIterator> pit(network_->pinIterator(inst));
+  while (pit->hasNext()) {
+    sta::Pin* pin = pit->next();
+    const sta::PortDirection* dir = network_->direction(pin);
+    if (dir->isOutput()) {
+      sta::Vertex* v = graph_->pinDrvrVertex(pin);
+      if (v == nullptr) {
+        continue;
+      }
+      const sta::LibertyPort* out_port = network_->libertyPort(pin);
+      if (out_port == nullptr) {
+        continue;
+      }
+      float lam_sum = 0.0f;
+      sta::VertexInEdgeIterator ieit(v, graph_);
+      while (ieit.hasNext()) {
+        sta::Edge* e = ieit.next();
+        if (!isDataArc(e)) {
+          continue;
+        }
+        // Restrict to gate-internal arcs (from a pin on the same instance).
+        const sta::Pin* from_pin = e->from(graph_)->pin();
+        if (network_->instance(from_pin) != inst) {
+          continue;
+        }
+        const sta::EdgeId id = graph_->id(e);
+        if (static_cast<int>(id) >= lambda_size) {
+          continue;
+        }
+        lam_sum += lambda[id];
+      }
+      OutputCtx o;
+      o.port = out_port;
+      o.load_cap = graph_delay_calc_->loadCap(pin, scene, max_mm);
+      o.lambda_sum = lam_sum;
+      // Freeze the Elmore-slew DRC inputs. slew is the STA graph slew at the
+      // output pin's load vertex; it is constant across candidates, so we read
+      // it once here on the main thread.
+      sta::Vertex* load_v = graph_->pinLoadVertex(pin);
+      o.slew = (load_v != nullptr)
+                   ? sta::delayAsFloat(sta_->slew(load_v,
+                                                  sta::RiseFallBoth::riseFall(),
+                                                  sta_->scenes(),
+                                                  max_mm))
+                   : 0.0f;
+      o.drive_res = out_port->driveResistance();
+      snap.outputs.push_back(o);
+    } else if (dir->isInput()) {
+      const sta::LibertyPort* in_port = network_->libertyPort(pin);
+
+      // (a) Input-side max-cap DRC context for every input pin: freeze each
+      // fanin driver's current cap-check so workers can replay
+      // Resizer::replacementPreservesMaxCap without touching live STA.
+      if (in_port != nullptr) {
+        sta::PinSet* drivers = network_->drivers(pin);
+        if (drivers != nullptr) {
+          InputMaxCapCtx in_ctx;
+          in_ctx.in_port = in_port;
+          in_ctx.old_cap = portInputCap(cur_cell, in_port->name().c_str());
+          for (const sta::Pin* driver_pin : *drivers) {
+            float cap = 0.0f;
+            float max_cap = 0.0f;
+            float cap_slack = 0.0f;
+            const sta::RiseFall* tr = nullptr;
+            const sta::Scene* corner = nullptr;
+            sta_->checkCapacitance(driver_pin,
+                                   sta_->scenes(),
+                                   max_mm,
+                                   cap,
+                                   max_cap,
+                                   cap_slack,
+                                   tr,
+                                   corner);
+            DriverCapCheck dc;
+            dc.cap = cap;
+            dc.max_cap = max_cap;
+            dc.cap_slack = cap_slack;
+            dc.corner_ok = (max_cap > 0.0f && corner != nullptr);
+            in_ctx.drivers.push_back(dc);
+          }
+          snap.inputs.push_back(std::move(in_ctx));
+        }
+      }
+
+      // (b) Upstream-Cin context: only input pins with real upstream pressure.
+      sta::Vertex* in_v = graph_->pinLoadVertex(pin);
+      if (in_v == nullptr) {
+        continue;
+      }
+      // Locate the driver pin via the wire arc(s) feeding in_v. There's
+      // typically exactly one; take the first valid one.
+      sta::Pin* drv_pin = nullptr;
+      sta::VertexInEdgeIterator wireIt(in_v, graph_);
+      while (wireIt.hasNext()) {
+        sta::Edge* w = wireIt.next();
+        if (w->isDisabledLoop()) {
+          continue;
+        }
+        sta::Pin* candidate_drv = w->from(graph_)->pin();
+        if (candidate_drv != nullptr && candidate_drv != pin) {
+          drv_pin = candidate_drv;
+          break;
+        }
+      }
+      if (drv_pin == nullptr) {
+        continue;  // floating / no driver
+      }
+      sta::Instance* upstream_inst = network_->instance(drv_pin);
+      if (upstream_inst == nullptr || upstream_inst == inst) {
+        continue;
+      }
+      sta::LibertyCell* upstream_cell = network_->libertyCell(upstream_inst);
+      if (upstream_cell == nullptr) {
+        // PI / hierarchical / black box - no Liberty model to evaluate.
+        continue;
+      }
+      sta::LibertyPort* drv_port = network_->libertyPort(drv_pin);
+      if (drv_port == nullptr) {
+        continue;
+      }
+      sta::Vertex* drv_v = graph_->pinDrvrVertex(drv_pin);
+      if (drv_v == nullptr) {
+        continue;
+      }
+      // Sum λ over U's gate-internal data arcs terminating at drv_pin.
+      float lam_U = 0.0f;
+      sta::VertexInEdgeIterator drvIt(drv_v, graph_);
+      while (drvIt.hasNext()) {
+        sta::Edge* e = drvIt.next();
+        if (!isDataArc(e)) {
+          continue;
+        }
+        const sta::Pin* from_pin = e->from(graph_)->pin();
+        if (network_->instance(from_pin) != upstream_inst) {
+          continue;
+        }
+        const sta::EdgeId id = graph_->id(e);
+        if (static_cast<int>(id) >= lambda_size) {
+          continue;
+        }
+        lam_U += lambda[id];
+      }
+      // Skip pins with no real upstream pressure - saves the per-candidate
+      // gateDelay call for arcs whose λ is essentially at floor anyway.
+      if (lam_U <= 0.0f) {
+        continue;
+      }
+      if (in_port == nullptr) {
+        continue;
+      }
+      UpstreamCtx u;
+      u.orig_in_port = in_port;
+      u.drv_port = drv_port;
+      u.load_U_cur = graph_delay_calc_->loadCap(drv_pin, scene, max_mm);
+      u.c_in_cur = portInputCap(cur_cell, in_port->name().c_str());
+      u.lambda_U_drv = lam_U;
+      snap.upstream.push_back(u);
+    }
+  }
+
+  if (snap.outputs.empty()) {
+    return false;
+  }
+
+  // Precompute leakage-equivalent cost for the current cell and every
+  // candidate now, on the main thread - leakageOrArea/getSwappableCells mutate
+  // lazy caches and must not be touched from workers.
+  snap.cur_leakage = leakageOrArea(cur_cell);
+  sta::LibertyCellSeq candidates = resizer_->getSwappableCells(cur_cell);
+  snap.candidates.reserve(candidates.size());
+  for (sta::LibertyCell* cand : candidates) {
+    if (cand == cur_cell) {
+      continue;
+    }
+    Candidate c;
+    c.cell = cand;
+    c.leakage = leakageOrArea(cand);
+    snap.candidates.push_back(c);
+  }
+
+  return true;
+}
+
+float LRSubproblem::evaluateCellCost(const GateSnapshot& snap,
+                                     sta::LibertyCell* cell,
+                                     const float cell_leakage,
+                                     const float timing_weight,
+                                     sta::ArcDelayCalc* arc_delay_calc) const
+{
+  float cost = cell_leakage;
+  const sta::Scene* scene = snap.scene;
+  // Output-cone term: arcs that terminate at this instance's output pins.
+  for (const OutputCtx& o : snap.outputs) {
+    if (o.lambda_sum == 0.0f || o.port == nullptr) {
+      continue;  // no timing pressure on this output pin
+    }
+    sta::LibertyPort* cand_port = cell->findLibertyPort(o.port->name());
+    if (cand_port == nullptr) {
+      // Candidate cell missing this output port - reject via huge cost.
+      return std::numeric_limits<float>::infinity();
+    }
+    const float d = sta::delayAsFloat(resizer_->gateDelay(
+        cand_port, o.load_cap, scene, max_, arc_delay_calc));
+    cost += timing_weight * o.lambda_sum * d;
+  }
+  // Upstream-Cin term: arcs inside each upstream driver U that terminate
+  // at the driver pin feeding one of inst's input pins. Their delay
+  // depends on the load U drives, which includes inst's input capacitance
+  // on that pin. Substituting the candidate's input cap perturbs the
+  // upstream's load and shifts its delay.
+  for (const UpstreamCtx& u : snap.upstream) {
+    if (u.lambda_U_drv == 0.0f || u.drv_port == nullptr
+        || u.orig_in_port == nullptr) {
+      continue;
+    }
+    const float c_in_cand = portInputCap(cell, u.orig_in_port->name().c_str());
+    if (c_in_cand == 0.0f) {
+      // Candidate missing this input port - incompatible.
+      return std::numeric_limits<float>::infinity();
+    }
+    float load_pert = u.load_U_cur - u.c_in_cur + c_in_cand;
+    if (load_pert < 0.0f) {
+      // Numerical safety: extreme C_in mismatches can push the perturbed
+      // load slightly negative. Clamp at zero rather than rejecting; the
+      // gateDelay LUT is well-defined at zero load.
+      load_pert = 0.0f;
+    }
+    const float d_U = sta::delayAsFloat(resizer_->gateDelay(
+        u.drv_port, load_pert, scene, max_, arc_delay_calc));
+    cost += timing_weight * u.lambda_U_drv * d_U;
+  }
+  return cost;
+}
+
+bool LRSubproblem::candidateDrcOkSnapshot(const GateSnapshot& snap,
+                                          sta::LibertyCell* replacement) const
+{
+  // Input-side: reject if a fanin net's max-cap would be violated (or made
+  // worse) by the new cell's larger input pin cap. Mirrors
+  // Resizer::replacementPreservesMaxCap / checkMaxCapOK against the frozen
+  // per-driver cap checks captured in snapshot().
+  for (const InputMaxCapCtx& in : snap.inputs) {
+    if (in.in_port == nullptr) {
+      continue;
+    }
+    const float new_cap = portInputCap(replacement, in.in_port->name().c_str());
+    const float cap_delta = new_cap - in.old_cap;
+    if (cap_delta <= 0.0f) {
+      continue;
+    }
+    for (const DriverCapCheck& dc : in.drivers) {
+      if (!dc.corner_ok) {
+        continue;
+      }
+      const float ncap = dc.cap + cap_delta;
+      if (dc.cap_slack < 0.0f) {
+        if (ncap > dc.cap) {
+          return false;
+        }
+      } else if (ncap > dc.max_cap) {
+        return false;
+      }
+    }
+  }
+
+  // Output-side: per-output-pin check against the new cell's cap/slew limits.
+  for (const OutputCtx& o : snap.outputs) {
+    if (o.port == nullptr) {
+      continue;
+    }
+    sta::LibertyPort* cand_port = replacement->findLibertyPort(o.port->name());
+    if (cand_port == nullptr) {
+      return false;  // candidate missing this output port - reject
+    }
+
+    if (checkOutputMaxCap(cand_port, o.load_cap, max_)) {
+      return false;
+    }
+
+    const float slew_factor = (o.drive_res > 0.0f && o.load_cap > 0.0f)
+                                  ? o.slew / (o.drive_res * o.load_cap)
+                                  : 0.0f;
+    if (checkOutputMaxSlew(
+            sta_, cand_port, slew_factor, o.load_cap, snap.scene, max_)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+LRSubproblem::GateDecision LRSubproblem::evaluateSnapshot(
+    const GateSnapshot& snap,
+    const float timing_weight,
+    sta::ArcDelayCalc* arc_delay_calc) const
+{
+  GateDecision result;
+  result.inst = snap.inst;
+
+  // Baseline cost with the current cell.
+  result.baseline_cost = evaluateCellCost(
+      snap, snap.cur_cell, snap.cur_leakage, timing_weight, arc_delay_calc);
+  result.best_cost = result.baseline_cost;
+  float best_leak = snap.cur_leakage;
+
+  for (const Candidate& cand : snap.candidates) {
+    // Hard DRC filter: reject any candidate that would introduce a max-cap
+    // or max-slew violation.
+    if (!candidateDrcOkSnapshot(snap, cand.cell)) {
+      continue;
+    }
+    const float cost = evaluateCellCost(
+        snap, cand.cell, cand.leakage, timing_weight, arc_delay_calc);
+    if (cost < result.best_cost) {
+      result.best_cost = cost;
+      result.best_cell = cand.cell;
+      best_leak = cand.leakage;
+    }
+  }
+
+  if (result.best_cell != nullptr) {
+    result.best_is_downsize = best_leak < snap.cur_leakage;
+  }
+  return result;
+}
+
+}  // namespace rsz
diff --git a/src/rsz/src/LRSubproblem.hh b/src/rsz/src/LRSubproblem.hh
new file mode 100644
index 00000000000..008a27b69b0
--- /dev/null
+++ b/src/rsz/src/LRSubproblem.hh
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: BSD-3-Clause
+// Copyright (c) 2026-2026, The OpenROAD Authors
+
+#pragma once
+
+#include <vector>
+
+#include "db_sta/dbNetwork.hh"
+#include "db_sta/dbSta.hh"
+#include "rsz/Resizer.hh"
+#include "sta/Liberty.hh"
+#include "sta/MinMax.hh"
+#include "sta/NetworkClass.hh"
+#include "utl/Logger.h"
+
+namespace sta {
+class ArcDelayCalc;
+class Edge;
+class Pin;
+class Scene;
+}  // namespace sta
+
+namespace rsz {
+
+class Resizer;
+
+// LRSubproblem: Evaluates the per-gate Lagrangian subproblem
+//
+//     minimize_{x ∈ S_i}
+//       leakage(x)
+//       + Σ_{e ∈ out(i)} λ_e · d_e(x)
+//       + Σ_{p ∈ inputs(i)} Σ_{e ∈ arcs_to_drv(p)} λ_e · d_e(U, load_perturbed)
+//
+// where load_perturbed = load_U - C_in(current x_i, p) + C_in(candidate, p).
+// The first sum prices the gate's own internal arcs; the second prices the
+// upstream driver U's delay change caused by varying the candidate's input
+// capacitance on pin p.
+//
+// === Threading model ========================================================
+// The evaluation is split so a Jacobi sweep can run in parallel:
+//   - snapshot(inst)         : MAIN THREAD ONLY. Reads live STA state (slews,
+//                              load caps, cap checks), fills the lazy Liberty
+//                              caches, and freezes everything needed to score a
+//                              gate into a GateSnapshot.
+//   - evaluateSnapshot(snap) : WORKER SAFE. Reads only `snap` + read-only
+//                              Liberty data and the caller-provided per-thread
+//                              ArcDelayCalc. No STA graph reads, no shared
+//                              arc_delay_calc_, no cache writes, no mutation.
+// Replacements chosen by evaluateSnapshot are applied later, serially, via
+// applyReplacement.
+class LRSubproblem : public sta::dbStaState
+{
+ public:
+  // Per-input-pin upstream context, captured once per snapshot() call and
+  // reused across every candidate cell. Each entry corresponds to one input
+  // pin of the instance whose driver belongs to a real upstream standard cell.
+  // Pins with no driver (PIs), driverless nets, or whose upstream sum-of-λ is
+  // at the floor are filtered out at build time.
+  struct UpstreamCtx
+  {
+    // Input port of the instance under its current cell. Used to look up
+    // the same port (by name) on each candidate cell so we can read the
+    // candidate's input capacitance for this pin.
+    const sta::LibertyPort* orig_in_port = nullptr;
+    // Output port of the upstream driver U at this pin's driver. Constant
+    // across candidates - only the load it sees changes per candidate.
+    sta::LibertyPort* drv_port = nullptr;
+    // Current load capacitance at U's driver pin (farads). Includes the current
+    // cell's contribution; we subtract C_in(current) and add C_in(candidate) to
+    // get the perturbed load each candidate.
+    float load_U_cur = 0.0f;
+    // Input capacitance on this pin under the instance's CURRENT cell.
+    float c_in_cur = 0.0f;
+    // Σλ over U's gate-internal data arcs that terminate at the driver pin.
+    // These are the arcs whose delay depends on the load U drives.
+    float lambda_U_drv = 0.0f;
+  };
+
+  // Frozen per-output-pin electrical state for one instance.
+  struct OutputCtx
+  {
+    // Output port under the instance's current cell. Candidate ports are looked
+    // up by name on each candidate cell.
+    const sta::LibertyPort* port = nullptr;
+    float load_cap = 0.0f;    // graph_delay_calc_->loadCap (frozen)
+    float lambda_sum = 0.0f;  // Σλ over gate-internal arcs into this pin
+    // Elmore-slew DRC inputs (frozen). slew is the STA graph slew at this pin's
+    // load vertex; drive_res is the current port's drive resistance.
+    // The candidate's output slew is estimated as
+    //   slew/(drive_res*load_cap) * cand_drive_res * load_cap.
+    float slew = 0.0f;
+    float drive_res = 0.0f;
+  };
+
+  // Snapshot of one driver pin's max-cap check on a fanin net.
+  struct DriverCapCheck
+  {
+    float cap = 0.0f;        // current load cap at the driver pin
+    float max_cap = 0.0f;    // cap limit
+    float cap_slack = 0.0f;  // current cap slack
+    bool corner_ok = false;  // max_cap > 0 && a corner was returned
+  };
+
+  // Per-input-pin context for the input-side max-cap DRC
+  // (Resizer::replacementPreservesMaxCap, frozen).
+  struct InputMaxCapCtx
+  {
+    const sta::LibertyPort* in_port = nullptr;  // current cell's input port
+    float old_cap = 0.0f;  // input pin cap under the CURRENT cell
+    std::vector<DriverCapCheck> drivers;
+  };
+
+  // One swappable candidate with its precomputed leakage-equivalent cost.
+  struct Candidate
+  {
+    sta::LibertyCell* cell = nullptr;
+    float leakage = 0.0f;  // leakageOrArea(cell), precomputed on main thread
+  };
+
+  // Everything evaluateSnapshot needs to score one instance, frozen on the
+  // main thread.
+  struct GateSnapshot
+  {
+    sta::Instance* inst = nullptr;
+    sta::LibertyCell* cur_cell = nullptr;
+    float cur_leakage = 0.0f;
+    const sta::Scene* scene = nullptr;
+    std::vector<OutputCtx> outputs;
+    std::vector<UpstreamCtx> upstream;
+    std::vector<InputMaxCapCtx> inputs;
+    std::vector<Candidate> candidates;  // excludes cur_cell
+  };
+
+  // Result of one per-gate evaluation, applied later in serial.
+  struct GateDecision
+  {
+    sta::Instance* inst = nullptr;
+    sta::LibertyCell* best_cell = nullptr;  // nullptr -> keep current
+    float best_cost = 0.0f;                 // leakage + Σλ·d at best_cell
+    float baseline_cost = 0.0f;             // same for current cell
+    // True iff best_cell has strictly lower leakage-equivalent cost than the
+    // current cell. Used by the outer loop to apply asymmetric acceptance:
+    // any cost drop is enough on a downsize, but timing-noise hysteresis
+    // still applies to upsizes. False when best_cell == nullptr.
+    bool best_is_downsize = false;
+  };
+
+  explicit LRSubproblem(Resizer* resizer);
+  ~LRSubproblem() override = default;
+
+  void init();
+
+  // MAIN THREAD ONLY. Capture the frozen state needed to evaluate `inst`.
+  // Returns false (and leaves `snap` unspecified) when `inst` is don't-touch,
+  // has no liberty cell, or has no usable output pin. `lambda` is indexed by
+  // sta::Edge::id (sparse, size `lambda_size`).
+  bool snapshot(sta::Instance* inst,
+                const float* lambda,
+                int lambda_size,
+                GateSnapshot& snap);
+
+  // WORKER SAFE. Evaluate the subproblem for a prepared snapshot using the
+  // caller-provided per-thread ArcDelayCalc. `timing_weight` scales the Σλ·d
+  // timing term against the leakage objective.
+  GateDecision evaluateSnapshot(const GateSnapshot& snap,
+                                float timing_weight,
+                                sta::ArcDelayCalc* arc_delay_calc) const;
+
+  // Leakage-equivalent cost for `cell`. Returns Resizer::cellLeakage when
+  // the Liberty exposes leakage; otherwise returns area · area-to-leakage
+  // scale (computed once at init() from the current design's distribution
+  // of leakage and area on cells that DO have leakage). Mutates a lazy cache;
+  // call only on the main thread.
+  float leakageOrArea(sta::LibertyCell* cell) const;
+
+  // Apply the LR-chosen replacement at `inst`. Wraps Resizer::replaceCell;
+  // returns true on success. Called from GlobalSizingPolicy in serial inside
+  // an open pass-level journal.
+  bool applyReplacement(sta::Instance* inst, sta::LibertyCell* replacement);
+
+ private:
+  bool isDataArc(const sta::Edge* edge) const;
+  // Walks leaf instances once to populate area_to_leakage_scale_ and
+  // expose any pure-area-only-library degenerate case.
+  void computeLeakageScale();
+
+  // Worker-safe cost of running `cell` at the snapshotted instance.
+  // `cell_leakage` is the precomputed leakage-equivalent cost of `cell`.
+  float evaluateCellCost(const GateSnapshot& snap,
+                         sta::LibertyCell* cell,
+                         float cell_leakage,
+                         float timing_weight,
+                         sta::ArcDelayCalc* arc_delay_calc) const;
+
+  // Read the max-rise/fall input capacitance of `port` on `cell` (farads).
+  // Returns 0 if the port is missing on the cell. Worker-safe (Liberty read).
+  float portInputCap(sta::LibertyCell* cell, const char* port_name) const;
+
+  // Worker-safe DRC filter over a frozen snapshot. Returns true iff installing
+  // `replacement` would not introduce any max-cap or max-slew violation -
+  // either on the input side (fanin nets due to larger input pin caps) or on
+  // each output pin (current load cap against the new cell's cap limit, and
+  // estimated output slew against the new cell's drive resistance).
+  bool candidateDrcOkSnapshot(const GateSnapshot& snap,
+                              sta::LibertyCell* replacement) const;
+
+  Resizer* resizer_ = nullptr;
+  utl::Logger* logger_ = nullptr;
+  sta::dbNetwork* db_network_ = nullptr;
+
+  // Computed at init() from this design's (leakage, area) distribution on
+  // instances whose current cell exposes Liberty leakage. Used by
+  // leakageOrArea() to give area-only cells a leakage-equivalent cost.
+  // Zero when no instance exposes leakage (degenerate area-only case).
+  float area_to_leakage_scale_ = 0.0f;
+
+  const sta::MinMax* max_ = sta::MinMax::max();
+  bool initialized_ = false;
+};
+
+}  // namespace rsz
diff --git a/src/rsz/src/Optimizer.cc b/src/rsz/src/Optimizer.cc
index bd5e362594f..6569ca18023 100644
--- a/src/rsz/src/Optimizer.cc
+++ b/src/rsz/src/Optimizer.cc
@@ -10,6 +10,7 @@
 #include <utility>
 #include <vector>
 
+#include "GlobalSizingPolicy.hh"
 #include "MeasuredVtSwapPolicy.hh"
 #include "OptimizationPolicy.hh"
 #include "OptimizerTypes.hh"
@@ -141,6 +142,10 @@ std::unique_ptr<OptimizationPolicy> Optimizer::makePolicyForPhase(
     return std::make_unique<MeasuredVtSwapPolicy>(
         resizer_, committer_, setup_context, config_);
   }
+  if (phase_name == "GLOBAL_SIZING") {
+    return std::make_unique<GlobalSizingPolicy>(
+        resizer_, committer_, setup_context, config_);
+  }
   // Only public phase names are listed; experimental top-level tokens
   // (LEGACY_MT, MT1, MEASURED_VT_SWAP) are accepted but undocumented.
   resizer_.logger()->error(
@@ -148,7 +153,7 @@ std::unique_ptr<OptimizationPolicy> Optimizer::makePolicyForPhase(
       217,
       "Unknown phase name '{}'. Valid phase names are: LEGACY, WNS, "
       "WNS_PATH, WNS_CONE, TNS, ENDPOINT_FANIN, STARTPOINT_FANOUT, "
-      "LAST_GASP, CRIT_VT_SWAP, REROUTE",
+      "LAST_GASP, CRIT_VT_SWAP, REROUTE, GLOBAL_SIZING",
       phase_name);
   return nullptr;
 }
@@ -182,7 +187,7 @@ bool Optimizer::run()
         223,
         "No phase names specified. Valid phase names are: LEGACY, WNS, "
         "WNS_PATH, WNS_CONE, TNS, ENDPOINT_FANIN, STARTPOINT_FANOUT, "
-        "LAST_GASP, CRIT_VT_SWAP, REROUTE");
+        "LAST_GASP, CRIT_VT_SWAP, REROUTE, GLOBAL_SIZING");
   }
   const int phase_count = phase_names.size();
 
diff --git a/src/rsz/src/Resizer.cc b/src/rsz/src/Resizer.cc
index 0988fcf2876..ad6e4686f7a 100644
--- a/src/rsz/src/Resizer.cc
+++ b/src/rsz/src/Resizer.cc
@@ -4304,6 +4304,19 @@ void Resizer::gateDelays(const sta::LibertyPort* drvr_port,
                          // Return values.
                          sta::ArcDelay delays[sta::RiseFall::index_count],
                          sta::Slew slews[sta::RiseFall::index_count])
+{
+  gateDelays(
+      drvr_port, load_cap, scene, min_max, arc_delay_calc_, delays, slews);
+}
+
+void Resizer::gateDelays(const sta::LibertyPort* drvr_port,
+                         const float load_cap,
+                         const sta::Scene* scene,
+                         const sta::MinMax* min_max,
+                         sta::ArcDelayCalc* arc_delay_calc,
+                         // Return values.
+                         sta::ArcDelay delays[sta::RiseFall::index_count],
+                         sta::Slew slews[sta::RiseFall::index_count])
 {
   for (int rf_index : sta::RiseFall::rangeIndex()) {
     delays[rf_index] = -sta::INF;
@@ -4327,14 +4340,14 @@ void Resizer::gateDelays(const sta::LibertyPort* drvr_port,
         }
         sta::LoadPinIndexMap load_pin_index_map(network_);
         sta::ArcDcalcResult dcalc_result
-            = arc_delay_calc_->gateDelay(nullptr,
-                                         arc,
-                                         in_slew,
-                                         load_cap,
-                                         nullptr,
-                                         load_pin_index_map,
-                                         scene,
-                                         min_max);
+            = arc_delay_calc->gateDelay(nullptr,
+                                        arc,
+                                        in_slew,
+                                        load_cap,
+                                        nullptr,
+                                        load_pin_index_map,
+                                        scene,
+                                        min_max);
 
         const sta::ArcDelay& gate_delay = dcalc_result.gateDelay();
         const sta::Slew& drvr_slew = dcalc_result.drvrSlew();
@@ -4402,10 +4415,20 @@ sta::ArcDelay Resizer::gateDelay(const sta::LibertyPort* drvr_port,
                                  const float load_cap,
                                  const sta::Scene* scene,
                                  const sta::MinMax* min_max)
+{
+  return gateDelay(drvr_port, load_cap, scene, min_max, arc_delay_calc_);
+}
+
+sta::ArcDelay Resizer::gateDelay(const sta::LibertyPort* drvr_port,
+                                 const float load_cap,
+                                 const sta::Scene* scene,
+                                 const sta::MinMax* min_max,
+                                 sta::ArcDelayCalc* arc_delay_calc)
 {
   sta::ArcDelay delays[sta::RiseFall::index_count];
   sta::Slew slews[sta::RiseFall::index_count];
-  gateDelays(drvr_port, load_cap, scene, min_max, delays, slews);
+  gateDelays(
+      drvr_port, load_cap, scene, min_max, arc_delay_calc, delays, slews);
   return max(delays[sta::RiseFall::riseIndex()],
              delays[sta::RiseFall::fallIndex()]);
 }
diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc
new file mode 100644
index 00000000000..3667e65139a
--- /dev/null
+++ b/src/rsz/src/policy/GlobalSizingPolicy.cc
@@ -0,0 +1,922 @@
+// SPDX-License-Identifier: BSD-3-Clause
+// Copyright (c) 2026-2026, The OpenROAD Authors
+
+#include "GlobalSizingPolicy.hh"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "LRSubproblem.hh"
+#include "OptimizationPolicy.hh"
+#include "OptimizerTypes.hh"
+#include "db_sta/dbNetwork.hh"
+#include "db_sta/dbSta.hh"
+#include "est/EstimateParasitics.h"
+#include "odb/db.h"
+#include "rsz/Resizer.hh"
+#include "sta/ArcDelayCalc.hh"
+#include "sta/Delay.hh"
+#include "sta/Fuzzy.hh"
+#include "sta/Graph.hh"
+#include "sta/GraphClass.hh"
+#include "sta/GraphDelayCalc.hh"
+#include "sta/Liberty.hh"
+#include "sta/Network.hh"
+#include "sta/NetworkClass.hh"
+#include "sta/PortDirection.hh"
+#include "sta/Scene.hh"
+#include "sta/Sta.hh"
+#include "sta/TimingArc.hh"
+#include "sta/TimingRole.hh"
+#include "sta/Transition.hh"
+#include "utl/Logger.h"
+#include "utl/ThreadPool.h"
+
+namespace rsz {
+
+using utl::RSZ;
+
+GlobalSizingPolicy::GlobalSizingPolicy(Resizer& resizer,
+                                       MoveCommitter& committer,
+                                       RepairSetupContext& setup_context,
+                                       const OptimizerRunConfig& config)
+    : OptimizationPolicy(resizer, committer, setup_context, config)
+{
+}
+
+GlobalSizingPolicy::~GlobalSizingPolicy() = default;
+
+bool GlobalSizingPolicy::isDataArc(const sta::Edge* edge) const
+{
+  const sta::TimingRole* role = edge->role();
+  if (role->isTimingCheck()) {
+    return false;
+  }
+  if (edge->isDisabledLoop()) {
+    return false;
+  }
+  if (role == sta::TimingRole::latchDtoQ()
+      || role == sta::TimingRole::latchEnToQ()) {
+    return false;
+  }
+  return true;
+}
+
+float GlobalSizingPolicy::edgeMaxArcDelay(sta::Edge* edge) const
+{
+  sta::TimingArcSet* arc_set = edge->timingArcSet();
+  if (arc_set == nullptr) {
+    return 0.0f;
+  }
+  float max_d = 0.0f;
+  for (sta::TimingArc* arc : arc_set->arcs()) {
+    const sta::ArcDelay d = graph_->arcDelay(edge, arc, dcalc_ap_);
+    const float df = sta::delayAsFloat(d);
+    max_d = std::max(df, max_d);
+  }
+  return max_d;
+}
+
+void GlobalSizingPolicy::allocate()
+{
+  // Ensure arc delays and endpoint slacks are up to date before we seed
+  sta_->findRequireds();
+  // DRC preambles the per-gate subproblem relies on later
+  sta_->checkCapacitancesPreamble(sta_->scenes());
+  sta_->checkSlewsPreamble();
+  sta_->checkFanoutPreamble();
+
+  const sta::Scene* scene = sta_->cmdScene();
+  dcalc_ap_ = scene->dcalcAnalysisPtIndex(policy_max_);
+
+  // Walk the graph once to discover max EdgeId (lambda_ is keyed by
+  // sta::Edge::id, which is sparse - size to max_id + 1)
+  sta::EdgeId max_edge_id = 0;
+  int data_edge_count = 0;
+  sta::VertexIterator vit(graph_);
+  while (vit.hasNext()) {
+    sta::Vertex* v = vit.next();
+    sta::VertexOutEdgeIterator eit(v, graph_);
+    while (eit.hasNext()) {
+      sta::Edge* e = eit.next();
+      if (!isDataArc(e)) {
+        continue;
+      }
+      const sta::EdgeId id = graph_->id(e);
+      max_edge_id = std::max(id, max_edge_id);
+      ++data_edge_count;
+    }
+  }
+
+  const size_t n_edges = static_cast<size_t>(max_edge_id) + 1;
+  lambda_.assign(n_edges, 0.0f);
+
+  // Endpoint bookkeeping
+  endpoint_vertices_.clear();
+  endpoint_index_.clear();
+  const sta::VertexSet& eps = sta_->endpoints();
+  endpoint_vertices_.reserve(eps.size());
+  endpoint_index_.reserve(eps.size());
+  for (sta::Vertex* v : eps) {
+    endpoint_index_.emplace(v, static_cast<int>(endpoint_vertices_.size()));
+    endpoint_vertices_.push_back(v);
+  }
+  mu_.assign(endpoint_vertices_.size(), 0.0f);
+
+  debugPrint(logger_,
+             RSZ,
+             "global_sizing",
+             2,
+             "LR allocate: edges={} (max_id={}), endpoints={}, dcalc_ap={}",
+             data_edge_count,
+             max_edge_id,
+             endpoint_vertices_.size(),
+             dcalc_ap_);
+}
+
+void GlobalSizingPolicy::seedMultipliers(const LRParams& params)
+{
+  // λ_e = d_e  (delay-proportional seed, max arc delay across rise/fall)
+  float lambda_sum = 0.0f;
+  float lambda_max = 0.0f;
+  int seeded = 0;
+  sta::VertexIterator vit(graph_);
+  while (vit.hasNext()) {
+    sta::Vertex* v = vit.next();
+    sta::VertexOutEdgeIterator eit(v, graph_);
+    while (eit.hasNext()) {
+      sta::Edge* e = eit.next();
+      if (!isDataArc(e)) {
+        continue;
+      }
+      const float d = edgeMaxArcDelay(e);
+      const sta::EdgeId id = graph_->id(e);
+      const float seed = std::max(d, params.lambda_floor);
+      lambda_[id] = seed;
+      lambda_sum += seed;
+      lambda_max = std::max(lambda_max, seed);
+      ++seeded;
+    }
+  }
+
+  // μ_k = max(0, margin - slack_k)^p  (WNS-biased endpoint seed).
+  // Then normalize so max(μ) = 1 - this decouples the LR pressure's scale
+  // from the raw slack units so that downstream λ·d terms are predictable.
+  float mu_max_raw = 0.0f;
+  int mu_nonzero = 0;
+  const float margin = params.setup_slack_margin;
+  const float p = params.mu_exponent;
+  for (size_t k = 0; k < endpoint_vertices_.size(); ++k) {
+    const sta::Slack slack = sta_->slack(endpoint_vertices_[k], policy_max_);
+    const float slack_f = sta::delayAsFloat(slack);
+    const float gap = margin - slack_f;
+    float mu = 0.0f;
+    if (gap > 0.0f) {
+      mu = std::pow(gap, p);
+      ++mu_nonzero;
+    }
+    mu_[k] = mu;
+    mu_max_raw = std::max(mu_max_raw, mu);
+  }
+  if (mu_max_raw > 0.0f) {
+    for (float& mu : mu_) {
+      mu /= mu_max_raw;
+    }
+  }
+  float mu_sum = 0.0f;
+  float mu_max = 0.0f;
+  for (const float mu : mu_) {
+    mu_sum += mu;
+    mu_max = std::max(mu_max, mu);
+  }
+
+  debugPrint(logger_,
+             RSZ,
+             "global_sizing",
+             2,
+             "LR seed: {} data arcs (λ sum={:.3g}, max={:.3g}, avg={:.3g}); "
+             "{}/{} endpoints violating (μ sum={:.3g}, max={:.3g})",
+             seeded,
+             lambda_sum,
+             lambda_max,
+             seeded ? lambda_sum / seeded : 0.0f,
+             mu_nonzero,
+             endpoint_vertices_.size(),
+             mu_sum,
+             mu_max);
+}
+
+void GlobalSizingPolicy::updateMultipliers(const LRParams& params)
+{
+  // μ: re-seed from current endpoint slacks. Fresh seed (rather than a
+  // multiplicative μ update) avoids the lock-in where an endpoint whose μ
+  // reached the floor can never re-activate when its slack regresses.
+  float mu_max_raw = 0.0f;
+  const float margin = params.setup_slack_margin;
+  const float p = params.mu_exponent;
+  int mu_nonzero = 0;
+  for (size_t k = 0; k < endpoint_vertices_.size(); ++k) {
+    const sta::Slack slack = sta_->slack(endpoint_vertices_[k], policy_max_);
+    const float slack_f = sta::delayAsFloat(slack);
+    const float gap = margin - slack_f;
+    float mu = 0.0f;
+    if (gap > 0.0f) {
+      mu = std::pow(gap, p);
+      ++mu_nonzero;
+    }
+    mu_[k] = mu;
+    mu_max_raw = std::max(mu_max_raw, mu);
+  }
+  if (mu_max_raw > 0.0f) {
+    for (float& mu : mu_) {
+      mu /= mu_max_raw;
+    }
+  }
+
+  // λ: dual-subgradient ascent.
+  //
+  //   g_e_norm = (d_e - (a_to - a_from)) / max(d_e, ε)   ∈ [-1, 0]
+  //   λ_e ← max(floor, λ_e · (1 + α · g_e_norm))
+  //
+  // tight arc (g=0)        → λ unchanged
+  // full slack (g=−1)      → λ ← (1-α)·λ
+  //
+  // Arcs touching unconstrained vertices (sentinel arrivals from no-clock
+  // PIs/POs) are skipped - those have no meaningful slack and projection
+  // alone determines their λ.
+  const float alpha = std::clamp(params.beta, 0.0f, 1.0f);
+  const float kArrivalSentinel = 1e6f;
+  float lam_sum = 0.0f;
+  float lam_max = 0.0f;
+  int updated = 0;
+  int skipped_unconstrained = 0;
+  int tight_arcs = 0;
+  sta::VertexIterator vit(graph_);
+  while (vit.hasNext()) {
+    sta::Vertex* v = vit.next();
+    sta::VertexOutEdgeIterator eit(v, graph_);
+    while (eit.hasNext()) {
+      sta::Edge* e = eit.next();
+      if (!isDataArc(e)) {
+        continue;
+      }
+      const sta::EdgeId id = graph_->id(e);
+      if (static_cast<size_t>(id) >= lambda_.size()) {
+        continue;
+      }
+      const float d = edgeMaxArcDelay(e);
+      sta::Vertex* from_v = e->from(graph_);
+      sta::Vertex* to_v = e->to(graph_);
+      const float a_from = sta::delayAsFloat(sta_->arrival(
+          from_v, sta::RiseFallBoth::riseFall(), sta_->scenes(), policy_max_));
+      const float a_to = sta::delayAsFloat(sta_->arrival(
+          to_v, sta::RiseFallBoth::riseFall(), sta_->scenes(), policy_max_));
+      if (std::fabs(a_from) >= kArrivalSentinel
+          || std::fabs(a_to) >= kArrivalSentinel) {
+        ++skipped_unconstrained;
+        lam_sum += lambda_[id];
+        lam_max = std::max(lam_max, lambda_[id]);
+        continue;
+      }
+      const float arrival_diff = a_to - a_from;
+      const float denom = std::max(d, params.lambda_floor);
+      const float g_norm = (d - arrival_diff) / denom;
+      const float g_clamped = std::clamp(g_norm, -1.0f, 0.0f);
+      if (g_clamped > -1e-6f) {
+        ++tight_arcs;
+      }
+      const float scale = 1.0f + alpha * g_clamped;
+      lambda_[id] = std::max(lambda_[id] * scale, params.lambda_floor);
+      lam_sum += lambda_[id];
+      lam_max = std::max(lam_max, lambda_[id]);
+      ++updated;
+    }
+  }
+
+  debugPrint(logger_,
+             RSZ,
+             "global_sizing",
+             2,
+             "LR update: {} arcs subgradient-stepped "
+             "({} tight, {} unconstrained skipped); "
+             "λ sum={:.3g} max={:.3g}; "
+             "{}/{} endpoints violating",
+             updated,
+             tight_arcs,
+             skipped_unconstrained,
+             lam_sum,
+             lam_max,
+             mu_nonzero,
+             endpoint_vertices_.size());
+}
+
+void GlobalSizingPolicy::projectFlowBalance(const LRParams& params)
+{
+  // Collect all vertices and sort by level (descending) so we visit endpoints
+  // before their predecessors
+  std::vector<sta::Vertex*> vertices;
+  {
+    sta::VertexIterator vit(graph_);
+    while (vit.hasNext()) {
+      vertices.push_back(vit.next());
+    }
+  }
+  std::sort(vertices.begin(),
+            vertices.end(),
+            [](const sta::Vertex* a, const sta::Vertex* b) {
+              return a->level() > b->level();
+            });
+
+  int rescaled = 0;
+  int zero_sum_fallback = 0;
+  for (sta::Vertex* v : vertices) {
+    // Target flow into v
+    float target = 0.0f;
+    auto ep_it = endpoint_index_.find(v);
+    const bool is_endpoint = ep_it != endpoint_index_.end();
+    if (is_endpoint) {
+      target = mu_[ep_it->second];
+    } else {
+      sta::VertexOutEdgeIterator oeit(v, graph_);
+      while (oeit.hasNext()) {
+        sta::Edge* e = oeit.next();
+        if (!isDataArc(e)) {
+          continue;
+        }
+        target += lambda_[graph_->id(e)];
+      }
+    }
+
+    // Current flow summed over in-data-edges
+    float in_sum = 0.0f;
+    int in_count = 0;
+    {
+      sta::VertexInEdgeIterator ieit(v, graph_);
+      while (ieit.hasNext()) {
+        sta::Edge* e = ieit.next();
+        if (!isDataArc(e)) {
+          continue;
+        }
+        in_sum += lambda_[graph_->id(e)];
+        ++in_count;
+      }
+    }
+
+    if (in_count == 0) {
+      continue;
+    }
+
+    if (in_sum > 0.0f) {
+      const float scale = target / in_sum;
+      sta::VertexInEdgeIterator ieit(v, graph_);
+      while (ieit.hasNext()) {
+        sta::Edge* e = ieit.next();
+        if (!isDataArc(e)) {
+          continue;
+        }
+        const sta::EdgeId id = graph_->id(e);
+        lambda_[id] = std::max(lambda_[id] * scale, params.lambda_floor);
+      }
+      ++rescaled;
+    } else if (target > 0.0f) {
+      const float share = target / static_cast<float>(in_count);
+      sta::VertexInEdgeIterator ieit(v, graph_);
+      while (ieit.hasNext()) {
+        sta::Edge* e = ieit.next();
+        if (!isDataArc(e)) {
+          continue;
+        }
+        lambda_[graph_->id(e)] = std::max(share, params.lambda_floor);
+      }
+      ++zero_sum_fallback;
+    }
+  }
+
+  debugPrint(logger_,
+             RSZ,
+             "global_sizing",
+             2,
+             "LR project: {} vertices rescaled ({} zero-sum fallbacks)",
+             rescaled,
+             zero_sum_fallback);
+}
+
+GlobalSizingPolicy::DesignSnap GlobalSizingPolicy::computeDesignSnap() const
+{
+  DesignSnap s;
+  std::unique_ptr<sta::LeafInstanceIterator> iit(
+      network_->leafInstanceIterator());
+  while (iit->hasNext()) {
+    sta::Instance* inst = iit->next();
+    sta::LibertyCell* cell = network_->libertyCell(inst);
+    if (cell == nullptr) {
+      continue;
+    }
+    ++s.instances;
+    const std::optional<float> leak = resizer_.cellLeakage(cell);
+    if (leak.has_value()) {
+      s.total_leakage += *leak;
+      ++s.with_leakage;
+    }
+    odb::dbMaster* master = db_network_->staToDb(db_network_->cell(cell));
+    if (master != nullptr && master->isCoreAutoPlaceable()) {
+      s.total_area += resizer_.dbuToMeters(master->getWidth())
+                      * resizer_.dbuToMeters(master->getHeight());
+    }
+  }
+  return s;
+}
+
+std::vector<LRSubproblem::GateSnapshot> GlobalSizingPolicy::buildSnapshots()
+{
+  // Phase A (main thread, delays valid): freeze each evaluable gate's
+  // timing/DRC state. snapshot() also reads loadCap/slew and warms the lazy
+  // getSwappableCells / cellLeakage / net-driver caches, so the subsequent
+  // parallel phase touches none of them.
+  const int lambda_size = static_cast<int>(lambda_.size());
+  std::vector<LRSubproblem::GateSnapshot> snapshots;
+  std::unique_ptr<sta::LeafInstanceIterator> iit(
+      network_->leafInstanceIterator());
+  while (iit->hasNext()) {
+    sta::Instance* inst = iit->next();
+    LRSubproblem::GateSnapshot snap;
+    if (subproblem_->snapshot(inst, lambda_.data(), lambda_size, snap)) {
+      snapshots.push_back(std::move(snap));
+    }
+  }
+  return snapshots;
+}
+
+GlobalSizingPolicy::SweepStats GlobalSizingPolicy::applyDecisions(
+    const std::vector<LRSubproblem::GateDecision>& decisions,
+    const int visited)
+{
+  // Phase C (main thread, serial): apply accepted replacements in the snapshot
+  // vector order so the result is independent of worker scheduling. Pure apply
+  // loop - no slack/slew/arrival query may run here, or the single batched
+  // timing update in iterate() would fragment into many.
+  //
+  // Hysteresis on cost improvement before we commit a move:
+  // - Upsize moves: 2% - filter LR-cost noise that would otherwise churn
+  //   the design without a meaningful timing win.
+  // - Downsize moves: 0% - on a non-critical gate λ is at the floor and
+  //   the cost is dominated by leakage; any drop is a real leakage gain.
+  const float upsize_accept_tol = 0.02f;
+  const float downsize_accept_tol = 0.0f;
+
+  int moves = 0;
+  int evaluated = 0;
+  int downsizes = 0;
+  int upsizes = 0;
+
+  for (const LRSubproblem::GateDecision& r : decisions) {
+    if (r.best_cell == nullptr) {
+      continue;
+    }
+    ++evaluated;
+    const float tol
+        = r.best_is_downsize ? downsize_accept_tol : upsize_accept_tol;
+    if (r.best_cost < r.baseline_cost * (1.0f - tol)) {
+      sta::LibertyCell* prev = network_->libertyCell(r.inst);
+      if (subproblem_->applyReplacement(r.inst, r.best_cell)) {
+        ++moves;
+        const float rel_gain
+            = r.baseline_cost > 0.0f
+                  ? (r.baseline_cost - r.best_cost) / r.baseline_cost
+                  : 0.0f;
+        if (r.best_is_downsize) {
+          ++downsizes;
+        } else {
+          ++upsizes;
+        }
+        debugPrint(logger_,
+                   RSZ,
+                   "global_sizing",
+                   5,
+                   "{} {}: {} -> {} (cost {:.3g} -> {:.3g}, gain {:.2f}%)",
+                   r.best_is_downsize ? "DOWN" : "UP  ",
+                   network_->pathName(r.inst),
+                   prev != nullptr ? prev->name() : "?",
+                   r.best_cell->name(),
+                   r.baseline_cost,
+                   r.best_cost,
+                   100.0f * rel_gain);
+      }
+    }
+  }
+
+  debugPrint(logger_,
+             RSZ,
+             "global_sizing",
+             2,
+             "LR sweep: {} instances visited, "
+             "{} with an improving candidate, "
+             "{} replacements applied ({} upsize, {} downsize)",
+             visited,
+             evaluated,
+             moves,
+             upsizes,
+             downsizes);
+
+  return {.moves = moves, .upsizes = upsizes, .downsizes = downsizes};
+}
+
+GlobalSizingPolicy::SweepStats GlobalSizingPolicy::singleSweep(
+    const float timing_weight)
+{
+  // Phase A: Freeze per-gate state.
+  std::vector<LRSubproblem::GateSnapshot> snapshots = buildSnapshots();
+
+  // Phase B: Score every snapshot independently. Each worker uses its own
+  // ArcDelayCalc copy (arc_delay_calc_ is single-threaded shared state); the
+  // copy is cached per worker thread and refreshed if the source changes. With
+  // a zero-worker pool this runs inline on the calling thread.
+  sta::ArcDelayCalc* const src = sta_->arcDelayCalc();
+  const std::vector<LRSubproblem::GateDecision> decisions
+      = thread_pool_->parallelMap(
+          snapshots,
+          [this, timing_weight, src](const LRSubproblem::GateSnapshot& snap) {
+            thread_local sta::ArcDelayCalc* cached_src = nullptr;
+            thread_local std::unique_ptr<sta::ArcDelayCalc> adc;
+            if (adc == nullptr || cached_src != src) {
+              adc.reset(src->copy());
+              cached_src = src;
+            }
+            return subproblem_->evaluateSnapshot(
+                snap, timing_weight, adc.get());
+          });
+
+  // Phase C: Apply accepted moves serially.
+  return applyDecisions(decisions, static_cast<int>(snapshots.size()));
+}
+
+float GlobalSizingPolicy::computeAutoTimingWeight(const LRParams& params) const
+{
+  std::vector<float> leakages;
+  std::vector<float> timings;
+  const sta::Scene* scene = sta_->cmdScene();
+  const int lambda_size = static_cast<int>(lambda_.size());
+
+  std::unique_ptr<sta::LeafInstanceIterator> iit(
+      network_->leafInstanceIterator());
+  while (iit->hasNext()) {
+    sta::Instance* inst = iit->next();
+    if (resizer_.dontTouch(inst)) {
+      continue;
+    }
+    sta::LibertyCell* cell = network_->libertyCell(inst);
+    if (cell == nullptr) {
+      continue;
+    }
+
+    leakages.push_back(subproblem_->leakageOrArea(cell));
+
+    // Per-gate timing pressure used to anchor the leakage<->timing scale.
+    //
+    // This medians ONLY the output-cone term Σλ·d_out, NOT the full cost
+    // function. The upstream-Cin term is deliberately excluded here even
+    // though it is part of evaluateCell's cost.
+    //
+    // Reason: computeAutoTimingWeight calibrates a gain, so it must be
+    // anchored to the *actionable* timing pressure - the part of the
+    // timing cost that varies as a gate is resized and therefore trades
+    // against leakage in the argmin. d_out swings 2-3x across candidate
+    // cells, so its level is a faithful proxy for that actionable swing.
+    // The upstream-Cin term's level is a full upstream gate delay d_U
+    // that is almost entirely a constant w.r.t. this gate's cell choice
+    // (intrinsic delay + the driver's other-fanout load); its level is
+    // a DC offset, not a signal. Folding it in inflated T_med ~2x,
+    // collapsed tw ~2x, and starved the output-cone term - see
+    // notes_lr/08_upstream_cin_tw_regression.md. tw cancels in
+    // evaluateCell's timing-vs-timing comparison, so the upstream-Cin
+    // term still works correctly at the output-cone-anchored tw.
+    float gate_t = 0.0f;
+    bool has_pressure = false;
+    std::unique_ptr<sta::InstancePinIterator> pit(network_->pinIterator(inst));
+    while (pit->hasNext()) {
+      sta::Pin* pin = pit->next();
+      const sta::PortDirection* dir = network_->direction(pin);
+      if (!dir->isOutput()) {
+        continue;
+      }
+      sta::Vertex* v = graph_->pinDrvrVertex(pin);
+      if (v == nullptr) {
+        continue;
+      }
+      float lam_sum = 0.0f;
+      sta::VertexInEdgeIterator ieit(v, graph_);
+      while (ieit.hasNext()) {
+        sta::Edge* e = ieit.next();
+        if (!isDataArc(e)) {
+          continue;
+        }
+        const sta::Pin* from_pin = e->from(graph_)->pin();
+        if (network_->instance(from_pin) != inst) {
+          continue;
+        }
+        const sta::EdgeId id = graph_->id(e);
+        if (static_cast<int>(id) >= lambda_size) {
+          continue;
+        }
+        lam_sum += lambda_[id];
+      }
+      if (lam_sum <= 4.0f * params.lambda_floor) {
+        continue;
+      }
+      sta::LibertyPort* port = network_->libertyPort(pin);
+      if (port == nullptr) {
+        continue;
+      }
+      const float load
+          = sta_->graphDelayCalc()->loadCap(pin, scene, policy_max_);
+      const float d = sta::delayAsFloat(
+          resizer_.gateDelay(port, load, scene, policy_max_));
+      gate_t += lam_sum * d;
+      has_pressure = true;
+    }
+    if (has_pressure) {
+      timings.push_back(gate_t);
+    }
+  }
+
+  float l_med = 0.0f;
+  float t_med = 0.0f;
+  bool degenerate = leakages.empty() || timings.empty();
+  if (!degenerate) {
+    const auto l_mid = leakages.size() / 2;
+    std::nth_element(
+        leakages.begin(), leakages.begin() + l_mid, leakages.end());
+    l_med = leakages[l_mid];
+
+    const auto t_mid = timings.size() / 2;
+    std::nth_element(timings.begin(), timings.begin() + t_mid, timings.end());
+    t_med = timings[t_mid];
+
+    if (l_med <= 0.0f || t_med <= 0.0f) {
+      degenerate = true;
+    }
+  }
+
+  if (degenerate) {
+    debugPrint(logger_,
+               RSZ,
+               "global_sizing",
+               1,
+               "LR auto timing_weight: degenerate "
+               "(leakages={}, timings={}, "
+               "L_med={:.3g}, T_med={:.3g}); using 1.0",
+               leakages.size(),
+               timings.size(),
+               l_med,
+               t_med);
+    return 1.0f;
+  }
+
+  const float tw = params.timing_bias * l_med / t_med;
+  debugPrint(logger_,
+             RSZ,
+             "global_sizing",
+             1,
+             "LR auto timing_weight: bias={:.3g} "
+             "L_med={:.3g} T_med={:.3g} -> tw={:.3g}",
+             params.timing_bias,
+             l_med,
+             t_med,
+             tw);
+  return tw;
+}
+
+bool GlobalSizingPolicy::start()
+{
+  if (!OptimizationPolicy::start()) {
+    return false;
+  }
+  db_network_ = resizer_.dbNetwork();
+  subproblem_ = std::make_unique<LRSubproblem>(&resizer_);
+  // Phase B fans the per-gate evaluations across the OpenROAD thread budget
+  // (threadCount()-1 workers; a zero-worker pool runs inline). Each worker
+  // reads only the frozen snapshots, read-only Liberty/SDC, and its own
+  // ArcDelayCalc copy, so results are independent of worker count and the
+  // apply order stays the snapshot vector order.
+  thread_pool_ = makeWorkerThreadPool();
+  return true;
+}
+
+void GlobalSizingPolicy::iterate()
+{
+  if (converged_) {
+    return;
+  }
+
+  allocate();
+  seedMultipliers(lr_params_);
+  projectFlowBalance(lr_params_);
+
+  subproblem_->init();
+
+  const float timing_weight = computeAutoTimingWeight(lr_params_);
+
+  const DesignSnap pre = computeDesignSnap();
+  const float wns_pre = sta::delayAsFloat(sta_->worstSlack(policy_max_));
+  const float tns_pre
+      = sta::delayAsFloat(sta_->totalNegativeSlack(policy_max_));
+  debugPrint(logger_,
+             RSZ,
+             "global_sizing",
+             1,
+             "Pre-global sizing design: instances={} (with leakage={}) "
+             "leakage={:.3g}W area={:.3g}m^2 WNS={} TNS={}",
+             pre.instances,
+             pre.with_leakage,
+             pre.total_leakage,
+             pre.total_area,
+             sta::delayAsString(wns_pre, 3, sta_),
+             sta::delayAsString(tns_pre, 1, sta_));
+
+  const int max_iter = (lr_params_.max_iterations > 0)
+                           ? lr_params_.max_iterations
+                           : LRParams{}.max_iterations;
+  const float wns_eps = 1e-12f;
+  LRParams iter_params = lr_params_;
+
+  float best_wns = wns_pre;
+  int total_committed = 0;
+  int total_attempted = 0;
+  int total_upsizes = 0;
+  int total_downsizes = 0;
+  int accepted_iters = 0;
+  int rejected_iters = 0;
+  int consec_zero = 0;
+  int consec_reject = 0;
+  resizer_.journalBegin();
+  for (int iter = 0; iter < max_iter; ++iter) {
+    if (iter > 0) {
+      updateMultipliers(iter_params);
+      projectFlowBalance(iter_params);
+    }
+
+    const float wns0 = sta::delayAsFloat(sta_->worstSlack(policy_max_));
+
+    const SweepStats sweep = singleSweep(timing_weight);
+    const int iter_moves = sweep.moves;
+    estimate_parasitics_->updateParasitics();
+    sta_->findRequireds();
+    const float wns1 = sta::delayAsFloat(sta_->worstSlack(policy_max_));
+
+    const float wns_delta = wns1 - wns0;
+    const bool no_benefit = (iter_moves == 0);
+    // Small regressions are deliberately allowed
+    const bool reject = sta::fuzzyLess(wns_delta, -wns_eps);
+
+    total_attempted += sweep.moves;
+    total_upsizes += sweep.upsizes;
+    total_downsizes += sweep.downsizes;
+
+    if (reject) {
+      ++consec_reject;
+      ++rejected_iters;
+      iter_params.beta *= 0.5f;
+    } else {
+      total_committed += iter_moves;
+      ++accepted_iters;
+      consec_reject = 0;
+    }
+
+    // Best-so-far: Keep track of the best WNS so far but don't restore a sweep
+    // that worsens WNS just yet to allow oscillation.
+    const float current_wns = sta::delayAsFloat(sta_->worstSlack(policy_max_));
+    if (!reject && sta::fuzzyGreaterEqual(current_wns, best_wns)) {
+      resizer_.journalEnd();  // checkpoint
+      resizer_.journalBegin();
+      best_wns = current_wns;
+    }
+
+    if (logger_->debugCheck(RSZ, "global_sizing", 1)) {
+      const DesignSnap iter_snap = computeDesignSnap();
+      const float tns_iter
+          = sta::delayAsFloat(sta_->totalNegativeSlack(policy_max_));
+      debugPrint(
+          logger_,
+          RSZ,
+          "global_sizing",
+          1,
+          "LR iter {}/{} {}: leakage={:.3g} (Δ={:+.3g}, {:+.2f}%) "
+          "area={:.3g} (Δ={:+.3g}, {:+.2f}%) "
+          "WNS={} TNS={}",
+          iter + 1,
+          max_iter,
+          reject ? "REJ" : "ACC",
+          iter_snap.total_leakage,
+          iter_snap.total_leakage - pre.total_leakage,
+          pre.total_leakage > 0.0
+              ? 100.0 * (iter_snap.total_leakage - pre.total_leakage)
+                    / pre.total_leakage
+              : 0.0,
+          iter_snap.total_area,
+          iter_snap.total_area - pre.total_area,
+          pre.total_area > 0.0
+              ? 100.0 * (iter_snap.total_area - pre.total_area) / pre.total_area
+              : 0.0,
+          sta::delayAsString(wns1, 3, sta_),
+          sta::delayAsString(tns_iter, 1, sta_));
+    }
+
+    if (consec_reject >= 3) {
+      debugPrint(logger_,
+                 RSZ,
+                 "global_sizing",
+                 1,
+                 "LR stop: 3 consecutive rejections");
+      break;
+    }
+    if (no_benefit && !reject) {
+      if (++consec_zero >= 2) {
+        debugPrint(logger_,
+                   RSZ,
+                   "global_sizing",
+                   1,
+                   "LR stop: 2 consecutive zero-move passes");
+        break;
+      }
+    } else {
+      consec_zero = 0;
+    }
+  }
+
+  // Journal will always be open and regardless of how the loop exited,
+  // we need to restore to the best checkpoint here
+  resizer_.journalRestore();
+
+  const DesignSnap post = computeDesignSnap();
+  const float wns_post = sta::delayAsFloat(sta_->worstSlack(policy_max_));
+  const float tns_post
+      = sta::delayAsFloat(sta_->totalNegativeSlack(policy_max_));
+  const auto rel = [](double after, double before) {
+    return before > 0.0 ? 100.0 * (after - before) / before : 0.0;
+  };
+  const int total_iters = accepted_iters + rejected_iters;
+
+  // Headline: kept moves vs. attempted moves. They diverge when sweeps are
+  // rolled back by the catastrophic-WNS guard, or when the end-of-run best-WNS
+  // restore reverts some drift past the best iter.
+  logger_->info(RSZ,
+                400,
+                "GLOBAL_SIZING: {} cells replaced (loop); "
+                "{}/{} sweeps accepted, {} rolled back; "
+                "{} replacements attempted in total "
+                "({} upsize, {} downsize).",
+                total_committed,
+                accepted_iters,
+                total_iters,
+                rejected_iters,
+                total_attempted,
+                total_upsizes,
+                total_downsizes);
+
+  // QoR before -> after. This is the line that answers "what did it improve
+  // and what did it regress" -- read the arrows, not just the deltas.
+  logger_->info(RSZ,
+                409,
+                "GLOBAL_SIZING QoR: "
+                "WNS {} -> {} ({}); "
+                "TNS {} -> {} ({}); "
+                "leakage {:.3g} -> {:.3g}W ({:+.2f}%); "
+                "area {:.3g} -> {:.3g}m^2 ({:+.2f}%).",
+                sta::delayAsString(wns_pre, 3, sta_),
+                sta::delayAsString(wns_post, 3, sta_),
+                sta::delayAsString(wns_post - wns_pre, 3, sta_),
+                sta::delayAsString(tns_pre, 1, sta_),
+                sta::delayAsString(tns_post, 1, sta_),
+                sta::delayAsString(tns_post - tns_pre, 1, sta_),
+                pre.total_leakage,
+                post.total_leakage,
+                rel(post.total_leakage, pre.total_leakage),
+                pre.total_area,
+                post.total_area,
+                rel(post.total_area, pre.total_area));
+
+  // Explain the all-zero summary case explicitly: the design did get
+  // churned, but every sweep blew the WNS guard so every pass was
+  // rolled back and the netlist is back to where it started.
+  if (total_committed == 0 && total_attempted > 0) {
+    logger_->info(RSZ,
+                  412,
+                  "GLOBAL_SIZING: nothing kept -- all {} sweeps tripped the "
+                  "WNS guard and were rolled back; the netlist is unchanged "
+                  "from the start of this phase. "
+                  "The {} attempted replacements were tentative only.",
+                  rejected_iters,
+                  total_attempted);
+  }
+
+  markRunComplete(true);
+}
+
+}  // namespace rsz
diff --git a/src/rsz/src/policy/GlobalSizingPolicy.hh b/src/rsz/src/policy/GlobalSizingPolicy.hh
new file mode 100644
index 00000000000..1b0f1bf1de5
--- /dev/null
+++ b/src/rsz/src/policy/GlobalSizingPolicy.hh
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: BSD-3-Clause
+// Copyright (c) 2026-2026, The OpenROAD Authors
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <unordered_map>
+#include <vector>
+
+#include "LRSubproblem.hh"
+#include "MoveCommitter.hh"
+#include "OptimizationPolicy.hh"
+#include "OptimizerTypes.hh"
+#include "RepairSetupContext.hh"
+#include "rsz/Resizer.hh"
+#include "sta/GraphClass.hh"
+#include "sta/MinMax.hh"
+
+namespace sta {
+class Edge;
+class Vertex;
+class dbNetwork;
+}  // namespace sta
+
+namespace rsz {
+
+// Tunables for the Lagrangian-Relaxation global sizing driver. Internal to the
+// policy - not user-facing through Tcl. Live here as a struct so each piece has
+// a stable name and so we can plug in env-var overrides later without rewriting
+// the policy.
+struct LRParams
+{
+  float setup_slack_margin = 0.0f;
+  int max_iterations = 20;
+  // Step size α for the dual-subgradient update on λ.
+  //   λ_e ← max(floor, λ_e · (1 + α · g_e_norm))
+  // with g_e_norm ∈ [-1, 0]. Tight arcs (g=0) are unchanged; arcs at full
+  // slack (g=-1) shrink to (1-α)·λ. Halved on pass rejection.
+  float beta = 0.6f;
+  // Endpoint seed exponent: mu_k ~ max(0, margin - slack_k)^p.
+  float mu_exponent = 2.0f;
+  // Floor for multipliers (subgradient floor so unused arcs can re-enter).
+  float lambda_floor = 1e-12f;
+  // Dimensionless balance between timing pressure and leakage cost.
+  // bias = 1.0 keeps Σλ·d (scaled) ≈ leakage cost on the median gate.
+  float timing_bias = 64.0f;
+};
+
+// GlobalSizingPolicy: Lagrangian-Relaxation-driven global sizing + Vt
+// assignment, packaged as an OptimizationPolicy phase.
+//
+// Outer loop (in iterate()): allocate λ/μ → seed → project → repeat
+// {update → project → Jacobi sweep over leaf instances → pass-level
+// accept/reject by WNS regression}.
+// Each gate's replacement decision uses LRSubproblem's per-gate cost. Skips the
+// OptimizationPolicy generator/candidate pipeline and the target_collector - LR
+// is not target-driven.
+class GlobalSizingPolicy : public OptimizationPolicy
+{
+ public:
+  GlobalSizingPolicy(Resizer& resizer,
+                     MoveCommitter& committer,
+                     RepairSetupContext& setup_context,
+                     const OptimizerRunConfig& config);
+  ~GlobalSizingPolicy() override;
+
+  const char* name() const override { return "GlobalSizingPolicy"; }
+  bool start() override;
+  void iterate() override;
+
+ private:
+  // === Setup ================================================================
+  // Discover graph size (edges, endpoints), set dcalc_ap_, size vectors.
+  void allocate();
+  // Delay-proportional λ seed + WNS-biased μ seed.
+  void seedMultipliers(const LRParams& params);
+  // Multiplicative λ update via dual-subgradient + re-seed of μ from the
+  // current slack picture. Called at the start of each outer iteration
+  // after iteration 0.
+  void updateMultipliers(const LRParams& params);
+  // Reverse-topological projection onto the KKT flow-balance polytope.
+  // After projection:
+  //   Σλ_in(v) = Σλ_out(v) for internal v
+  //   Σλ_in(k) = μ_k for each endpoint k
+  void projectFlowBalance(const LRParams& params);
+  // Tally of one Jacobi sweep. `moves` is the total cell replacements applied
+  // to the journal this sweep (tentative - the pass-acceptance test in
+  // iterate() may still roll the whole sweep back).
+  struct SweepStats
+  {
+    int moves = 0;
+    int upsizes = 0;
+    int downsizes = 0;
+  };
+
+  // One Jacobi sweep over all leaf instances, in three phases:
+  //   A buildSnapshots()  - main thread: freeze each gate's timing/DRC state
+  //   B parallel evaluate - workers: score every snapshot independently
+  //   C applyDecisions()  - main thread: apply the winning replacements
+  // The per-sweep timing update is done by the caller (iterate()), once,
+  // after this returns.
+  SweepStats singleSweep(float timing_weight);
+
+  // Phase A: Capture the frozen per-gate snapshots for every evaluable leaf
+  // instance, in a stable order. Reads live STA and warms the lazy
+  // Liberty/dbNetwork caches on the main thread.
+  std::vector<LRSubproblem::GateSnapshot> buildSnapshots();
+
+  // Phase C: Apply the accepted replacements in vector order. No timing query
+  // may run here - the single batched update happens in iterate() afterwards.
+  SweepStats applyDecisions(
+      const std::vector<LRSubproblem::GateDecision>& decisions,
+      int visited);
+
+  // Auto-scale timing weight so the output-cone timing term is comparable to
+  // the leakage term on the median gate of this design. Anchored to the
+  // output-cone term only (not the upstream-Cin term).
+  float computeAutoTimingWeight(const LRParams& params) const;
+
+  // === Diagnostics ==========================================================
+  struct DesignSnap
+  {
+    double total_leakage = 0.0;
+    double total_area = 0.0;
+    int instances = 0;
+    int with_leakage = 0;
+  };
+  DesignSnap computeDesignSnap() const;
+
+  // === Graph helpers ========================================================
+  bool isDataArc(const sta::Edge* edge) const;
+  float edgeMaxArcDelay(sta::Edge* edge) const;
+
+  // === Policy state =========================================================
+  LRParams lr_params_;
+  sta::dbNetwork* db_network_ = nullptr;
+
+  // Per-edge multipliers, indexed by sta::Edge::id (sparse)
+  std::vector<float> lambda_;
+  // Per-endpoint multipliers, indexed by a dense endpoint index
+  std::vector<float> mu_;
+  // Dense endpoint bookkeeping
+  std::vector<sta::Vertex*> endpoint_vertices_;
+  std::unordered_map<const sta::Vertex*, int> endpoint_index_;
+
+  sta::DcalcAPIndex dcalc_ap_ = 0;
+  std::unique_ptr<LRSubproblem> subproblem_;  // Per-gate cost evaluator
+  const sta::MinMax* policy_max_ = sta::MinMax::max();
+};
+
+}  // namespace rsz
diff --git a/src/rsz/test/BUILD b/src/rsz/test/BUILD
index 242953d233d..482ec2a91d7 100644
--- a/src/rsz/test/BUILD
+++ b/src/rsz/test/BUILD
@@ -254,6 +254,8 @@ PASSFAIL_TESTS = [
     # "cpp_tests",
     "repair_setup_legacy_mt",
     "repair_setup_mt1",
+    "global_sizing",
+    "global_sizing_threads",
 ]
 
 ALL_TESTS = TESTS + PASSFAIL_TESTS
@@ -278,6 +280,7 @@ filegroup(
 
 # Tests that reference other tests
 extra_deps = {
+    "global_sizing_threads": ["global_sizing.tcl"],
     "repair_fanout6_multi": ["repair_fanout6.tcl"],
     "repair_fanout7_multi": ["repair_fanout7.tcl"],
     "repair_fanout7_skip_pin_swap": ["repair_fanout7.tcl"],
diff --git a/src/rsz/test/CMakeLists.txt b/src/rsz/test/CMakeLists.txt
index 68ad69c731a..4b29def3a16 100644
--- a/src/rsz/test/CMakeLists.txt
+++ b/src/rsz/test/CMakeLists.txt
@@ -236,6 +236,8 @@ or_integration_tests(
   PASSFAIL_TESTS
     repair_setup_legacy_mt
     repair_setup_mt1
+    global_sizing
+    global_sizing_threads
     cpp_tests
 )
 
diff --git a/src/rsz/test/global_sizing.tcl b/src/rsz/test/global_sizing.tcl
new file mode 100644
index 00000000000..3c86b397cb6
--- /dev/null
+++ b/src/rsz/test/global_sizing.tcl
@@ -0,0 +1,35 @@
+# Coverage for rsz GlobalSizingPolicy.
+# Runs the GLOBAL_SIZING phase and checks the resized netlist against a golden.
+#
+# This file runs single-threaded (serial Phase-B, inline). The companion
+# global_sizing_threads.tcl runs the identical flow multi-threaded and diffs the
+# SAME golden, so the pair asserts the parallel Jacobi sweep is deterministic
+# and matches the serial result.
+source "helpers.tcl"
+
+# Thread count and result-file stem are overridable by the _threads variant.
+if { ![info exists global_sizing_threads] } {
+  set global_sizing_threads 1
+}
+if { ![info exists global_sizing_result] } {
+  set global_sizing_result "global_sizing"
+}
+
+read_liberty Nangate45/Nangate45_typ.lib
+read_lef Nangate45/Nangate45.lef
+read_def gcd_nangate45_placed.def
+read_sdc gcd_nangate45.sdc
+
+source Nangate45/Nangate45.rc
+set_wire_rc -layer metal3
+estimate_parasitics -placement
+
+set_thread_count $global_sizing_threads
+repair_timing -setup -phases GLOBAL_SIZING
+
+set verilog_file [make_result_file "${global_sizing_result}.v"]
+write_verilog $verilog_file
+check "global sizing netlist matches golden" \
+  {diff_files global_sizing.vok $verilog_file} 0
+
+exit_summary
diff --git a/src/rsz/test/global_sizing.vok b/src/rsz/test/global_sizing.vok
new file mode 100644
index 00000000000..d2531899023
--- /dev/null
+++ b/src/rsz/test/global_sizing.vok
@@ -0,0 +1,2019 @@
+module gcd (clk,
+    req_rdy,
+    req_val,
+    reset,
+    resp_rdy,
+    resp_val,
+    req_msg,
+    resp_msg);
+ input clk;
+ output req_rdy;
+ input req_val;
+ input reset;
+ input resp_rdy;
+ output resp_val;
+ input [31:0] req_msg;
+ output [15:0] resp_msg;
+
+ wire _000_;
+ wire _001_;
+ wire _002_;
+ wire _003_;
+ wire _004_;
+ wire _005_;
+ wire _006_;
+ wire _007_;
+ wire _008_;
+ wire _009_;
+ wire _010_;
+ wire _011_;
+ wire _012_;
+ wire _013_;
+ wire _014_;
+ wire _015_;
+ wire _016_;
+ wire _017_;
+ wire _018_;
+ wire _019_;
+ wire _020_;
+ wire _021_;
+ wire _022_;
+ wire _023_;
+ wire _024_;
+ wire _025_;
+ wire _026_;
+ wire _027_;
+ wire _028_;
+ wire _029_;
+ wire _030_;
+ wire _031_;
+ wire _032_;
+ wire _033_;
+ wire _034_;
+ wire _035_;
+ wire _036_;
+ wire _037_;
+ wire _038_;
+ wire _039_;
+ wire _040_;
+ wire _041_;
+ wire _042_;
+ wire _043_;
+ wire _044_;
+ wire _045_;
+ wire _046_;
+ wire _047_;
+ wire _048_;
+ wire _049_;
+ wire _050_;
+ wire _051_;
+ wire _052_;
+ wire _053_;
+ wire _054_;
+ wire _055_;
+ wire _056_;
+ wire _057_;
+ wire _058_;
+ wire _059_;
+ wire _060_;
+ wire _061_;
+ wire _062_;
+ wire _063_;
+ wire _064_;
+ wire _065_;
+ wire _066_;
+ wire _067_;
+ wire _068_;
+ wire _069_;
+ wire _070_;
+ wire _071_;
+ wire _072_;
+ wire _073_;
+ wire _074_;
+ wire _075_;
+ wire _076_;
+ wire _077_;
+ wire _078_;
+ wire _079_;
+ wire _080_;
+ wire _081_;
+ wire _082_;
+ wire _083_;
+ wire _084_;
+ wire _085_;
+ wire _086_;
+ wire _087_;
+ wire _088_;
+ wire _089_;
+ wire _090_;
+ wire _091_;
+ wire _092_;
+ wire _093_;
+ wire _094_;
+ wire _095_;
+ wire _096_;
+ wire _097_;
+ wire _098_;
+ wire _099_;
+ wire _100_;
+ wire _101_;
+ wire _102_;
+ wire _103_;
+ wire _104_;
+ wire _105_;
+ wire _106_;
+ wire _107_;
+ wire _108_;
+ wire _109_;
+ wire _110_;
+ wire _111_;
+ wire _112_;
+ wire _113_;
+ wire _114_;
+ wire _115_;
+ wire _116_;
+ wire _117_;
+ wire _118_;
+ wire _119_;
+ wire _120_;
+ wire _121_;
+ wire _122_;
+ wire _123_;
+ wire _124_;
+ wire _125_;
+ wire _126_;
+ wire _127_;
+ wire _128_;
+ wire _129_;
+ wire _130_;
+ wire _131_;
+ wire _132_;
+ wire _133_;
+ wire _134_;
+ wire _135_;
+ wire _136_;
+ wire _137_;
+ wire _138_;
+ wire _139_;
+ wire _140_;
+ wire _141_;
+ wire _142_;
+ wire _143_;
+ wire _144_;
+ wire _145_;
+ wire _146_;
+ wire _147_;
+ wire _148_;
+ wire _149_;
+ wire _150_;
+ wire _151_;
+ wire _152_;
+ wire _153_;
+ wire _154_;
+ wire _155_;
+ wire _156_;
+ wire _157_;
+ wire _158_;
+ wire _159_;
+ wire _160_;
+ wire _161_;
+ wire _162_;
+ wire _163_;
+ wire _164_;
+ wire _165_;
+ wire _166_;
+ wire _167_;
+ wire _168_;
+ wire _169_;
+ wire _170_;
+ wire _171_;
+ wire _172_;
+ wire _173_;
+ wire _174_;
+ wire _175_;
+ wire _176_;
+ wire _177_;
+ wire _178_;
+ wire _179_;
+ wire _180_;
+ wire _181_;
+ wire _182_;
+ wire _183_;
+ wire _184_;
+ wire _185_;
+ wire _186_;
+ wire _187_;
+ wire _188_;
+ wire _189_;
+ wire _190_;
+ wire _191_;
+ wire _192_;
+ wire _193_;
+ wire _194_;
+ wire _195_;
+ wire _196_;
+ wire _197_;
+ wire _198_;
+ wire _199_;
+ wire _200_;
+ wire _201_;
+ wire _202_;
+ wire _203_;
+ wire _204_;
+ wire _205_;
+ wire _206_;
+ wire _207_;
+ wire _208_;
+ wire _209_;
+ wire _210_;
+ wire _211_;
+ wire _212_;
+ wire _213_;
+ wire _214_;
+ wire _215_;
+ wire _216_;
+ wire _217_;
+ wire _218_;
+ wire _219_;
+ wire _220_;
+ wire _221_;
+ wire _222_;
+ wire _223_;
+ wire _224_;
+ wire _225_;
+ wire _226_;
+ wire _227_;
+ wire _228_;
+ wire _229_;
+ wire _230_;
+ wire _231_;
+ wire _232_;
+ wire _233_;
+ wire _234_;
+ wire _235_;
+ wire _236_;
+ wire _237_;
+ wire _238_;
+ wire _239_;
+ wire _240_;
+ wire _241_;
+ wire _242_;
+ wire _243_;
+ wire _244_;
+ wire _245_;
+ wire _246_;
+ wire _247_;
+ wire _248_;
+ wire _249_;
+ wire _250_;
+ wire _251_;
+ wire _252_;
+ wire _253_;
+ wire _254_;
+ wire _255_;
+ wire _256_;
+ wire _257_;
+ wire _258_;
+ wire _259_;
+ wire _260_;
+ wire _261_;
+ wire _262_;
+ wire _263_;
+ wire _264_;
+ wire _265_;
+ wire _266_;
+ wire _267_;
+ wire _268_;
+ wire _269_;
+ wire _270_;
+ wire _271_;
+ wire _272_;
+ wire _273_;
+ wire _274_;
+ wire _275_;
+ wire _276_;
+ wire _277_;
+ wire _278_;
+ wire _279_;
+ wire _280_;
+ wire _281_;
+ wire _282_;
+ wire _283_;
+ wire _284_;
+ wire _285_;
+ wire _286_;
+ wire _287_;
+ wire _288_;
+ wire _289_;
+ wire _290_;
+ wire _291_;
+ wire _292_;
+ wire _293_;
+ wire _294_;
+ wire _295_;
+ wire _296_;
+ wire _297_;
+ wire _298_;
+ wire _299_;
+ wire _300_;
+ wire _301_;
+ wire _302_;
+ wire _303_;
+ wire _304_;
+ wire _305_;
+ wire _306_;
+ wire _307_;
+ wire _308_;
+ wire _309_;
+ wire _310_;
+ wire _311_;
+ wire _312_;
+ wire _313_;
+ wire _314_;
+ wire _315_;
+ wire _316_;
+ wire _317_;
+ wire _318_;
+ wire _319_;
+ wire _320_;
+ wire _321_;
+ wire _322_;
+ wire _323_;
+ wire _324_;
+ wire _325_;
+ wire _326_;
+ wire _327_;
+ wire _328_;
+ wire _329_;
+ wire _330_;
+ wire _331_;
+ wire _332_;
+ wire _333_;
+ wire _334_;
+ wire _335_;
+ wire _336_;
+ wire _337_;
+ wire _338_;
+ wire _339_;
+ wire _340_;
+ wire _341_;
+ wire _342_;
+ wire _343_;
+ wire _344_;
+ wire _345_;
+ wire _346_;
+ wire _347_;
+ wire _348_;
+ wire _349_;
+ wire _350_;
+ wire _351_;
+ wire _352_;
+ wire _353_;
+ wire _354_;
+ wire _355_;
+ wire _356_;
+ wire _357_;
+ wire _358_;
+ wire _359_;
+ wire _360_;
+ wire _361_;
+ wire _362_;
+ wire _363_;
+ wire _364_;
+ wire _365_;
+ wire _366_;
+ wire _367_;
+ wire _368_;
+ wire _369_;
+ wire _370_;
+ wire _371_;
+ wire _372_;
+ wire _373_;
+ wire _374_;
+ wire _375_;
+ wire _376_;
+ wire _377_;
+ wire _378_;
+ wire _379_;
+ wire _380_;
+ wire _381_;
+ wire _382_;
+ wire _383_;
+ wire _384_;
+ wire _385_;
+ wire _386_;
+ wire _387_;
+ wire _388_;
+ wire _389_;
+ wire _390_;
+ wire _391_;
+ wire _392_;
+ wire _393_;
+ wire _394_;
+ wire _395_;
+ wire _396_;
+ wire _397_;
+ wire _398_;
+ wire _399_;
+ wire _400_;
+ wire _401_;
+ wire _402_;
+ wire _403_;
+ wire _404_;
+ wire _405_;
+ wire _406_;
+ wire _407_;
+ wire _408_;
+ wire _409_;
+ wire _410_;
+ wire _411_;
+ wire _412_;
+ wire _413_;
+ wire _414_;
+ wire _415_;
+ wire _416_;
+ wire _417_;
+ wire _418_;
+ wire _419_;
+ wire _420_;
+ wire _421_;
+ wire _422_;
+ wire _423_;
+ wire _424_;
+ wire _425_;
+ wire _426_;
+ wire _427_;
+ wire _428_;
+ wire _429_;
+ wire _430_;
+ wire _431_;
+ wire _432_;
+ wire _433_;
+ wire _434_;
+ wire _435_;
+ wire _436_;
+ wire _437_;
+ wire _438_;
+ wire _439_;
+ wire \ctrl.state.out[1] ;
+ wire \ctrl.state.out[2] ;
+ wire \dpath.a_lt_b$in0[0] ;
+ wire \dpath.a_lt_b$in0[10] ;
+ wire \dpath.a_lt_b$in0[11] ;
+ wire \dpath.a_lt_b$in0[12] ;
+ wire \dpath.a_lt_b$in0[13] ;
+ wire \dpath.a_lt_b$in0[14] ;
+ wire \dpath.a_lt_b$in0[15] ;
+ wire \dpath.a_lt_b$in0[1] ;
+ wire \dpath.a_lt_b$in0[2] ;
+ wire \dpath.a_lt_b$in0[3] ;
+ wire \dpath.a_lt_b$in0[4] ;
+ wire \dpath.a_lt_b$in0[5] ;
+ wire \dpath.a_lt_b$in0[6] ;
+ wire \dpath.a_lt_b$in0[7] ;
+ wire \dpath.a_lt_b$in0[8] ;
+ wire \dpath.a_lt_b$in0[9] ;
+ wire \dpath.a_lt_b$in1[0] ;
+ wire \dpath.a_lt_b$in1[10] ;
+ wire \dpath.a_lt_b$in1[11] ;
+ wire \dpath.a_lt_b$in1[12] ;
+ wire \dpath.a_lt_b$in1[13] ;
+ wire \dpath.a_lt_b$in1[14] ;
+ wire \dpath.a_lt_b$in1[15] ;
+ wire \dpath.a_lt_b$in1[1] ;
+ wire \dpath.a_lt_b$in1[2] ;
+ wire \dpath.a_lt_b$in1[3] ;
+ wire \dpath.a_lt_b$in1[4] ;
+ wire \dpath.a_lt_b$in1[5] ;
+ wire \dpath.a_lt_b$in1[6] ;
+ wire \dpath.a_lt_b$in1[7] ;
+ wire \dpath.a_lt_b$in1[8] ;
+ wire \dpath.a_lt_b$in1[9] ;
+
+ FILLCELL_X1 PHY_0 ();
+ FILLCELL_X1 PHY_1 ();
+ FILLCELL_X1 PHY_10 ();
+ FILLCELL_X1 PHY_100 ();
+ FILLCELL_X1 PHY_101 ();
+ FILLCELL_X1 PHY_102 ();
+ FILLCELL_X1 PHY_103 ();
+ FILLCELL_X1 PHY_104 ();
+ FILLCELL_X1 PHY_105 ();
+ FILLCELL_X1 PHY_106 ();
+ FILLCELL_X1 PHY_107 ();
+ FILLCELL_X1 PHY_108 ();
+ FILLCELL_X1 PHY_109 ();
+ FILLCELL_X1 PHY_11 ();
+ FILLCELL_X1 PHY_110 ();
+ FILLCELL_X1 PHY_111 ();
+ FILLCELL_X1 PHY_112 ();
+ FILLCELL_X1 PHY_113 ();
+ FILLCELL_X1 PHY_12 ();
+ FILLCELL_X1 PHY_13 ();
+ FILLCELL_X1 PHY_14 ();
+ FILLCELL_X1 PHY_15 ();
+ FILLCELL_X1 PHY_16 ();
+ FILLCELL_X1 PHY_17 ();
+ FILLCELL_X1 PHY_18 ();
+ FILLCELL_X1 PHY_19 ();
+ FILLCELL_X1 PHY_2 ();
+ FILLCELL_X1 PHY_20 ();
+ FILLCELL_X1 PHY_21 ();
+ FILLCELL_X1 PHY_22 ();
+ FILLCELL_X1 PHY_23 ();
+ FILLCELL_X1 PHY_24 ();
+ FILLCELL_X1 PHY_25 ();
+ FILLCELL_X1 PHY_26 ();
+ FILLCELL_X1 PHY_27 ();
+ FILLCELL_X1 PHY_28 ();
+ FILLCELL_X1 PHY_29 ();
+ FILLCELL_X1 PHY_3 ();
+ FILLCELL_X1 PHY_30 ();
+ FILLCELL_X1 PHY_31 ();
+ FILLCELL_X1 PHY_32 ();
+ FILLCELL_X1 PHY_33 ();
+ FILLCELL_X1 PHY_34 ();
+ FILLCELL_X1 PHY_35 ();
+ FILLCELL_X1 PHY_36 ();
+ FILLCELL_X1 PHY_37 ();
+ FILLCELL_X1 PHY_38 ();
+ FILLCELL_X1 PHY_39 ();
+ FILLCELL_X1 PHY_4 ();
+ FILLCELL_X1 PHY_40 ();
+ FILLCELL_X1 PHY_41 ();
+ FILLCELL_X1 PHY_42 ();
+ FILLCELL_X1 PHY_43 ();
+ FILLCELL_X1 PHY_44 ();
+ FILLCELL_X1 PHY_45 ();
+ FILLCELL_X1 PHY_46 ();
+ FILLCELL_X1 PHY_47 ();
+ FILLCELL_X1 PHY_48 ();
+ FILLCELL_X1 PHY_49 ();
+ FILLCELL_X1 PHY_5 ();
+ FILLCELL_X1 PHY_50 ();
+ FILLCELL_X1 PHY_51 ();
+ FILLCELL_X1 PHY_52 ();
+ FILLCELL_X1 PHY_53 ();
+ FILLCELL_X1 PHY_54 ();
+ FILLCELL_X1 PHY_55 ();
+ FILLCELL_X1 PHY_56 ();
+ FILLCELL_X1 PHY_57 ();
+ FILLCELL_X1 PHY_58 ();
+ FILLCELL_X1 PHY_59 ();
+ FILLCELL_X1 PHY_6 ();
+ FILLCELL_X1 PHY_60 ();
+ FILLCELL_X1 PHY_61 ();
+ FILLCELL_X1 PHY_62 ();
+ FILLCELL_X1 PHY_63 ();
+ FILLCELL_X1 PHY_64 ();
+ FILLCELL_X1 PHY_65 ();
+ FILLCELL_X1 PHY_66 ();
+ FILLCELL_X1 PHY_67 ();
+ FILLCELL_X1 PHY_68 ();
+ FILLCELL_X1 PHY_69 ();
+ FILLCELL_X1 PHY_7 ();
+ FILLCELL_X1 PHY_70 ();
+ FILLCELL_X1 PHY_71 ();
+ FILLCELL_X1 PHY_72 ();
+ FILLCELL_X1 PHY_73 ();
+ FILLCELL_X1 PHY_74 ();
+ FILLCELL_X1 PHY_75 ();
+ FILLCELL_X1 PHY_76 ();
+ FILLCELL_X1 PHY_77 ();
+ FILLCELL_X1 PHY_78 ();
+ FILLCELL_X1 PHY_79 ();
+ FILLCELL_X1 PHY_8 ();
+ FILLCELL_X1 PHY_80 ();
+ FILLCELL_X1 PHY_81 ();
+ FILLCELL_X1 PHY_82 ();
+ FILLCELL_X1 PHY_83 ();
+ FILLCELL_X1 PHY_84 ();
+ FILLCELL_X1 PHY_85 ();
+ FILLCELL_X1 PHY_86 ();
+ FILLCELL_X1 PHY_87 ();
+ FILLCELL_X1 PHY_88 ();
+ FILLCELL_X1 PHY_89 ();
+ FILLCELL_X1 PHY_9 ();
+ FILLCELL_X1 PHY_90 ();
+ FILLCELL_X1 PHY_91 ();
+ FILLCELL_X1 PHY_92 ();
+ FILLCELL_X1 PHY_93 ();
+ FILLCELL_X1 PHY_94 ();
+ FILLCELL_X1 PHY_95 ();
+ FILLCELL_X1 PHY_96 ();
+ FILLCELL_X1 PHY_97 ();
+ FILLCELL_X1 PHY_98 ();
+ FILLCELL_X1 PHY_99 ();
+ INV_X8 _440_ (.A(_109_),
+    .ZN(_142_));
+ AND3_X1 _441_ (.A1(_142_),
+    .A2(_108_),
+    .A3(_059_),
+    .ZN(_423_));
+ XOR2_X2 _442_ (.A(_110_),
+    .B(_126_),
+    .Z(_406_));
+ NOR4_X1 _443_ (.A1(_139_),
+    .A2(_138_),
+    .A3(_136_),
+    .A4(_137_),
+    .ZN(_143_));
+ NOR2_X1 _444_ (.A1(_135_),
+    .A2(_126_),
+    .ZN(_144_));
+ INV_X2 _445_ (.A(_133_),
+    .ZN(_145_));
+ INV_X2 _446_ (.A(_134_),
+    .ZN(_146_));
+ NAND4_X1 _447_ (.A1(_143_),
+    .A2(_144_),
+    .A3(_145_),
+    .A4(_146_),
+    .ZN(_147_));
+ NOR4_X1 _448_ (.A1(_128_),
+    .A2(_127_),
+    .A3(_140_),
+    .A4(_141_),
+    .ZN(_148_));
+ NOR4_X1 _449_ (.A1(_132_),
+    .A2(_131_),
+    .A3(_129_),
+    .A4(_130_),
+    .ZN(_149_));
+ NAND2_X1 _450_ (.A1(_148_),
+    .A2(_149_),
+    .ZN(_150_));
+ NOR2_X1 _451_ (.A1(_147_),
+    .A2(_150_),
+    .ZN(_151_));
+ INV_X2 _452_ (.A(_405_),
+    .ZN(_152_));
+ INV_X2 _453_ (.A(_058_),
+    .ZN(_153_));
+ NAND3_X1 _454_ (.A1(_151_),
+    .A2(_152_),
+    .A3(_153_),
+    .ZN(_154_));
+ AND2_X1 _455_ (.A1(_423_),
+    .A2(_422_),
+    .ZN(_155_));
+ OR3_X1 _456_ (.A1(_155_),
+    .A2(_057_),
+    .A3(_405_),
+    .ZN(_156_));
+ NAND2_X1 _457_ (.A1(_154_),
+    .A2(_156_),
+    .ZN(_055_));
+ OAI211_X1 _458_ (.A(_152_),
+    .B(_153_),
+    .C1(_147_),
+    .C2(_150_),
+    .ZN(_157_));
+ INV_X4 _459_ (.A(_059_),
+    .ZN(_158_));
+ BUF_X4 _460_ (.A(_158_),
+    .Z(_159_));
+ BUF_X4 _461_ (.A(_403_),
+    .Z(_160_));
+ NAND4_X1 _462_ (.A1(_152_),
+    .A2(_159_),
+    .A3(_160_),
+    .A4(_404_),
+    .ZN(_161_));
+ NAND2_X1 _463_ (.A1(_157_),
+    .A2(_161_),
+    .ZN(_056_));
+ NAND3_X1 _464_ (.A1(_423_),
+    .A2(_152_),
+    .A3(_422_),
+    .ZN(_162_));
+ NOR2_X1 _465_ (.A1(_162_),
+    .A2(_057_),
+    .ZN(_163_));
+ AOI211_X4 _466_ (.A(_405_),
+    .B(_059_),
+    .C1(_403_),
+    .C2(_404_),
+    .ZN(_164_));
+ OR3_X1 _467_ (.A1(_163_),
+    .A2(_164_),
+    .A3(_405_),
+    .ZN(_054_));
+ XNOR2_X2 _468_ (.A(_112_),
+    .B(_128_),
+    .ZN(_165_));
+ XNOR2_X1 _469_ (.A(_111_),
+    .B(_127_),
+    .ZN(_166_));
+ AND2_X1 _470_ (.A1(_165_),
+    .A2(_166_),
+    .ZN(_167_));
+ XNOR2_X2 _471_ (.A(_125_),
+    .B(_141_),
+    .ZN(_168_));
+ XNOR2_X2 _472_ (.A(_124_),
+    .B(_140_),
+    .ZN(_169_));
+ AND3_X2 _473_ (.A1(_167_),
+    .A2(_168_),
+    .A3(_169_),
+    .ZN(_170_));
+ XNOR2_X2 _474_ (.A(_120_),
+    .B(_136_),
+    .ZN(_171_));
+ XNOR2_X2 _475_ (.A(_121_),
+    .B(_137_),
+    .ZN(_172_));
+ AND2_X2 _476_ (.A1(_171_),
+    .A2(_172_),
+    .ZN(_173_));
+ XNOR2_X2 _477_ (.A(_122_),
+    .B(_138_),
+    .ZN(_174_));
+ XNOR2_X2 _478_ (.A(_123_),
+    .B(_139_),
+    .ZN(_175_));
+ AND2_X2 _479_ (.A1(_174_),
+    .A2(_175_),
+    .ZN(_176_));
+ AND2_X2 _480_ (.A1(_173_),
+    .A2(_176_),
+    .ZN(_177_));
+ XNOR2_X1 _481_ (.A(_115_),
+    .B(_131_),
+    .ZN(_178_));
+ XNOR2_X2 _482_ (.A(_116_),
+    .B(_132_),
+    .ZN(_179_));
+ NAND2_X1 _483_ (.A1(_178_),
+    .A2(_179_),
+    .ZN(_180_));
+ XNOR2_X2 _484_ (.A(_114_),
+    .B(_130_),
+    .ZN(_181_));
+ INV_X1 _485_ (.A(_181_),
+    .ZN(_182_));
+ XNOR2_X1 _486_ (.A(_113_),
+    .B(_129_),
+    .ZN(_183_));
+ INV_X1 _487_ (.A(_183_),
+    .ZN(_184_));
+ NOR3_X2 _488_ (.A1(_180_),
+    .A2(_182_),
+    .A3(_184_),
+    .ZN(_185_));
+ XNOR2_X2 _489_ (.A(_118_),
+    .B(_134_),
+    .ZN(_186_));
+ INV_X2 _490_ (.A(_186_),
+    .ZN(_187_));
+ XOR2_X2 _491_ (.A(_119_),
+    .B(_135_),
+    .Z(_188_));
+ NOR2_X2 _492_ (.A1(_187_),
+    .A2(_188_),
+    .ZN(_189_));
+ NAND4_X1 _493_ (.A1(_170_),
+    .A2(_177_),
+    .A3(_185_),
+    .A4(_189_),
+    .ZN(_190_));
+ XNOR2_X2 _494_ (.A(_117_),
+    .B(_133_),
+    .ZN(_191_));
+ INV_X1 _495_ (.A(_191_),
+    .ZN(_192_));
+ NOR3_X2 _496_ (.A1(_190_),
+    .A2(_192_),
+    .A3(_406_),
+    .ZN(_193_));
+ NAND3_X1 _497_ (.A1(_189_),
+    .A2(_176_),
+    .A3(_173_),
+    .ZN(_194_));
+ NOR2_X1 _498_ (.A1(_145_),
+    .A2(_117_),
+    .ZN(_195_));
+ INV_X1 _499_ (.A(_110_),
+    .ZN(_196_));
+ NOR3_X1 _500_ (.A1(_195_),
+    .A2(_126_),
+    .A3(_196_),
+    .ZN(_197_));
+ AND2_X2 _501_ (.A1(_145_),
+    .A2(_117_),
+    .ZN(_198_));
+ OR3_X2 _502_ (.A1(_194_),
+    .A2(_197_),
+    .A3(_198_),
+    .ZN(_199_));
+ INV_X1 _503_ (.A(_120_),
+    .ZN(_200_));
+ AND3_X1 _504_ (.A1(_172_),
+    .A2(_136_),
+    .A3(_200_),
+    .ZN(_201_));
+ INV_X1 _505_ (.A(_121_),
+    .ZN(_202_));
+ AND2_X1 _506_ (.A1(_202_),
+    .A2(_137_),
+    .ZN(_203_));
+ OAI21_X1 _507_ (.A(_176_),
+    .B1(_201_),
+    .B2(_203_),
+    .ZN(_204_));
+ INV_X1 _508_ (.A(_122_),
+    .ZN(_205_));
+ NAND3_X1 _509_ (.A1(_175_),
+    .A2(_205_),
+    .A3(_138_),
+    .ZN(_206_));
+ INV_X1 _510_ (.A(_139_),
+    .ZN(_207_));
+ NOR2_X1 _511_ (.A1(_207_),
+    .A2(_123_),
+    .ZN(_208_));
+ INV_X1 _512_ (.A(_208_),
+    .ZN(_209_));
+ AND3_X1 _513_ (.A1(_204_),
+    .A2(_206_),
+    .A3(_209_),
+    .ZN(_210_));
+ INV_X1 _514_ (.A(_119_),
+    .ZN(_211_));
+ NAND2_X1 _515_ (.A1(_211_),
+    .A2(_135_),
+    .ZN(_212_));
+ OAI21_X1 _516_ (.A(_212_),
+    .B1(_118_),
+    .B2(_146_),
+    .ZN(_213_));
+ OAI211_X1 _517_ (.A(_177_),
+    .B(_213_),
+    .C1(_135_),
+    .C2(_211_),
+    .ZN(_214_));
+ NAND3_X1 _518_ (.A1(_199_),
+    .A2(_210_),
+    .A3(_214_),
+    .ZN(_215_));
+ AND2_X1 _519_ (.A1(_170_),
+    .A2(_185_),
+    .ZN(_216_));
+ NAND2_X1 _520_ (.A1(_215_),
+    .A2(_216_),
+    .ZN(_217_));
+ INV_X1 _521_ (.A(_116_),
+    .ZN(_218_));
+ NAND2_X1 _522_ (.A1(_218_),
+    .A2(_132_),
+    .ZN(_219_));
+ INV_X1 _523_ (.A(_124_),
+    .ZN(_220_));
+ AND3_X1 _524_ (.A1(_168_),
+    .A2(_140_),
+    .A3(_220_),
+    .ZN(_221_));
+ INV_X1 _525_ (.A(_125_),
+    .ZN(_222_));
+ AND2_X1 _526_ (.A1(_222_),
+    .A2(_141_),
+    .ZN(_223_));
+ OAI21_X1 _527_ (.A(_167_),
+    .B1(_221_),
+    .B2(_223_),
+    .ZN(_224_));
+ INV_X2 _528_ (.A(_112_),
+    .ZN(_225_));
+ NOR2_X1 _529_ (.A1(_225_),
+    .A2(_128_),
+    .ZN(_226_));
+ INV_X1 _530_ (.A(_111_),
+    .ZN(_227_));
+ AOI22_X1 _531_ (.A1(_128_),
+    .A2(_225_),
+    .B1(_227_),
+    .B2(_127_),
+    .ZN(_228_));
+ OAI21_X1 _532_ (.A(_224_),
+    .B1(_226_),
+    .B2(_228_),
+    .ZN(_229_));
+ NAND2_X1 _533_ (.A1(_229_),
+    .A2(_185_),
+    .ZN(_230_));
+ INV_X1 _534_ (.A(_115_),
+    .ZN(_231_));
+ NAND3_X1 _535_ (.A1(_179_),
+    .A2(_231_),
+    .A3(_131_),
+    .ZN(_232_));
+ AND4_X2 _536_ (.A1(_217_),
+    .A2(_219_),
+    .A3(_230_),
+    .A4(_232_),
+    .ZN(_233_));
+ INV_X1 _537_ (.A(_114_),
+    .ZN(_234_));
+ NOR2_X1 _538_ (.A1(_234_),
+    .A2(_130_),
+    .ZN(_235_));
+ INV_X1 _539_ (.A(_113_),
+    .ZN(_236_));
+ AOI22_X1 _540_ (.A1(_130_),
+    .A2(_234_),
+    .B1(_236_),
+    .B2(_129_),
+    .ZN(_237_));
+ OR3_X1 _541_ (.A1(_180_),
+    .A2(_235_),
+    .A3(_237_),
+    .ZN(_238_));
+ AOI21_X2 _542_ (.A(_193_),
+    .B1(_233_),
+    .B2(_238_),
+    .ZN(_239_));
+ NOR2_X2 _543_ (.A1(_239_),
+    .A2(_142_),
+    .ZN(_240_));
+ BUF_X4 _544_ (.A(_240_),
+    .Z(_241_));
+ BUF_X4 _545_ (.A(_059_),
+    .Z(_242_));
+ NAND3_X1 _546_ (.A1(_241_),
+    .A2(_242_),
+    .A3(_406_),
+    .ZN(_243_));
+ OAI21_X1 _547_ (.A(_378_),
+    .B1(_142_),
+    .B2(_158_),
+    .ZN(_244_));
+ NAND2_X1 _548_ (.A1(_239_),
+    .A2(_109_),
+    .ZN(_245_));
+ OR2_X2 _549_ (.A1(_245_),
+    .A2(_158_),
+    .ZN(_246_));
+ BUF_X8 _550_ (.A(_246_),
+    .Z(_247_));
+ BUF_X8 _551_ (.A(_247_),
+    .Z(_248_));
+ OAI211_X1 _552_ (.A(_243_),
+    .B(_244_),
+    .C1(_248_),
+    .C2(_060_),
+    .ZN(_249_));
+ OR2_X2 _553_ (.A1(_109_),
+    .A2(_403_),
+    .ZN(_250_));
+ BUF_X8 _554_ (.A(_250_),
+    .Z(_251_));
+ MUX2_X1 _555_ (.A(_110_),
+    .B(_249_),
+    .S(_251_),
+    .Z(_076_));
+ BUF_X8 _556_ (.A(_142_),
+    .Z(_252_));
+ OAI21_X1 _557_ (.A(_379_),
+    .B1(_252_),
+    .B2(_159_),
+    .ZN(_253_));
+ NAND2_X1 _558_ (.A1(_196_),
+    .A2(_126_),
+    .ZN(_254_));
+ XOR2_X1 _559_ (.A(_191_),
+    .B(_254_),
+    .Z(_413_));
+ NAND3_X1 _560_ (.A1(_240_),
+    .A2(_242_),
+    .A3(_413_),
+    .ZN(_255_));
+ OAI211_X1 _561_ (.A(_253_),
+    .B(_255_),
+    .C1(_248_),
+    .C2(_061_),
+    .ZN(_256_));
+ BUF_X8 _562_ (.A(_251_),
+    .Z(_257_));
+ MUX2_X1 _563_ (.A(_117_),
+    .B(_256_),
+    .S(_257_),
+    .Z(_083_));
+ OAI21_X1 _564_ (.A(_380_),
+    .B1(_252_),
+    .B2(_159_),
+    .ZN(_258_));
+ AOI21_X2 _565_ (.A(_198_),
+    .B1(_191_),
+    .B2(_254_),
+    .ZN(_259_));
+ XNOR2_X1 _566_ (.A(_259_),
+    .B(_186_),
+    .ZN(_414_));
+ NAND3_X1 _567_ (.A1(_241_),
+    .A2(_242_),
+    .A3(_414_),
+    .ZN(_260_));
+ OAI211_X1 _568_ (.A(_258_),
+    .B(_260_),
+    .C1(_248_),
+    .C2(_062_),
+    .ZN(_261_));
+ MUX2_X1 _569_ (.A(_118_),
+    .B(_261_),
+    .S(_251_),
+    .Z(_084_));
+ OAI21_X1 _570_ (.A(_381_),
+    .B1(_252_),
+    .B2(_159_),
+    .ZN(_262_));
+ NOR2_X1 _571_ (.A1(_259_),
+    .A2(_187_),
+    .ZN(_263_));
+ AND2_X1 _572_ (.A1(_146_),
+    .A2(_118_),
+    .ZN(_264_));
+ OR2_X1 _573_ (.A1(_263_),
+    .A2(_264_),
+    .ZN(_265_));
+ XNOR2_X1 _574_ (.A(_265_),
+    .B(_188_),
+    .ZN(_415_));
+ NAND3_X1 _575_ (.A1(_241_),
+    .A2(_059_),
+    .A3(_415_),
+    .ZN(_266_));
+ OAI211_X1 _576_ (.A(_262_),
+    .B(_266_),
+    .C1(_248_),
+    .C2(_063_),
+    .ZN(_267_));
+ MUX2_X1 _577_ (.A(_119_),
+    .B(_267_),
+    .S(_257_),
+    .Z(_085_));
+ OAI21_X2 _578_ (.A(_383_),
+    .B1(_252_),
+    .B2(_159_),
+    .ZN(_268_));
+ NOR3_X2 _579_ (.A1(_259_),
+    .A2(_187_),
+    .A3(_188_),
+    .ZN(_269_));
+ NAND3_X1 _580_ (.A1(_212_),
+    .A2(_118_),
+    .A3(_146_),
+    .ZN(_270_));
+ OAI21_X1 _581_ (.A(_270_),
+    .B1(_211_),
+    .B2(_135_),
+    .ZN(_271_));
+ OR2_X2 _582_ (.A1(_269_),
+    .A2(_271_),
+    .ZN(_272_));
+ XOR2_X1 _583_ (.A(_272_),
+    .B(_171_),
+    .Z(_416_));
+ NAND3_X1 _584_ (.A1(_241_),
+    .A2(_059_),
+    .A3(_416_),
+    .ZN(_273_));
+ OAI211_X1 _585_ (.A(_268_),
+    .B(_273_),
+    .C1(_247_),
+    .C2(_064_),
+    .ZN(_274_));
+ MUX2_X1 _586_ (.A(_120_),
+    .B(_274_),
+    .S(_251_),
+    .Z(_086_));
+ OAI21_X1 _587_ (.A(_384_),
+    .B1(_252_),
+    .B2(_158_),
+    .ZN(_275_));
+ AND2_X1 _588_ (.A1(_272_),
+    .A2(_171_),
+    .ZN(_276_));
+ NOR2_X2 _589_ (.A1(_200_),
+    .A2(_136_),
+    .ZN(_277_));
+ NOR3_X1 _590_ (.A1(_276_),
+    .A2(_277_),
+    .A3(_172_),
+    .ZN(_278_));
+ AOI221_X4 _591_ (.A(_278_),
+    .B1(_277_),
+    .B2(_172_),
+    .C1(_272_),
+    .C2(_173_),
+    .ZN(_417_));
+ NAND3_X1 _592_ (.A1(_241_),
+    .A2(_242_),
+    .A3(_417_),
+    .ZN(_279_));
+ OAI211_X1 _593_ (.A(_275_),
+    .B(_279_),
+    .C1(_247_),
+    .C2(_065_),
+    .ZN(_280_));
+ MUX2_X1 _594_ (.A(_121_),
+    .B(_280_),
+    .S(_257_),
+    .Z(_087_));
+ OAI21_X1 _595_ (.A(_385_),
+    .B1(_252_),
+    .B2(_159_),
+    .ZN(_281_));
+ NAND2_X1 _596_ (.A1(_172_),
+    .A2(_277_),
+    .ZN(_282_));
+ OAI21_X2 _597_ (.A(_282_),
+    .B1(_202_),
+    .B2(_137_),
+    .ZN(_283_));
+ AOI21_X1 _598_ (.A(_283_),
+    .B1(_272_),
+    .B2(_173_),
+    .ZN(_284_));
+ XNOR2_X1 _599_ (.A(_284_),
+    .B(_174_),
+    .ZN(_418_));
+ NAND3_X1 _600_ (.A1(_241_),
+    .A2(_242_),
+    .A3(_418_),
+    .ZN(_285_));
+ OAI211_X1 _601_ (.A(_281_),
+    .B(_285_),
+    .C1(_247_),
+    .C2(_066_),
+    .ZN(_286_));
+ MUX2_X1 _602_ (.A(_122_),
+    .B(_286_),
+    .S(_257_),
+    .Z(_088_));
+ OAI21_X1 _603_ (.A(_386_),
+    .B1(_142_),
+    .B2(_159_),
+    .ZN(_287_));
+ NOR2_X2 _604_ (.A1(_205_),
+    .A2(_138_),
+    .ZN(_288_));
+ AND2_X1 _605_ (.A1(_205_),
+    .A2(_138_),
+    .ZN(_289_));
+ NOR3_X1 _606_ (.A1(_284_),
+    .A2(_288_),
+    .A3(_289_),
+    .ZN(_290_));
+ NOR2_X1 _607_ (.A1(_290_),
+    .A2(_288_),
+    .ZN(_291_));
+ XNOR2_X1 _608_ (.A(_291_),
+    .B(_175_),
+    .ZN(_419_));
+ NAND3_X1 _609_ (.A1(_240_),
+    .A2(_242_),
+    .A3(_419_),
+    .ZN(_292_));
+ OAI211_X1 _610_ (.A(_287_),
+    .B(_292_),
+    .C1(_248_),
+    .C2(_067_),
+    .ZN(_293_));
+ MUX2_X1 _611_ (.A(_123_),
+    .B(_293_),
+    .S(_257_),
+    .Z(_089_));
+ OAI21_X1 _612_ (.A(_387_),
+    .B1(_252_),
+    .B2(_158_),
+    .ZN(_294_));
+ AND2_X1 _613_ (.A1(_175_),
+    .A2(_288_),
+    .ZN(_295_));
+ AOI221_X1 _614_ (.A(_295_),
+    .B1(_207_),
+    .B2(_123_),
+    .C1(_283_),
+    .C2(_176_),
+    .ZN(_296_));
+ OAI21_X1 _615_ (.A(_177_),
+    .B1(_269_),
+    .B2(_271_),
+    .ZN(_297_));
+ NAND2_X1 _616_ (.A1(_296_),
+    .A2(_297_),
+    .ZN(_298_));
+ XOR2_X1 _617_ (.A(_298_),
+    .B(_169_),
+    .Z(_420_));
+ NAND3_X1 _618_ (.A1(_241_),
+    .A2(_242_),
+    .A3(_420_),
+    .ZN(_299_));
+ OAI211_X1 _619_ (.A(_294_),
+    .B(_299_),
+    .C1(_247_),
+    .C2(_068_),
+    .ZN(_300_));
+ MUX2_X1 _620_ (.A(_124_),
+    .B(_300_),
+    .S(_257_),
+    .Z(_090_));
+ AND3_X1 _621_ (.A1(_298_),
+    .A2(_169_),
+    .A3(_168_),
+    .ZN(_301_));
+ AND2_X1 _622_ (.A1(_298_),
+    .A2(_169_),
+    .ZN(_302_));
+ NOR2_X2 _623_ (.A1(_220_),
+    .A2(_140_),
+    .ZN(_303_));
+ NOR3_X1 _624_ (.A1(_302_),
+    .A2(_303_),
+    .A3(_168_),
+    .ZN(_304_));
+ AOI211_X4 _625_ (.A(_301_),
+    .B(_304_),
+    .C1(_303_),
+    .C2(_168_),
+    .ZN(_421_));
+ NAND3_X1 _626_ (.A1(_241_),
+    .A2(_421_),
+    .A3(_059_),
+    .ZN(_305_));
+ OAI21_X1 _627_ (.A(_388_),
+    .B1(_142_),
+    .B2(_158_),
+    .ZN(_306_));
+ OAI211_X1 _628_ (.A(_305_),
+    .B(_306_),
+    .C1(_247_),
+    .C2(_069_),
+    .ZN(_307_));
+ MUX2_X1 _629_ (.A(_125_),
+    .B(_307_),
+    .S(_251_),
+    .Z(_091_));
+ NOR2_X1 _630_ (.A1(_222_),
+    .A2(_141_),
+    .ZN(_308_));
+ AOI21_X1 _631_ (.A(_308_),
+    .B1(_168_),
+    .B2(_303_),
+    .ZN(_309_));
+ INV_X1 _632_ (.A(_309_),
+    .ZN(_310_));
+ OR3_X1 _633_ (.A1(_301_),
+    .A2(_310_),
+    .A3(_166_),
+    .ZN(_311_));
+ OAI21_X1 _634_ (.A(_166_),
+    .B1(_301_),
+    .B2(_310_),
+    .ZN(_312_));
+ AND2_X1 _635_ (.A1(_311_),
+    .A2(_312_),
+    .ZN(_407_));
+ NAND3_X1 _636_ (.A1(_241_),
+    .A2(_242_),
+    .A3(_407_),
+    .ZN(_313_));
+ OAI21_X1 _637_ (.A(_389_),
+    .B1(_142_),
+    .B2(_158_),
+    .ZN(_314_));
+ OAI211_X1 _638_ (.A(_313_),
+    .B(_314_),
+    .C1(_248_),
+    .C2(_070_),
+    .ZN(_315_));
+ MUX2_X1 _639_ (.A(_111_),
+    .B(_315_),
+    .S(_251_),
+    .Z(_077_));
+ OAI21_X2 _640_ (.A(_390_),
+    .B1(_252_),
+    .B2(_159_),
+    .ZN(_316_));
+ OAI21_X1 _641_ (.A(_316_),
+    .B1(_248_),
+    .B2(_071_),
+    .ZN(_317_));
+ NOR2_X1 _642_ (.A1(_227_),
+    .A2(_127_),
+    .ZN(_318_));
+ INV_X1 _643_ (.A(_318_),
+    .ZN(_319_));
+ NAND2_X1 _644_ (.A1(_312_),
+    .A2(_319_),
+    .ZN(_320_));
+ INV_X1 _645_ (.A(_165_),
+    .ZN(_321_));
+ XNOR2_X1 _646_ (.A(_320_),
+    .B(_321_),
+    .ZN(_408_));
+ AND3_X1 _647_ (.A1(_408_),
+    .A2(_241_),
+    .A3(_242_),
+    .ZN(_322_));
+ OAI21_X1 _648_ (.A(_257_),
+    .B1(_317_),
+    .B2(_322_),
+    .ZN(_323_));
+ OAI21_X1 _649_ (.A(_323_),
+    .B1(_225_),
+    .B2(_257_),
+    .ZN(_078_));
+ OAI21_X2 _650_ (.A(_391_),
+    .B1(_252_),
+    .B2(_159_),
+    .ZN(_324_));
+ AND2_X1 _651_ (.A1(_298_),
+    .A2(_170_),
+    .ZN(_325_));
+ AND3_X1 _652_ (.A1(_310_),
+    .A2(_165_),
+    .A3(_166_),
+    .ZN(_326_));
+ NAND2_X1 _653_ (.A1(_165_),
+    .A2(_318_),
+    .ZN(_327_));
+ OAI21_X1 _654_ (.A(_327_),
+    .B1(_225_),
+    .B2(_128_),
+    .ZN(_328_));
+ NOR3_X2 _655_ (.A1(_325_),
+    .A2(_326_),
+    .A3(_328_),
+    .ZN(_329_));
+ XNOR2_X1 _656_ (.A(_329_),
+    .B(_183_),
+    .ZN(_409_));
+ NAND3_X1 _657_ (.A1(_240_),
+    .A2(_242_),
+    .A3(_409_),
+    .ZN(_330_));
+ OAI211_X1 _658_ (.A(_324_),
+    .B(_330_),
+    .C1(_248_),
+    .C2(_072_),
+    .ZN(_331_));
+ MUX2_X1 _659_ (.A(_113_),
+    .B(_331_),
+    .S(_251_),
+    .Z(_079_));
+ NOR2_X1 _660_ (.A1(_329_),
+    .A2(_184_),
+    .ZN(_332_));
+ NOR2_X1 _661_ (.A1(_236_),
+    .A2(_129_),
+    .ZN(_333_));
+ NOR2_X1 _662_ (.A1(_332_),
+    .A2(_333_),
+    .ZN(_334_));
+ XNOR2_X1 _663_ (.A(_334_),
+    .B(_181_),
+    .ZN(_410_));
+ AND2_X1 _664_ (.A1(_240_),
+    .A2(_059_),
+    .ZN(_335_));
+ NAND2_X1 _665_ (.A1(_410_),
+    .A2(_335_),
+    .ZN(_336_));
+ OAI21_X1 _666_ (.A(_392_),
+    .B1(_142_),
+    .B2(_158_),
+    .ZN(_337_));
+ OAI211_X1 _667_ (.A(_336_),
+    .B(_337_),
+    .C1(_248_),
+    .C2(_073_),
+    .ZN(_338_));
+ MUX2_X1 _668_ (.A(_114_),
+    .B(_338_),
+    .S(_251_),
+    .Z(_080_));
+ OR3_X1 _669_ (.A1(_329_),
+    .A2(_184_),
+    .A3(_182_),
+    .ZN(_339_));
+ INV_X1 _670_ (.A(_178_),
+    .ZN(_340_));
+ AOI22_X1 _671_ (.A1(_181_),
+    .A2(_333_),
+    .B1(_073_),
+    .B2(_114_),
+    .ZN(_341_));
+ AND3_X1 _672_ (.A1(_339_),
+    .A2(_340_),
+    .A3(_341_),
+    .ZN(_342_));
+ AOI21_X1 _673_ (.A(_340_),
+    .B1(_339_),
+    .B2(_341_),
+    .ZN(_343_));
+ NOR2_X1 _674_ (.A1(_342_),
+    .A2(_343_),
+    .ZN(_411_));
+ NAND2_X1 _675_ (.A1(_411_),
+    .A2(_335_),
+    .ZN(_344_));
+ OAI21_X1 _676_ (.A(_394_),
+    .B1(_142_),
+    .B2(_158_),
+    .ZN(_345_));
+ OAI211_X1 _677_ (.A(_344_),
+    .B(_345_),
+    .C1(_248_),
+    .C2(_074_),
+    .ZN(_346_));
+ MUX2_X1 _678_ (.A(_115_),
+    .B(_346_),
+    .S(_251_),
+    .Z(_081_));
+ NOR2_X1 _679_ (.A1(_231_),
+    .A2(_131_),
+    .ZN(_347_));
+ NOR2_X1 _680_ (.A1(_343_),
+    .A2(_347_),
+    .ZN(_348_));
+ XNOR2_X1 _681_ (.A(_348_),
+    .B(_179_),
+    .ZN(_412_));
+ AND2_X1 _682_ (.A1(_412_),
+    .A2(_335_),
+    .ZN(_349_));
+ OAI21_X1 _683_ (.A(_395_),
+    .B1(_252_),
+    .B2(_159_),
+    .ZN(_350_));
+ OAI21_X1 _684_ (.A(_350_),
+    .B1(_247_),
+    .B2(_075_),
+    .ZN(_351_));
+ OAI21_X1 _685_ (.A(_257_),
+    .B1(_349_),
+    .B2(_351_),
+    .ZN(_352_));
+ OAI21_X1 _686_ (.A(_352_),
+    .B1(_218_),
+    .B2(_257_),
+    .ZN(_082_));
+ MUX2_X1 _687_ (.A(_110_),
+    .B(_371_),
+    .S(_160_),
+    .Z(_353_));
+ NAND2_X2 _688_ (.A1(_245_),
+    .A2(_059_),
+    .ZN(_354_));
+ BUF_X4 _689_ (.A(_354_),
+    .Z(_355_));
+ MUX2_X1 _690_ (.A(_126_),
+    .B(_353_),
+    .S(_355_),
+    .Z(_092_));
+ MUX2_X1 _691_ (.A(_117_),
+    .B(_382_),
+    .S(_160_),
+    .Z(_356_));
+ MUX2_X1 _692_ (.A(_133_),
+    .B(_356_),
+    .S(_355_),
+    .Z(_099_));
+ MUX2_X1 _693_ (.A(_118_),
+    .B(_393_),
+    .S(_160_),
+    .Z(_357_));
+ MUX2_X1 _694_ (.A(_134_),
+    .B(_357_),
+    .S(_355_),
+    .Z(_100_));
+ MUX2_X1 _695_ (.A(_119_),
+    .B(_396_),
+    .S(_403_),
+    .Z(_358_));
+ MUX2_X1 _696_ (.A(_135_),
+    .B(_358_),
+    .S(_355_),
+    .Z(_101_));
+ MUX2_X1 _697_ (.A(_120_),
+    .B(_397_),
+    .S(_160_),
+    .Z(_359_));
+ MUX2_X1 _698_ (.A(_136_),
+    .B(_359_),
+    .S(_355_),
+    .Z(_102_));
+ MUX2_X1 _699_ (.A(_121_),
+    .B(_398_),
+    .S(_403_),
+    .Z(_360_));
+ MUX2_X1 _700_ (.A(_137_),
+    .B(_360_),
+    .S(_355_),
+    .Z(_103_));
+ MUX2_X1 _701_ (.A(_122_),
+    .B(_399_),
+    .S(_403_),
+    .Z(_361_));
+ MUX2_X1 _702_ (.A(_138_),
+    .B(_361_),
+    .S(_355_),
+    .Z(_104_));
+ MUX2_X1 _703_ (.A(_123_),
+    .B(_400_),
+    .S(_403_),
+    .Z(_362_));
+ MUX2_X1 _704_ (.A(_139_),
+    .B(_362_),
+    .S(_355_),
+    .Z(_105_));
+ MUX2_X1 _705_ (.A(_124_),
+    .B(_401_),
+    .S(_403_),
+    .Z(_363_));
+ MUX2_X1 _706_ (.A(_140_),
+    .B(_363_),
+    .S(_354_),
+    .Z(_106_));
+ MUX2_X1 _707_ (.A(_125_),
+    .B(_402_),
+    .S(_403_),
+    .Z(_364_));
+ MUX2_X1 _708_ (.A(_141_),
+    .B(_364_),
+    .S(_355_),
+    .Z(_107_));
+ MUX2_X1 _709_ (.A(_111_),
+    .B(_372_),
+    .S(_403_),
+    .Z(_365_));
+ MUX2_X1 _710_ (.A(_127_),
+    .B(_365_),
+    .S(_354_),
+    .Z(_093_));
+ MUX2_X1 _711_ (.A(_112_),
+    .B(_373_),
+    .S(_160_),
+    .Z(_366_));
+ MUX2_X1 _712_ (.A(_128_),
+    .B(_366_),
+    .S(_354_),
+    .Z(_094_));
+ MUX2_X1 _713_ (.A(_113_),
+    .B(_374_),
+    .S(_160_),
+    .Z(_367_));
+ MUX2_X1 _714_ (.A(_129_),
+    .B(_367_),
+    .S(_354_),
+    .Z(_095_));
+ MUX2_X1 _715_ (.A(_114_),
+    .B(_375_),
+    .S(_160_),
+    .Z(_368_));
+ MUX2_X1 _716_ (.A(_130_),
+    .B(_368_),
+    .S(_354_),
+    .Z(_096_));
+ MUX2_X1 _717_ (.A(_115_),
+    .B(_376_),
+    .S(_160_),
+    .Z(_369_));
+ MUX2_X1 _718_ (.A(_131_),
+    .B(_369_),
+    .S(_354_),
+    .Z(_097_));
+ MUX2_X1 _719_ (.A(_116_),
+    .B(_377_),
+    .S(_160_),
+    .Z(_370_));
+ MUX2_X1 _720_ (.A(_132_),
+    .B(_370_),
+    .S(_355_),
+    .Z(_098_));
+ BUF_X4 _721_ (.A(reset),
+    .Z(_405_));
+ BUF_X4 _722_ (.A(\ctrl.state.out[2] ),
+    .Z(_109_));
+ BUF_X1 _723_ (.A(\ctrl.state.out[1] ),
+    .Z(_108_));
+ BUF_X4 _724_ (.A(_005_),
+    .Z(_059_));
+ BUF_X1 _725_ (.A(_423_),
+    .Z(resp_val));
+ BUF_X2 _726_ (.A(resp_rdy),
+    .Z(_422_));
+ BUF_X2 _727_ (.A(\dpath.a_lt_b$in0[15] ),
+    .Z(_116_));
+ BUF_X2 _728_ (.A(\dpath.a_lt_b$in1[15] ),
+    .Z(_132_));
+ BUF_X2 _729_ (.A(\dpath.a_lt_b$in0[14] ),
+    .Z(_115_));
+ BUF_X2 _730_ (.A(\dpath.a_lt_b$in1[14] ),
+    .Z(_131_));
+ BUF_X4 _731_ (.A(\dpath.a_lt_b$in0[13] ),
+    .Z(_114_));
+ BUF_X4 _732_ (.A(\dpath.a_lt_b$in1[13] ),
+    .Z(_130_));
+ BUF_X2 _733_ (.A(\dpath.a_lt_b$in0[12] ),
+    .Z(_113_));
+ BUF_X2 _734_ (.A(\dpath.a_lt_b$in1[12] ),
+    .Z(_129_));
+ BUF_X2 _735_ (.A(\dpath.a_lt_b$in0[11] ),
+    .Z(_112_));
+ BUF_X4 _736_ (.A(\dpath.a_lt_b$in1[11] ),
+    .Z(_128_));
+ BUF_X2 _737_ (.A(\dpath.a_lt_b$in0[10] ),
+    .Z(_111_));
+ BUF_X2 _738_ (.A(\dpath.a_lt_b$in1[10] ),
+    .Z(_127_));
+ BUF_X2 _739_ (.A(\dpath.a_lt_b$in0[9] ),
+    .Z(_125_));
+ BUF_X4 _740_ (.A(\dpath.a_lt_b$in1[9] ),
+    .Z(_141_));
+ BUF_X2 _741_ (.A(\dpath.a_lt_b$in0[8] ),
+    .Z(_124_));
+ BUF_X4 _742_ (.A(\dpath.a_lt_b$in1[8] ),
+    .Z(_140_));
+ BUF_X4 _743_ (.A(\dpath.a_lt_b$in0[7] ),
+    .Z(_123_));
+ BUF_X4 _744_ (.A(\dpath.a_lt_b$in1[7] ),
+    .Z(_139_));
+ BUF_X2 _745_ (.A(\dpath.a_lt_b$in0[6] ),
+    .Z(_122_));
+ BUF_X4 _746_ (.A(\dpath.a_lt_b$in1[6] ),
+    .Z(_138_));
+ BUF_X2 _747_ (.A(\dpath.a_lt_b$in0[5] ),
+    .Z(_121_));
+ BUF_X4 _748_ (.A(\dpath.a_lt_b$in1[5] ),
+    .Z(_137_));
+ BUF_X2 _749_ (.A(\dpath.a_lt_b$in0[4] ),
+    .Z(_120_));
+ BUF_X4 _750_ (.A(\dpath.a_lt_b$in1[4] ),
+    .Z(_136_));
+ BUF_X2 _751_ (.A(\dpath.a_lt_b$in0[3] ),
+    .Z(_119_));
+ BUF_X4 _752_ (.A(\dpath.a_lt_b$in1[3] ),
+    .Z(_135_));
+ BUF_X4 _753_ (.A(\dpath.a_lt_b$in0[2] ),
+    .Z(_118_));
+ BUF_X2 _754_ (.A(\dpath.a_lt_b$in1[2] ),
+    .Z(_134_));
+ BUF_X2 _755_ (.A(\dpath.a_lt_b$in0[1] ),
+    .Z(_117_));
+ BUF_X2 _756_ (.A(\dpath.a_lt_b$in1[1] ),
+    .Z(_133_));
+ BUF_X2 _757_ (.A(\dpath.a_lt_b$in0[0] ),
+    .Z(_110_));
+ BUF_X4 _758_ (.A(\dpath.a_lt_b$in1[0] ),
+    .Z(_126_));
+ CLKBUF_X1 _759_ (.A(_406_),
+    .Z(resp_msg[0]));
+ BUF_X1 _760_ (.A(_004_),
+    .Z(_058_));
+ BUF_X1 _761_ (.A(_003_),
+    .Z(_057_));
+ BUF_X1 _762_ (.A(_055_),
+    .Z(_001_));
+ BUF_X4 _763_ (.A(req_rdy),
+    .Z(_403_));
+ BUF_X2 _764_ (.A(req_val),
+    .Z(_404_));
+ BUF_X1 _765_ (.A(_056_),
+    .Z(_002_));
+ BUF_X1 _766_ (.A(_054_),
+    .Z(_000_));
+ BUF_X1 _767_ (.A(_006_),
+    .Z(_060_));
+ BUF_X2 _768_ (.A(req_msg[16]),
+    .Z(_378_));
+ CLKBUF_X1 _769_ (.A(_076_),
+    .Z(_022_));
+ BUF_X1 _770_ (.A(_007_),
+    .Z(_061_));
+ BUF_X2 _771_ (.A(req_msg[17]),
+    .Z(_379_));
+ CLKBUF_X1 _772_ (.A(_083_),
+    .Z(_029_));
+ BUF_X1 _773_ (.A(_008_),
+    .Z(_062_));
+ BUF_X2 _774_ (.A(req_msg[18]),
+    .Z(_380_));
+ CLKBUF_X1 _775_ (.A(_084_),
+    .Z(_030_));
+ BUF_X1 _776_ (.A(_009_),
+    .Z(_063_));
+ BUF_X2 _777_ (.A(req_msg[19]),
+    .Z(_381_));
+ CLKBUF_X1 _778_ (.A(_085_),
+    .Z(_031_));
+ BUF_X1 _779_ (.A(_010_),
+    .Z(_064_));
+ BUF_X2 _780_ (.A(req_msg[20]),
+    .Z(_383_));
+ CLKBUF_X1 _781_ (.A(_086_),
+    .Z(_032_));
+ BUF_X1 _782_ (.A(_011_),
+    .Z(_065_));
+ BUF_X2 _783_ (.A(req_msg[21]),
+    .Z(_384_));
+ CLKBUF_X1 _784_ (.A(_087_),
+    .Z(_033_));
+ BUF_X1 _785_ (.A(_012_),
+    .Z(_066_));
+ BUF_X2 _786_ (.A(req_msg[22]),
+    .Z(_385_));
+ CLKBUF_X1 _787_ (.A(_088_),
+    .Z(_034_));
+ BUF_X1 _788_ (.A(_013_),
+    .Z(_067_));
+ BUF_X2 _789_ (.A(req_msg[23]),
+    .Z(_386_));
+ CLKBUF_X1 _790_ (.A(_089_),
+    .Z(_035_));
+ BUF_X1 _791_ (.A(_014_),
+    .Z(_068_));
+ BUF_X2 _792_ (.A(req_msg[24]),
+    .Z(_387_));
+ CLKBUF_X1 _793_ (.A(_090_),
+    .Z(_036_));
+ BUF_X1 _794_ (.A(_015_),
+    .Z(_069_));
+ BUF_X2 _795_ (.A(req_msg[25]),
+    .Z(_388_));
+ CLKBUF_X1 _796_ (.A(_091_),
+    .Z(_037_));
+ BUF_X1 _797_ (.A(_016_),
+    .Z(_070_));
+ BUF_X2 _798_ (.A(req_msg[26]),
+    .Z(_389_));
+ CLKBUF_X1 _799_ (.A(_077_),
+    .Z(_023_));
+ BUF_X1 _800_ (.A(_017_),
+    .Z(_071_));
+ BUF_X2 _801_ (.A(req_msg[27]),
+    .Z(_390_));
+ CLKBUF_X1 _802_ (.A(_078_),
+    .Z(_024_));
+ BUF_X1 _803_ (.A(_018_),
+    .Z(_072_));
+ BUF_X2 _804_ (.A(req_msg[28]),
+    .Z(_391_));
+ CLKBUF_X1 _805_ (.A(_079_),
+    .Z(_025_));
+ BUF_X2 _806_ (.A(_019_),
+    .Z(_073_));
+ BUF_X2 _807_ (.A(req_msg[29]),
+    .Z(_392_));
+ CLKBUF_X1 _808_ (.A(_080_),
+    .Z(_026_));
+ BUF_X1 _809_ (.A(_020_),
+    .Z(_074_));
+ BUF_X2 _810_ (.A(req_msg[30]),
+    .Z(_394_));
+ CLKBUF_X1 _811_ (.A(_081_),
+    .Z(_027_));
+ BUF_X1 _812_ (.A(_021_),
+    .Z(_075_));
+ BUF_X2 _813_ (.A(req_msg[31]),
+    .Z(_395_));
+ CLKBUF_X1 _814_ (.A(_082_),
+    .Z(_028_));
+ BUF_X1 _815_ (.A(req_msg[0]),
+    .Z(_371_));
+ CLKBUF_X1 _816_ (.A(_092_),
+    .Z(_038_));
+ BUF_X2 _817_ (.A(req_msg[1]),
+    .Z(_382_));
+ CLKBUF_X1 _818_ (.A(_099_),
+    .Z(_045_));
+ BUF_X1 _819_ (.A(req_msg[2]),
+    .Z(_393_));
+ CLKBUF_X1 _820_ (.A(_100_),
+    .Z(_046_));
+ BUF_X1 _821_ (.A(req_msg[3]),
+    .Z(_396_));
+ CLKBUF_X1 _822_ (.A(_101_),
+    .Z(_047_));
+ BUF_X2 _823_ (.A(req_msg[4]),
+    .Z(_397_));
+ CLKBUF_X1 _824_ (.A(_102_),
+    .Z(_048_));
+ BUF_X2 _825_ (.A(req_msg[5]),
+    .Z(_398_));
+ CLKBUF_X1 _826_ (.A(_103_),
+    .Z(_049_));
+ BUF_X1 _827_ (.A(req_msg[6]),
+    .Z(_399_));
+ CLKBUF_X1 _828_ (.A(_104_),
+    .Z(_050_));
+ BUF_X2 _829_ (.A(req_msg[7]),
+    .Z(_400_));
+ CLKBUF_X1 _830_ (.A(_105_),
+    .Z(_051_));
+ BUF_X2 _831_ (.A(req_msg[8]),
+    .Z(_401_));
+ CLKBUF_X1 _832_ (.A(_106_),
+    .Z(_052_));
+ BUF_X2 _833_ (.A(req_msg[9]),
+    .Z(_402_));
+ CLKBUF_X1 _834_ (.A(_107_),
+    .Z(_053_));
+ BUF_X2 _835_ (.A(req_msg[10]),
+    .Z(_372_));
+ CLKBUF_X1 _836_ (.A(_093_),
+    .Z(_039_));
+ BUF_X1 _837_ (.A(req_msg[11]),
+    .Z(_373_));
+ CLKBUF_X1 _838_ (.A(_094_),
+    .Z(_040_));
+ BUF_X2 _839_ (.A(req_msg[12]),
+    .Z(_374_));
+ CLKBUF_X1 _840_ (.A(_095_),
+    .Z(_041_));
+ BUF_X2 _841_ (.A(req_msg[13]),
+    .Z(_375_));
+ CLKBUF_X1 _842_ (.A(_096_),
+    .Z(_042_));
+ BUF_X2 _843_ (.A(req_msg[14]),
+    .Z(_376_));
+ CLKBUF_X1 _844_ (.A(_097_),
+    .Z(_043_));
+ BUF_X2 _845_ (.A(req_msg[15]),
+    .Z(_377_));
+ CLKBUF_X1 _846_ (.A(_098_),
+    .Z(_044_));
+ BUF_X1 _847_ (.A(_413_),
+    .Z(resp_msg[1]));
+ BUF_X1 _848_ (.A(_414_),
+    .Z(resp_msg[2]));
+ BUF_X1 _849_ (.A(_415_),
+    .Z(resp_msg[3]));
+ BUF_X1 _850_ (.A(_416_),
+    .Z(resp_msg[4]));
+ BUF_X1 _851_ (.A(_417_),
+    .Z(resp_msg[5]));
+ BUF_X1 _852_ (.A(_418_),
+    .Z(resp_msg[6]));
+ BUF_X1 _853_ (.A(_419_),
+    .Z(resp_msg[7]));
+ BUF_X1 _854_ (.A(_420_),
+    .Z(resp_msg[8]));
+ BUF_X1 _855_ (.A(_421_),
+    .Z(resp_msg[9]));
+ BUF_X1 _856_ (.A(_407_),
+    .Z(resp_msg[10]));
+ BUF_X1 _857_ (.A(_408_),
+    .Z(resp_msg[11]));
+ BUF_X1 _858_ (.A(_409_),
+    .Z(resp_msg[12]));
+ BUF_X1 _859_ (.A(_410_),
+    .Z(resp_msg[13]));
+ BUF_X1 _860_ (.A(_411_),
+    .Z(resp_msg[14]));
+ BUF_X1 _861_ (.A(_412_),
+    .Z(resp_msg[15]));
+ DFF_X1 _862_ (.D(_000_),
+    .CK(clk),
+    .Q(req_rdy),
+    .QN(_005_));
+ DFF_X1 _863_ (.D(_001_),
+    .CK(clk),
+    .Q(\ctrl.state.out[1] ),
+    .QN(_003_));
+ DFF_X1 _864_ (.D(_002_),
+    .CK(clk),
+    .Q(\ctrl.state.out[2] ),
+    .QN(_004_));
+ DFF_X1 _865_ (.D(_022_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[0] ),
+    .QN(_424_));
+ DFF_X1 _866_ (.D(_029_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[1] ),
+    .QN(_425_));
+ DFF_X1 _867_ (.D(_030_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[2] ),
+    .QN(_426_));
+ DFF_X1 _868_ (.D(_031_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[3] ),
+    .QN(_427_));
+ DFF_X1 _869_ (.D(_032_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[4] ),
+    .QN(_428_));
+ DFF_X1 _870_ (.D(_033_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[5] ),
+    .QN(_429_));
+ DFF_X1 _871_ (.D(_034_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[6] ),
+    .QN(_430_));
+ DFF_X1 _872_ (.D(_035_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[7] ),
+    .QN(_431_));
+ DFF_X1 _873_ (.D(_036_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[8] ),
+    .QN(_432_));
+ DFF_X1 _874_ (.D(_037_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[9] ),
+    .QN(_433_));
+ DFF_X1 _875_ (.D(_023_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[10] ),
+    .QN(_434_));
+ DFF_X1 _876_ (.D(_024_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[11] ),
+    .QN(_435_));
+ DFF_X1 _877_ (.D(_025_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[12] ),
+    .QN(_436_));
+ DFF_X1 _878_ (.D(_026_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[13] ),
+    .QN(_437_));
+ DFF_X1 _879_ (.D(_027_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[14] ),
+    .QN(_438_));
+ DFF_X1 _880_ (.D(_028_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in0[15] ),
+    .QN(_439_));
+ DFF_X1 _881_ (.D(_038_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[0] ),
+    .QN(_006_));
+ DFF_X1 _882_ (.D(_045_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[1] ),
+    .QN(_007_));
+ DFF_X1 _883_ (.D(_046_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[2] ),
+    .QN(_008_));
+ DFF_X1 _884_ (.D(_047_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[3] ),
+    .QN(_009_));
+ DFF_X1 _885_ (.D(_048_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[4] ),
+    .QN(_010_));
+ DFF_X1 _886_ (.D(_049_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[5] ),
+    .QN(_011_));
+ DFF_X1 _887_ (.D(_050_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[6] ),
+    .QN(_012_));
+ DFF_X1 _888_ (.D(_051_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[7] ),
+    .QN(_013_));
+ DFF_X1 _889_ (.D(_052_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[8] ),
+    .QN(_014_));
+ DFF_X1 _890_ (.D(_053_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[9] ),
+    .QN(_015_));
+ DFF_X1 _891_ (.D(_039_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[10] ),
+    .QN(_016_));
+ DFF_X1 _892_ (.D(_040_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[11] ),
+    .QN(_017_));
+ DFF_X1 _893_ (.D(_041_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[12] ),
+    .QN(_018_));
+ DFF_X1 _894_ (.D(_042_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[13] ),
+    .QN(_019_));
+ DFF_X1 _895_ (.D(_043_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[14] ),
+    .QN(_020_));
+ DFF_X1 _896_ (.D(_044_),
+    .CK(clk),
+    .Q(\dpath.a_lt_b$in1[15] ),
+    .QN(_021_));
+endmodule
diff --git a/src/rsz/test/global_sizing_threads.tcl b/src/rsz/test/global_sizing_threads.tcl
new file mode 100644
index 00000000000..19899e1b68d
--- /dev/null
+++ b/src/rsz/test/global_sizing_threads.tcl
@@ -0,0 +1,7 @@
+# Multi-threaded run of the global sizing coverage flow (see global_sizing.tcl).
+# Exercises the parallel Phase-B worker path and asserts it produces the same
+# netlist as the serial golden (global_sizing.vok).
+source "helpers.tcl"
+set global_sizing_threads 8
+set global_sizing_result "global_sizing_threads"
+source "global_sizing.tcl"
diff --git a/src/rsz/test/repair_setup_invalid_phase.ok b/src/rsz/test/repair_setup_invalid_phase.ok
index 03b9b776074..5f4ab09d837 100644
--- a/src/rsz/test/repair_setup_invalid_phase.ok
+++ b/src/rsz/test/repair_setup_invalid_phase.ok
@@ -4,7 +4,7 @@
 [INFO ODB-0131]     Created 3 components and 15 component-terminals.
 [INFO ODB-0132]     Created 2 special nets and 0 connections.
 [INFO ODB-0133]     Created 7 nets and 9 connections.
-[ERROR RSZ-0223] No phase names specified. Valid phase names are: LEGACY, WNS, WNS_PATH, WNS_CONE, TNS, ENDPOINT_FANIN, STARTPOINT_FANOUT, LAST_GASP, CRIT_VT_SWAP, REROUTE
+[ERROR RSZ-0223] No phase names specified. Valid phase names are: LEGACY, WNS, WNS_PATH, WNS_CONE, TNS, ENDPOINT_FANIN, STARTPOINT_FANOUT, LAST_GASP, CRIT_VT_SWAP, REROUTE, GLOBAL_SIZING
 Caught expected empty phase error: RSZ-0223
 [INFO RSZ-0100] Repair move sequence: UnbufferMove SizeUpMove SwapPinsMove BufferMove CloneMove SplitLoadMove 
 [INFO RSZ-0094] Found 2 endpoints with setup violations.
@@ -16,5 +16,5 @@ Caught expected empty phase error: RSZ-0223
        0* |       0 |       0 |        0 |      0 |     0 |    +0.0% |   -0.278 |       -0.7 |       -0.5 |      2 | y2
       10* |       0 |       3 |        0 |      0 |     1 |   +53.8% |   -0.195 |       -0.5 |       -0.3 |      2 | y2
       14* |       0 |       4 |        0 |      0 |     0 |   +84.6% |   -0.157 |       -0.4 |       -0.3 |      2 | y2
-[ERROR RSZ-0217] Unknown phase name 'BAD_PHASE'. Valid phase names are: LEGACY, WNS, WNS_PATH, WNS_CONE, TNS, ENDPOINT_FANIN, STARTPOINT_FANOUT, LAST_GASP, CRIT_VT_SWAP, REROUTE
+[ERROR RSZ-0217] Unknown phase name 'BAD_PHASE'. Valid phase names are: LEGACY, WNS, WNS_PATH, WNS_CONE, TNS, ENDPOINT_FANIN, STARTPOINT_FANOUT, LAST_GASP, CRIT_VT_SWAP, REROUTE, GLOBAL_SIZING
 Caught expected invalid phase error: RSZ-0217

From 4c3e619379c87b78a0c8a1c4e48ffbfe86e9653f Mon Sep 17 00:00:00 2001
From: Eren Dogan <erendogan@google.com>
Date: Fri, 5 Jun 2026 20:06:31 -0700
Subject: [PATCH 2/8] rsz: clang-tidy

Signed-off-by: Eren Dogan <erendogan@google.com>
---
 src/rsz/src/LRSubproblem.cc              | 18 ++++++++----------
 src/rsz/src/policy/GlobalSizingPolicy.cc | 16 +++++++---------
 2 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/src/rsz/src/LRSubproblem.cc b/src/rsz/src/LRSubproblem.cc
index 34eac8cd2e8..95263899a1f 100644
--- a/src/rsz/src/LRSubproblem.cc
+++ b/src/rsz/src/LRSubproblem.cc
@@ -153,7 +153,7 @@ float LRSubproblem::leakageOrArea(sta::LibertyCell* cell) const
 bool LRSubproblem::isDataArc(const sta::Edge* edge) const
 {
   const sta::TimingRole* role = edge->role();
-  if (role->isTimingCheck()) {
+  if (role != nullptr && role->isTimingCheck()) {
     return false;
   }
   if (edge->isDisabledLoop()) {
@@ -241,7 +241,7 @@ bool LRSubproblem::snapshot(sta::Instance* inst,
           continue;
         }
         const sta::EdgeId id = graph_->id(e);
-        if (static_cast<int>(id) >= lambda_size) {
+        if (std::cmp_greater_equal(id, lambda_size)) {
           continue;
         }
         lam_sum += lambda[id];
@@ -352,7 +352,7 @@ bool LRSubproblem::snapshot(sta::Instance* inst,
           continue;
         }
         const sta::EdgeId id = graph_->id(e);
-        if (static_cast<int>(id) >= lambda_size) {
+        if (std::cmp_greater_equal(id, lambda_size)) {
           continue;
         }
         lam_U += lambda[id];
@@ -435,13 +435,11 @@ float LRSubproblem::evaluateCellCost(const GateSnapshot& snap,
       // Candidate missing this input port - incompatible.
       return std::numeric_limits<float>::infinity();
     }
-    float load_pert = u.load_U_cur - u.c_in_cur + c_in_cand;
-    if (load_pert < 0.0f) {
-      // Numerical safety: extreme C_in mismatches can push the perturbed
-      // load slightly negative. Clamp at zero rather than rejecting; the
-      // gateDelay LUT is well-defined at zero load.
-      load_pert = 0.0f;
-    }
+    // Numerical safety: extreme C_in mismatches can push the perturbed load
+    // slightly negative. Clamp at zero rather than rejecting; the gateDelay LUT
+    // is well-defined at zero load.
+    const float load_pert
+        = std::max(u.load_U_cur - u.c_in_cur + c_in_cand, 0.0f);
     const float d_U = sta::delayAsFloat(resizer_->gateDelay(
         u.drv_port, load_pert, scene, max_, arc_delay_calc));
     cost += timing_weight * u.lambda_U_drv * d_U;
diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc
index 3667e65139a..1cccf6c7e1c 100644
--- a/src/rsz/src/policy/GlobalSizingPolicy.cc
+++ b/src/rsz/src/policy/GlobalSizingPolicy.cc
@@ -56,7 +56,7 @@ GlobalSizingPolicy::~GlobalSizingPolicy() = default;
 bool GlobalSizingPolicy::isDataArc(const sta::Edge* edge) const
 {
   const sta::TimingRole* role = edge->role();
-  if (role->isTimingCheck()) {
+  if (role != nullptr && role->isTimingCheck()) {
     return false;
   }
   if (edge->isDisabledLoop()) {
@@ -328,11 +328,9 @@ void GlobalSizingPolicy::projectFlowBalance(const LRParams& params)
       vertices.push_back(vit.next());
     }
   }
-  std::sort(vertices.begin(),
-            vertices.end(),
-            [](const sta::Vertex* a, const sta::Vertex* b) {
-              return a->level() > b->level();
-            });
+  std::ranges::sort(vertices, [](const sta::Vertex* a, const sta::Vertex* b) {
+    return a->level() > b->level();
+  });
 
   int rescaled = 0;
   int zero_sum_fallback = 0;
@@ -543,8 +541,8 @@ GlobalSizingPolicy::SweepStats GlobalSizingPolicy::singleSweep(
       = thread_pool_->parallelMap(
           snapshots,
           [this, timing_weight, src](const LRSubproblem::GateSnapshot& snap) {
-            thread_local sta::ArcDelayCalc* cached_src = nullptr;
-            thread_local std::unique_ptr<sta::ArcDelayCalc> adc;
+            static thread_local sta::ArcDelayCalc* cached_src = nullptr;
+            static thread_local std::unique_ptr<sta::ArcDelayCalc> adc;
             if (adc == nullptr || cached_src != src) {
               adc.reset(src->copy());
               cached_src = src;
@@ -622,7 +620,7 @@ float GlobalSizingPolicy::computeAutoTimingWeight(const LRParams& params) const
           continue;
         }
         const sta::EdgeId id = graph_->id(e);
-        if (static_cast<int>(id) >= lambda_size) {
+        if (std::cmp_greater_equal(id, lambda_size)) {
           continue;
         }
         lam_sum += lambda_[id];

From 9a56ff0eb4fadae934bcd17d0e004006c8182757 Mon Sep 17 00:00:00 2001
From: Eren Dogan <erendogan@google.com>
Date: Sat, 6 Jun 2026 12:29:44 -0700
Subject: [PATCH 3/8] rsz: Fix edge id overflow

Signed-off-by: Eren Dogan <erendogan@google.com>
---
 src/rsz/src/policy/GlobalSizingPolicy.cc | 26 +++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc
index 1cccf6c7e1c..4ca57ecaced 100644
--- a/src/rsz/src/policy/GlobalSizingPolicy.cc
+++ b/src/rsz/src/policy/GlobalSizingPolicy.cc
@@ -348,7 +348,16 @@ void GlobalSizingPolicy::projectFlowBalance(const LRParams& params)
         if (!isDataArc(e)) {
           continue;
         }
-        target += lambda_[graph_->id(e)];
+        // lambda_ is sized to the edge-id space captured in allocate(). A
+        // sweep can replace cells and the subsequent updateParasitics()/
+        // findRequireds() rebuild arcs, minting edge ids beyond that space, so
+        // an id may now be >= lambda_.size(). Such arcs carry no multiplier;
+        // skip them, matching the guard in updateMultipliers().
+        const sta::EdgeId id = graph_->id(e);
+        if (static_cast<size_t>(id) >= lambda_.size()) {
+          continue;
+        }
+        target += lambda_[id];
       }
     }
 
@@ -362,7 +371,11 @@ void GlobalSizingPolicy::projectFlowBalance(const LRParams& params)
         if (!isDataArc(e)) {
           continue;
         }
-        in_sum += lambda_[graph_->id(e)];
+        const sta::EdgeId id = graph_->id(e);
+        if (static_cast<size_t>(id) >= lambda_.size()) {
+          continue;
+        }
+        in_sum += lambda_[id];
         ++in_count;
       }
     }
@@ -380,6 +393,9 @@ void GlobalSizingPolicy::projectFlowBalance(const LRParams& params)
           continue;
         }
         const sta::EdgeId id = graph_->id(e);
+        if (static_cast<size_t>(id) >= lambda_.size()) {
+          continue;
+        }
         lambda_[id] = std::max(lambda_[id] * scale, params.lambda_floor);
       }
       ++rescaled;
@@ -391,7 +407,11 @@ void GlobalSizingPolicy::projectFlowBalance(const LRParams& params)
         if (!isDataArc(e)) {
           continue;
         }
-        lambda_[graph_->id(e)] = std::max(share, params.lambda_floor);
+        const sta::EdgeId id = graph_->id(e);
+        if (static_cast<size_t>(id) >= lambda_.size()) {
+          continue;
+        }
+        lambda_[id] = std::max(share, params.lambda_floor);
       }
       ++zero_sum_fallback;
     }

From 2dd38548ad4b1f84a3ef7b8e86eb0ff2eefd63bb Mon Sep 17 00:00:00 2001
From: Eren Dogan <erendogan@google.com>
Date: Sat, 6 Jun 2026 17:45:39 -0700
Subject: [PATCH 4/8] rsz: Prevent max area overflow

Signed-off-by: Eren Dogan <erendogan@google.com>
---
 src/rsz/src/policy/GlobalSizingPolicy.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc
index 4ca57ecaced..93aeb2f6dd7 100644
--- a/src/rsz/src/policy/GlobalSizingPolicy.cc
+++ b/src/rsz/src/policy/GlobalSizingPolicy.cc
@@ -810,7 +810,8 @@ void GlobalSizingPolicy::iterate()
     // Best-so-far: Keep track of the best WNS so far but don't restore a sweep
     // that worsens WNS just yet to allow oscillation.
     const float current_wns = sta::delayAsFloat(sta_->worstSlack(policy_max_));
-    if (!reject && sta::fuzzyGreaterEqual(current_wns, best_wns)) {
+    if (!reject && sta::fuzzyGreaterEqual(current_wns, best_wns)
+        && !resizer_.overMaxArea()) {
       resizer_.journalEnd();  // checkpoint
       resizer_.journalBegin();
       best_wns = current_wns;

From 75c93ef5b3a732f3e8ed259c24b1b24b85549047 Mon Sep 17 00:00:00 2001
From: Eren Dogan <erendogan@google.com>
Date: Sat, 6 Jun 2026 18:32:52 -0700
Subject: [PATCH 5/8] rsz: Early exit when timing is met

Signed-off-by: Eren Dogan <erendogan@google.com>
---
 src/rsz/src/policy/GlobalSizingPolicy.cc | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc
index 93aeb2f6dd7..74b1e7b6549 100644
--- a/src/rsz/src/policy/GlobalSizingPolicy.cc
+++ b/src/rsz/src/policy/GlobalSizingPolicy.cc
@@ -775,6 +775,21 @@ void GlobalSizingPolicy::iterate()
   int consec_reject = 0;
   resizer_.journalBegin();
   for (int iter = 0; iter < max_iter; ++iter) {
+    // Global sizing only drives WNS upward; once it meets the setup margin
+    // there is no timing left to recover and further sweeps would only spend
+    // area and leakage.
+    const float wns_now = sta::delayAsFloat(sta_->worstSlack(policy_max_));
+    if (sta::fuzzyGreaterEqual(wns_now, lr_params_.setup_slack_margin)) {
+      debugPrint(logger_,
+                 RSZ,
+                 "global_sizing",
+                 1,
+                 "LR stop: WNS {} meets setup margin {}",
+                 sta::delayAsString(wns_now, 3, sta_),
+                 sta::delayAsString(lr_params_.setup_slack_margin, 3, sta_));
+      break;
+    }
+
     if (iter > 0) {
       updateMultipliers(iter_params);
       projectFlowBalance(iter_params);

From 844d2cff936ec476b824c2dff44fb8df04edfd60 Mon Sep 17 00:00:00 2001
From: Eren Dogan <erendogan@google.com>
Date: Sun, 7 Jun 2026 17:38:02 -0700
Subject: [PATCH 6/8] rsz: Exclude clock drivers

Signed-off-by: Eren Dogan <erendogan@google.com>
---
 src/rsz/src/LRSubproblem.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/rsz/src/LRSubproblem.cc b/src/rsz/src/LRSubproblem.cc
index 95263899a1f..8315f9c8170 100644
--- a/src/rsz/src/LRSubproblem.cc
+++ b/src/rsz/src/LRSubproblem.cc
@@ -220,6 +220,9 @@ bool LRSubproblem::snapshot(sta::Instance* inst,
     sta::Pin* pin = pit->next();
     const sta::PortDirection* dir = network_->direction(pin);
     if (dir->isOutput()) {
+      if (sta_->isClock(pin, sta_->cmdMode())) {
+        return false;
+      }
       sta::Vertex* v = graph_->pinDrvrVertex(pin);
       if (v == nullptr) {
         continue;

From 7cabee91a286e678e1ab9df8dc5a5ab0aeb8b29f Mon Sep 17 00:00:00 2001
From: Eren Dogan <erendogan@google.com>
Date: Mon, 8 Jun 2026 20:05:16 -0700
Subject: [PATCH 7/8] rsz: Add downsizing budget

Signed-off-by: Eren Dogan <erendogan@google.com>
---
 src/rsz/src/LRSubproblem.cc              |  60 +++++++++++++
 src/rsz/src/LRSubproblem.hh              |  30 ++++++-
 src/rsz/src/policy/GlobalSizingPolicy.cc | 102 ++++++++++++++++++++++-
 src/rsz/src/policy/GlobalSizingPolicy.hh |  16 ++++
 4 files changed, 202 insertions(+), 6 deletions(-)

diff --git a/src/rsz/src/LRSubproblem.cc b/src/rsz/src/LRSubproblem.cc
index 8315f9c8170..a17c1b1b0ed 100644
--- a/src/rsz/src/LRSubproblem.cc
+++ b/src/rsz/src/LRSubproblem.cc
@@ -29,6 +29,8 @@
 
 namespace rsz {
 
+using utl::RSZ;
+
 namespace {
 
 // Resizer::area(Cell*) is protected. Compute the same value through the public
@@ -192,6 +194,8 @@ bool LRSubproblem::applyReplacement(sta::Instance* inst,
 bool LRSubproblem::snapshot(sta::Instance* inst,
                             const float* lambda,
                             const int lambda_size,
+                            const float* budget,
+                            const int budget_size,
                             GateSnapshot& snap)
 {
   init();
@@ -215,6 +219,12 @@ bool LRSubproblem::snapshot(sta::Instance* inst,
   snap.inputs.clear();
   snap.candidates.clear();
 
+  // Min depth-normalized downsize budget over the kept output pins. The policy
+  // precomputes a per-vertex budget (computeSlackBudgets) from the live slacks;
+  // we just freeze the gate's worst (min) value so workers never touch the STA
+  // graph.
+  float worst_budget = std::numeric_limits<float>::max();
+
   std::unique_ptr<sta::InstancePinIterator> pit(network_->pinIterator(inst));
   while (pit->hasNext()) {
     sta::Pin* pin = pit->next();
@@ -264,6 +274,11 @@ bool LRSubproblem::snapshot(sta::Instance* inst,
                                                   max_mm))
                    : 0.0f;
       o.drive_res = out_port->driveResistance();
+      const sta::VertexId vid = graph_->id(v);
+      const float vbudget = std::cmp_less(vid, budget_size)
+                                ? budget[vid]
+                                : std::numeric_limits<float>::max();
+      worst_budget = std::min(worst_budget, vbudget);
       snap.outputs.push_back(o);
     } else if (dir->isInput()) {
       const sta::LibertyPort* in_port = network_->libertyPort(pin);
@@ -381,6 +396,7 @@ bool LRSubproblem::snapshot(sta::Instance* inst,
   if (snap.outputs.empty()) {
     return false;
   }
+  snap.budget = worst_budget;
 
   // Precompute leakage-equivalent cost for the current cell and every
   // candidate now, on the main thread - leakageOrArea/getSwappableCells mutate
@@ -507,9 +523,44 @@ bool LRSubproblem::candidateDrcOkSnapshot(const GateSnapshot& snap,
   return true;
 }
 
+bool LRSubproblem::downsizeFitsSlackBudget(
+    const GateSnapshot& snap,
+    sta::LibertyCell* replacement,
+    const float safety,
+    sta::ArcDelayCalc* arc_delay_calc) const
+{
+  // snap.budget is the depth-normalized, distributed slack budget.
+  const float budget = safety * snap.budget;
+  if (budget <= 0.0f) {
+    return false;
+  }
+  const sta::Scene* scene = snap.scene;
+  for (const OutputCtx& o : snap.outputs) {
+    if (o.port == nullptr) {
+      continue;
+    }
+    sta::LibertyPort* cand_port = replacement->findLibertyPort(o.port->name());
+    if (cand_port == nullptr) {
+      return false;  // candidate missing this output port - reject
+    }
+    // Δd at the frozen load: extra gate delay the downsize adds on this pin.
+    // Increasing the gate delay by Δd reduces the slack on every path through
+    // the pin by Δd, so Δd must fit the budget.
+    const float d_cur = sta::delayAsFloat(
+        resizer_->gateDelay(o.port, o.load_cap, scene, max_, arc_delay_calc));
+    const float d_cand = sta::delayAsFloat(resizer_->gateDelay(
+        cand_port, o.load_cap, scene, max_, arc_delay_calc));
+    if (d_cand - d_cur > budget) {
+      return false;
+    }
+  }
+  return true;
+}
+
 LRSubproblem::GateDecision LRSubproblem::evaluateSnapshot(
     const GateSnapshot& snap,
     const float timing_weight,
+    const float budget_safety,
     sta::ArcDelayCalc* arc_delay_calc) const
 {
   GateDecision result;
@@ -527,6 +578,15 @@ LRSubproblem::GateDecision LRSubproblem::evaluateSnapshot(
     if (!candidateDrcOkSnapshot(snap, cand.cell)) {
       continue;
     }
+    // Downsize slack-budget guard: a candidate with lower leakage than the
+    // current cell is a downsize; only take it if its added delay fits the
+    // gate's distributed slack budget. Upsizes are unconstrained - they only
+    // improve setup.
+    if (cand.leakage < snap.cur_leakage
+        && !downsizeFitsSlackBudget(
+            snap, cand.cell, budget_safety, arc_delay_calc)) {
+      continue;
+    }
     const float cost = evaluateCellCost(
         snap, cand.cell, cand.leakage, timing_weight, arc_delay_calc);
     if (cost < result.best_cost) {
diff --git a/src/rsz/src/LRSubproblem.hh b/src/rsz/src/LRSubproblem.hh
index 008a27b69b0..fe9bcd88649 100644
--- a/src/rsz/src/LRSubproblem.hh
+++ b/src/rsz/src/LRSubproblem.hh
@@ -125,6 +125,14 @@ class LRSubproblem : public sta::dbStaState
     sta::LibertyCell* cur_cell = nullptr;
     float cur_leakage = 0.0f;
     const sta::Scene* scene = nullptr;
+    // Distributed downsize budget for this gate: the min over its output pins
+    // of the depth-normalized slack budget  max(0, slack - margin) / depth,
+    // frozen on the main thread (computed by the policy's computeSlackBudgets).
+    // A downsize may add at most this much delay on any output pin (times a
+    // safety factor). Because the per-path sum of these budgets is <= the path
+    // slack, simultaneous (Jacobi) downsizes within budget cannot overshoot a
+    // path.
+    float budget = 0.0f;
     std::vector<OutputCtx> outputs;
     std::vector<UpstreamCtx> upstream;
     std::vector<InputMaxCapCtx> inputs;
@@ -153,17 +161,23 @@ class LRSubproblem : public sta::dbStaState
   // MAIN THREAD ONLY. Capture the frozen state needed to evaluate `inst`.
   // Returns false (and leaves `snap` unspecified) when `inst` is don't-touch,
   // has no liberty cell, or has no usable output pin. `lambda` is indexed by
-  // sta::Edge::id (sparse, size `lambda_size`).
+  // sta::Edge::id (sparse, size `lambda_size`). `budget` is the per-vertex
+  // depth-normalized downsize budget indexed by sta::Graph vertex id (size
+  // `budget_size`); the gate's frozen budget is the min over its output pins.
   bool snapshot(sta::Instance* inst,
                 const float* lambda,
                 int lambda_size,
+                const float* budget,
+                int budget_size,
                 GateSnapshot& snap);
 
   // WORKER SAFE. Evaluate the subproblem for a prepared snapshot using the
   // caller-provided per-thread ArcDelayCalc. `timing_weight` scales the Σλ·d
-  // timing term against the leakage objective.
+  // timing term against the leakage objective. `budget_safety` (<= 1) scales
+  // the gate's frozen downsize budget in the feasibility guard.
   GateDecision evaluateSnapshot(const GateSnapshot& snap,
                                 float timing_weight,
+                                float budget_safety,
                                 sta::ArcDelayCalc* arc_delay_calc) const;
 
   // Leakage-equivalent cost for `cell`. Returns Resizer::cellLeakage when
@@ -204,6 +218,18 @@ class LRSubproblem : public sta::dbStaState
   bool candidateDrcOkSnapshot(const GateSnapshot& snap,
                               sta::LibertyCell* replacement) const;
 
+  // Worker-safe downsize feasibility guard over a frozen snapshot. Returns true
+  // iff installing the (lower-leakage) `replacement` adds, on every output pin,
+  // no more delay than `safety * snap.budget`. snap.budget is the depth-
+  // normalized, distributed slack budget frozen by the policy: because the
+  // per-path sum of gate budgets is <= path slack, simultaneous downsizes
+  // within budget cannot overshoot, so no per-gate discount is needed. A gate
+  // with no budget (<= 0) cannot be downsized.
+  bool downsizeFitsSlackBudget(const GateSnapshot& snap,
+                               sta::LibertyCell* replacement,
+                               float safety,
+                               sta::ArcDelayCalc* arc_delay_calc) const;
+
   Resizer* resizer_ = nullptr;
   utl::Logger* logger_ = nullptr;
   sta::dbNetwork* db_network_ = nullptr;
diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc
index 74b1e7b6549..d92e6865d7e 100644
--- a/src/rsz/src/policy/GlobalSizingPolicy.cc
+++ b/src/rsz/src/policy/GlobalSizingPolicy.cc
@@ -8,6 +8,7 @@
 #include <cstdlib>
 #include <memory>
 #include <optional>
+#include <ranges>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -452,6 +453,89 @@ GlobalSizingPolicy::DesignSnap GlobalSizingPolicy::computeDesignSnap() const
   return s;
 }
 
+void GlobalSizingPolicy::computeSlackBudgets()
+{
+  // Per-vertex downsize budget = max(0, slack(v) - margin) / depth(v), where
+  // depth(v) is the gate count on the longest path through v. Distributing by
+  // depth bounds the per-path budget sum by the path slack; using v's own
+  // (worst-path) slack keeps each gate safe on all its paths. Recomputed each
+  // sweep from the live slacks.
+  const size_t n = static_cast<size_t>(graph_->vertexCount()) + 1;
+
+  std::vector<sta::Vertex*> vertices;
+  {
+    sta::VertexIterator vit(graph_);
+    while (vit.hasNext()) {
+      vertices.push_back(vit.next());
+    }
+  }
+  std::ranges::sort(vertices, [](const sta::Vertex* a, const sta::Vertex* b) {
+    return a->level() < b->level();
+  });
+
+  // A gate-internal (cell) arc has both pins on the same leaf instance; only
+  // these add a gate-delay term to a path, so only these increment the depth.
+  auto is_gate_arc = [this](sta::Edge* e) {
+    const sta::Instance* fi = network_->instance(e->from(graph_)->pin());
+    const sta::Instance* ti = network_->instance(e->to(graph_)->pin());
+    return fi != nullptr && fi == ti;
+  };
+
+  // Forward pass (increasing level): gates from a source up to and including v.
+  std::vector<int> fwd(n, 0);
+  for (sta::Vertex* v : vertices) {
+    int best = 0;
+    sta::VertexInEdgeIterator ieit(v, graph_);
+    while (ieit.hasNext()) {
+      sta::Edge* e = ieit.next();
+      if (!isDataArc(e)) {
+        continue;
+      }
+      const sta::VertexId uid = graph_->id(e->from(graph_));
+      best = std::max(best, fwd[uid] + (is_gate_arc(e) ? 1 : 0));
+    }
+    fwd[graph_->id(v)] = best;
+  }
+
+  // Backward pass (decreasing level): gates from v (exclusive) to a sink.
+  std::vector<int> bwd(n, 0);
+  for (sta::Vertex* v : std::views::reverse(vertices)) {
+    int best = 0;
+    sta::VertexOutEdgeIterator oeit(v, graph_);
+    while (oeit.hasNext()) {
+      sta::Edge* e = oeit.next();
+      if (!isDataArc(e)) {
+        continue;
+      }
+      const sta::VertexId wid = graph_->id(e->to(graph_));
+      best = std::max(best, bwd[wid] + (is_gate_arc(e) ? 1 : 0));
+    }
+    bwd[graph_->id(v)] = best;
+  }
+
+  const float margin = lr_params_.setup_slack_margin;
+  const float kSlackSentinel = 1e6f;
+  vertex_budget_.assign(n, 0.0f);
+  for (sta::Vertex* v : vertices) {
+    const sta::VertexId vid = graph_->id(v);
+    const int depth = std::max(1, fwd[vid] + bwd[vid]);
+    const float slack = sta::delayAsFloat(sta_->slack(v, policy_max_));
+    // Unconstrained vertices (no real required time) report a sentinel slack;
+    // leave them effectively unbudgeted so genuinely free gates can downsize.
+    vertex_budget_[vid]
+        = (slack >= kSlackSentinel)
+              ? kSlackSentinel
+              : std::max(0.0f, slack - margin) / static_cast<float>(depth);
+  }
+  debugPrint(logger_,
+             RSZ,
+             "global_sizing",
+             2,
+             "LR budgets: {} vertices, margin={}",
+             n - 1,
+             sta::delayAsString(margin, 3, sta_));
+}
+
 std::vector<LRSubproblem::GateSnapshot> GlobalSizingPolicy::buildSnapshots()
 {
   // Phase A (main thread, delays valid): freeze each evaluable gate's
@@ -459,13 +543,19 @@ std::vector<LRSubproblem::GateSnapshot> GlobalSizingPolicy::buildSnapshots()
   // getSwappableCells / cellLeakage / net-driver caches, so the subsequent
   // parallel phase touches none of them.
   const int lambda_size = static_cast<int>(lambda_.size());
+  const int budget_size = static_cast<int>(vertex_budget_.size());
   std::vector<LRSubproblem::GateSnapshot> snapshots;
   std::unique_ptr<sta::LeafInstanceIterator> iit(
       network_->leafInstanceIterator());
   while (iit->hasNext()) {
     sta::Instance* inst = iit->next();
     LRSubproblem::GateSnapshot snap;
-    if (subproblem_->snapshot(inst, lambda_.data(), lambda_size, snap)) {
+    if (subproblem_->snapshot(inst,
+                              lambda_.data(),
+                              lambda_size,
+                              vertex_budget_.data(),
+                              budget_size,
+                              snap)) {
       snapshots.push_back(std::move(snap));
     }
   }
@@ -549,18 +639,22 @@ GlobalSizingPolicy::SweepStats GlobalSizingPolicy::applyDecisions(
 GlobalSizingPolicy::SweepStats GlobalSizingPolicy::singleSweep(
     const float timing_weight)
 {
-  // Phase A: Freeze per-gate state.
+  // Phase A: Distribute the slack into per-vertex budgets, then freeze per-gate
+  // state (which reads those budgets).
+  computeSlackBudgets();
   std::vector<LRSubproblem::GateSnapshot> snapshots = buildSnapshots();
 
   // Phase B: Score every snapshot independently. Each worker uses its own
   // ArcDelayCalc copy (arc_delay_calc_ is single-threaded shared state); the
   // copy is cached per worker thread and refreshed if the source changes. With
   // a zero-worker pool this runs inline on the calling thread.
+  const float safety = lr_params_.budget_safety_factor;
   sta::ArcDelayCalc* const src = sta_->arcDelayCalc();
   const std::vector<LRSubproblem::GateDecision> decisions
       = thread_pool_->parallelMap(
           snapshots,
-          [this, timing_weight, src](const LRSubproblem::GateSnapshot& snap) {
+          [this, timing_weight, safety, src](
+              const LRSubproblem::GateSnapshot& snap) {
             static thread_local sta::ArcDelayCalc* cached_src = nullptr;
             static thread_local std::unique_ptr<sta::ArcDelayCalc> adc;
             if (adc == nullptr || cached_src != src) {
@@ -568,7 +662,7 @@ GlobalSizingPolicy::SweepStats GlobalSizingPolicy::singleSweep(
               cached_src = src;
             }
             return subproblem_->evaluateSnapshot(
-                snap, timing_weight, adc.get());
+                snap, timing_weight, safety, adc.get());
           });
 
   // Phase C: Apply accepted moves serially.
diff --git a/src/rsz/src/policy/GlobalSizingPolicy.hh b/src/rsz/src/policy/GlobalSizingPolicy.hh
index 1b0f1bf1de5..3ffdb5a8df7 100644
--- a/src/rsz/src/policy/GlobalSizingPolicy.hh
+++ b/src/rsz/src/policy/GlobalSizingPolicy.hh
@@ -45,6 +45,11 @@ struct LRParams
   // Dimensionless balance between timing pressure and leakage cost.
   // bias = 1.0 keeps Σλ·d (scaled) ≈ leakage cost on the median gate.
   float timing_bias = 64.0f;
+  // Safety derate (<= 1) on the per-gate distributed downsize budget. The
+  // depth-normalized distribution already guarantees per-path budget sums
+  // <= path slack, so 1.0 is feasible in theory; a value < 1 adds margin for
+  // the un-modeled slew cascade / estimated-vs-routed parasitic gap.
+  float budget_safety_factor = 1.0f;
 };
 
 // GlobalSizingPolicy: Lagrangian-Relaxation-driven global sizing + Vt
@@ -102,6 +107,14 @@ class GlobalSizingPolicy : public OptimizationPolicy
   // after this returns.
   SweepStats singleSweep(float timing_weight);
 
+  // Phase A pre-pass: Compute the per-vertex depth-normalized downsize budget
+  //   budget(v) = max(0, slack(v) - margin) / depth(v)
+  // where depth(v) is the gate count on the longest path through v.
+  // Distributing by depth guarantees the per-path sum of budgets <= path slack,
+  // while using each vertex's own (worst-path) slack keeps every gate within
+  // all its paths.
+  void computeSlackBudgets();
+
   // Phase A: Capture the frozen per-gate snapshots for every evaluable leaf
   // instance, in a stable order. Reads live STA and warms the lazy
   // Liberty/dbNetwork caches on the main thread.
@@ -138,6 +151,9 @@ class GlobalSizingPolicy : public OptimizationPolicy
 
   // Per-edge multipliers, indexed by sta::Edge::id (sparse)
   std::vector<float> lambda_;
+  // Per-vertex depth-normalized downsize budget, indexed by sta::Graph vertex
+  // id. Rebuilt each sweep by computeSlackBudgets().
+  std::vector<float> vertex_budget_;
   // Per-endpoint multipliers, indexed by a dense endpoint index
   std::vector<float> mu_;
   // Dense endpoint bookkeeping

From 0b9d52c05557d81a0890631c0103507bcc270b5c Mon Sep 17 00:00:00 2001
From: Eren Dogan <erendogan@google.com>
Date: Mon, 8 Jun 2026 21:57:48 -0700
Subject: [PATCH 8/8] rsz: Fix vertex id overflow

Signed-off-by: Eren Dogan <erendogan@google.com>
---
 src/rsz/src/policy/GlobalSizingPolicy.cc | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc
index d92e6865d7e..5ffb06a0fe7 100644
--- a/src/rsz/src/policy/GlobalSizingPolicy.cc
+++ b/src/rsz/src/policy/GlobalSizingPolicy.cc
@@ -460,15 +460,17 @@ void GlobalSizingPolicy::computeSlackBudgets()
   // depth bounds the per-path budget sum by the path slack; using v's own
   // (worst-path) slack keeps each gate safe on all its paths. Recomputed each
   // sweep from the live slacks.
-  const size_t n = static_cast<size_t>(graph_->vertexCount()) + 1;
-
   std::vector<sta::Vertex*> vertices;
+  size_t max_id = 0;
   {
     sta::VertexIterator vit(graph_);
     while (vit.hasNext()) {
-      vertices.push_back(vit.next());
+      sta::Vertex* v = vit.next();
+      vertices.push_back(v);
+      max_id = std::max(max_id, static_cast<size_t>(graph_->id(v)));
     }
   }
+  const size_t n = max_id + 1;
   std::ranges::sort(vertices, [](const sta::Vertex* a, const sta::Vertex* b) {
     return a->level() < b->level();
   });
@@ -531,8 +533,9 @@ void GlobalSizingPolicy::computeSlackBudgets()
              RSZ,
              "global_sizing",
              2,
-             "LR budgets: {} vertices, margin={}",
-             n - 1,
+             "LR budgets: {} vertices (max id {}), margin={}",
+             vertices.size(),
+             max_id,
              sta::delayAsString(margin, 3, sta_));
 }