From 9cf00eb012c808a16911c7b5ed3a843abbb39e1a Mon Sep 17 00:00:00 2001 From: Eren Dogan Date: Fri, 5 Jun 2026 12:54:51 -0700 Subject: [PATCH 1/8] rsz: Add global sizing Signed-off-by: Eren Dogan --- src/rsz/BUILD | 4 + src/rsz/include/rsz/Resizer.hh | 17 + src/rsz/src/CMakeLists.txt | 2 + src/rsz/src/LRSubproblem.cc | 544 ++++++ src/rsz/src/LRSubproblem.hh | 221 +++ src/rsz/src/Optimizer.cc | 9 +- src/rsz/src/Resizer.cc | 41 +- src/rsz/src/policy/GlobalSizingPolicy.cc | 922 +++++++++ src/rsz/src/policy/GlobalSizingPolicy.hh | 152 ++ src/rsz/test/BUILD | 3 + src/rsz/test/CMakeLists.txt | 2 + src/rsz/test/global_sizing.tcl | 35 + src/rsz/test/global_sizing.vok | 2019 ++++++++++++++++++++ src/rsz/test/global_sizing_threads.tcl | 7 + src/rsz/test/repair_setup_invalid_phase.ok | 4 +- 15 files changed, 3969 insertions(+), 13 deletions(-) create mode 100644 src/rsz/src/LRSubproblem.cc create mode 100644 src/rsz/src/LRSubproblem.hh create mode 100644 src/rsz/src/policy/GlobalSizingPolicy.cc create mode 100644 src/rsz/src/policy/GlobalSizingPolicy.hh create mode 100644 src/rsz/test/global_sizing.tcl create mode 100644 src/rsz/test/global_sizing.vok create mode 100644 src/rsz/test/global_sizing_threads.tcl diff --git a/src/rsz/BUILD b/src/rsz/BUILD index a862f2a97bb..f816b03e1db 100644 --- a/src/rsz/BUILD +++ b/src/rsz/BUILD @@ -23,6 +23,8 @@ cc_library( "src/DelayEstimator.hh", "src/DelayEstimatorReporter.cc", "src/DelayEstimatorReporter.hh", + "src/LRSubproblem.cc", + "src/LRSubproblem.hh", "src/MoveCommitter.cc", "src/MoveCommitter.hh", "src/MoveTracker.cc", @@ -104,6 +106,8 @@ cc_library( "src/move/VtSwapMtCandidate.hh", "src/move/VtSwapMtGenerator.cc", "src/move/VtSwapMtGenerator.hh", + "src/policy/GlobalSizingPolicy.cc", + "src/policy/GlobalSizingPolicy.hh", "src/policy/MeasuredVtSwapPolicy.cc", "src/policy/MeasuredVtSwapPolicy.hh", "src/policy/OptimizationPolicy.cc", diff --git a/src/rsz/include/rsz/Resizer.hh b/src/rsz/include/rsz/Resizer.hh index ffe5cc4405d..ad43a93241a 100644 --- a/src/rsz/include/rsz/Resizer.hh +++ b/src/rsz/include/rsz/Resizer.hh @@ -690,6 +690,16 @@ class Resizer : public sta::dbStaState, public sta::dbNetworkObserver // Return values. sta::ArcDelay delays[sta::RiseFall::index_count], sta::Slew slews[sta::RiseFall::index_count]); + // Worker-safe overload: uses the caller-provided ArcDelayCalc instead of the + // shared member, so the table-model lookup can run concurrently. + void gateDelays(const sta::LibertyPort* drvr_port, + float load_cap, + const sta::Scene* scene, + const sta::MinMax* min_max, + sta::ArcDelayCalc* arc_delay_calc, + // Return values. + sta::ArcDelay delays[sta::RiseFall::index_count], + sta::Slew slews[sta::RiseFall::index_count]); void gateDelays(const sta::LibertyPort* drvr_port, float load_cap, const sta::Slew in_slews[sta::RiseFall::index_count], @@ -702,6 +712,12 @@ class Resizer : public sta::dbStaState, public sta::dbNetworkObserver float load_cap, const sta::Scene* scene, const sta::MinMax* min_max); + // Worker-safe overload (see gateDelays above). + sta::ArcDelay gateDelay(const sta::LibertyPort* drvr_port, + float load_cap, + const sta::Scene* scene, + const sta::MinMax* min_max, + sta::ArcDelayCalc* arc_delay_calc); sta::ArcDelay gateDelay(const sta::LibertyPort* drvr_port, const sta::RiseFall* rf, float load_cap, @@ -1028,6 +1044,7 @@ class Resizer : public sta::dbStaState, public sta::dbNetworkObserver friend class OdbCallBack; friend class SetupLegacyBase; friend class RepairTargetCollector; + friend class LRSubproblem; friend class DelayEstimatorReporter; }; diff --git a/src/rsz/src/CMakeLists.txt b/src/rsz/src/CMakeLists.txt index 7ebe9fbceeb..43dbdd2774a 100644 --- a/src/rsz/src/CMakeLists.txt +++ b/src/rsz/src/CMakeLists.txt @@ -28,6 +28,7 @@ add_library(rsz_lib policy/MeasuredVtSwapPolicy.cc DelayEstimator.cc DelayEstimatorReporter.cc + LRSubproblem.cc policy/SetupCritVtSwapPolicy.cc policy/SetupDirectionalPolicy.cc policy/SetupLastGaspPolicy.cc @@ -36,6 +37,7 @@ add_library(rsz_lib policy/SetupWnsPolicy.cc policy/SetupMt1Policy.cc policy/SetupReroutePolicy.cc + policy/GlobalSizingPolicy.cc Resizer.cc OdbCallBack.cc ConcreteSwapArithModules.cc diff --git a/src/rsz/src/LRSubproblem.cc b/src/rsz/src/LRSubproblem.cc new file mode 100644 index 00000000000..34eac8cd2e8 --- /dev/null +++ b/src/rsz/src/LRSubproblem.cc @@ -0,0 +1,544 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026-2026, The OpenROAD Authors + +#include "LRSubproblem.hh" + +#include +#include +#include +#include +#include +#include + +#include "db_sta/dbNetwork.hh" +#include "db_sta/dbSta.hh" +#include "odb/db.h" +#include "rsz/Resizer.hh" +#include "sta/Delay.hh" +#include "sta/Graph.hh" +#include "sta/GraphDelayCalc.hh" +#include "sta/Liberty.hh" +#include "sta/LibertyClass.hh" +#include "sta/Network.hh" +#include "sta/NetworkClass.hh" +#include "sta/PortDirection.hh" +#include "sta/Scene.hh" +#include "sta/Sta.hh" +#include "sta/TimingRole.hh" +#include "sta/Transition.hh" + +namespace rsz { + +namespace { + +// Resizer::area(Cell*) is protected. Compute the same value through the public +// dbuToMeters + db_network->staToDb pair so we don't friend-pierce the Resizer +// class. Matches Resizer::area(dbMaster*) exactly. +double cellAreaSI(const Resizer& resizer, + sta::dbNetwork* db_network, + sta::LibertyCell* cell) +{ + if (cell == nullptr) { + return 0.0; + } + odb::dbMaster* master = db_network->staToDb(db_network->cell(cell)); + if (master == nullptr || !master->isCoreAutoPlaceable()) { + return 0.0; + } + return resizer.dbuToMeters(master->getWidth()) + * resizer.dbuToMeters(master->getHeight()); +} + +// File-local output-side DRC helpers. Mirrors the pattern used by +// SizeDownGenerator.cc where similar checks live as file-local statics. +// Polarity: return true when the proposed replacement would introduce a +// violation. Both are pure Liberty/SDC reads and so are safe to call from +// worker threads. + +bool checkOutputMaxCap(sta::LibertyPort* output_port, + const float output_cap, + const sta::MinMax* max_mm) +{ + float max_cap = 0.0f; + bool cap_limit_exists = false; + output_port->capacitanceLimit(max_mm, max_cap, cap_limit_exists); + return cap_limit_exists && max_cap > 0.0f && output_cap > max_cap; +} + +bool checkOutputMaxSlew(sta::dbSta* sta, + sta::LibertyPort* candidate_port, + const float output_slew_factor, + const float output_cap, + const sta::Scene* scene, + const sta::MinMax* max_mm) +{ + const float new_slew + = output_slew_factor * candidate_port->driveResistance() * output_cap; + float max_slew = 0.0f; + bool slew_limit_exists = false; + sta->findSlewLimit( + candidate_port, scene, max_mm, max_slew, slew_limit_exists); + return slew_limit_exists && new_slew > max_slew; +} + +} // namespace + +LRSubproblem::LRSubproblem(Resizer* resizer) : resizer_(resizer) +{ +} + +void LRSubproblem::init() +{ + if (initialized_) { + return; + } + logger_ = resizer_->logger(); + dbStaState::init(resizer_->sta()); + db_network_ = resizer_->dbNetwork(); + computeLeakageScale(); + initialized_ = true; +} + +void LRSubproblem::computeLeakageScale() +{ + // Build (leakage, area) pairs for instances whose current cell has both. + std::vector leakages; + std::vector areas; + std::unique_ptr iit( + network_->leafInstanceIterator()); + while (iit->hasNext()) { + sta::Instance* inst = iit->next(); + sta::LibertyCell* cell = network_->libertyCell(inst); + if (cell == nullptr) { + continue; + } + const std::optional leak = resizer_->cellLeakage(cell); + if (!leak.has_value()) { + continue; + } + const double a = cellAreaSI(*resizer_, db_network_, cell); + if (a <= 0.0) { + continue; + } + leakages.push_back(*leak); + areas.push_back(static_cast(a)); + } + + if (leakages.empty()) { + // Degenerate: no instance exposes leakage. leakageOrArea will return + // raw area, which is order-preserving within this design. + area_to_leakage_scale_ = 0.0f; + return; + } + + const auto mid = leakages.size() / 2; + std::nth_element(leakages.begin(), leakages.begin() + mid, leakages.end()); + const float l_med = leakages[mid]; + std::nth_element(areas.begin(), areas.begin() + mid, areas.end()); + const float a_med = areas[mid]; + + area_to_leakage_scale_ = (a_med > 0.0f) ? (l_med / a_med) : 0.0f; +} + +float LRSubproblem::leakageOrArea(sta::LibertyCell* cell) const +{ + const std::optional leak = resizer_->cellLeakage(cell); + if (leak.has_value()) { + return *leak; + } + const float a = static_cast(cellAreaSI(*resizer_, db_network_, cell)); + return area_to_leakage_scale_ > 0.0f ? area_to_leakage_scale_ * a : a; +} + +bool LRSubproblem::isDataArc(const sta::Edge* edge) const +{ + const sta::TimingRole* role = edge->role(); + if (role->isTimingCheck()) { + return false; + } + if (edge->isDisabledLoop()) { + return false; + } + if (role == sta::TimingRole::latchDtoQ() + || role == sta::TimingRole::latchEnToQ()) { + return false; + } + return true; +} + +float LRSubproblem::portInputCap(sta::LibertyCell* cell, + const char* port_name) const +{ + sta::LibertyPort* port = cell->findLibertyPort(port_name); + if (port == nullptr) { + return 0.0f; + } + float cap = 0.0f; + for (auto rf : sta::RiseFall::range()) { + cap = std::max(cap, port->capacitance(rf, max_)); + } + return cap; +} + +bool LRSubproblem::applyReplacement(sta::Instance* inst, + sta::LibertyCell* replacement) +{ + if (inst == nullptr || replacement == nullptr) { + return false; + } + return resizer_->replaceCell(inst, replacement, /*journal=*/true); +} + +bool LRSubproblem::snapshot(sta::Instance* inst, + const float* lambda, + const int lambda_size, + GateSnapshot& snap) +{ + init(); + + if (resizer_->dontTouch(inst)) { + return false; + } + sta::LibertyCell* cur_cell = network_->libertyCell(inst); + if (cur_cell == nullptr) { + return false; + } + + const sta::Scene* scene = sta_->cmdScene(); + const sta::MinMax* max_mm = max_; + + snap.inst = inst; + snap.cur_cell = cur_cell; + snap.scene = scene; + snap.outputs.clear(); + snap.upstream.clear(); + snap.inputs.clear(); + snap.candidates.clear(); + + std::unique_ptr pit(network_->pinIterator(inst)); + while (pit->hasNext()) { + sta::Pin* pin = pit->next(); + const sta::PortDirection* dir = network_->direction(pin); + if (dir->isOutput()) { + sta::Vertex* v = graph_->pinDrvrVertex(pin); + if (v == nullptr) { + continue; + } + const sta::LibertyPort* out_port = network_->libertyPort(pin); + if (out_port == nullptr) { + continue; + } + float lam_sum = 0.0f; + sta::VertexInEdgeIterator ieit(v, graph_); + while (ieit.hasNext()) { + sta::Edge* e = ieit.next(); + if (!isDataArc(e)) { + continue; + } + // Restrict to gate-internal arcs (from a pin on the same instance). + const sta::Pin* from_pin = e->from(graph_)->pin(); + if (network_->instance(from_pin) != inst) { + continue; + } + const sta::EdgeId id = graph_->id(e); + if (static_cast(id) >= lambda_size) { + continue; + } + lam_sum += lambda[id]; + } + OutputCtx o; + o.port = out_port; + o.load_cap = graph_delay_calc_->loadCap(pin, scene, max_mm); + o.lambda_sum = lam_sum; + // Freeze the Elmore-slew DRC inputs. slew is the STA graph slew at the + // output pin's load vertex; it is constant across candidates, so we read + // it once here on the main thread. + sta::Vertex* load_v = graph_->pinLoadVertex(pin); + o.slew = (load_v != nullptr) + ? sta::delayAsFloat(sta_->slew(load_v, + sta::RiseFallBoth::riseFall(), + sta_->scenes(), + max_mm)) + : 0.0f; + o.drive_res = out_port->driveResistance(); + snap.outputs.push_back(o); + } else if (dir->isInput()) { + const sta::LibertyPort* in_port = network_->libertyPort(pin); + + // (a) Input-side max-cap DRC context for every input pin: freeze each + // fanin driver's current cap-check so workers can replay + // Resizer::replacementPreservesMaxCap without touching live STA. + if (in_port != nullptr) { + sta::PinSet* drivers = network_->drivers(pin); + if (drivers != nullptr) { + InputMaxCapCtx in_ctx; + in_ctx.in_port = in_port; + in_ctx.old_cap = portInputCap(cur_cell, in_port->name().c_str()); + for (const sta::Pin* driver_pin : *drivers) { + float cap = 0.0f; + float max_cap = 0.0f; + float cap_slack = 0.0f; + const sta::RiseFall* tr = nullptr; + const sta::Scene* corner = nullptr; + sta_->checkCapacitance(driver_pin, + sta_->scenes(), + max_mm, + cap, + max_cap, + cap_slack, + tr, + corner); + DriverCapCheck dc; + dc.cap = cap; + dc.max_cap = max_cap; + dc.cap_slack = cap_slack; + dc.corner_ok = (max_cap > 0.0f && corner != nullptr); + in_ctx.drivers.push_back(dc); + } + snap.inputs.push_back(std::move(in_ctx)); + } + } + + // (b) Upstream-Cin context: only input pins with real upstream pressure. + sta::Vertex* in_v = graph_->pinLoadVertex(pin); + if (in_v == nullptr) { + continue; + } + // Locate the driver pin via the wire arc(s) feeding in_v. There's + // typically exactly one; take the first valid one. + sta::Pin* drv_pin = nullptr; + sta::VertexInEdgeIterator wireIt(in_v, graph_); + while (wireIt.hasNext()) { + sta::Edge* w = wireIt.next(); + if (w->isDisabledLoop()) { + continue; + } + sta::Pin* candidate_drv = w->from(graph_)->pin(); + if (candidate_drv != nullptr && candidate_drv != pin) { + drv_pin = candidate_drv; + break; + } + } + if (drv_pin == nullptr) { + continue; // floating / no driver + } + sta::Instance* upstream_inst = network_->instance(drv_pin); + if (upstream_inst == nullptr || upstream_inst == inst) { + continue; + } + sta::LibertyCell* upstream_cell = network_->libertyCell(upstream_inst); + if (upstream_cell == nullptr) { + // PI / hierarchical / black box - no Liberty model to evaluate. + continue; + } + sta::LibertyPort* drv_port = network_->libertyPort(drv_pin); + if (drv_port == nullptr) { + continue; + } + sta::Vertex* drv_v = graph_->pinDrvrVertex(drv_pin); + if (drv_v == nullptr) { + continue; + } + // Sum λ over U's gate-internal data arcs terminating at drv_pin. + float lam_U = 0.0f; + sta::VertexInEdgeIterator drvIt(drv_v, graph_); + while (drvIt.hasNext()) { + sta::Edge* e = drvIt.next(); + if (!isDataArc(e)) { + continue; + } + const sta::Pin* from_pin = e->from(graph_)->pin(); + if (network_->instance(from_pin) != upstream_inst) { + continue; + } + const sta::EdgeId id = graph_->id(e); + if (static_cast(id) >= lambda_size) { + continue; + } + lam_U += lambda[id]; + } + // Skip pins with no real upstream pressure - saves the per-candidate + // gateDelay call for arcs whose λ is essentially at floor anyway. + if (lam_U <= 0.0f) { + continue; + } + if (in_port == nullptr) { + continue; + } + UpstreamCtx u; + u.orig_in_port = in_port; + u.drv_port = drv_port; + u.load_U_cur = graph_delay_calc_->loadCap(drv_pin, scene, max_mm); + u.c_in_cur = portInputCap(cur_cell, in_port->name().c_str()); + u.lambda_U_drv = lam_U; + snap.upstream.push_back(u); + } + } + + if (snap.outputs.empty()) { + return false; + } + + // Precompute leakage-equivalent cost for the current cell and every + // candidate now, on the main thread - leakageOrArea/getSwappableCells mutate + // lazy caches and must not be touched from workers. + snap.cur_leakage = leakageOrArea(cur_cell); + sta::LibertyCellSeq candidates = resizer_->getSwappableCells(cur_cell); + snap.candidates.reserve(candidates.size()); + for (sta::LibertyCell* cand : candidates) { + if (cand == cur_cell) { + continue; + } + Candidate c; + c.cell = cand; + c.leakage = leakageOrArea(cand); + snap.candidates.push_back(c); + } + + return true; +} + +float LRSubproblem::evaluateCellCost(const GateSnapshot& snap, + sta::LibertyCell* cell, + const float cell_leakage, + const float timing_weight, + sta::ArcDelayCalc* arc_delay_calc) const +{ + float cost = cell_leakage; + const sta::Scene* scene = snap.scene; + // Output-cone term: arcs that terminate at this instance's output pins. + for (const OutputCtx& o : snap.outputs) { + if (o.lambda_sum == 0.0f || o.port == nullptr) { + continue; // no timing pressure on this output pin + } + sta::LibertyPort* cand_port = cell->findLibertyPort(o.port->name()); + if (cand_port == nullptr) { + // Candidate cell missing this output port - reject via huge cost. + return std::numeric_limits::infinity(); + } + const float d = sta::delayAsFloat(resizer_->gateDelay( + cand_port, o.load_cap, scene, max_, arc_delay_calc)); + cost += timing_weight * o.lambda_sum * d; + } + // Upstream-Cin term: arcs inside each upstream driver U that terminate + // at the driver pin feeding one of inst's input pins. Their delay + // depends on the load U drives, which includes inst's input capacitance + // on that pin. Substituting the candidate's input cap perturbs the + // upstream's load and shifts its delay. + for (const UpstreamCtx& u : snap.upstream) { + if (u.lambda_U_drv == 0.0f || u.drv_port == nullptr + || u.orig_in_port == nullptr) { + continue; + } + const float c_in_cand = portInputCap(cell, u.orig_in_port->name().c_str()); + if (c_in_cand == 0.0f) { + // Candidate missing this input port - incompatible. + return std::numeric_limits::infinity(); + } + float load_pert = u.load_U_cur - u.c_in_cur + c_in_cand; + if (load_pert < 0.0f) { + // Numerical safety: extreme C_in mismatches can push the perturbed + // load slightly negative. Clamp at zero rather than rejecting; the + // gateDelay LUT is well-defined at zero load. + load_pert = 0.0f; + } + const float d_U = sta::delayAsFloat(resizer_->gateDelay( + u.drv_port, load_pert, scene, max_, arc_delay_calc)); + cost += timing_weight * u.lambda_U_drv * d_U; + } + return cost; +} + +bool LRSubproblem::candidateDrcOkSnapshot(const GateSnapshot& snap, + sta::LibertyCell* replacement) const +{ + // Input-side: reject if a fanin net's max-cap would be violated (or made + // worse) by the new cell's larger input pin cap. Mirrors + // Resizer::replacementPreservesMaxCap / checkMaxCapOK against the frozen + // per-driver cap checks captured in snapshot(). + for (const InputMaxCapCtx& in : snap.inputs) { + if (in.in_port == nullptr) { + continue; + } + const float new_cap = portInputCap(replacement, in.in_port->name().c_str()); + const float cap_delta = new_cap - in.old_cap; + if (cap_delta <= 0.0f) { + continue; + } + for (const DriverCapCheck& dc : in.drivers) { + if (!dc.corner_ok) { + continue; + } + const float ncap = dc.cap + cap_delta; + if (dc.cap_slack < 0.0f) { + if (ncap > dc.cap) { + return false; + } + } else if (ncap > dc.max_cap) { + return false; + } + } + } + + // Output-side: per-output-pin check against the new cell's cap/slew limits. + for (const OutputCtx& o : snap.outputs) { + if (o.port == nullptr) { + continue; + } + sta::LibertyPort* cand_port = replacement->findLibertyPort(o.port->name()); + if (cand_port == nullptr) { + return false; // candidate missing this output port - reject + } + + if (checkOutputMaxCap(cand_port, o.load_cap, max_)) { + return false; + } + + const float slew_factor = (o.drive_res > 0.0f && o.load_cap > 0.0f) + ? o.slew / (o.drive_res * o.load_cap) + : 0.0f; + if (checkOutputMaxSlew( + sta_, cand_port, slew_factor, o.load_cap, snap.scene, max_)) { + return false; + } + } + + return true; +} + +LRSubproblem::GateDecision LRSubproblem::evaluateSnapshot( + const GateSnapshot& snap, + const float timing_weight, + sta::ArcDelayCalc* arc_delay_calc) const +{ + GateDecision result; + result.inst = snap.inst; + + // Baseline cost with the current cell. + result.baseline_cost = evaluateCellCost( + snap, snap.cur_cell, snap.cur_leakage, timing_weight, arc_delay_calc); + result.best_cost = result.baseline_cost; + float best_leak = snap.cur_leakage; + + for (const Candidate& cand : snap.candidates) { + // Hard DRC filter: reject any candidate that would introduce a max-cap + // or max-slew violation. + if (!candidateDrcOkSnapshot(snap, cand.cell)) { + continue; + } + const float cost = evaluateCellCost( + snap, cand.cell, cand.leakage, timing_weight, arc_delay_calc); + if (cost < result.best_cost) { + result.best_cost = cost; + result.best_cell = cand.cell; + best_leak = cand.leakage; + } + } + + if (result.best_cell != nullptr) { + result.best_is_downsize = best_leak < snap.cur_leakage; + } + return result; +} + +} // namespace rsz diff --git a/src/rsz/src/LRSubproblem.hh b/src/rsz/src/LRSubproblem.hh new file mode 100644 index 00000000000..008a27b69b0 --- /dev/null +++ b/src/rsz/src/LRSubproblem.hh @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026-2026, The OpenROAD Authors + +#pragma once + +#include + +#include "db_sta/dbNetwork.hh" +#include "db_sta/dbSta.hh" +#include "rsz/Resizer.hh" +#include "sta/Liberty.hh" +#include "sta/MinMax.hh" +#include "sta/NetworkClass.hh" +#include "utl/Logger.h" + +namespace sta { +class ArcDelayCalc; +class Edge; +class Pin; +class Scene; +} // namespace sta + +namespace rsz { + +class Resizer; + +// LRSubproblem: Evaluates the per-gate Lagrangian subproblem +// +// minimize_{x ∈ S_i} +// leakage(x) +// + Σ_{e ∈ out(i)} λ_e · d_e(x) +// + Σ_{p ∈ inputs(i)} Σ_{e ∈ arcs_to_drv(p)} λ_e · d_e(U, load_perturbed) +// +// where load_perturbed = load_U - C_in(current x_i, p) + C_in(candidate, p). +// The first sum prices the gate's own internal arcs; the second prices the +// upstream driver U's delay change caused by varying the candidate's input +// capacitance on pin p. +// +// === Threading model ======================================================== +// The evaluation is split so a Jacobi sweep can run in parallel: +// - snapshot(inst) : MAIN THREAD ONLY. Reads live STA state (slews, +// load caps, cap checks), fills the lazy Liberty +// caches, and freezes everything needed to score a +// gate into a GateSnapshot. +// - evaluateSnapshot(snap) : WORKER SAFE. Reads only `snap` + read-only +// Liberty data and the caller-provided per-thread +// ArcDelayCalc. No STA graph reads, no shared +// arc_delay_calc_, no cache writes, no mutation. +// Replacements chosen by evaluateSnapshot are applied later, serially, via +// applyReplacement. +class LRSubproblem : public sta::dbStaState +{ + public: + // Per-input-pin upstream context, captured once per snapshot() call and + // reused across every candidate cell. Each entry corresponds to one input + // pin of the instance whose driver belongs to a real upstream standard cell. + // Pins with no driver (PIs), driverless nets, or whose upstream sum-of-λ is + // at the floor are filtered out at build time. + struct UpstreamCtx + { + // Input port of the instance under its current cell. Used to look up + // the same port (by name) on each candidate cell so we can read the + // candidate's input capacitance for this pin. + const sta::LibertyPort* orig_in_port = nullptr; + // Output port of the upstream driver U at this pin's driver. Constant + // across candidates - only the load it sees changes per candidate. + sta::LibertyPort* drv_port = nullptr; + // Current load capacitance at U's driver pin (farads). Includes the current + // cell's contribution; we subtract C_in(current) and add C_in(candidate) to + // get the perturbed load each candidate. + float load_U_cur = 0.0f; + // Input capacitance on this pin under the instance's CURRENT cell. + float c_in_cur = 0.0f; + // Σλ over U's gate-internal data arcs that terminate at the driver pin. + // These are the arcs whose delay depends on the load U drives. + float lambda_U_drv = 0.0f; + }; + + // Frozen per-output-pin electrical state for one instance. + struct OutputCtx + { + // Output port under the instance's current cell. Candidate ports are looked + // up by name on each candidate cell. + const sta::LibertyPort* port = nullptr; + float load_cap = 0.0f; // graph_delay_calc_->loadCap (frozen) + float lambda_sum = 0.0f; // Σλ over gate-internal arcs into this pin + // Elmore-slew DRC inputs (frozen). slew is the STA graph slew at this pin's + // load vertex; drive_res is the current port's drive resistance. + // The candidate's output slew is estimated as + // slew/(drive_res*load_cap) * cand_drive_res * load_cap. + float slew = 0.0f; + float drive_res = 0.0f; + }; + + // Snapshot of one driver pin's max-cap check on a fanin net. + struct DriverCapCheck + { + float cap = 0.0f; // current load cap at the driver pin + float max_cap = 0.0f; // cap limit + float cap_slack = 0.0f; // current cap slack + bool corner_ok = false; // max_cap > 0 && a corner was returned + }; + + // Per-input-pin context for the input-side max-cap DRC + // (Resizer::replacementPreservesMaxCap, frozen). + struct InputMaxCapCtx + { + const sta::LibertyPort* in_port = nullptr; // current cell's input port + float old_cap = 0.0f; // input pin cap under the CURRENT cell + std::vector drivers; + }; + + // One swappable candidate with its precomputed leakage-equivalent cost. + struct Candidate + { + sta::LibertyCell* cell = nullptr; + float leakage = 0.0f; // leakageOrArea(cell), precomputed on main thread + }; + + // Everything evaluateSnapshot needs to score one instance, frozen on the + // main thread. + struct GateSnapshot + { + sta::Instance* inst = nullptr; + sta::LibertyCell* cur_cell = nullptr; + float cur_leakage = 0.0f; + const sta::Scene* scene = nullptr; + std::vector outputs; + std::vector upstream; + std::vector inputs; + std::vector candidates; // excludes cur_cell + }; + + // Result of one per-gate evaluation, applied later in serial. + struct GateDecision + { + sta::Instance* inst = nullptr; + sta::LibertyCell* best_cell = nullptr; // nullptr -> keep current + float best_cost = 0.0f; // leakage + Σλ·d at best_cell + float baseline_cost = 0.0f; // same for current cell + // True iff best_cell has strictly lower leakage-equivalent cost than the + // current cell. Used by the outer loop to apply asymmetric acceptance: + // any cost drop is enough on a downsize, but timing-noise hysteresis + // still applies to upsizes. False when best_cell == nullptr. + bool best_is_downsize = false; + }; + + explicit LRSubproblem(Resizer* resizer); + ~LRSubproblem() override = default; + + void init(); + + // MAIN THREAD ONLY. Capture the frozen state needed to evaluate `inst`. + // Returns false (and leaves `snap` unspecified) when `inst` is don't-touch, + // has no liberty cell, or has no usable output pin. `lambda` is indexed by + // sta::Edge::id (sparse, size `lambda_size`). + bool snapshot(sta::Instance* inst, + const float* lambda, + int lambda_size, + GateSnapshot& snap); + + // WORKER SAFE. Evaluate the subproblem for a prepared snapshot using the + // caller-provided per-thread ArcDelayCalc. `timing_weight` scales the Σλ·d + // timing term against the leakage objective. + GateDecision evaluateSnapshot(const GateSnapshot& snap, + float timing_weight, + sta::ArcDelayCalc* arc_delay_calc) const; + + // Leakage-equivalent cost for `cell`. Returns Resizer::cellLeakage when + // the Liberty exposes leakage; otherwise returns area · area-to-leakage + // scale (computed once at init() from the current design's distribution + // of leakage and area on cells that DO have leakage). Mutates a lazy cache; + // call only on the main thread. + float leakageOrArea(sta::LibertyCell* cell) const; + + // Apply the LR-chosen replacement at `inst`. Wraps Resizer::replaceCell; + // returns true on success. Called from GlobalSizingPolicy in serial inside + // an open pass-level journal. + bool applyReplacement(sta::Instance* inst, sta::LibertyCell* replacement); + + private: + bool isDataArc(const sta::Edge* edge) const; + // Walks leaf instances once to populate area_to_leakage_scale_ and + // expose any pure-area-only-library degenerate case. + void computeLeakageScale(); + + // Worker-safe cost of running `cell` at the snapshotted instance. + // `cell_leakage` is the precomputed leakage-equivalent cost of `cell`. + float evaluateCellCost(const GateSnapshot& snap, + sta::LibertyCell* cell, + float cell_leakage, + float timing_weight, + sta::ArcDelayCalc* arc_delay_calc) const; + + // Read the max-rise/fall input capacitance of `port` on `cell` (farads). + // Returns 0 if the port is missing on the cell. Worker-safe (Liberty read). + float portInputCap(sta::LibertyCell* cell, const char* port_name) const; + + // Worker-safe DRC filter over a frozen snapshot. Returns true iff installing + // `replacement` would not introduce any max-cap or max-slew violation - + // either on the input side (fanin nets due to larger input pin caps) or on + // each output pin (current load cap against the new cell's cap limit, and + // estimated output slew against the new cell's drive resistance). + bool candidateDrcOkSnapshot(const GateSnapshot& snap, + sta::LibertyCell* replacement) const; + + Resizer* resizer_ = nullptr; + utl::Logger* logger_ = nullptr; + sta::dbNetwork* db_network_ = nullptr; + + // Computed at init() from this design's (leakage, area) distribution on + // instances whose current cell exposes Liberty leakage. Used by + // leakageOrArea() to give area-only cells a leakage-equivalent cost. + // Zero when no instance exposes leakage (degenerate area-only case). + float area_to_leakage_scale_ = 0.0f; + + const sta::MinMax* max_ = sta::MinMax::max(); + bool initialized_ = false; +}; + +} // namespace rsz diff --git a/src/rsz/src/Optimizer.cc b/src/rsz/src/Optimizer.cc index bd5e362594f..6569ca18023 100644 --- a/src/rsz/src/Optimizer.cc +++ b/src/rsz/src/Optimizer.cc @@ -10,6 +10,7 @@ #include #include +#include "GlobalSizingPolicy.hh" #include "MeasuredVtSwapPolicy.hh" #include "OptimizationPolicy.hh" #include "OptimizerTypes.hh" @@ -141,6 +142,10 @@ std::unique_ptr Optimizer::makePolicyForPhase( return std::make_unique( resizer_, committer_, setup_context, config_); } + if (phase_name == "GLOBAL_SIZING") { + return std::make_unique( + resizer_, committer_, setup_context, config_); + } // Only public phase names are listed; experimental top-level tokens // (LEGACY_MT, MT1, MEASURED_VT_SWAP) are accepted but undocumented. resizer_.logger()->error( @@ -148,7 +153,7 @@ std::unique_ptr Optimizer::makePolicyForPhase( 217, "Unknown phase name '{}'. Valid phase names are: LEGACY, WNS, " "WNS_PATH, WNS_CONE, TNS, ENDPOINT_FANIN, STARTPOINT_FANOUT, " - "LAST_GASP, CRIT_VT_SWAP, REROUTE", + "LAST_GASP, CRIT_VT_SWAP, REROUTE, GLOBAL_SIZING", phase_name); return nullptr; } @@ -182,7 +187,7 @@ bool Optimizer::run() 223, "No phase names specified. Valid phase names are: LEGACY, WNS, " "WNS_PATH, WNS_CONE, TNS, ENDPOINT_FANIN, STARTPOINT_FANOUT, " - "LAST_GASP, CRIT_VT_SWAP, REROUTE"); + "LAST_GASP, CRIT_VT_SWAP, REROUTE, GLOBAL_SIZING"); } const int phase_count = phase_names.size(); diff --git a/src/rsz/src/Resizer.cc b/src/rsz/src/Resizer.cc index 0988fcf2876..ad6e4686f7a 100644 --- a/src/rsz/src/Resizer.cc +++ b/src/rsz/src/Resizer.cc @@ -4304,6 +4304,19 @@ void Resizer::gateDelays(const sta::LibertyPort* drvr_port, // Return values. sta::ArcDelay delays[sta::RiseFall::index_count], sta::Slew slews[sta::RiseFall::index_count]) +{ + gateDelays( + drvr_port, load_cap, scene, min_max, arc_delay_calc_, delays, slews); +} + +void Resizer::gateDelays(const sta::LibertyPort* drvr_port, + const float load_cap, + const sta::Scene* scene, + const sta::MinMax* min_max, + sta::ArcDelayCalc* arc_delay_calc, + // Return values. + sta::ArcDelay delays[sta::RiseFall::index_count], + sta::Slew slews[sta::RiseFall::index_count]) { for (int rf_index : sta::RiseFall::rangeIndex()) { delays[rf_index] = -sta::INF; @@ -4327,14 +4340,14 @@ void Resizer::gateDelays(const sta::LibertyPort* drvr_port, } sta::LoadPinIndexMap load_pin_index_map(network_); sta::ArcDcalcResult dcalc_result - = arc_delay_calc_->gateDelay(nullptr, - arc, - in_slew, - load_cap, - nullptr, - load_pin_index_map, - scene, - min_max); + = arc_delay_calc->gateDelay(nullptr, + arc, + in_slew, + load_cap, + nullptr, + load_pin_index_map, + scene, + min_max); const sta::ArcDelay& gate_delay = dcalc_result.gateDelay(); const sta::Slew& drvr_slew = dcalc_result.drvrSlew(); @@ -4402,10 +4415,20 @@ sta::ArcDelay Resizer::gateDelay(const sta::LibertyPort* drvr_port, const float load_cap, const sta::Scene* scene, const sta::MinMax* min_max) +{ + return gateDelay(drvr_port, load_cap, scene, min_max, arc_delay_calc_); +} + +sta::ArcDelay Resizer::gateDelay(const sta::LibertyPort* drvr_port, + const float load_cap, + const sta::Scene* scene, + const sta::MinMax* min_max, + sta::ArcDelayCalc* arc_delay_calc) { sta::ArcDelay delays[sta::RiseFall::index_count]; sta::Slew slews[sta::RiseFall::index_count]; - gateDelays(drvr_port, load_cap, scene, min_max, delays, slews); + gateDelays( + drvr_port, load_cap, scene, min_max, arc_delay_calc, delays, slews); return max(delays[sta::RiseFall::riseIndex()], delays[sta::RiseFall::fallIndex()]); } diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc new file mode 100644 index 00000000000..3667e65139a --- /dev/null +++ b/src/rsz/src/policy/GlobalSizingPolicy.cc @@ -0,0 +1,922 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026-2026, The OpenROAD Authors + +#include "GlobalSizingPolicy.hh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "LRSubproblem.hh" +#include "OptimizationPolicy.hh" +#include "OptimizerTypes.hh" +#include "db_sta/dbNetwork.hh" +#include "db_sta/dbSta.hh" +#include "est/EstimateParasitics.h" +#include "odb/db.h" +#include "rsz/Resizer.hh" +#include "sta/ArcDelayCalc.hh" +#include "sta/Delay.hh" +#include "sta/Fuzzy.hh" +#include "sta/Graph.hh" +#include "sta/GraphClass.hh" +#include "sta/GraphDelayCalc.hh" +#include "sta/Liberty.hh" +#include "sta/Network.hh" +#include "sta/NetworkClass.hh" +#include "sta/PortDirection.hh" +#include "sta/Scene.hh" +#include "sta/Sta.hh" +#include "sta/TimingArc.hh" +#include "sta/TimingRole.hh" +#include "sta/Transition.hh" +#include "utl/Logger.h" +#include "utl/ThreadPool.h" + +namespace rsz { + +using utl::RSZ; + +GlobalSizingPolicy::GlobalSizingPolicy(Resizer& resizer, + MoveCommitter& committer, + RepairSetupContext& setup_context, + const OptimizerRunConfig& config) + : OptimizationPolicy(resizer, committer, setup_context, config) +{ +} + +GlobalSizingPolicy::~GlobalSizingPolicy() = default; + +bool GlobalSizingPolicy::isDataArc(const sta::Edge* edge) const +{ + const sta::TimingRole* role = edge->role(); + if (role->isTimingCheck()) { + return false; + } + if (edge->isDisabledLoop()) { + return false; + } + if (role == sta::TimingRole::latchDtoQ() + || role == sta::TimingRole::latchEnToQ()) { + return false; + } + return true; +} + +float GlobalSizingPolicy::edgeMaxArcDelay(sta::Edge* edge) const +{ + sta::TimingArcSet* arc_set = edge->timingArcSet(); + if (arc_set == nullptr) { + return 0.0f; + } + float max_d = 0.0f; + for (sta::TimingArc* arc : arc_set->arcs()) { + const sta::ArcDelay d = graph_->arcDelay(edge, arc, dcalc_ap_); + const float df = sta::delayAsFloat(d); + max_d = std::max(df, max_d); + } + return max_d; +} + +void GlobalSizingPolicy::allocate() +{ + // Ensure arc delays and endpoint slacks are up to date before we seed + sta_->findRequireds(); + // DRC preambles the per-gate subproblem relies on later + sta_->checkCapacitancesPreamble(sta_->scenes()); + sta_->checkSlewsPreamble(); + sta_->checkFanoutPreamble(); + + const sta::Scene* scene = sta_->cmdScene(); + dcalc_ap_ = scene->dcalcAnalysisPtIndex(policy_max_); + + // Walk the graph once to discover max EdgeId (lambda_ is keyed by + // sta::Edge::id, which is sparse - size to max_id + 1) + sta::EdgeId max_edge_id = 0; + int data_edge_count = 0; + sta::VertexIterator vit(graph_); + while (vit.hasNext()) { + sta::Vertex* v = vit.next(); + sta::VertexOutEdgeIterator eit(v, graph_); + while (eit.hasNext()) { + sta::Edge* e = eit.next(); + if (!isDataArc(e)) { + continue; + } + const sta::EdgeId id = graph_->id(e); + max_edge_id = std::max(id, max_edge_id); + ++data_edge_count; + } + } + + const size_t n_edges = static_cast(max_edge_id) + 1; + lambda_.assign(n_edges, 0.0f); + + // Endpoint bookkeeping + endpoint_vertices_.clear(); + endpoint_index_.clear(); + const sta::VertexSet& eps = sta_->endpoints(); + endpoint_vertices_.reserve(eps.size()); + endpoint_index_.reserve(eps.size()); + for (sta::Vertex* v : eps) { + endpoint_index_.emplace(v, static_cast(endpoint_vertices_.size())); + endpoint_vertices_.push_back(v); + } + mu_.assign(endpoint_vertices_.size(), 0.0f); + + debugPrint(logger_, + RSZ, + "global_sizing", + 2, + "LR allocate: edges={} (max_id={}), endpoints={}, dcalc_ap={}", + data_edge_count, + max_edge_id, + endpoint_vertices_.size(), + dcalc_ap_); +} + +void GlobalSizingPolicy::seedMultipliers(const LRParams& params) +{ + // λ_e = d_e (delay-proportional seed, max arc delay across rise/fall) + float lambda_sum = 0.0f; + float lambda_max = 0.0f; + int seeded = 0; + sta::VertexIterator vit(graph_); + while (vit.hasNext()) { + sta::Vertex* v = vit.next(); + sta::VertexOutEdgeIterator eit(v, graph_); + while (eit.hasNext()) { + sta::Edge* e = eit.next(); + if (!isDataArc(e)) { + continue; + } + const float d = edgeMaxArcDelay(e); + const sta::EdgeId id = graph_->id(e); + const float seed = std::max(d, params.lambda_floor); + lambda_[id] = seed; + lambda_sum += seed; + lambda_max = std::max(lambda_max, seed); + ++seeded; + } + } + + // μ_k = max(0, margin - slack_k)^p (WNS-biased endpoint seed). + // Then normalize so max(μ) = 1 - this decouples the LR pressure's scale + // from the raw slack units so that downstream λ·d terms are predictable. + float mu_max_raw = 0.0f; + int mu_nonzero = 0; + const float margin = params.setup_slack_margin; + const float p = params.mu_exponent; + for (size_t k = 0; k < endpoint_vertices_.size(); ++k) { + const sta::Slack slack = sta_->slack(endpoint_vertices_[k], policy_max_); + const float slack_f = sta::delayAsFloat(slack); + const float gap = margin - slack_f; + float mu = 0.0f; + if (gap > 0.0f) { + mu = std::pow(gap, p); + ++mu_nonzero; + } + mu_[k] = mu; + mu_max_raw = std::max(mu_max_raw, mu); + } + if (mu_max_raw > 0.0f) { + for (float& mu : mu_) { + mu /= mu_max_raw; + } + } + float mu_sum = 0.0f; + float mu_max = 0.0f; + for (const float mu : mu_) { + mu_sum += mu; + mu_max = std::max(mu_max, mu); + } + + debugPrint(logger_, + RSZ, + "global_sizing", + 2, + "LR seed: {} data arcs (λ sum={:.3g}, max={:.3g}, avg={:.3g}); " + "{}/{} endpoints violating (μ sum={:.3g}, max={:.3g})", + seeded, + lambda_sum, + lambda_max, + seeded ? lambda_sum / seeded : 0.0f, + mu_nonzero, + endpoint_vertices_.size(), + mu_sum, + mu_max); +} + +void GlobalSizingPolicy::updateMultipliers(const LRParams& params) +{ + // μ: re-seed from current endpoint slacks. Fresh seed (rather than a + // multiplicative μ update) avoids the lock-in where an endpoint whose μ + // reached the floor can never re-activate when its slack regresses. + float mu_max_raw = 0.0f; + const float margin = params.setup_slack_margin; + const float p = params.mu_exponent; + int mu_nonzero = 0; + for (size_t k = 0; k < endpoint_vertices_.size(); ++k) { + const sta::Slack slack = sta_->slack(endpoint_vertices_[k], policy_max_); + const float slack_f = sta::delayAsFloat(slack); + const float gap = margin - slack_f; + float mu = 0.0f; + if (gap > 0.0f) { + mu = std::pow(gap, p); + ++mu_nonzero; + } + mu_[k] = mu; + mu_max_raw = std::max(mu_max_raw, mu); + } + if (mu_max_raw > 0.0f) { + for (float& mu : mu_) { + mu /= mu_max_raw; + } + } + + // λ: dual-subgradient ascent. + // + // g_e_norm = (d_e - (a_to - a_from)) / max(d_e, ε) ∈ [-1, 0] + // λ_e ← max(floor, λ_e · (1 + α · g_e_norm)) + // + // tight arc (g=0) → λ unchanged + // full slack (g=−1) → λ ← (1-α)·λ + // + // Arcs touching unconstrained vertices (sentinel arrivals from no-clock + // PIs/POs) are skipped - those have no meaningful slack and projection + // alone determines their λ. + const float alpha = std::clamp(params.beta, 0.0f, 1.0f); + const float kArrivalSentinel = 1e6f; + float lam_sum = 0.0f; + float lam_max = 0.0f; + int updated = 0; + int skipped_unconstrained = 0; + int tight_arcs = 0; + sta::VertexIterator vit(graph_); + while (vit.hasNext()) { + sta::Vertex* v = vit.next(); + sta::VertexOutEdgeIterator eit(v, graph_); + while (eit.hasNext()) { + sta::Edge* e = eit.next(); + if (!isDataArc(e)) { + continue; + } + const sta::EdgeId id = graph_->id(e); + if (static_cast(id) >= lambda_.size()) { + continue; + } + const float d = edgeMaxArcDelay(e); + sta::Vertex* from_v = e->from(graph_); + sta::Vertex* to_v = e->to(graph_); + const float a_from = sta::delayAsFloat(sta_->arrival( + from_v, sta::RiseFallBoth::riseFall(), sta_->scenes(), policy_max_)); + const float a_to = sta::delayAsFloat(sta_->arrival( + to_v, sta::RiseFallBoth::riseFall(), sta_->scenes(), policy_max_)); + if (std::fabs(a_from) >= kArrivalSentinel + || std::fabs(a_to) >= kArrivalSentinel) { + ++skipped_unconstrained; + lam_sum += lambda_[id]; + lam_max = std::max(lam_max, lambda_[id]); + continue; + } + const float arrival_diff = a_to - a_from; + const float denom = std::max(d, params.lambda_floor); + const float g_norm = (d - arrival_diff) / denom; + const float g_clamped = std::clamp(g_norm, -1.0f, 0.0f); + if (g_clamped > -1e-6f) { + ++tight_arcs; + } + const float scale = 1.0f + alpha * g_clamped; + lambda_[id] = std::max(lambda_[id] * scale, params.lambda_floor); + lam_sum += lambda_[id]; + lam_max = std::max(lam_max, lambda_[id]); + ++updated; + } + } + + debugPrint(logger_, + RSZ, + "global_sizing", + 2, + "LR update: {} arcs subgradient-stepped " + "({} tight, {} unconstrained skipped); " + "λ sum={:.3g} max={:.3g}; " + "{}/{} endpoints violating", + updated, + tight_arcs, + skipped_unconstrained, + lam_sum, + lam_max, + mu_nonzero, + endpoint_vertices_.size()); +} + +void GlobalSizingPolicy::projectFlowBalance(const LRParams& params) +{ + // Collect all vertices and sort by level (descending) so we visit endpoints + // before their predecessors + std::vector vertices; + { + sta::VertexIterator vit(graph_); + while (vit.hasNext()) { + vertices.push_back(vit.next()); + } + } + std::sort(vertices.begin(), + vertices.end(), + [](const sta::Vertex* a, const sta::Vertex* b) { + return a->level() > b->level(); + }); + + int rescaled = 0; + int zero_sum_fallback = 0; + for (sta::Vertex* v : vertices) { + // Target flow into v + float target = 0.0f; + auto ep_it = endpoint_index_.find(v); + const bool is_endpoint = ep_it != endpoint_index_.end(); + if (is_endpoint) { + target = mu_[ep_it->second]; + } else { + sta::VertexOutEdgeIterator oeit(v, graph_); + while (oeit.hasNext()) { + sta::Edge* e = oeit.next(); + if (!isDataArc(e)) { + continue; + } + target += lambda_[graph_->id(e)]; + } + } + + // Current flow summed over in-data-edges + float in_sum = 0.0f; + int in_count = 0; + { + sta::VertexInEdgeIterator ieit(v, graph_); + while (ieit.hasNext()) { + sta::Edge* e = ieit.next(); + if (!isDataArc(e)) { + continue; + } + in_sum += lambda_[graph_->id(e)]; + ++in_count; + } + } + + if (in_count == 0) { + continue; + } + + if (in_sum > 0.0f) { + const float scale = target / in_sum; + sta::VertexInEdgeIterator ieit(v, graph_); + while (ieit.hasNext()) { + sta::Edge* e = ieit.next(); + if (!isDataArc(e)) { + continue; + } + const sta::EdgeId id = graph_->id(e); + lambda_[id] = std::max(lambda_[id] * scale, params.lambda_floor); + } + ++rescaled; + } else if (target > 0.0f) { + const float share = target / static_cast(in_count); + sta::VertexInEdgeIterator ieit(v, graph_); + while (ieit.hasNext()) { + sta::Edge* e = ieit.next(); + if (!isDataArc(e)) { + continue; + } + lambda_[graph_->id(e)] = std::max(share, params.lambda_floor); + } + ++zero_sum_fallback; + } + } + + debugPrint(logger_, + RSZ, + "global_sizing", + 2, + "LR project: {} vertices rescaled ({} zero-sum fallbacks)", + rescaled, + zero_sum_fallback); +} + +GlobalSizingPolicy::DesignSnap GlobalSizingPolicy::computeDesignSnap() const +{ + DesignSnap s; + std::unique_ptr iit( + network_->leafInstanceIterator()); + while (iit->hasNext()) { + sta::Instance* inst = iit->next(); + sta::LibertyCell* cell = network_->libertyCell(inst); + if (cell == nullptr) { + continue; + } + ++s.instances; + const std::optional leak = resizer_.cellLeakage(cell); + if (leak.has_value()) { + s.total_leakage += *leak; + ++s.with_leakage; + } + odb::dbMaster* master = db_network_->staToDb(db_network_->cell(cell)); + if (master != nullptr && master->isCoreAutoPlaceable()) { + s.total_area += resizer_.dbuToMeters(master->getWidth()) + * resizer_.dbuToMeters(master->getHeight()); + } + } + return s; +} + +std::vector GlobalSizingPolicy::buildSnapshots() +{ + // Phase A (main thread, delays valid): freeze each evaluable gate's + // timing/DRC state. snapshot() also reads loadCap/slew and warms the lazy + // getSwappableCells / cellLeakage / net-driver caches, so the subsequent + // parallel phase touches none of them. + const int lambda_size = static_cast(lambda_.size()); + std::vector snapshots; + std::unique_ptr iit( + network_->leafInstanceIterator()); + while (iit->hasNext()) { + sta::Instance* inst = iit->next(); + LRSubproblem::GateSnapshot snap; + if (subproblem_->snapshot(inst, lambda_.data(), lambda_size, snap)) { + snapshots.push_back(std::move(snap)); + } + } + return snapshots; +} + +GlobalSizingPolicy::SweepStats GlobalSizingPolicy::applyDecisions( + const std::vector& decisions, + const int visited) +{ + // Phase C (main thread, serial): apply accepted replacements in the snapshot + // vector order so the result is independent of worker scheduling. Pure apply + // loop - no slack/slew/arrival query may run here, or the single batched + // timing update in iterate() would fragment into many. + // + // Hysteresis on cost improvement before we commit a move: + // - Upsize moves: 2% - filter LR-cost noise that would otherwise churn + // the design without a meaningful timing win. + // - Downsize moves: 0% - on a non-critical gate λ is at the floor and + // the cost is dominated by leakage; any drop is a real leakage gain. + const float upsize_accept_tol = 0.02f; + const float downsize_accept_tol = 0.0f; + + int moves = 0; + int evaluated = 0; + int downsizes = 0; + int upsizes = 0; + + for (const LRSubproblem::GateDecision& r : decisions) { + if (r.best_cell == nullptr) { + continue; + } + ++evaluated; + const float tol + = r.best_is_downsize ? downsize_accept_tol : upsize_accept_tol; + if (r.best_cost < r.baseline_cost * (1.0f - tol)) { + sta::LibertyCell* prev = network_->libertyCell(r.inst); + if (subproblem_->applyReplacement(r.inst, r.best_cell)) { + ++moves; + const float rel_gain + = r.baseline_cost > 0.0f + ? (r.baseline_cost - r.best_cost) / r.baseline_cost + : 0.0f; + if (r.best_is_downsize) { + ++downsizes; + } else { + ++upsizes; + } + debugPrint(logger_, + RSZ, + "global_sizing", + 5, + "{} {}: {} -> {} (cost {:.3g} -> {:.3g}, gain {:.2f}%)", + r.best_is_downsize ? "DOWN" : "UP ", + network_->pathName(r.inst), + prev != nullptr ? prev->name() : "?", + r.best_cell->name(), + r.baseline_cost, + r.best_cost, + 100.0f * rel_gain); + } + } + } + + debugPrint(logger_, + RSZ, + "global_sizing", + 2, + "LR sweep: {} instances visited, " + "{} with an improving candidate, " + "{} replacements applied ({} upsize, {} downsize)", + visited, + evaluated, + moves, + upsizes, + downsizes); + + return {.moves = moves, .upsizes = upsizes, .downsizes = downsizes}; +} + +GlobalSizingPolicy::SweepStats GlobalSizingPolicy::singleSweep( + const float timing_weight) +{ + // Phase A: Freeze per-gate state. + std::vector snapshots = buildSnapshots(); + + // Phase B: Score every snapshot independently. Each worker uses its own + // ArcDelayCalc copy (arc_delay_calc_ is single-threaded shared state); the + // copy is cached per worker thread and refreshed if the source changes. With + // a zero-worker pool this runs inline on the calling thread. + sta::ArcDelayCalc* const src = sta_->arcDelayCalc(); + const std::vector decisions + = thread_pool_->parallelMap( + snapshots, + [this, timing_weight, src](const LRSubproblem::GateSnapshot& snap) { + thread_local sta::ArcDelayCalc* cached_src = nullptr; + thread_local std::unique_ptr adc; + if (adc == nullptr || cached_src != src) { + adc.reset(src->copy()); + cached_src = src; + } + return subproblem_->evaluateSnapshot( + snap, timing_weight, adc.get()); + }); + + // Phase C: Apply accepted moves serially. + return applyDecisions(decisions, static_cast(snapshots.size())); +} + +float GlobalSizingPolicy::computeAutoTimingWeight(const LRParams& params) const +{ + std::vector leakages; + std::vector timings; + const sta::Scene* scene = sta_->cmdScene(); + const int lambda_size = static_cast(lambda_.size()); + + std::unique_ptr iit( + network_->leafInstanceIterator()); + while (iit->hasNext()) { + sta::Instance* inst = iit->next(); + if (resizer_.dontTouch(inst)) { + continue; + } + sta::LibertyCell* cell = network_->libertyCell(inst); + if (cell == nullptr) { + continue; + } + + leakages.push_back(subproblem_->leakageOrArea(cell)); + + // Per-gate timing pressure used to anchor the leakage<->timing scale. + // + // This medians ONLY the output-cone term Σλ·d_out, NOT the full cost + // function. The upstream-Cin term is deliberately excluded here even + // though it is part of evaluateCell's cost. + // + // Reason: computeAutoTimingWeight calibrates a gain, so it must be + // anchored to the *actionable* timing pressure - the part of the + // timing cost that varies as a gate is resized and therefore trades + // against leakage in the argmin. d_out swings 2-3x across candidate + // cells, so its level is a faithful proxy for that actionable swing. + // The upstream-Cin term's level is a full upstream gate delay d_U + // that is almost entirely a constant w.r.t. this gate's cell choice + // (intrinsic delay + the driver's other-fanout load); its level is + // a DC offset, not a signal. Folding it in inflated T_med ~2x, + // collapsed tw ~2x, and starved the output-cone term - see + // notes_lr/08_upstream_cin_tw_regression.md. tw cancels in + // evaluateCell's timing-vs-timing comparison, so the upstream-Cin + // term still works correctly at the output-cone-anchored tw. + float gate_t = 0.0f; + bool has_pressure = false; + std::unique_ptr pit(network_->pinIterator(inst)); + while (pit->hasNext()) { + sta::Pin* pin = pit->next(); + const sta::PortDirection* dir = network_->direction(pin); + if (!dir->isOutput()) { + continue; + } + sta::Vertex* v = graph_->pinDrvrVertex(pin); + if (v == nullptr) { + continue; + } + float lam_sum = 0.0f; + sta::VertexInEdgeIterator ieit(v, graph_); + while (ieit.hasNext()) { + sta::Edge* e = ieit.next(); + if (!isDataArc(e)) { + continue; + } + const sta::Pin* from_pin = e->from(graph_)->pin(); + if (network_->instance(from_pin) != inst) { + continue; + } + const sta::EdgeId id = graph_->id(e); + if (static_cast(id) >= lambda_size) { + continue; + } + lam_sum += lambda_[id]; + } + if (lam_sum <= 4.0f * params.lambda_floor) { + continue; + } + sta::LibertyPort* port = network_->libertyPort(pin); + if (port == nullptr) { + continue; + } + const float load + = sta_->graphDelayCalc()->loadCap(pin, scene, policy_max_); + const float d = sta::delayAsFloat( + resizer_.gateDelay(port, load, scene, policy_max_)); + gate_t += lam_sum * d; + has_pressure = true; + } + if (has_pressure) { + timings.push_back(gate_t); + } + } + + float l_med = 0.0f; + float t_med = 0.0f; + bool degenerate = leakages.empty() || timings.empty(); + if (!degenerate) { + const auto l_mid = leakages.size() / 2; + std::nth_element( + leakages.begin(), leakages.begin() + l_mid, leakages.end()); + l_med = leakages[l_mid]; + + const auto t_mid = timings.size() / 2; + std::nth_element(timings.begin(), timings.begin() + t_mid, timings.end()); + t_med = timings[t_mid]; + + if (l_med <= 0.0f || t_med <= 0.0f) { + degenerate = true; + } + } + + if (degenerate) { + debugPrint(logger_, + RSZ, + "global_sizing", + 1, + "LR auto timing_weight: degenerate " + "(leakages={}, timings={}, " + "L_med={:.3g}, T_med={:.3g}); using 1.0", + leakages.size(), + timings.size(), + l_med, + t_med); + return 1.0f; + } + + const float tw = params.timing_bias * l_med / t_med; + debugPrint(logger_, + RSZ, + "global_sizing", + 1, + "LR auto timing_weight: bias={:.3g} " + "L_med={:.3g} T_med={:.3g} -> tw={:.3g}", + params.timing_bias, + l_med, + t_med, + tw); + return tw; +} + +bool GlobalSizingPolicy::start() +{ + if (!OptimizationPolicy::start()) { + return false; + } + db_network_ = resizer_.dbNetwork(); + subproblem_ = std::make_unique(&resizer_); + // Phase B fans the per-gate evaluations across the OpenROAD thread budget + // (threadCount()-1 workers; a zero-worker pool runs inline). Each worker + // reads only the frozen snapshots, read-only Liberty/SDC, and its own + // ArcDelayCalc copy, so results are independent of worker count and the + // apply order stays the snapshot vector order. + thread_pool_ = makeWorkerThreadPool(); + return true; +} + +void GlobalSizingPolicy::iterate() +{ + if (converged_) { + return; + } + + allocate(); + seedMultipliers(lr_params_); + projectFlowBalance(lr_params_); + + subproblem_->init(); + + const float timing_weight = computeAutoTimingWeight(lr_params_); + + const DesignSnap pre = computeDesignSnap(); + const float wns_pre = sta::delayAsFloat(sta_->worstSlack(policy_max_)); + const float tns_pre + = sta::delayAsFloat(sta_->totalNegativeSlack(policy_max_)); + debugPrint(logger_, + RSZ, + "global_sizing", + 1, + "Pre-global sizing design: instances={} (with leakage={}) " + "leakage={:.3g}W area={:.3g}m^2 WNS={} TNS={}", + pre.instances, + pre.with_leakage, + pre.total_leakage, + pre.total_area, + sta::delayAsString(wns_pre, 3, sta_), + sta::delayAsString(tns_pre, 1, sta_)); + + const int max_iter = (lr_params_.max_iterations > 0) + ? lr_params_.max_iterations + : LRParams{}.max_iterations; + const float wns_eps = 1e-12f; + LRParams iter_params = lr_params_; + + float best_wns = wns_pre; + int total_committed = 0; + int total_attempted = 0; + int total_upsizes = 0; + int total_downsizes = 0; + int accepted_iters = 0; + int rejected_iters = 0; + int consec_zero = 0; + int consec_reject = 0; + resizer_.journalBegin(); + for (int iter = 0; iter < max_iter; ++iter) { + if (iter > 0) { + updateMultipliers(iter_params); + projectFlowBalance(iter_params); + } + + const float wns0 = sta::delayAsFloat(sta_->worstSlack(policy_max_)); + + const SweepStats sweep = singleSweep(timing_weight); + const int iter_moves = sweep.moves; + estimate_parasitics_->updateParasitics(); + sta_->findRequireds(); + const float wns1 = sta::delayAsFloat(sta_->worstSlack(policy_max_)); + + const float wns_delta = wns1 - wns0; + const bool no_benefit = (iter_moves == 0); + // Small regressions are deliberately allowed + const bool reject = sta::fuzzyLess(wns_delta, -wns_eps); + + total_attempted += sweep.moves; + total_upsizes += sweep.upsizes; + total_downsizes += sweep.downsizes; + + if (reject) { + ++consec_reject; + ++rejected_iters; + iter_params.beta *= 0.5f; + } else { + total_committed += iter_moves; + ++accepted_iters; + consec_reject = 0; + } + + // Best-so-far: Keep track of the best WNS so far but don't restore a sweep + // that worsens WNS just yet to allow oscillation. + const float current_wns = sta::delayAsFloat(sta_->worstSlack(policy_max_)); + if (!reject && sta::fuzzyGreaterEqual(current_wns, best_wns)) { + resizer_.journalEnd(); // checkpoint + resizer_.journalBegin(); + best_wns = current_wns; + } + + if (logger_->debugCheck(RSZ, "global_sizing", 1)) { + const DesignSnap iter_snap = computeDesignSnap(); + const float tns_iter + = sta::delayAsFloat(sta_->totalNegativeSlack(policy_max_)); + debugPrint( + logger_, + RSZ, + "global_sizing", + 1, + "LR iter {}/{} {}: leakage={:.3g} (Δ={:+.3g}, {:+.2f}%) " + "area={:.3g} (Δ={:+.3g}, {:+.2f}%) " + "WNS={} TNS={}", + iter + 1, + max_iter, + reject ? "REJ" : "ACC", + iter_snap.total_leakage, + iter_snap.total_leakage - pre.total_leakage, + pre.total_leakage > 0.0 + ? 100.0 * (iter_snap.total_leakage - pre.total_leakage) + / pre.total_leakage + : 0.0, + iter_snap.total_area, + iter_snap.total_area - pre.total_area, + pre.total_area > 0.0 + ? 100.0 * (iter_snap.total_area - pre.total_area) / pre.total_area + : 0.0, + sta::delayAsString(wns1, 3, sta_), + sta::delayAsString(tns_iter, 1, sta_)); + } + + if (consec_reject >= 3) { + debugPrint(logger_, + RSZ, + "global_sizing", + 1, + "LR stop: 3 consecutive rejections"); + break; + } + if (no_benefit && !reject) { + if (++consec_zero >= 2) { + debugPrint(logger_, + RSZ, + "global_sizing", + 1, + "LR stop: 2 consecutive zero-move passes"); + break; + } + } else { + consec_zero = 0; + } + } + + // Journal will always be open and regardless of how the loop exited, + // we need to restore to the best checkpoint here + resizer_.journalRestore(); + + const DesignSnap post = computeDesignSnap(); + const float wns_post = sta::delayAsFloat(sta_->worstSlack(policy_max_)); + const float tns_post + = sta::delayAsFloat(sta_->totalNegativeSlack(policy_max_)); + const auto rel = [](double after, double before) { + return before > 0.0 ? 100.0 * (after - before) / before : 0.0; + }; + const int total_iters = accepted_iters + rejected_iters; + + // Headline: kept moves vs. attempted moves. They diverge when sweeps are + // rolled back by the catastrophic-WNS guard, or when the end-of-run best-WNS + // restore reverts some drift past the best iter. + logger_->info(RSZ, + 400, + "GLOBAL_SIZING: {} cells replaced (loop); " + "{}/{} sweeps accepted, {} rolled back; " + "{} replacements attempted in total " + "({} upsize, {} downsize).", + total_committed, + accepted_iters, + total_iters, + rejected_iters, + total_attempted, + total_upsizes, + total_downsizes); + + // QoR before -> after. This is the line that answers "what did it improve + // and what did it regress" -- read the arrows, not just the deltas. + logger_->info(RSZ, + 409, + "GLOBAL_SIZING QoR: " + "WNS {} -> {} ({}); " + "TNS {} -> {} ({}); " + "leakage {:.3g} -> {:.3g}W ({:+.2f}%); " + "area {:.3g} -> {:.3g}m^2 ({:+.2f}%).", + sta::delayAsString(wns_pre, 3, sta_), + sta::delayAsString(wns_post, 3, sta_), + sta::delayAsString(wns_post - wns_pre, 3, sta_), + sta::delayAsString(tns_pre, 1, sta_), + sta::delayAsString(tns_post, 1, sta_), + sta::delayAsString(tns_post - tns_pre, 1, sta_), + pre.total_leakage, + post.total_leakage, + rel(post.total_leakage, pre.total_leakage), + pre.total_area, + post.total_area, + rel(post.total_area, pre.total_area)); + + // Explain the all-zero summary case explicitly: the design did get + // churned, but every sweep blew the WNS guard so every pass was + // rolled back and the netlist is back to where it started. + if (total_committed == 0 && total_attempted > 0) { + logger_->info(RSZ, + 412, + "GLOBAL_SIZING: nothing kept -- all {} sweeps tripped the " + "WNS guard and were rolled back; the netlist is unchanged " + "from the start of this phase. " + "The {} attempted replacements were tentative only.", + rejected_iters, + total_attempted); + } + + markRunComplete(true); +} + +} // namespace rsz diff --git a/src/rsz/src/policy/GlobalSizingPolicy.hh b/src/rsz/src/policy/GlobalSizingPolicy.hh new file mode 100644 index 00000000000..1b0f1bf1de5 --- /dev/null +++ b/src/rsz/src/policy/GlobalSizingPolicy.hh @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright (c) 2026-2026, The OpenROAD Authors + +#pragma once + +#include +#include +#include +#include + +#include "LRSubproblem.hh" +#include "MoveCommitter.hh" +#include "OptimizationPolicy.hh" +#include "OptimizerTypes.hh" +#include "RepairSetupContext.hh" +#include "rsz/Resizer.hh" +#include "sta/GraphClass.hh" +#include "sta/MinMax.hh" + +namespace sta { +class Edge; +class Vertex; +class dbNetwork; +} // namespace sta + +namespace rsz { + +// Tunables for the Lagrangian-Relaxation global sizing driver. Internal to the +// policy - not user-facing through Tcl. Live here as a struct so each piece has +// a stable name and so we can plug in env-var overrides later without rewriting +// the policy. +struct LRParams +{ + float setup_slack_margin = 0.0f; + int max_iterations = 20; + // Step size α for the dual-subgradient update on λ. + // λ_e ← max(floor, λ_e · (1 + α · g_e_norm)) + // with g_e_norm ∈ [-1, 0]. Tight arcs (g=0) are unchanged; arcs at full + // slack (g=-1) shrink to (1-α)·λ. Halved on pass rejection. + float beta = 0.6f; + // Endpoint seed exponent: mu_k ~ max(0, margin - slack_k)^p. + float mu_exponent = 2.0f; + // Floor for multipliers (subgradient floor so unused arcs can re-enter). + float lambda_floor = 1e-12f; + // Dimensionless balance between timing pressure and leakage cost. + // bias = 1.0 keeps Σλ·d (scaled) ≈ leakage cost on the median gate. + float timing_bias = 64.0f; +}; + +// GlobalSizingPolicy: Lagrangian-Relaxation-driven global sizing + Vt +// assignment, packaged as an OptimizationPolicy phase. +// +// Outer loop (in iterate()): allocate λ/μ → seed → project → repeat +// {update → project → Jacobi sweep over leaf instances → pass-level +// accept/reject by WNS regression}. +// Each gate's replacement decision uses LRSubproblem's per-gate cost. Skips the +// OptimizationPolicy generator/candidate pipeline and the target_collector - LR +// is not target-driven. +class GlobalSizingPolicy : public OptimizationPolicy +{ + public: + GlobalSizingPolicy(Resizer& resizer, + MoveCommitter& committer, + RepairSetupContext& setup_context, + const OptimizerRunConfig& config); + ~GlobalSizingPolicy() override; + + const char* name() const override { return "GlobalSizingPolicy"; } + bool start() override; + void iterate() override; + + private: + // === Setup ================================================================ + // Discover graph size (edges, endpoints), set dcalc_ap_, size vectors. + void allocate(); + // Delay-proportional λ seed + WNS-biased μ seed. + void seedMultipliers(const LRParams& params); + // Multiplicative λ update via dual-subgradient + re-seed of μ from the + // current slack picture. Called at the start of each outer iteration + // after iteration 0. + void updateMultipliers(const LRParams& params); + // Reverse-topological projection onto the KKT flow-balance polytope. + // After projection: + // Σλ_in(v) = Σλ_out(v) for internal v + // Σλ_in(k) = μ_k for each endpoint k + void projectFlowBalance(const LRParams& params); + // Tally of one Jacobi sweep. `moves` is the total cell replacements applied + // to the journal this sweep (tentative - the pass-acceptance test in + // iterate() may still roll the whole sweep back). + struct SweepStats + { + int moves = 0; + int upsizes = 0; + int downsizes = 0; + }; + + // One Jacobi sweep over all leaf instances, in three phases: + // A buildSnapshots() - main thread: freeze each gate's timing/DRC state + // B parallel evaluate - workers: score every snapshot independently + // C applyDecisions() - main thread: apply the winning replacements + // The per-sweep timing update is done by the caller (iterate()), once, + // after this returns. + SweepStats singleSweep(float timing_weight); + + // Phase A: Capture the frozen per-gate snapshots for every evaluable leaf + // instance, in a stable order. Reads live STA and warms the lazy + // Liberty/dbNetwork caches on the main thread. + std::vector buildSnapshots(); + + // Phase C: Apply the accepted replacements in vector order. No timing query + // may run here - the single batched update happens in iterate() afterwards. + SweepStats applyDecisions( + const std::vector& decisions, + int visited); + + // Auto-scale timing weight so the output-cone timing term is comparable to + // the leakage term on the median gate of this design. Anchored to the + // output-cone term only (not the upstream-Cin term). + float computeAutoTimingWeight(const LRParams& params) const; + + // === Diagnostics ========================================================== + struct DesignSnap + { + double total_leakage = 0.0; + double total_area = 0.0; + int instances = 0; + int with_leakage = 0; + }; + DesignSnap computeDesignSnap() const; + + // === Graph helpers ======================================================== + bool isDataArc(const sta::Edge* edge) const; + float edgeMaxArcDelay(sta::Edge* edge) const; + + // === Policy state ========================================================= + LRParams lr_params_; + sta::dbNetwork* db_network_ = nullptr; + + // Per-edge multipliers, indexed by sta::Edge::id (sparse) + std::vector lambda_; + // Per-endpoint multipliers, indexed by a dense endpoint index + std::vector mu_; + // Dense endpoint bookkeeping + std::vector endpoint_vertices_; + std::unordered_map endpoint_index_; + + sta::DcalcAPIndex dcalc_ap_ = 0; + std::unique_ptr subproblem_; // Per-gate cost evaluator + const sta::MinMax* policy_max_ = sta::MinMax::max(); +}; + +} // namespace rsz diff --git a/src/rsz/test/BUILD b/src/rsz/test/BUILD index 242953d233d..482ec2a91d7 100644 --- a/src/rsz/test/BUILD +++ b/src/rsz/test/BUILD @@ -254,6 +254,8 @@ PASSFAIL_TESTS = [ # "cpp_tests", "repair_setup_legacy_mt", "repair_setup_mt1", + "global_sizing", + "global_sizing_threads", ] ALL_TESTS = TESTS + PASSFAIL_TESTS @@ -278,6 +280,7 @@ filegroup( # Tests that reference other tests extra_deps = { + "global_sizing_threads": ["global_sizing.tcl"], "repair_fanout6_multi": ["repair_fanout6.tcl"], "repair_fanout7_multi": ["repair_fanout7.tcl"], "repair_fanout7_skip_pin_swap": ["repair_fanout7.tcl"], diff --git a/src/rsz/test/CMakeLists.txt b/src/rsz/test/CMakeLists.txt index 68ad69c731a..4b29def3a16 100644 --- a/src/rsz/test/CMakeLists.txt +++ b/src/rsz/test/CMakeLists.txt @@ -236,6 +236,8 @@ or_integration_tests( PASSFAIL_TESTS repair_setup_legacy_mt repair_setup_mt1 + global_sizing + global_sizing_threads cpp_tests ) diff --git a/src/rsz/test/global_sizing.tcl b/src/rsz/test/global_sizing.tcl new file mode 100644 index 00000000000..3c86b397cb6 --- /dev/null +++ b/src/rsz/test/global_sizing.tcl @@ -0,0 +1,35 @@ +# Coverage for rsz GlobalSizingPolicy. +# Runs the GLOBAL_SIZING phase and checks the resized netlist against a golden. +# +# This file runs single-threaded (serial Phase-B, inline). The companion +# global_sizing_threads.tcl runs the identical flow multi-threaded and diffs the +# SAME golden, so the pair asserts the parallel Jacobi sweep is deterministic +# and matches the serial result. +source "helpers.tcl" + +# Thread count and result-file stem are overridable by the _threads variant. +if { ![info exists global_sizing_threads] } { + set global_sizing_threads 1 +} +if { ![info exists global_sizing_result] } { + set global_sizing_result "global_sizing" +} + +read_liberty Nangate45/Nangate45_typ.lib +read_lef Nangate45/Nangate45.lef +read_def gcd_nangate45_placed.def +read_sdc gcd_nangate45.sdc + +source Nangate45/Nangate45.rc +set_wire_rc -layer metal3 +estimate_parasitics -placement + +set_thread_count $global_sizing_threads +repair_timing -setup -phases GLOBAL_SIZING + +set verilog_file [make_result_file "${global_sizing_result}.v"] +write_verilog $verilog_file +check "global sizing netlist matches golden" \ + {diff_files global_sizing.vok $verilog_file} 0 + +exit_summary diff --git a/src/rsz/test/global_sizing.vok b/src/rsz/test/global_sizing.vok new file mode 100644 index 00000000000..d2531899023 --- /dev/null +++ b/src/rsz/test/global_sizing.vok @@ -0,0 +1,2019 @@ +module gcd (clk, + req_rdy, + req_val, + reset, + resp_rdy, + resp_val, + req_msg, + resp_msg); + input clk; + output req_rdy; + input req_val; + input reset; + input resp_rdy; + output resp_val; + input [31:0] req_msg; + output [15:0] resp_msg; + + wire _000_; + wire _001_; + wire _002_; + wire _003_; + wire _004_; + wire _005_; + wire _006_; + wire _007_; + wire _008_; + wire _009_; + wire _010_; + wire _011_; + wire _012_; + wire _013_; + wire _014_; + wire _015_; + wire _016_; + wire _017_; + wire _018_; + wire _019_; + wire _020_; + wire _021_; + wire _022_; + wire _023_; + wire _024_; + wire _025_; + wire _026_; + wire _027_; + wire _028_; + wire _029_; + wire _030_; + wire _031_; + wire _032_; + wire _033_; + wire _034_; + wire _035_; + wire _036_; + wire _037_; + wire _038_; + wire _039_; + wire _040_; + wire _041_; + wire _042_; + wire _043_; + wire _044_; + wire _045_; + wire _046_; + wire _047_; + wire _048_; + wire _049_; + wire _050_; + wire _051_; + wire _052_; + wire _053_; + wire _054_; + wire _055_; + wire _056_; + wire _057_; + wire _058_; + wire _059_; + wire _060_; + wire _061_; + wire _062_; + wire _063_; + wire _064_; + wire _065_; + wire _066_; + wire _067_; + wire _068_; + wire _069_; + wire _070_; + wire _071_; + wire _072_; + wire _073_; + wire _074_; + wire _075_; + wire _076_; + wire _077_; + wire _078_; + wire _079_; + wire _080_; + wire _081_; + wire _082_; + wire _083_; + wire _084_; + wire _085_; + wire _086_; + wire _087_; + wire _088_; + wire _089_; + wire _090_; + wire _091_; + wire _092_; + wire _093_; + wire _094_; + wire _095_; + wire _096_; + wire _097_; + wire _098_; + wire _099_; + wire _100_; + wire _101_; + wire _102_; + wire _103_; + wire _104_; + wire _105_; + wire _106_; + wire _107_; + wire _108_; + wire _109_; + wire _110_; + wire _111_; + wire _112_; + wire _113_; + wire _114_; + wire _115_; + wire _116_; + wire _117_; + wire _118_; + wire _119_; + wire _120_; + wire _121_; + wire _122_; + wire _123_; + wire _124_; + wire _125_; + wire _126_; + wire _127_; + wire _128_; + wire _129_; + wire _130_; + wire _131_; + wire _132_; + wire _133_; + wire _134_; + wire _135_; + wire _136_; + wire _137_; + wire _138_; + wire _139_; + wire _140_; + wire _141_; + wire _142_; + wire _143_; + wire _144_; + wire _145_; + wire _146_; + wire _147_; + wire _148_; + wire _149_; + wire _150_; + wire _151_; + wire _152_; + wire _153_; + wire _154_; + wire _155_; + wire _156_; + wire _157_; + wire _158_; + wire _159_; + wire _160_; + wire _161_; + wire _162_; + wire _163_; + wire _164_; + wire _165_; + wire _166_; + wire _167_; + wire _168_; + wire _169_; + wire _170_; + wire _171_; + wire _172_; + wire _173_; + wire _174_; + wire _175_; + wire _176_; + wire _177_; + wire _178_; + wire _179_; + wire _180_; + wire _181_; + wire _182_; + wire _183_; + wire _184_; + wire _185_; + wire _186_; + wire _187_; + wire _188_; + wire _189_; + wire _190_; + wire _191_; + wire _192_; + wire _193_; + wire _194_; + wire _195_; + wire _196_; + wire _197_; + wire _198_; + wire _199_; + wire _200_; + wire _201_; + wire _202_; + wire _203_; + wire _204_; + wire _205_; + wire _206_; + wire _207_; + wire _208_; + wire _209_; + wire _210_; + wire _211_; + wire _212_; + wire _213_; + wire _214_; + wire _215_; + wire _216_; + wire _217_; + wire _218_; + wire _219_; + wire _220_; + wire _221_; + wire _222_; + wire _223_; + wire _224_; + wire _225_; + wire _226_; + wire _227_; + wire _228_; + wire _229_; + wire _230_; + wire _231_; + wire _232_; + wire _233_; + wire _234_; + wire _235_; + wire _236_; + wire _237_; + wire _238_; + wire _239_; + wire _240_; + wire _241_; + wire _242_; + wire _243_; + wire _244_; + wire _245_; + wire _246_; + wire _247_; + wire _248_; + wire _249_; + wire _250_; + wire _251_; + wire _252_; + wire _253_; + wire _254_; + wire _255_; + wire _256_; + wire _257_; + wire _258_; + wire _259_; + wire _260_; + wire _261_; + wire _262_; + wire _263_; + wire _264_; + wire _265_; + wire _266_; + wire _267_; + wire _268_; + wire _269_; + wire _270_; + wire _271_; + wire _272_; + wire _273_; + wire _274_; + wire _275_; + wire _276_; + wire _277_; + wire _278_; + wire _279_; + wire _280_; + wire _281_; + wire _282_; + wire _283_; + wire _284_; + wire _285_; + wire _286_; + wire _287_; + wire _288_; + wire _289_; + wire _290_; + wire _291_; + wire _292_; + wire _293_; + wire _294_; + wire _295_; + wire _296_; + wire _297_; + wire _298_; + wire _299_; + wire _300_; + wire _301_; + wire _302_; + wire _303_; + wire _304_; + wire _305_; + wire _306_; + wire _307_; + wire _308_; + wire _309_; + wire _310_; + wire _311_; + wire _312_; + wire _313_; + wire _314_; + wire _315_; + wire _316_; + wire _317_; + wire _318_; + wire _319_; + wire _320_; + wire _321_; + wire _322_; + wire _323_; + wire _324_; + wire _325_; + wire _326_; + wire _327_; + wire _328_; + wire _329_; + wire _330_; + wire _331_; + wire _332_; + wire _333_; + wire _334_; + wire _335_; + wire _336_; + wire _337_; + wire _338_; + wire _339_; + wire _340_; + wire _341_; + wire _342_; + wire _343_; + wire _344_; + wire _345_; + wire _346_; + wire _347_; + wire _348_; + wire _349_; + wire _350_; + wire _351_; + wire _352_; + wire _353_; + wire _354_; + wire _355_; + wire _356_; + wire _357_; + wire _358_; + wire _359_; + wire _360_; + wire _361_; + wire _362_; + wire _363_; + wire _364_; + wire _365_; + wire _366_; + wire _367_; + wire _368_; + wire _369_; + wire _370_; + wire _371_; + wire _372_; + wire _373_; + wire _374_; + wire _375_; + wire _376_; + wire _377_; + wire _378_; + wire _379_; + wire _380_; + wire _381_; + wire _382_; + wire _383_; + wire _384_; + wire _385_; + wire _386_; + wire _387_; + wire _388_; + wire _389_; + wire _390_; + wire _391_; + wire _392_; + wire _393_; + wire _394_; + wire _395_; + wire _396_; + wire _397_; + wire _398_; + wire _399_; + wire _400_; + wire _401_; + wire _402_; + wire _403_; + wire _404_; + wire _405_; + wire _406_; + wire _407_; + wire _408_; + wire _409_; + wire _410_; + wire _411_; + wire _412_; + wire _413_; + wire _414_; + wire _415_; + wire _416_; + wire _417_; + wire _418_; + wire _419_; + wire _420_; + wire _421_; + wire _422_; + wire _423_; + wire _424_; + wire _425_; + wire _426_; + wire _427_; + wire _428_; + wire _429_; + wire _430_; + wire _431_; + wire _432_; + wire _433_; + wire _434_; + wire _435_; + wire _436_; + wire _437_; + wire _438_; + wire _439_; + wire \ctrl.state.out[1] ; + wire \ctrl.state.out[2] ; + wire \dpath.a_lt_b$in0[0] ; + wire \dpath.a_lt_b$in0[10] ; + wire \dpath.a_lt_b$in0[11] ; + wire \dpath.a_lt_b$in0[12] ; + wire \dpath.a_lt_b$in0[13] ; + wire \dpath.a_lt_b$in0[14] ; + wire \dpath.a_lt_b$in0[15] ; + wire \dpath.a_lt_b$in0[1] ; + wire \dpath.a_lt_b$in0[2] ; + wire \dpath.a_lt_b$in0[3] ; + wire \dpath.a_lt_b$in0[4] ; + wire \dpath.a_lt_b$in0[5] ; + wire \dpath.a_lt_b$in0[6] ; + wire \dpath.a_lt_b$in0[7] ; + wire \dpath.a_lt_b$in0[8] ; + wire \dpath.a_lt_b$in0[9] ; + wire \dpath.a_lt_b$in1[0] ; + wire \dpath.a_lt_b$in1[10] ; + wire \dpath.a_lt_b$in1[11] ; + wire \dpath.a_lt_b$in1[12] ; + wire \dpath.a_lt_b$in1[13] ; + wire \dpath.a_lt_b$in1[14] ; + wire \dpath.a_lt_b$in1[15] ; + wire \dpath.a_lt_b$in1[1] ; + wire \dpath.a_lt_b$in1[2] ; + wire \dpath.a_lt_b$in1[3] ; + wire \dpath.a_lt_b$in1[4] ; + wire \dpath.a_lt_b$in1[5] ; + wire \dpath.a_lt_b$in1[6] ; + wire \dpath.a_lt_b$in1[7] ; + wire \dpath.a_lt_b$in1[8] ; + wire \dpath.a_lt_b$in1[9] ; + + FILLCELL_X1 PHY_0 (); + FILLCELL_X1 PHY_1 (); + FILLCELL_X1 PHY_10 (); + FILLCELL_X1 PHY_100 (); + FILLCELL_X1 PHY_101 (); + FILLCELL_X1 PHY_102 (); + FILLCELL_X1 PHY_103 (); + FILLCELL_X1 PHY_104 (); + FILLCELL_X1 PHY_105 (); + FILLCELL_X1 PHY_106 (); + FILLCELL_X1 PHY_107 (); + FILLCELL_X1 PHY_108 (); + FILLCELL_X1 PHY_109 (); + FILLCELL_X1 PHY_11 (); + FILLCELL_X1 PHY_110 (); + FILLCELL_X1 PHY_111 (); + FILLCELL_X1 PHY_112 (); + FILLCELL_X1 PHY_113 (); + FILLCELL_X1 PHY_12 (); + FILLCELL_X1 PHY_13 (); + FILLCELL_X1 PHY_14 (); + FILLCELL_X1 PHY_15 (); + FILLCELL_X1 PHY_16 (); + FILLCELL_X1 PHY_17 (); + FILLCELL_X1 PHY_18 (); + FILLCELL_X1 PHY_19 (); + FILLCELL_X1 PHY_2 (); + FILLCELL_X1 PHY_20 (); + FILLCELL_X1 PHY_21 (); + FILLCELL_X1 PHY_22 (); + FILLCELL_X1 PHY_23 (); + FILLCELL_X1 PHY_24 (); + FILLCELL_X1 PHY_25 (); + FILLCELL_X1 PHY_26 (); + FILLCELL_X1 PHY_27 (); + FILLCELL_X1 PHY_28 (); + FILLCELL_X1 PHY_29 (); + FILLCELL_X1 PHY_3 (); + FILLCELL_X1 PHY_30 (); + FILLCELL_X1 PHY_31 (); + FILLCELL_X1 PHY_32 (); + FILLCELL_X1 PHY_33 (); + FILLCELL_X1 PHY_34 (); + FILLCELL_X1 PHY_35 (); + FILLCELL_X1 PHY_36 (); + FILLCELL_X1 PHY_37 (); + FILLCELL_X1 PHY_38 (); + FILLCELL_X1 PHY_39 (); + FILLCELL_X1 PHY_4 (); + FILLCELL_X1 PHY_40 (); + FILLCELL_X1 PHY_41 (); + FILLCELL_X1 PHY_42 (); + FILLCELL_X1 PHY_43 (); + FILLCELL_X1 PHY_44 (); + FILLCELL_X1 PHY_45 (); + FILLCELL_X1 PHY_46 (); + FILLCELL_X1 PHY_47 (); + FILLCELL_X1 PHY_48 (); + FILLCELL_X1 PHY_49 (); + FILLCELL_X1 PHY_5 (); + FILLCELL_X1 PHY_50 (); + FILLCELL_X1 PHY_51 (); + FILLCELL_X1 PHY_52 (); + FILLCELL_X1 PHY_53 (); + FILLCELL_X1 PHY_54 (); + FILLCELL_X1 PHY_55 (); + FILLCELL_X1 PHY_56 (); + FILLCELL_X1 PHY_57 (); + FILLCELL_X1 PHY_58 (); + FILLCELL_X1 PHY_59 (); + FILLCELL_X1 PHY_6 (); + FILLCELL_X1 PHY_60 (); + FILLCELL_X1 PHY_61 (); + FILLCELL_X1 PHY_62 (); + FILLCELL_X1 PHY_63 (); + FILLCELL_X1 PHY_64 (); + FILLCELL_X1 PHY_65 (); + FILLCELL_X1 PHY_66 (); + FILLCELL_X1 PHY_67 (); + FILLCELL_X1 PHY_68 (); + FILLCELL_X1 PHY_69 (); + FILLCELL_X1 PHY_7 (); + FILLCELL_X1 PHY_70 (); + FILLCELL_X1 PHY_71 (); + FILLCELL_X1 PHY_72 (); + FILLCELL_X1 PHY_73 (); + FILLCELL_X1 PHY_74 (); + FILLCELL_X1 PHY_75 (); + FILLCELL_X1 PHY_76 (); + FILLCELL_X1 PHY_77 (); + FILLCELL_X1 PHY_78 (); + FILLCELL_X1 PHY_79 (); + FILLCELL_X1 PHY_8 (); + FILLCELL_X1 PHY_80 (); + FILLCELL_X1 PHY_81 (); + FILLCELL_X1 PHY_82 (); + FILLCELL_X1 PHY_83 (); + FILLCELL_X1 PHY_84 (); + FILLCELL_X1 PHY_85 (); + FILLCELL_X1 PHY_86 (); + FILLCELL_X1 PHY_87 (); + FILLCELL_X1 PHY_88 (); + FILLCELL_X1 PHY_89 (); + FILLCELL_X1 PHY_9 (); + FILLCELL_X1 PHY_90 (); + FILLCELL_X1 PHY_91 (); + FILLCELL_X1 PHY_92 (); + FILLCELL_X1 PHY_93 (); + FILLCELL_X1 PHY_94 (); + FILLCELL_X1 PHY_95 (); + FILLCELL_X1 PHY_96 (); + FILLCELL_X1 PHY_97 (); + FILLCELL_X1 PHY_98 (); + FILLCELL_X1 PHY_99 (); + INV_X8 _440_ (.A(_109_), + .ZN(_142_)); + AND3_X1 _441_ (.A1(_142_), + .A2(_108_), + .A3(_059_), + .ZN(_423_)); + XOR2_X2 _442_ (.A(_110_), + .B(_126_), + .Z(_406_)); + NOR4_X1 _443_ (.A1(_139_), + .A2(_138_), + .A3(_136_), + .A4(_137_), + .ZN(_143_)); + NOR2_X1 _444_ (.A1(_135_), + .A2(_126_), + .ZN(_144_)); + INV_X2 _445_ (.A(_133_), + .ZN(_145_)); + INV_X2 _446_ (.A(_134_), + .ZN(_146_)); + NAND4_X1 _447_ (.A1(_143_), + .A2(_144_), + .A3(_145_), + .A4(_146_), + .ZN(_147_)); + NOR4_X1 _448_ (.A1(_128_), + .A2(_127_), + .A3(_140_), + .A4(_141_), + .ZN(_148_)); + NOR4_X1 _449_ (.A1(_132_), + .A2(_131_), + .A3(_129_), + .A4(_130_), + .ZN(_149_)); + NAND2_X1 _450_ (.A1(_148_), + .A2(_149_), + .ZN(_150_)); + NOR2_X1 _451_ (.A1(_147_), + .A2(_150_), + .ZN(_151_)); + INV_X2 _452_ (.A(_405_), + .ZN(_152_)); + INV_X2 _453_ (.A(_058_), + .ZN(_153_)); + NAND3_X1 _454_ (.A1(_151_), + .A2(_152_), + .A3(_153_), + .ZN(_154_)); + AND2_X1 _455_ (.A1(_423_), + .A2(_422_), + .ZN(_155_)); + OR3_X1 _456_ (.A1(_155_), + .A2(_057_), + .A3(_405_), + .ZN(_156_)); + NAND2_X1 _457_ (.A1(_154_), + .A2(_156_), + .ZN(_055_)); + OAI211_X1 _458_ (.A(_152_), + .B(_153_), + .C1(_147_), + .C2(_150_), + .ZN(_157_)); + INV_X4 _459_ (.A(_059_), + .ZN(_158_)); + BUF_X4 _460_ (.A(_158_), + .Z(_159_)); + BUF_X4 _461_ (.A(_403_), + .Z(_160_)); + NAND4_X1 _462_ (.A1(_152_), + .A2(_159_), + .A3(_160_), + .A4(_404_), + .ZN(_161_)); + NAND2_X1 _463_ (.A1(_157_), + .A2(_161_), + .ZN(_056_)); + NAND3_X1 _464_ (.A1(_423_), + .A2(_152_), + .A3(_422_), + .ZN(_162_)); + NOR2_X1 _465_ (.A1(_162_), + .A2(_057_), + .ZN(_163_)); + AOI211_X4 _466_ (.A(_405_), + .B(_059_), + .C1(_403_), + .C2(_404_), + .ZN(_164_)); + OR3_X1 _467_ (.A1(_163_), + .A2(_164_), + .A3(_405_), + .ZN(_054_)); + XNOR2_X2 _468_ (.A(_112_), + .B(_128_), + .ZN(_165_)); + XNOR2_X1 _469_ (.A(_111_), + .B(_127_), + .ZN(_166_)); + AND2_X1 _470_ (.A1(_165_), + .A2(_166_), + .ZN(_167_)); + XNOR2_X2 _471_ (.A(_125_), + .B(_141_), + .ZN(_168_)); + XNOR2_X2 _472_ (.A(_124_), + .B(_140_), + .ZN(_169_)); + AND3_X2 _473_ (.A1(_167_), + .A2(_168_), + .A3(_169_), + .ZN(_170_)); + XNOR2_X2 _474_ (.A(_120_), + .B(_136_), + .ZN(_171_)); + XNOR2_X2 _475_ (.A(_121_), + .B(_137_), + .ZN(_172_)); + AND2_X2 _476_ (.A1(_171_), + .A2(_172_), + .ZN(_173_)); + XNOR2_X2 _477_ (.A(_122_), + .B(_138_), + .ZN(_174_)); + XNOR2_X2 _478_ (.A(_123_), + .B(_139_), + .ZN(_175_)); + AND2_X2 _479_ (.A1(_174_), + .A2(_175_), + .ZN(_176_)); + AND2_X2 _480_ (.A1(_173_), + .A2(_176_), + .ZN(_177_)); + XNOR2_X1 _481_ (.A(_115_), + .B(_131_), + .ZN(_178_)); + XNOR2_X2 _482_ (.A(_116_), + .B(_132_), + .ZN(_179_)); + NAND2_X1 _483_ (.A1(_178_), + .A2(_179_), + .ZN(_180_)); + XNOR2_X2 _484_ (.A(_114_), + .B(_130_), + .ZN(_181_)); + INV_X1 _485_ (.A(_181_), + .ZN(_182_)); + XNOR2_X1 _486_ (.A(_113_), + .B(_129_), + .ZN(_183_)); + INV_X1 _487_ (.A(_183_), + .ZN(_184_)); + NOR3_X2 _488_ (.A1(_180_), + .A2(_182_), + .A3(_184_), + .ZN(_185_)); + XNOR2_X2 _489_ (.A(_118_), + .B(_134_), + .ZN(_186_)); + INV_X2 _490_ (.A(_186_), + .ZN(_187_)); + XOR2_X2 _491_ (.A(_119_), + .B(_135_), + .Z(_188_)); + NOR2_X2 _492_ (.A1(_187_), + .A2(_188_), + .ZN(_189_)); + NAND4_X1 _493_ (.A1(_170_), + .A2(_177_), + .A3(_185_), + .A4(_189_), + .ZN(_190_)); + XNOR2_X2 _494_ (.A(_117_), + .B(_133_), + .ZN(_191_)); + INV_X1 _495_ (.A(_191_), + .ZN(_192_)); + NOR3_X2 _496_ (.A1(_190_), + .A2(_192_), + .A3(_406_), + .ZN(_193_)); + NAND3_X1 _497_ (.A1(_189_), + .A2(_176_), + .A3(_173_), + .ZN(_194_)); + NOR2_X1 _498_ (.A1(_145_), + .A2(_117_), + .ZN(_195_)); + INV_X1 _499_ (.A(_110_), + .ZN(_196_)); + NOR3_X1 _500_ (.A1(_195_), + .A2(_126_), + .A3(_196_), + .ZN(_197_)); + AND2_X2 _501_ (.A1(_145_), + .A2(_117_), + .ZN(_198_)); + OR3_X2 _502_ (.A1(_194_), + .A2(_197_), + .A3(_198_), + .ZN(_199_)); + INV_X1 _503_ (.A(_120_), + .ZN(_200_)); + AND3_X1 _504_ (.A1(_172_), + .A2(_136_), + .A3(_200_), + .ZN(_201_)); + INV_X1 _505_ (.A(_121_), + .ZN(_202_)); + AND2_X1 _506_ (.A1(_202_), + .A2(_137_), + .ZN(_203_)); + OAI21_X1 _507_ (.A(_176_), + .B1(_201_), + .B2(_203_), + .ZN(_204_)); + INV_X1 _508_ (.A(_122_), + .ZN(_205_)); + NAND3_X1 _509_ (.A1(_175_), + .A2(_205_), + .A3(_138_), + .ZN(_206_)); + INV_X1 _510_ (.A(_139_), + .ZN(_207_)); + NOR2_X1 _511_ (.A1(_207_), + .A2(_123_), + .ZN(_208_)); + INV_X1 _512_ (.A(_208_), + .ZN(_209_)); + AND3_X1 _513_ (.A1(_204_), + .A2(_206_), + .A3(_209_), + .ZN(_210_)); + INV_X1 _514_ (.A(_119_), + .ZN(_211_)); + NAND2_X1 _515_ (.A1(_211_), + .A2(_135_), + .ZN(_212_)); + OAI21_X1 _516_ (.A(_212_), + .B1(_118_), + .B2(_146_), + .ZN(_213_)); + OAI211_X1 _517_ (.A(_177_), + .B(_213_), + .C1(_135_), + .C2(_211_), + .ZN(_214_)); + NAND3_X1 _518_ (.A1(_199_), + .A2(_210_), + .A3(_214_), + .ZN(_215_)); + AND2_X1 _519_ (.A1(_170_), + .A2(_185_), + .ZN(_216_)); + NAND2_X1 _520_ (.A1(_215_), + .A2(_216_), + .ZN(_217_)); + INV_X1 _521_ (.A(_116_), + .ZN(_218_)); + NAND2_X1 _522_ (.A1(_218_), + .A2(_132_), + .ZN(_219_)); + INV_X1 _523_ (.A(_124_), + .ZN(_220_)); + AND3_X1 _524_ (.A1(_168_), + .A2(_140_), + .A3(_220_), + .ZN(_221_)); + INV_X1 _525_ (.A(_125_), + .ZN(_222_)); + AND2_X1 _526_ (.A1(_222_), + .A2(_141_), + .ZN(_223_)); + OAI21_X1 _527_ (.A(_167_), + .B1(_221_), + .B2(_223_), + .ZN(_224_)); + INV_X2 _528_ (.A(_112_), + .ZN(_225_)); + NOR2_X1 _529_ (.A1(_225_), + .A2(_128_), + .ZN(_226_)); + INV_X1 _530_ (.A(_111_), + .ZN(_227_)); + AOI22_X1 _531_ (.A1(_128_), + .A2(_225_), + .B1(_227_), + .B2(_127_), + .ZN(_228_)); + OAI21_X1 _532_ (.A(_224_), + .B1(_226_), + .B2(_228_), + .ZN(_229_)); + NAND2_X1 _533_ (.A1(_229_), + .A2(_185_), + .ZN(_230_)); + INV_X1 _534_ (.A(_115_), + .ZN(_231_)); + NAND3_X1 _535_ (.A1(_179_), + .A2(_231_), + .A3(_131_), + .ZN(_232_)); + AND4_X2 _536_ (.A1(_217_), + .A2(_219_), + .A3(_230_), + .A4(_232_), + .ZN(_233_)); + INV_X1 _537_ (.A(_114_), + .ZN(_234_)); + NOR2_X1 _538_ (.A1(_234_), + .A2(_130_), + .ZN(_235_)); + INV_X1 _539_ (.A(_113_), + .ZN(_236_)); + AOI22_X1 _540_ (.A1(_130_), + .A2(_234_), + .B1(_236_), + .B2(_129_), + .ZN(_237_)); + OR3_X1 _541_ (.A1(_180_), + .A2(_235_), + .A3(_237_), + .ZN(_238_)); + AOI21_X2 _542_ (.A(_193_), + .B1(_233_), + .B2(_238_), + .ZN(_239_)); + NOR2_X2 _543_ (.A1(_239_), + .A2(_142_), + .ZN(_240_)); + BUF_X4 _544_ (.A(_240_), + .Z(_241_)); + BUF_X4 _545_ (.A(_059_), + .Z(_242_)); + NAND3_X1 _546_ (.A1(_241_), + .A2(_242_), + .A3(_406_), + .ZN(_243_)); + OAI21_X1 _547_ (.A(_378_), + .B1(_142_), + .B2(_158_), + .ZN(_244_)); + NAND2_X1 _548_ (.A1(_239_), + .A2(_109_), + .ZN(_245_)); + OR2_X2 _549_ (.A1(_245_), + .A2(_158_), + .ZN(_246_)); + BUF_X8 _550_ (.A(_246_), + .Z(_247_)); + BUF_X8 _551_ (.A(_247_), + .Z(_248_)); + OAI211_X1 _552_ (.A(_243_), + .B(_244_), + .C1(_248_), + .C2(_060_), + .ZN(_249_)); + OR2_X2 _553_ (.A1(_109_), + .A2(_403_), + .ZN(_250_)); + BUF_X8 _554_ (.A(_250_), + .Z(_251_)); + MUX2_X1 _555_ (.A(_110_), + .B(_249_), + .S(_251_), + .Z(_076_)); + BUF_X8 _556_ (.A(_142_), + .Z(_252_)); + OAI21_X1 _557_ (.A(_379_), + .B1(_252_), + .B2(_159_), + .ZN(_253_)); + NAND2_X1 _558_ (.A1(_196_), + .A2(_126_), + .ZN(_254_)); + XOR2_X1 _559_ (.A(_191_), + .B(_254_), + .Z(_413_)); + NAND3_X1 _560_ (.A1(_240_), + .A2(_242_), + .A3(_413_), + .ZN(_255_)); + OAI211_X1 _561_ (.A(_253_), + .B(_255_), + .C1(_248_), + .C2(_061_), + .ZN(_256_)); + BUF_X8 _562_ (.A(_251_), + .Z(_257_)); + MUX2_X1 _563_ (.A(_117_), + .B(_256_), + .S(_257_), + .Z(_083_)); + OAI21_X1 _564_ (.A(_380_), + .B1(_252_), + .B2(_159_), + .ZN(_258_)); + AOI21_X2 _565_ (.A(_198_), + .B1(_191_), + .B2(_254_), + .ZN(_259_)); + XNOR2_X1 _566_ (.A(_259_), + .B(_186_), + .ZN(_414_)); + NAND3_X1 _567_ (.A1(_241_), + .A2(_242_), + .A3(_414_), + .ZN(_260_)); + OAI211_X1 _568_ (.A(_258_), + .B(_260_), + .C1(_248_), + .C2(_062_), + .ZN(_261_)); + MUX2_X1 _569_ (.A(_118_), + .B(_261_), + .S(_251_), + .Z(_084_)); + OAI21_X1 _570_ (.A(_381_), + .B1(_252_), + .B2(_159_), + .ZN(_262_)); + NOR2_X1 _571_ (.A1(_259_), + .A2(_187_), + .ZN(_263_)); + AND2_X1 _572_ (.A1(_146_), + .A2(_118_), + .ZN(_264_)); + OR2_X1 _573_ (.A1(_263_), + .A2(_264_), + .ZN(_265_)); + XNOR2_X1 _574_ (.A(_265_), + .B(_188_), + .ZN(_415_)); + NAND3_X1 _575_ (.A1(_241_), + .A2(_059_), + .A3(_415_), + .ZN(_266_)); + OAI211_X1 _576_ (.A(_262_), + .B(_266_), + .C1(_248_), + .C2(_063_), + .ZN(_267_)); + MUX2_X1 _577_ (.A(_119_), + .B(_267_), + .S(_257_), + .Z(_085_)); + OAI21_X2 _578_ (.A(_383_), + .B1(_252_), + .B2(_159_), + .ZN(_268_)); + NOR3_X2 _579_ (.A1(_259_), + .A2(_187_), + .A3(_188_), + .ZN(_269_)); + NAND3_X1 _580_ (.A1(_212_), + .A2(_118_), + .A3(_146_), + .ZN(_270_)); + OAI21_X1 _581_ (.A(_270_), + .B1(_211_), + .B2(_135_), + .ZN(_271_)); + OR2_X2 _582_ (.A1(_269_), + .A2(_271_), + .ZN(_272_)); + XOR2_X1 _583_ (.A(_272_), + .B(_171_), + .Z(_416_)); + NAND3_X1 _584_ (.A1(_241_), + .A2(_059_), + .A3(_416_), + .ZN(_273_)); + OAI211_X1 _585_ (.A(_268_), + .B(_273_), + .C1(_247_), + .C2(_064_), + .ZN(_274_)); + MUX2_X1 _586_ (.A(_120_), + .B(_274_), + .S(_251_), + .Z(_086_)); + OAI21_X1 _587_ (.A(_384_), + .B1(_252_), + .B2(_158_), + .ZN(_275_)); + AND2_X1 _588_ (.A1(_272_), + .A2(_171_), + .ZN(_276_)); + NOR2_X2 _589_ (.A1(_200_), + .A2(_136_), + .ZN(_277_)); + NOR3_X1 _590_ (.A1(_276_), + .A2(_277_), + .A3(_172_), + .ZN(_278_)); + AOI221_X4 _591_ (.A(_278_), + .B1(_277_), + .B2(_172_), + .C1(_272_), + .C2(_173_), + .ZN(_417_)); + NAND3_X1 _592_ (.A1(_241_), + .A2(_242_), + .A3(_417_), + .ZN(_279_)); + OAI211_X1 _593_ (.A(_275_), + .B(_279_), + .C1(_247_), + .C2(_065_), + .ZN(_280_)); + MUX2_X1 _594_ (.A(_121_), + .B(_280_), + .S(_257_), + .Z(_087_)); + OAI21_X1 _595_ (.A(_385_), + .B1(_252_), + .B2(_159_), + .ZN(_281_)); + NAND2_X1 _596_ (.A1(_172_), + .A2(_277_), + .ZN(_282_)); + OAI21_X2 _597_ (.A(_282_), + .B1(_202_), + .B2(_137_), + .ZN(_283_)); + AOI21_X1 _598_ (.A(_283_), + .B1(_272_), + .B2(_173_), + .ZN(_284_)); + XNOR2_X1 _599_ (.A(_284_), + .B(_174_), + .ZN(_418_)); + NAND3_X1 _600_ (.A1(_241_), + .A2(_242_), + .A3(_418_), + .ZN(_285_)); + OAI211_X1 _601_ (.A(_281_), + .B(_285_), + .C1(_247_), + .C2(_066_), + .ZN(_286_)); + MUX2_X1 _602_ (.A(_122_), + .B(_286_), + .S(_257_), + .Z(_088_)); + OAI21_X1 _603_ (.A(_386_), + .B1(_142_), + .B2(_159_), + .ZN(_287_)); + NOR2_X2 _604_ (.A1(_205_), + .A2(_138_), + .ZN(_288_)); + AND2_X1 _605_ (.A1(_205_), + .A2(_138_), + .ZN(_289_)); + NOR3_X1 _606_ (.A1(_284_), + .A2(_288_), + .A3(_289_), + .ZN(_290_)); + NOR2_X1 _607_ (.A1(_290_), + .A2(_288_), + .ZN(_291_)); + XNOR2_X1 _608_ (.A(_291_), + .B(_175_), + .ZN(_419_)); + NAND3_X1 _609_ (.A1(_240_), + .A2(_242_), + .A3(_419_), + .ZN(_292_)); + OAI211_X1 _610_ (.A(_287_), + .B(_292_), + .C1(_248_), + .C2(_067_), + .ZN(_293_)); + MUX2_X1 _611_ (.A(_123_), + .B(_293_), + .S(_257_), + .Z(_089_)); + OAI21_X1 _612_ (.A(_387_), + .B1(_252_), + .B2(_158_), + .ZN(_294_)); + AND2_X1 _613_ (.A1(_175_), + .A2(_288_), + .ZN(_295_)); + AOI221_X1 _614_ (.A(_295_), + .B1(_207_), + .B2(_123_), + .C1(_283_), + .C2(_176_), + .ZN(_296_)); + OAI21_X1 _615_ (.A(_177_), + .B1(_269_), + .B2(_271_), + .ZN(_297_)); + NAND2_X1 _616_ (.A1(_296_), + .A2(_297_), + .ZN(_298_)); + XOR2_X1 _617_ (.A(_298_), + .B(_169_), + .Z(_420_)); + NAND3_X1 _618_ (.A1(_241_), + .A2(_242_), + .A3(_420_), + .ZN(_299_)); + OAI211_X1 _619_ (.A(_294_), + .B(_299_), + .C1(_247_), + .C2(_068_), + .ZN(_300_)); + MUX2_X1 _620_ (.A(_124_), + .B(_300_), + .S(_257_), + .Z(_090_)); + AND3_X1 _621_ (.A1(_298_), + .A2(_169_), + .A3(_168_), + .ZN(_301_)); + AND2_X1 _622_ (.A1(_298_), + .A2(_169_), + .ZN(_302_)); + NOR2_X2 _623_ (.A1(_220_), + .A2(_140_), + .ZN(_303_)); + NOR3_X1 _624_ (.A1(_302_), + .A2(_303_), + .A3(_168_), + .ZN(_304_)); + AOI211_X4 _625_ (.A(_301_), + .B(_304_), + .C1(_303_), + .C2(_168_), + .ZN(_421_)); + NAND3_X1 _626_ (.A1(_241_), + .A2(_421_), + .A3(_059_), + .ZN(_305_)); + OAI21_X1 _627_ (.A(_388_), + .B1(_142_), + .B2(_158_), + .ZN(_306_)); + OAI211_X1 _628_ (.A(_305_), + .B(_306_), + .C1(_247_), + .C2(_069_), + .ZN(_307_)); + MUX2_X1 _629_ (.A(_125_), + .B(_307_), + .S(_251_), + .Z(_091_)); + NOR2_X1 _630_ (.A1(_222_), + .A2(_141_), + .ZN(_308_)); + AOI21_X1 _631_ (.A(_308_), + .B1(_168_), + .B2(_303_), + .ZN(_309_)); + INV_X1 _632_ (.A(_309_), + .ZN(_310_)); + OR3_X1 _633_ (.A1(_301_), + .A2(_310_), + .A3(_166_), + .ZN(_311_)); + OAI21_X1 _634_ (.A(_166_), + .B1(_301_), + .B2(_310_), + .ZN(_312_)); + AND2_X1 _635_ (.A1(_311_), + .A2(_312_), + .ZN(_407_)); + NAND3_X1 _636_ (.A1(_241_), + .A2(_242_), + .A3(_407_), + .ZN(_313_)); + OAI21_X1 _637_ (.A(_389_), + .B1(_142_), + .B2(_158_), + .ZN(_314_)); + OAI211_X1 _638_ (.A(_313_), + .B(_314_), + .C1(_248_), + .C2(_070_), + .ZN(_315_)); + MUX2_X1 _639_ (.A(_111_), + .B(_315_), + .S(_251_), + .Z(_077_)); + OAI21_X2 _640_ (.A(_390_), + .B1(_252_), + .B2(_159_), + .ZN(_316_)); + OAI21_X1 _641_ (.A(_316_), + .B1(_248_), + .B2(_071_), + .ZN(_317_)); + NOR2_X1 _642_ (.A1(_227_), + .A2(_127_), + .ZN(_318_)); + INV_X1 _643_ (.A(_318_), + .ZN(_319_)); + NAND2_X1 _644_ (.A1(_312_), + .A2(_319_), + .ZN(_320_)); + INV_X1 _645_ (.A(_165_), + .ZN(_321_)); + XNOR2_X1 _646_ (.A(_320_), + .B(_321_), + .ZN(_408_)); + AND3_X1 _647_ (.A1(_408_), + .A2(_241_), + .A3(_242_), + .ZN(_322_)); + OAI21_X1 _648_ (.A(_257_), + .B1(_317_), + .B2(_322_), + .ZN(_323_)); + OAI21_X1 _649_ (.A(_323_), + .B1(_225_), + .B2(_257_), + .ZN(_078_)); + OAI21_X2 _650_ (.A(_391_), + .B1(_252_), + .B2(_159_), + .ZN(_324_)); + AND2_X1 _651_ (.A1(_298_), + .A2(_170_), + .ZN(_325_)); + AND3_X1 _652_ (.A1(_310_), + .A2(_165_), + .A3(_166_), + .ZN(_326_)); + NAND2_X1 _653_ (.A1(_165_), + .A2(_318_), + .ZN(_327_)); + OAI21_X1 _654_ (.A(_327_), + .B1(_225_), + .B2(_128_), + .ZN(_328_)); + NOR3_X2 _655_ (.A1(_325_), + .A2(_326_), + .A3(_328_), + .ZN(_329_)); + XNOR2_X1 _656_ (.A(_329_), + .B(_183_), + .ZN(_409_)); + NAND3_X1 _657_ (.A1(_240_), + .A2(_242_), + .A3(_409_), + .ZN(_330_)); + OAI211_X1 _658_ (.A(_324_), + .B(_330_), + .C1(_248_), + .C2(_072_), + .ZN(_331_)); + MUX2_X1 _659_ (.A(_113_), + .B(_331_), + .S(_251_), + .Z(_079_)); + NOR2_X1 _660_ (.A1(_329_), + .A2(_184_), + .ZN(_332_)); + NOR2_X1 _661_ (.A1(_236_), + .A2(_129_), + .ZN(_333_)); + NOR2_X1 _662_ (.A1(_332_), + .A2(_333_), + .ZN(_334_)); + XNOR2_X1 _663_ (.A(_334_), + .B(_181_), + .ZN(_410_)); + AND2_X1 _664_ (.A1(_240_), + .A2(_059_), + .ZN(_335_)); + NAND2_X1 _665_ (.A1(_410_), + .A2(_335_), + .ZN(_336_)); + OAI21_X1 _666_ (.A(_392_), + .B1(_142_), + .B2(_158_), + .ZN(_337_)); + OAI211_X1 _667_ (.A(_336_), + .B(_337_), + .C1(_248_), + .C2(_073_), + .ZN(_338_)); + MUX2_X1 _668_ (.A(_114_), + .B(_338_), + .S(_251_), + .Z(_080_)); + OR3_X1 _669_ (.A1(_329_), + .A2(_184_), + .A3(_182_), + .ZN(_339_)); + INV_X1 _670_ (.A(_178_), + .ZN(_340_)); + AOI22_X1 _671_ (.A1(_181_), + .A2(_333_), + .B1(_073_), + .B2(_114_), + .ZN(_341_)); + AND3_X1 _672_ (.A1(_339_), + .A2(_340_), + .A3(_341_), + .ZN(_342_)); + AOI21_X1 _673_ (.A(_340_), + .B1(_339_), + .B2(_341_), + .ZN(_343_)); + NOR2_X1 _674_ (.A1(_342_), + .A2(_343_), + .ZN(_411_)); + NAND2_X1 _675_ (.A1(_411_), + .A2(_335_), + .ZN(_344_)); + OAI21_X1 _676_ (.A(_394_), + .B1(_142_), + .B2(_158_), + .ZN(_345_)); + OAI211_X1 _677_ (.A(_344_), + .B(_345_), + .C1(_248_), + .C2(_074_), + .ZN(_346_)); + MUX2_X1 _678_ (.A(_115_), + .B(_346_), + .S(_251_), + .Z(_081_)); + NOR2_X1 _679_ (.A1(_231_), + .A2(_131_), + .ZN(_347_)); + NOR2_X1 _680_ (.A1(_343_), + .A2(_347_), + .ZN(_348_)); + XNOR2_X1 _681_ (.A(_348_), + .B(_179_), + .ZN(_412_)); + AND2_X1 _682_ (.A1(_412_), + .A2(_335_), + .ZN(_349_)); + OAI21_X1 _683_ (.A(_395_), + .B1(_252_), + .B2(_159_), + .ZN(_350_)); + OAI21_X1 _684_ (.A(_350_), + .B1(_247_), + .B2(_075_), + .ZN(_351_)); + OAI21_X1 _685_ (.A(_257_), + .B1(_349_), + .B2(_351_), + .ZN(_352_)); + OAI21_X1 _686_ (.A(_352_), + .B1(_218_), + .B2(_257_), + .ZN(_082_)); + MUX2_X1 _687_ (.A(_110_), + .B(_371_), + .S(_160_), + .Z(_353_)); + NAND2_X2 _688_ (.A1(_245_), + .A2(_059_), + .ZN(_354_)); + BUF_X4 _689_ (.A(_354_), + .Z(_355_)); + MUX2_X1 _690_ (.A(_126_), + .B(_353_), + .S(_355_), + .Z(_092_)); + MUX2_X1 _691_ (.A(_117_), + .B(_382_), + .S(_160_), + .Z(_356_)); + MUX2_X1 _692_ (.A(_133_), + .B(_356_), + .S(_355_), + .Z(_099_)); + MUX2_X1 _693_ (.A(_118_), + .B(_393_), + .S(_160_), + .Z(_357_)); + MUX2_X1 _694_ (.A(_134_), + .B(_357_), + .S(_355_), + .Z(_100_)); + MUX2_X1 _695_ (.A(_119_), + .B(_396_), + .S(_403_), + .Z(_358_)); + MUX2_X1 _696_ (.A(_135_), + .B(_358_), + .S(_355_), + .Z(_101_)); + MUX2_X1 _697_ (.A(_120_), + .B(_397_), + .S(_160_), + .Z(_359_)); + MUX2_X1 _698_ (.A(_136_), + .B(_359_), + .S(_355_), + .Z(_102_)); + MUX2_X1 _699_ (.A(_121_), + .B(_398_), + .S(_403_), + .Z(_360_)); + MUX2_X1 _700_ (.A(_137_), + .B(_360_), + .S(_355_), + .Z(_103_)); + MUX2_X1 _701_ (.A(_122_), + .B(_399_), + .S(_403_), + .Z(_361_)); + MUX2_X1 _702_ (.A(_138_), + .B(_361_), + .S(_355_), + .Z(_104_)); + MUX2_X1 _703_ (.A(_123_), + .B(_400_), + .S(_403_), + .Z(_362_)); + MUX2_X1 _704_ (.A(_139_), + .B(_362_), + .S(_355_), + .Z(_105_)); + MUX2_X1 _705_ (.A(_124_), + .B(_401_), + .S(_403_), + .Z(_363_)); + MUX2_X1 _706_ (.A(_140_), + .B(_363_), + .S(_354_), + .Z(_106_)); + MUX2_X1 _707_ (.A(_125_), + .B(_402_), + .S(_403_), + .Z(_364_)); + MUX2_X1 _708_ (.A(_141_), + .B(_364_), + .S(_355_), + .Z(_107_)); + MUX2_X1 _709_ (.A(_111_), + .B(_372_), + .S(_403_), + .Z(_365_)); + MUX2_X1 _710_ (.A(_127_), + .B(_365_), + .S(_354_), + .Z(_093_)); + MUX2_X1 _711_ (.A(_112_), + .B(_373_), + .S(_160_), + .Z(_366_)); + MUX2_X1 _712_ (.A(_128_), + .B(_366_), + .S(_354_), + .Z(_094_)); + MUX2_X1 _713_ (.A(_113_), + .B(_374_), + .S(_160_), + .Z(_367_)); + MUX2_X1 _714_ (.A(_129_), + .B(_367_), + .S(_354_), + .Z(_095_)); + MUX2_X1 _715_ (.A(_114_), + .B(_375_), + .S(_160_), + .Z(_368_)); + MUX2_X1 _716_ (.A(_130_), + .B(_368_), + .S(_354_), + .Z(_096_)); + MUX2_X1 _717_ (.A(_115_), + .B(_376_), + .S(_160_), + .Z(_369_)); + MUX2_X1 _718_ (.A(_131_), + .B(_369_), + .S(_354_), + .Z(_097_)); + MUX2_X1 _719_ (.A(_116_), + .B(_377_), + .S(_160_), + .Z(_370_)); + MUX2_X1 _720_ (.A(_132_), + .B(_370_), + .S(_355_), + .Z(_098_)); + BUF_X4 _721_ (.A(reset), + .Z(_405_)); + BUF_X4 _722_ (.A(\ctrl.state.out[2] ), + .Z(_109_)); + BUF_X1 _723_ (.A(\ctrl.state.out[1] ), + .Z(_108_)); + BUF_X4 _724_ (.A(_005_), + .Z(_059_)); + BUF_X1 _725_ (.A(_423_), + .Z(resp_val)); + BUF_X2 _726_ (.A(resp_rdy), + .Z(_422_)); + BUF_X2 _727_ (.A(\dpath.a_lt_b$in0[15] ), + .Z(_116_)); + BUF_X2 _728_ (.A(\dpath.a_lt_b$in1[15] ), + .Z(_132_)); + BUF_X2 _729_ (.A(\dpath.a_lt_b$in0[14] ), + .Z(_115_)); + BUF_X2 _730_ (.A(\dpath.a_lt_b$in1[14] ), + .Z(_131_)); + BUF_X4 _731_ (.A(\dpath.a_lt_b$in0[13] ), + .Z(_114_)); + BUF_X4 _732_ (.A(\dpath.a_lt_b$in1[13] ), + .Z(_130_)); + BUF_X2 _733_ (.A(\dpath.a_lt_b$in0[12] ), + .Z(_113_)); + BUF_X2 _734_ (.A(\dpath.a_lt_b$in1[12] ), + .Z(_129_)); + BUF_X2 _735_ (.A(\dpath.a_lt_b$in0[11] ), + .Z(_112_)); + BUF_X4 _736_ (.A(\dpath.a_lt_b$in1[11] ), + .Z(_128_)); + BUF_X2 _737_ (.A(\dpath.a_lt_b$in0[10] ), + .Z(_111_)); + BUF_X2 _738_ (.A(\dpath.a_lt_b$in1[10] ), + .Z(_127_)); + BUF_X2 _739_ (.A(\dpath.a_lt_b$in0[9] ), + .Z(_125_)); + BUF_X4 _740_ (.A(\dpath.a_lt_b$in1[9] ), + .Z(_141_)); + BUF_X2 _741_ (.A(\dpath.a_lt_b$in0[8] ), + .Z(_124_)); + BUF_X4 _742_ (.A(\dpath.a_lt_b$in1[8] ), + .Z(_140_)); + BUF_X4 _743_ (.A(\dpath.a_lt_b$in0[7] ), + .Z(_123_)); + BUF_X4 _744_ (.A(\dpath.a_lt_b$in1[7] ), + .Z(_139_)); + BUF_X2 _745_ (.A(\dpath.a_lt_b$in0[6] ), + .Z(_122_)); + BUF_X4 _746_ (.A(\dpath.a_lt_b$in1[6] ), + .Z(_138_)); + BUF_X2 _747_ (.A(\dpath.a_lt_b$in0[5] ), + .Z(_121_)); + BUF_X4 _748_ (.A(\dpath.a_lt_b$in1[5] ), + .Z(_137_)); + BUF_X2 _749_ (.A(\dpath.a_lt_b$in0[4] ), + .Z(_120_)); + BUF_X4 _750_ (.A(\dpath.a_lt_b$in1[4] ), + .Z(_136_)); + BUF_X2 _751_ (.A(\dpath.a_lt_b$in0[3] ), + .Z(_119_)); + BUF_X4 _752_ (.A(\dpath.a_lt_b$in1[3] ), + .Z(_135_)); + BUF_X4 _753_ (.A(\dpath.a_lt_b$in0[2] ), + .Z(_118_)); + BUF_X2 _754_ (.A(\dpath.a_lt_b$in1[2] ), + .Z(_134_)); + BUF_X2 _755_ (.A(\dpath.a_lt_b$in0[1] ), + .Z(_117_)); + BUF_X2 _756_ (.A(\dpath.a_lt_b$in1[1] ), + .Z(_133_)); + BUF_X2 _757_ (.A(\dpath.a_lt_b$in0[0] ), + .Z(_110_)); + BUF_X4 _758_ (.A(\dpath.a_lt_b$in1[0] ), + .Z(_126_)); + CLKBUF_X1 _759_ (.A(_406_), + .Z(resp_msg[0])); + BUF_X1 _760_ (.A(_004_), + .Z(_058_)); + BUF_X1 _761_ (.A(_003_), + .Z(_057_)); + BUF_X1 _762_ (.A(_055_), + .Z(_001_)); + BUF_X4 _763_ (.A(req_rdy), + .Z(_403_)); + BUF_X2 _764_ (.A(req_val), + .Z(_404_)); + BUF_X1 _765_ (.A(_056_), + .Z(_002_)); + BUF_X1 _766_ (.A(_054_), + .Z(_000_)); + BUF_X1 _767_ (.A(_006_), + .Z(_060_)); + BUF_X2 _768_ (.A(req_msg[16]), + .Z(_378_)); + CLKBUF_X1 _769_ (.A(_076_), + .Z(_022_)); + BUF_X1 _770_ (.A(_007_), + .Z(_061_)); + BUF_X2 _771_ (.A(req_msg[17]), + .Z(_379_)); + CLKBUF_X1 _772_ (.A(_083_), + .Z(_029_)); + BUF_X1 _773_ (.A(_008_), + .Z(_062_)); + BUF_X2 _774_ (.A(req_msg[18]), + .Z(_380_)); + CLKBUF_X1 _775_ (.A(_084_), + .Z(_030_)); + BUF_X1 _776_ (.A(_009_), + .Z(_063_)); + BUF_X2 _777_ (.A(req_msg[19]), + .Z(_381_)); + CLKBUF_X1 _778_ (.A(_085_), + .Z(_031_)); + BUF_X1 _779_ (.A(_010_), + .Z(_064_)); + BUF_X2 _780_ (.A(req_msg[20]), + .Z(_383_)); + CLKBUF_X1 _781_ (.A(_086_), + .Z(_032_)); + BUF_X1 _782_ (.A(_011_), + .Z(_065_)); + BUF_X2 _783_ (.A(req_msg[21]), + .Z(_384_)); + CLKBUF_X1 _784_ (.A(_087_), + .Z(_033_)); + BUF_X1 _785_ (.A(_012_), + .Z(_066_)); + BUF_X2 _786_ (.A(req_msg[22]), + .Z(_385_)); + CLKBUF_X1 _787_ (.A(_088_), + .Z(_034_)); + BUF_X1 _788_ (.A(_013_), + .Z(_067_)); + BUF_X2 _789_ (.A(req_msg[23]), + .Z(_386_)); + CLKBUF_X1 _790_ (.A(_089_), + .Z(_035_)); + BUF_X1 _791_ (.A(_014_), + .Z(_068_)); + BUF_X2 _792_ (.A(req_msg[24]), + .Z(_387_)); + CLKBUF_X1 _793_ (.A(_090_), + .Z(_036_)); + BUF_X1 _794_ (.A(_015_), + .Z(_069_)); + BUF_X2 _795_ (.A(req_msg[25]), + .Z(_388_)); + CLKBUF_X1 _796_ (.A(_091_), + .Z(_037_)); + BUF_X1 _797_ (.A(_016_), + .Z(_070_)); + BUF_X2 _798_ (.A(req_msg[26]), + .Z(_389_)); + CLKBUF_X1 _799_ (.A(_077_), + .Z(_023_)); + BUF_X1 _800_ (.A(_017_), + .Z(_071_)); + BUF_X2 _801_ (.A(req_msg[27]), + .Z(_390_)); + CLKBUF_X1 _802_ (.A(_078_), + .Z(_024_)); + BUF_X1 _803_ (.A(_018_), + .Z(_072_)); + BUF_X2 _804_ (.A(req_msg[28]), + .Z(_391_)); + CLKBUF_X1 _805_ (.A(_079_), + .Z(_025_)); + BUF_X2 _806_ (.A(_019_), + .Z(_073_)); + BUF_X2 _807_ (.A(req_msg[29]), + .Z(_392_)); + CLKBUF_X1 _808_ (.A(_080_), + .Z(_026_)); + BUF_X1 _809_ (.A(_020_), + .Z(_074_)); + BUF_X2 _810_ (.A(req_msg[30]), + .Z(_394_)); + CLKBUF_X1 _811_ (.A(_081_), + .Z(_027_)); + BUF_X1 _812_ (.A(_021_), + .Z(_075_)); + BUF_X2 _813_ (.A(req_msg[31]), + .Z(_395_)); + CLKBUF_X1 _814_ (.A(_082_), + .Z(_028_)); + BUF_X1 _815_ (.A(req_msg[0]), + .Z(_371_)); + CLKBUF_X1 _816_ (.A(_092_), + .Z(_038_)); + BUF_X2 _817_ (.A(req_msg[1]), + .Z(_382_)); + CLKBUF_X1 _818_ (.A(_099_), + .Z(_045_)); + BUF_X1 _819_ (.A(req_msg[2]), + .Z(_393_)); + CLKBUF_X1 _820_ (.A(_100_), + .Z(_046_)); + BUF_X1 _821_ (.A(req_msg[3]), + .Z(_396_)); + CLKBUF_X1 _822_ (.A(_101_), + .Z(_047_)); + BUF_X2 _823_ (.A(req_msg[4]), + .Z(_397_)); + CLKBUF_X1 _824_ (.A(_102_), + .Z(_048_)); + BUF_X2 _825_ (.A(req_msg[5]), + .Z(_398_)); + CLKBUF_X1 _826_ (.A(_103_), + .Z(_049_)); + BUF_X1 _827_ (.A(req_msg[6]), + .Z(_399_)); + CLKBUF_X1 _828_ (.A(_104_), + .Z(_050_)); + BUF_X2 _829_ (.A(req_msg[7]), + .Z(_400_)); + CLKBUF_X1 _830_ (.A(_105_), + .Z(_051_)); + BUF_X2 _831_ (.A(req_msg[8]), + .Z(_401_)); + CLKBUF_X1 _832_ (.A(_106_), + .Z(_052_)); + BUF_X2 _833_ (.A(req_msg[9]), + .Z(_402_)); + CLKBUF_X1 _834_ (.A(_107_), + .Z(_053_)); + BUF_X2 _835_ (.A(req_msg[10]), + .Z(_372_)); + CLKBUF_X1 _836_ (.A(_093_), + .Z(_039_)); + BUF_X1 _837_ (.A(req_msg[11]), + .Z(_373_)); + CLKBUF_X1 _838_ (.A(_094_), + .Z(_040_)); + BUF_X2 _839_ (.A(req_msg[12]), + .Z(_374_)); + CLKBUF_X1 _840_ (.A(_095_), + .Z(_041_)); + BUF_X2 _841_ (.A(req_msg[13]), + .Z(_375_)); + CLKBUF_X1 _842_ (.A(_096_), + .Z(_042_)); + BUF_X2 _843_ (.A(req_msg[14]), + .Z(_376_)); + CLKBUF_X1 _844_ (.A(_097_), + .Z(_043_)); + BUF_X2 _845_ (.A(req_msg[15]), + .Z(_377_)); + CLKBUF_X1 _846_ (.A(_098_), + .Z(_044_)); + BUF_X1 _847_ (.A(_413_), + .Z(resp_msg[1])); + BUF_X1 _848_ (.A(_414_), + .Z(resp_msg[2])); + BUF_X1 _849_ (.A(_415_), + .Z(resp_msg[3])); + BUF_X1 _850_ (.A(_416_), + .Z(resp_msg[4])); + BUF_X1 _851_ (.A(_417_), + .Z(resp_msg[5])); + BUF_X1 _852_ (.A(_418_), + .Z(resp_msg[6])); + BUF_X1 _853_ (.A(_419_), + .Z(resp_msg[7])); + BUF_X1 _854_ (.A(_420_), + .Z(resp_msg[8])); + BUF_X1 _855_ (.A(_421_), + .Z(resp_msg[9])); + BUF_X1 _856_ (.A(_407_), + .Z(resp_msg[10])); + BUF_X1 _857_ (.A(_408_), + .Z(resp_msg[11])); + BUF_X1 _858_ (.A(_409_), + .Z(resp_msg[12])); + BUF_X1 _859_ (.A(_410_), + .Z(resp_msg[13])); + BUF_X1 _860_ (.A(_411_), + .Z(resp_msg[14])); + BUF_X1 _861_ (.A(_412_), + .Z(resp_msg[15])); + DFF_X1 _862_ (.D(_000_), + .CK(clk), + .Q(req_rdy), + .QN(_005_)); + DFF_X1 _863_ (.D(_001_), + .CK(clk), + .Q(\ctrl.state.out[1] ), + .QN(_003_)); + DFF_X1 _864_ (.D(_002_), + .CK(clk), + .Q(\ctrl.state.out[2] ), + .QN(_004_)); + DFF_X1 _865_ (.D(_022_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[0] ), + .QN(_424_)); + DFF_X1 _866_ (.D(_029_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[1] ), + .QN(_425_)); + DFF_X1 _867_ (.D(_030_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[2] ), + .QN(_426_)); + DFF_X1 _868_ (.D(_031_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[3] ), + .QN(_427_)); + DFF_X1 _869_ (.D(_032_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[4] ), + .QN(_428_)); + DFF_X1 _870_ (.D(_033_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[5] ), + .QN(_429_)); + DFF_X1 _871_ (.D(_034_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[6] ), + .QN(_430_)); + DFF_X1 _872_ (.D(_035_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[7] ), + .QN(_431_)); + DFF_X1 _873_ (.D(_036_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[8] ), + .QN(_432_)); + DFF_X1 _874_ (.D(_037_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[9] ), + .QN(_433_)); + DFF_X1 _875_ (.D(_023_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[10] ), + .QN(_434_)); + DFF_X1 _876_ (.D(_024_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[11] ), + .QN(_435_)); + DFF_X1 _877_ (.D(_025_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[12] ), + .QN(_436_)); + DFF_X1 _878_ (.D(_026_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[13] ), + .QN(_437_)); + DFF_X1 _879_ (.D(_027_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[14] ), + .QN(_438_)); + DFF_X1 _880_ (.D(_028_), + .CK(clk), + .Q(\dpath.a_lt_b$in0[15] ), + .QN(_439_)); + DFF_X1 _881_ (.D(_038_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[0] ), + .QN(_006_)); + DFF_X1 _882_ (.D(_045_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[1] ), + .QN(_007_)); + DFF_X1 _883_ (.D(_046_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[2] ), + .QN(_008_)); + DFF_X1 _884_ (.D(_047_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[3] ), + .QN(_009_)); + DFF_X1 _885_ (.D(_048_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[4] ), + .QN(_010_)); + DFF_X1 _886_ (.D(_049_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[5] ), + .QN(_011_)); + DFF_X1 _887_ (.D(_050_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[6] ), + .QN(_012_)); + DFF_X1 _888_ (.D(_051_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[7] ), + .QN(_013_)); + DFF_X1 _889_ (.D(_052_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[8] ), + .QN(_014_)); + DFF_X1 _890_ (.D(_053_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[9] ), + .QN(_015_)); + DFF_X1 _891_ (.D(_039_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[10] ), + .QN(_016_)); + DFF_X1 _892_ (.D(_040_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[11] ), + .QN(_017_)); + DFF_X1 _893_ (.D(_041_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[12] ), + .QN(_018_)); + DFF_X1 _894_ (.D(_042_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[13] ), + .QN(_019_)); + DFF_X1 _895_ (.D(_043_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[14] ), + .QN(_020_)); + DFF_X1 _896_ (.D(_044_), + .CK(clk), + .Q(\dpath.a_lt_b$in1[15] ), + .QN(_021_)); +endmodule diff --git a/src/rsz/test/global_sizing_threads.tcl b/src/rsz/test/global_sizing_threads.tcl new file mode 100644 index 00000000000..19899e1b68d --- /dev/null +++ b/src/rsz/test/global_sizing_threads.tcl @@ -0,0 +1,7 @@ +# Multi-threaded run of the global sizing coverage flow (see global_sizing.tcl). +# Exercises the parallel Phase-B worker path and asserts it produces the same +# netlist as the serial golden (global_sizing.vok). +source "helpers.tcl" +set global_sizing_threads 8 +set global_sizing_result "global_sizing_threads" +source "global_sizing.tcl" diff --git a/src/rsz/test/repair_setup_invalid_phase.ok b/src/rsz/test/repair_setup_invalid_phase.ok index 03b9b776074..5f4ab09d837 100644 --- a/src/rsz/test/repair_setup_invalid_phase.ok +++ b/src/rsz/test/repair_setup_invalid_phase.ok @@ -4,7 +4,7 @@ [INFO ODB-0131] Created 3 components and 15 component-terminals. [INFO ODB-0132] Created 2 special nets and 0 connections. [INFO ODB-0133] Created 7 nets and 9 connections. -[ERROR RSZ-0223] No phase names specified. Valid phase names are: LEGACY, WNS, WNS_PATH, WNS_CONE, TNS, ENDPOINT_FANIN, STARTPOINT_FANOUT, LAST_GASP, CRIT_VT_SWAP, REROUTE +[ERROR RSZ-0223] No phase names specified. Valid phase names are: LEGACY, WNS, WNS_PATH, WNS_CONE, TNS, ENDPOINT_FANIN, STARTPOINT_FANOUT, LAST_GASP, CRIT_VT_SWAP, REROUTE, GLOBAL_SIZING Caught expected empty phase error: RSZ-0223 [INFO RSZ-0100] Repair move sequence: UnbufferMove SizeUpMove SwapPinsMove BufferMove CloneMove SplitLoadMove [INFO RSZ-0094] Found 2 endpoints with setup violations. @@ -16,5 +16,5 @@ Caught expected empty phase error: RSZ-0223 0* | 0 | 0 | 0 | 0 | 0 | +0.0% | -0.278 | -0.7 | -0.5 | 2 | y2 10* | 0 | 3 | 0 | 0 | 1 | +53.8% | -0.195 | -0.5 | -0.3 | 2 | y2 14* | 0 | 4 | 0 | 0 | 0 | +84.6% | -0.157 | -0.4 | -0.3 | 2 | y2 -[ERROR RSZ-0217] Unknown phase name 'BAD_PHASE'. Valid phase names are: LEGACY, WNS, WNS_PATH, WNS_CONE, TNS, ENDPOINT_FANIN, STARTPOINT_FANOUT, LAST_GASP, CRIT_VT_SWAP, REROUTE +[ERROR RSZ-0217] Unknown phase name 'BAD_PHASE'. Valid phase names are: LEGACY, WNS, WNS_PATH, WNS_CONE, TNS, ENDPOINT_FANIN, STARTPOINT_FANOUT, LAST_GASP, CRIT_VT_SWAP, REROUTE, GLOBAL_SIZING Caught expected invalid phase error: RSZ-0217 From 4c3e619379c87b78a0c8a1c4e48ffbfe86e9653f Mon Sep 17 00:00:00 2001 From: Eren Dogan Date: Fri, 5 Jun 2026 20:06:31 -0700 Subject: [PATCH 2/8] rsz: clang-tidy Signed-off-by: Eren Dogan --- src/rsz/src/LRSubproblem.cc | 18 ++++++++---------- src/rsz/src/policy/GlobalSizingPolicy.cc | 16 +++++++--------- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/rsz/src/LRSubproblem.cc b/src/rsz/src/LRSubproblem.cc index 34eac8cd2e8..95263899a1f 100644 --- a/src/rsz/src/LRSubproblem.cc +++ b/src/rsz/src/LRSubproblem.cc @@ -153,7 +153,7 @@ float LRSubproblem::leakageOrArea(sta::LibertyCell* cell) const bool LRSubproblem::isDataArc(const sta::Edge* edge) const { const sta::TimingRole* role = edge->role(); - if (role->isTimingCheck()) { + if (role != nullptr && role->isTimingCheck()) { return false; } if (edge->isDisabledLoop()) { @@ -241,7 +241,7 @@ bool LRSubproblem::snapshot(sta::Instance* inst, continue; } const sta::EdgeId id = graph_->id(e); - if (static_cast(id) >= lambda_size) { + if (std::cmp_greater_equal(id, lambda_size)) { continue; } lam_sum += lambda[id]; @@ -352,7 +352,7 @@ bool LRSubproblem::snapshot(sta::Instance* inst, continue; } const sta::EdgeId id = graph_->id(e); - if (static_cast(id) >= lambda_size) { + if (std::cmp_greater_equal(id, lambda_size)) { continue; } lam_U += lambda[id]; @@ -435,13 +435,11 @@ float LRSubproblem::evaluateCellCost(const GateSnapshot& snap, // Candidate missing this input port - incompatible. return std::numeric_limits::infinity(); } - float load_pert = u.load_U_cur - u.c_in_cur + c_in_cand; - if (load_pert < 0.0f) { - // Numerical safety: extreme C_in mismatches can push the perturbed - // load slightly negative. Clamp at zero rather than rejecting; the - // gateDelay LUT is well-defined at zero load. - load_pert = 0.0f; - } + // Numerical safety: extreme C_in mismatches can push the perturbed load + // slightly negative. Clamp at zero rather than rejecting; the gateDelay LUT + // is well-defined at zero load. + const float load_pert + = std::max(u.load_U_cur - u.c_in_cur + c_in_cand, 0.0f); const float d_U = sta::delayAsFloat(resizer_->gateDelay( u.drv_port, load_pert, scene, max_, arc_delay_calc)); cost += timing_weight * u.lambda_U_drv * d_U; diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc index 3667e65139a..1cccf6c7e1c 100644 --- a/src/rsz/src/policy/GlobalSizingPolicy.cc +++ b/src/rsz/src/policy/GlobalSizingPolicy.cc @@ -56,7 +56,7 @@ GlobalSizingPolicy::~GlobalSizingPolicy() = default; bool GlobalSizingPolicy::isDataArc(const sta::Edge* edge) const { const sta::TimingRole* role = edge->role(); - if (role->isTimingCheck()) { + if (role != nullptr && role->isTimingCheck()) { return false; } if (edge->isDisabledLoop()) { @@ -328,11 +328,9 @@ void GlobalSizingPolicy::projectFlowBalance(const LRParams& params) vertices.push_back(vit.next()); } } - std::sort(vertices.begin(), - vertices.end(), - [](const sta::Vertex* a, const sta::Vertex* b) { - return a->level() > b->level(); - }); + std::ranges::sort(vertices, [](const sta::Vertex* a, const sta::Vertex* b) { + return a->level() > b->level(); + }); int rescaled = 0; int zero_sum_fallback = 0; @@ -543,8 +541,8 @@ GlobalSizingPolicy::SweepStats GlobalSizingPolicy::singleSweep( = thread_pool_->parallelMap( snapshots, [this, timing_weight, src](const LRSubproblem::GateSnapshot& snap) { - thread_local sta::ArcDelayCalc* cached_src = nullptr; - thread_local std::unique_ptr adc; + static thread_local sta::ArcDelayCalc* cached_src = nullptr; + static thread_local std::unique_ptr adc; if (adc == nullptr || cached_src != src) { adc.reset(src->copy()); cached_src = src; @@ -622,7 +620,7 @@ float GlobalSizingPolicy::computeAutoTimingWeight(const LRParams& params) const continue; } const sta::EdgeId id = graph_->id(e); - if (static_cast(id) >= lambda_size) { + if (std::cmp_greater_equal(id, lambda_size)) { continue; } lam_sum += lambda_[id]; From 9a56ff0eb4fadae934bcd17d0e004006c8182757 Mon Sep 17 00:00:00 2001 From: Eren Dogan Date: Sat, 6 Jun 2026 12:29:44 -0700 Subject: [PATCH 3/8] rsz: Fix edge id overflow Signed-off-by: Eren Dogan --- src/rsz/src/policy/GlobalSizingPolicy.cc | 26 +++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc index 1cccf6c7e1c..4ca57ecaced 100644 --- a/src/rsz/src/policy/GlobalSizingPolicy.cc +++ b/src/rsz/src/policy/GlobalSizingPolicy.cc @@ -348,7 +348,16 @@ void GlobalSizingPolicy::projectFlowBalance(const LRParams& params) if (!isDataArc(e)) { continue; } - target += lambda_[graph_->id(e)]; + // lambda_ is sized to the edge-id space captured in allocate(). A + // sweep can replace cells and the subsequent updateParasitics()/ + // findRequireds() rebuild arcs, minting edge ids beyond that space, so + // an id may now be >= lambda_.size(). Such arcs carry no multiplier; + // skip them, matching the guard in updateMultipliers(). + const sta::EdgeId id = graph_->id(e); + if (static_cast(id) >= lambda_.size()) { + continue; + } + target += lambda_[id]; } } @@ -362,7 +371,11 @@ void GlobalSizingPolicy::projectFlowBalance(const LRParams& params) if (!isDataArc(e)) { continue; } - in_sum += lambda_[graph_->id(e)]; + const sta::EdgeId id = graph_->id(e); + if (static_cast(id) >= lambda_.size()) { + continue; + } + in_sum += lambda_[id]; ++in_count; } } @@ -380,6 +393,9 @@ void GlobalSizingPolicy::projectFlowBalance(const LRParams& params) continue; } const sta::EdgeId id = graph_->id(e); + if (static_cast(id) >= lambda_.size()) { + continue; + } lambda_[id] = std::max(lambda_[id] * scale, params.lambda_floor); } ++rescaled; @@ -391,7 +407,11 @@ void GlobalSizingPolicy::projectFlowBalance(const LRParams& params) if (!isDataArc(e)) { continue; } - lambda_[graph_->id(e)] = std::max(share, params.lambda_floor); + const sta::EdgeId id = graph_->id(e); + if (static_cast(id) >= lambda_.size()) { + continue; + } + lambda_[id] = std::max(share, params.lambda_floor); } ++zero_sum_fallback; } From 2dd38548ad4b1f84a3ef7b8e86eb0ff2eefd63bb Mon Sep 17 00:00:00 2001 From: Eren Dogan Date: Sat, 6 Jun 2026 17:45:39 -0700 Subject: [PATCH 4/8] rsz: Prevent max area overflow Signed-off-by: Eren Dogan --- src/rsz/src/policy/GlobalSizingPolicy.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc index 4ca57ecaced..93aeb2f6dd7 100644 --- a/src/rsz/src/policy/GlobalSizingPolicy.cc +++ b/src/rsz/src/policy/GlobalSizingPolicy.cc @@ -810,7 +810,8 @@ void GlobalSizingPolicy::iterate() // Best-so-far: Keep track of the best WNS so far but don't restore a sweep // that worsens WNS just yet to allow oscillation. const float current_wns = sta::delayAsFloat(sta_->worstSlack(policy_max_)); - if (!reject && sta::fuzzyGreaterEqual(current_wns, best_wns)) { + if (!reject && sta::fuzzyGreaterEqual(current_wns, best_wns) + && !resizer_.overMaxArea()) { resizer_.journalEnd(); // checkpoint resizer_.journalBegin(); best_wns = current_wns; From 75c93ef5b3a732f3e8ed259c24b1b24b85549047 Mon Sep 17 00:00:00 2001 From: Eren Dogan Date: Sat, 6 Jun 2026 18:32:52 -0700 Subject: [PATCH 5/8] rsz: Early exit when timing is met Signed-off-by: Eren Dogan --- src/rsz/src/policy/GlobalSizingPolicy.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc index 93aeb2f6dd7..74b1e7b6549 100644 --- a/src/rsz/src/policy/GlobalSizingPolicy.cc +++ b/src/rsz/src/policy/GlobalSizingPolicy.cc @@ -775,6 +775,21 @@ void GlobalSizingPolicy::iterate() int consec_reject = 0; resizer_.journalBegin(); for (int iter = 0; iter < max_iter; ++iter) { + // Global sizing only drives WNS upward; once it meets the setup margin + // there is no timing left to recover and further sweeps would only spend + // area and leakage. + const float wns_now = sta::delayAsFloat(sta_->worstSlack(policy_max_)); + if (sta::fuzzyGreaterEqual(wns_now, lr_params_.setup_slack_margin)) { + debugPrint(logger_, + RSZ, + "global_sizing", + 1, + "LR stop: WNS {} meets setup margin {}", + sta::delayAsString(wns_now, 3, sta_), + sta::delayAsString(lr_params_.setup_slack_margin, 3, sta_)); + break; + } + if (iter > 0) { updateMultipliers(iter_params); projectFlowBalance(iter_params); From 844d2cff936ec476b824c2dff44fb8df04edfd60 Mon Sep 17 00:00:00 2001 From: Eren Dogan Date: Sun, 7 Jun 2026 17:38:02 -0700 Subject: [PATCH 6/8] rsz: Exclude clock drivers Signed-off-by: Eren Dogan --- src/rsz/src/LRSubproblem.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/rsz/src/LRSubproblem.cc b/src/rsz/src/LRSubproblem.cc index 95263899a1f..8315f9c8170 100644 --- a/src/rsz/src/LRSubproblem.cc +++ b/src/rsz/src/LRSubproblem.cc @@ -220,6 +220,9 @@ bool LRSubproblem::snapshot(sta::Instance* inst, sta::Pin* pin = pit->next(); const sta::PortDirection* dir = network_->direction(pin); if (dir->isOutput()) { + if (sta_->isClock(pin, sta_->cmdMode())) { + return false; + } sta::Vertex* v = graph_->pinDrvrVertex(pin); if (v == nullptr) { continue; From 7cabee91a286e678e1ab9df8dc5a5ab0aeb8b29f Mon Sep 17 00:00:00 2001 From: Eren Dogan Date: Mon, 8 Jun 2026 20:05:16 -0700 Subject: [PATCH 7/8] rsz: Add downsizing budget Signed-off-by: Eren Dogan --- src/rsz/src/LRSubproblem.cc | 60 +++++++++++++ src/rsz/src/LRSubproblem.hh | 30 ++++++- src/rsz/src/policy/GlobalSizingPolicy.cc | 102 ++++++++++++++++++++++- src/rsz/src/policy/GlobalSizingPolicy.hh | 16 ++++ 4 files changed, 202 insertions(+), 6 deletions(-) diff --git a/src/rsz/src/LRSubproblem.cc b/src/rsz/src/LRSubproblem.cc index 8315f9c8170..a17c1b1b0ed 100644 --- a/src/rsz/src/LRSubproblem.cc +++ b/src/rsz/src/LRSubproblem.cc @@ -29,6 +29,8 @@ namespace rsz { +using utl::RSZ; + namespace { // Resizer::area(Cell*) is protected. Compute the same value through the public @@ -192,6 +194,8 @@ bool LRSubproblem::applyReplacement(sta::Instance* inst, bool LRSubproblem::snapshot(sta::Instance* inst, const float* lambda, const int lambda_size, + const float* budget, + const int budget_size, GateSnapshot& snap) { init(); @@ -215,6 +219,12 @@ bool LRSubproblem::snapshot(sta::Instance* inst, snap.inputs.clear(); snap.candidates.clear(); + // Min depth-normalized downsize budget over the kept output pins. The policy + // precomputes a per-vertex budget (computeSlackBudgets) from the live slacks; + // we just freeze the gate's worst (min) value so workers never touch the STA + // graph. + float worst_budget = std::numeric_limits::max(); + std::unique_ptr pit(network_->pinIterator(inst)); while (pit->hasNext()) { sta::Pin* pin = pit->next(); @@ -264,6 +274,11 @@ bool LRSubproblem::snapshot(sta::Instance* inst, max_mm)) : 0.0f; o.drive_res = out_port->driveResistance(); + const sta::VertexId vid = graph_->id(v); + const float vbudget = std::cmp_less(vid, budget_size) + ? budget[vid] + : std::numeric_limits::max(); + worst_budget = std::min(worst_budget, vbudget); snap.outputs.push_back(o); } else if (dir->isInput()) { const sta::LibertyPort* in_port = network_->libertyPort(pin); @@ -381,6 +396,7 @@ bool LRSubproblem::snapshot(sta::Instance* inst, if (snap.outputs.empty()) { return false; } + snap.budget = worst_budget; // Precompute leakage-equivalent cost for the current cell and every // candidate now, on the main thread - leakageOrArea/getSwappableCells mutate @@ -507,9 +523,44 @@ bool LRSubproblem::candidateDrcOkSnapshot(const GateSnapshot& snap, return true; } +bool LRSubproblem::downsizeFitsSlackBudget( + const GateSnapshot& snap, + sta::LibertyCell* replacement, + const float safety, + sta::ArcDelayCalc* arc_delay_calc) const +{ + // snap.budget is the depth-normalized, distributed slack budget. + const float budget = safety * snap.budget; + if (budget <= 0.0f) { + return false; + } + const sta::Scene* scene = snap.scene; + for (const OutputCtx& o : snap.outputs) { + if (o.port == nullptr) { + continue; + } + sta::LibertyPort* cand_port = replacement->findLibertyPort(o.port->name()); + if (cand_port == nullptr) { + return false; // candidate missing this output port - reject + } + // Δd at the frozen load: extra gate delay the downsize adds on this pin. + // Increasing the gate delay by Δd reduces the slack on every path through + // the pin by Δd, so Δd must fit the budget. + const float d_cur = sta::delayAsFloat( + resizer_->gateDelay(o.port, o.load_cap, scene, max_, arc_delay_calc)); + const float d_cand = sta::delayAsFloat(resizer_->gateDelay( + cand_port, o.load_cap, scene, max_, arc_delay_calc)); + if (d_cand - d_cur > budget) { + return false; + } + } + return true; +} + LRSubproblem::GateDecision LRSubproblem::evaluateSnapshot( const GateSnapshot& snap, const float timing_weight, + const float budget_safety, sta::ArcDelayCalc* arc_delay_calc) const { GateDecision result; @@ -527,6 +578,15 @@ LRSubproblem::GateDecision LRSubproblem::evaluateSnapshot( if (!candidateDrcOkSnapshot(snap, cand.cell)) { continue; } + // Downsize slack-budget guard: a candidate with lower leakage than the + // current cell is a downsize; only take it if its added delay fits the + // gate's distributed slack budget. Upsizes are unconstrained - they only + // improve setup. + if (cand.leakage < snap.cur_leakage + && !downsizeFitsSlackBudget( + snap, cand.cell, budget_safety, arc_delay_calc)) { + continue; + } const float cost = evaluateCellCost( snap, cand.cell, cand.leakage, timing_weight, arc_delay_calc); if (cost < result.best_cost) { diff --git a/src/rsz/src/LRSubproblem.hh b/src/rsz/src/LRSubproblem.hh index 008a27b69b0..fe9bcd88649 100644 --- a/src/rsz/src/LRSubproblem.hh +++ b/src/rsz/src/LRSubproblem.hh @@ -125,6 +125,14 @@ class LRSubproblem : public sta::dbStaState sta::LibertyCell* cur_cell = nullptr; float cur_leakage = 0.0f; const sta::Scene* scene = nullptr; + // Distributed downsize budget for this gate: the min over its output pins + // of the depth-normalized slack budget max(0, slack - margin) / depth, + // frozen on the main thread (computed by the policy's computeSlackBudgets). + // A downsize may add at most this much delay on any output pin (times a + // safety factor). Because the per-path sum of these budgets is <= the path + // slack, simultaneous (Jacobi) downsizes within budget cannot overshoot a + // path. + float budget = 0.0f; std::vector outputs; std::vector upstream; std::vector inputs; @@ -153,17 +161,23 @@ class LRSubproblem : public sta::dbStaState // MAIN THREAD ONLY. Capture the frozen state needed to evaluate `inst`. // Returns false (and leaves `snap` unspecified) when `inst` is don't-touch, // has no liberty cell, or has no usable output pin. `lambda` is indexed by - // sta::Edge::id (sparse, size `lambda_size`). + // sta::Edge::id (sparse, size `lambda_size`). `budget` is the per-vertex + // depth-normalized downsize budget indexed by sta::Graph vertex id (size + // `budget_size`); the gate's frozen budget is the min over its output pins. bool snapshot(sta::Instance* inst, const float* lambda, int lambda_size, + const float* budget, + int budget_size, GateSnapshot& snap); // WORKER SAFE. Evaluate the subproblem for a prepared snapshot using the // caller-provided per-thread ArcDelayCalc. `timing_weight` scales the Σλ·d - // timing term against the leakage objective. + // timing term against the leakage objective. `budget_safety` (<= 1) scales + // the gate's frozen downsize budget in the feasibility guard. GateDecision evaluateSnapshot(const GateSnapshot& snap, float timing_weight, + float budget_safety, sta::ArcDelayCalc* arc_delay_calc) const; // Leakage-equivalent cost for `cell`. Returns Resizer::cellLeakage when @@ -204,6 +218,18 @@ class LRSubproblem : public sta::dbStaState bool candidateDrcOkSnapshot(const GateSnapshot& snap, sta::LibertyCell* replacement) const; + // Worker-safe downsize feasibility guard over a frozen snapshot. Returns true + // iff installing the (lower-leakage) `replacement` adds, on every output pin, + // no more delay than `safety * snap.budget`. snap.budget is the depth- + // normalized, distributed slack budget frozen by the policy: because the + // per-path sum of gate budgets is <= path slack, simultaneous downsizes + // within budget cannot overshoot, so no per-gate discount is needed. A gate + // with no budget (<= 0) cannot be downsized. + bool downsizeFitsSlackBudget(const GateSnapshot& snap, + sta::LibertyCell* replacement, + float safety, + sta::ArcDelayCalc* arc_delay_calc) const; + Resizer* resizer_ = nullptr; utl::Logger* logger_ = nullptr; sta::dbNetwork* db_network_ = nullptr; diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc index 74b1e7b6549..d92e6865d7e 100644 --- a/src/rsz/src/policy/GlobalSizingPolicy.cc +++ b/src/rsz/src/policy/GlobalSizingPolicy.cc @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -452,6 +453,89 @@ GlobalSizingPolicy::DesignSnap GlobalSizingPolicy::computeDesignSnap() const return s; } +void GlobalSizingPolicy::computeSlackBudgets() +{ + // Per-vertex downsize budget = max(0, slack(v) - margin) / depth(v), where + // depth(v) is the gate count on the longest path through v. Distributing by + // depth bounds the per-path budget sum by the path slack; using v's own + // (worst-path) slack keeps each gate safe on all its paths. Recomputed each + // sweep from the live slacks. + const size_t n = static_cast(graph_->vertexCount()) + 1; + + std::vector vertices; + { + sta::VertexIterator vit(graph_); + while (vit.hasNext()) { + vertices.push_back(vit.next()); + } + } + std::ranges::sort(vertices, [](const sta::Vertex* a, const sta::Vertex* b) { + return a->level() < b->level(); + }); + + // A gate-internal (cell) arc has both pins on the same leaf instance; only + // these add a gate-delay term to a path, so only these increment the depth. + auto is_gate_arc = [this](sta::Edge* e) { + const sta::Instance* fi = network_->instance(e->from(graph_)->pin()); + const sta::Instance* ti = network_->instance(e->to(graph_)->pin()); + return fi != nullptr && fi == ti; + }; + + // Forward pass (increasing level): gates from a source up to and including v. + std::vector fwd(n, 0); + for (sta::Vertex* v : vertices) { + int best = 0; + sta::VertexInEdgeIterator ieit(v, graph_); + while (ieit.hasNext()) { + sta::Edge* e = ieit.next(); + if (!isDataArc(e)) { + continue; + } + const sta::VertexId uid = graph_->id(e->from(graph_)); + best = std::max(best, fwd[uid] + (is_gate_arc(e) ? 1 : 0)); + } + fwd[graph_->id(v)] = best; + } + + // Backward pass (decreasing level): gates from v (exclusive) to a sink. + std::vector bwd(n, 0); + for (sta::Vertex* v : std::views::reverse(vertices)) { + int best = 0; + sta::VertexOutEdgeIterator oeit(v, graph_); + while (oeit.hasNext()) { + sta::Edge* e = oeit.next(); + if (!isDataArc(e)) { + continue; + } + const sta::VertexId wid = graph_->id(e->to(graph_)); + best = std::max(best, bwd[wid] + (is_gate_arc(e) ? 1 : 0)); + } + bwd[graph_->id(v)] = best; + } + + const float margin = lr_params_.setup_slack_margin; + const float kSlackSentinel = 1e6f; + vertex_budget_.assign(n, 0.0f); + for (sta::Vertex* v : vertices) { + const sta::VertexId vid = graph_->id(v); + const int depth = std::max(1, fwd[vid] + bwd[vid]); + const float slack = sta::delayAsFloat(sta_->slack(v, policy_max_)); + // Unconstrained vertices (no real required time) report a sentinel slack; + // leave them effectively unbudgeted so genuinely free gates can downsize. + vertex_budget_[vid] + = (slack >= kSlackSentinel) + ? kSlackSentinel + : std::max(0.0f, slack - margin) / static_cast(depth); + } + debugPrint(logger_, + RSZ, + "global_sizing", + 2, + "LR budgets: {} vertices, margin={}", + n - 1, + sta::delayAsString(margin, 3, sta_)); +} + std::vector GlobalSizingPolicy::buildSnapshots() { // Phase A (main thread, delays valid): freeze each evaluable gate's @@ -459,13 +543,19 @@ std::vector GlobalSizingPolicy::buildSnapshots() // getSwappableCells / cellLeakage / net-driver caches, so the subsequent // parallel phase touches none of them. const int lambda_size = static_cast(lambda_.size()); + const int budget_size = static_cast(vertex_budget_.size()); std::vector snapshots; std::unique_ptr iit( network_->leafInstanceIterator()); while (iit->hasNext()) { sta::Instance* inst = iit->next(); LRSubproblem::GateSnapshot snap; - if (subproblem_->snapshot(inst, lambda_.data(), lambda_size, snap)) { + if (subproblem_->snapshot(inst, + lambda_.data(), + lambda_size, + vertex_budget_.data(), + budget_size, + snap)) { snapshots.push_back(std::move(snap)); } } @@ -549,18 +639,22 @@ GlobalSizingPolicy::SweepStats GlobalSizingPolicy::applyDecisions( GlobalSizingPolicy::SweepStats GlobalSizingPolicy::singleSweep( const float timing_weight) { - // Phase A: Freeze per-gate state. + // Phase A: Distribute the slack into per-vertex budgets, then freeze per-gate + // state (which reads those budgets). + computeSlackBudgets(); std::vector snapshots = buildSnapshots(); // Phase B: Score every snapshot independently. Each worker uses its own // ArcDelayCalc copy (arc_delay_calc_ is single-threaded shared state); the // copy is cached per worker thread and refreshed if the source changes. With // a zero-worker pool this runs inline on the calling thread. + const float safety = lr_params_.budget_safety_factor; sta::ArcDelayCalc* const src = sta_->arcDelayCalc(); const std::vector decisions = thread_pool_->parallelMap( snapshots, - [this, timing_weight, src](const LRSubproblem::GateSnapshot& snap) { + [this, timing_weight, safety, src]( + const LRSubproblem::GateSnapshot& snap) { static thread_local sta::ArcDelayCalc* cached_src = nullptr; static thread_local std::unique_ptr adc; if (adc == nullptr || cached_src != src) { @@ -568,7 +662,7 @@ GlobalSizingPolicy::SweepStats GlobalSizingPolicy::singleSweep( cached_src = src; } return subproblem_->evaluateSnapshot( - snap, timing_weight, adc.get()); + snap, timing_weight, safety, adc.get()); }); // Phase C: Apply accepted moves serially. diff --git a/src/rsz/src/policy/GlobalSizingPolicy.hh b/src/rsz/src/policy/GlobalSizingPolicy.hh index 1b0f1bf1de5..3ffdb5a8df7 100644 --- a/src/rsz/src/policy/GlobalSizingPolicy.hh +++ b/src/rsz/src/policy/GlobalSizingPolicy.hh @@ -45,6 +45,11 @@ struct LRParams // Dimensionless balance between timing pressure and leakage cost. // bias = 1.0 keeps Σλ·d (scaled) ≈ leakage cost on the median gate. float timing_bias = 64.0f; + // Safety derate (<= 1) on the per-gate distributed downsize budget. The + // depth-normalized distribution already guarantees per-path budget sums + // <= path slack, so 1.0 is feasible in theory; a value < 1 adds margin for + // the un-modeled slew cascade / estimated-vs-routed parasitic gap. + float budget_safety_factor = 1.0f; }; // GlobalSizingPolicy: Lagrangian-Relaxation-driven global sizing + Vt @@ -102,6 +107,14 @@ class GlobalSizingPolicy : public OptimizationPolicy // after this returns. SweepStats singleSweep(float timing_weight); + // Phase A pre-pass: Compute the per-vertex depth-normalized downsize budget + // budget(v) = max(0, slack(v) - margin) / depth(v) + // where depth(v) is the gate count on the longest path through v. + // Distributing by depth guarantees the per-path sum of budgets <= path slack, + // while using each vertex's own (worst-path) slack keeps every gate within + // all its paths. + void computeSlackBudgets(); + // Phase A: Capture the frozen per-gate snapshots for every evaluable leaf // instance, in a stable order. Reads live STA and warms the lazy // Liberty/dbNetwork caches on the main thread. @@ -138,6 +151,9 @@ class GlobalSizingPolicy : public OptimizationPolicy // Per-edge multipliers, indexed by sta::Edge::id (sparse) std::vector lambda_; + // Per-vertex depth-normalized downsize budget, indexed by sta::Graph vertex + // id. Rebuilt each sweep by computeSlackBudgets(). + std::vector vertex_budget_; // Per-endpoint multipliers, indexed by a dense endpoint index std::vector mu_; // Dense endpoint bookkeeping From 0b9d52c05557d81a0890631c0103507bcc270b5c Mon Sep 17 00:00:00 2001 From: Eren Dogan Date: Mon, 8 Jun 2026 21:57:48 -0700 Subject: [PATCH 8/8] rsz: Fix vertex id overflow Signed-off-by: Eren Dogan --- src/rsz/src/policy/GlobalSizingPolicy.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/rsz/src/policy/GlobalSizingPolicy.cc b/src/rsz/src/policy/GlobalSizingPolicy.cc index d92e6865d7e..5ffb06a0fe7 100644 --- a/src/rsz/src/policy/GlobalSizingPolicy.cc +++ b/src/rsz/src/policy/GlobalSizingPolicy.cc @@ -460,15 +460,17 @@ void GlobalSizingPolicy::computeSlackBudgets() // depth bounds the per-path budget sum by the path slack; using v's own // (worst-path) slack keeps each gate safe on all its paths. Recomputed each // sweep from the live slacks. - const size_t n = static_cast(graph_->vertexCount()) + 1; - std::vector vertices; + size_t max_id = 0; { sta::VertexIterator vit(graph_); while (vit.hasNext()) { - vertices.push_back(vit.next()); + sta::Vertex* v = vit.next(); + vertices.push_back(v); + max_id = std::max(max_id, static_cast(graph_->id(v))); } } + const size_t n = max_id + 1; std::ranges::sort(vertices, [](const sta::Vertex* a, const sta::Vertex* b) { return a->level() < b->level(); }); @@ -531,8 +533,9 @@ void GlobalSizingPolicy::computeSlackBudgets() RSZ, "global_sizing", 2, - "LR budgets: {} vertices, margin={}", - n - 1, + "LR budgets: {} vertices (max id {}), margin={}", + vertices.size(), + max_id, sta::delayAsString(margin, 3, sta_)); }