Skip to content

Commit 7427aee

Browse files
nvidia-gpu: add ConnectX Ethernet Port Metrics
Add xyz.openbmc_project.Metric.Value interface for each of the following Ethernet port metrics of a ConnectX device. - TXBytes - RXBytes - RXMulticastFrames - TXMulticastFrames - RXUnicastFrames - TXUnicastFrames - RXBroadcastFrames - TXBroadcastFrames - RXFCSErrors - RXFrameAlignmentErrors - RXFalseCarrierErrors - RXUndersizeFrames - RXOversizeFrames - RXPauseXONFrames - RXPauseXOFFFrames - TXPauseXONFrames - TXPauseXOFFFrames - TXSingleCollisions - TXMultipleCollisions - TXLateCollisions - TXExcessiveCollisions PDI Patch - https://gerrit.openbmc.org/c/openbmc/phosphor-dbus-interfaces/+/84847 Tested: Build an image for nvl32-obmc machine with the following patch cherry picked. https://gerrit.openbmc.org/c/openbmc/entity-manager/+/84257 https://gerrit.openbmc.org/c/openbmc/openbmc/+/85490 The openbmc patch cherry-picks the following patches that are currently under review. ``` 1. device tree https://lore.kernel.org/all/aRbLqH8pLWCQryhu@molberding.nvidia.com/ 2. mctpd patches CodeConstruct/mctp#85 3. u-boot changes https://lore.kernel.org/openbmc/20251121-msx4-v1-0-fc0118b666c1@nvidia.com/T/#t 4. kernel changes as specified in the openbmc patch (for espi) 5. entity-manager changes https://gerrit.openbmc.org/c/openbmc/entity-manager/+/85455 6. platform-init changes https://gerrit.openbmc.org/c/openbmc/platform-init/+/85456 7. spi changes https://lore.kernel.org/all/20251121-w25q01jv_fixup-v1-1-3d175050db73@nvidia.com/ ``` ``` root@nvl32-obmc:~# busctl tree xyz.openbmc_project.GpuSensor `- /xyz `- /xyz/openbmc_project |- /xyz/openbmc_project/inventory | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_0_NIC | | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_0_NIC/Port_1 | | `- /xyz/openbmc_project/inventory/Nvidia_ConnectX_0_NIC/Port_2 | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_0_PCIe | | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_0_PCIe/DOWN_0 | | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_0_PCIe/DOWN_1 | | `- /xyz/openbmc_project/inventory/Nvidia_ConnectX_0_PCIe/UP_0 | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_2_NIC | | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_2_NIC/Port_1 | | `- /xyz/openbmc_project/inventory/Nvidia_ConnectX_2_NIC/Port_2 | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_2_PCIe | | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_2_PCIe/DOWN_0 | | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_2_PCIe/DOWN_1 | | `- /xyz/openbmc_project/inventory/Nvidia_ConnectX_2_PCIe/UP_0 | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_3_NIC | | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_3_NIC/Port_1 | | `- /xyz/openbmc_project/inventory/Nvidia_ConnectX_3_NIC/Port_2 | `- /xyz/openbmc_project/inventory/Nvidia_ConnectX_3_PCIe | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_3_PCIe/DOWN_0 | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_3_PCIe/DOWN_1 | `- /xyz/openbmc_project/inventory/Nvidia_ConnectX_3_PCIe/UP_0 |- /xyz/openbmc_project/metric | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1 | | `- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/rx_broadcast_frames | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/rx_bytes | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/rx_false_carrier_errors | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/rx_fcs_errors | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/rx_frame_alignment_errors | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/rx_multicast_frames | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/rx_oversize_frames | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/rx_pause_xoff_frames | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/rx_pause_xon_frames | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/rx_undersize_frames | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/rx_unicast_frames | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/tx_broadcast_frames | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/tx_bytes | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/tx_excessive_collisions | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/tx_late_collisions | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/tx_multicast_frames | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/tx_multiple_collisions | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/tx_pause_xoff_frames | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/tx_pause_xon_frames | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/tx_single_collisions | | `- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_NIC_Port_1/nic/tx_unicast_frames root@nvl32-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/metric/port_Nvidia_ConnectX_3_NIC_Port_2/nic/rx_bytes NAME TYPE SIGNATURE RESULT/VALUE FLAGS org.freedesktop.DBus.Introspectable interface - - - .Introspect method - s - org.freedesktop.DBus.Peer interface - - - .GetMachineId method - s - .Ping method - - - org.freedesktop.DBus.Properties interface - - - .Get method ss v - .GetAll method s a{sv} - .Set method ssv - - .PropertiesChanged signal sa{sv}as - - xyz.openbmc_project.Association.Definitions interface - - - .Associations property a(sss) 1 "measuring" "measured_by" "/xyz/ope... emits-change xyz.openbmc_project.Metric.Value interface - - - .Unit property s "xyz.openbmc_project.Metric.Value.Uni... emits-change .Value property d 0 emits-change ``` Change-Id: I30123e35b759182039cb6f25526fafe733c0f354 Signed-off-by: Harshit Aghera <haghera@nvidia.com>
1 parent 1180ed4 commit 7427aee

12 files changed

Lines changed: 1335 additions & 0 deletions

src/nvidia-gpu/NvidiaDeviceDiscovery.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,17 @@ void processSensorConfigs(
373373
configs.pollRate = sensorPollRateMs;
374374
}
375375

376+
try
377+
{
378+
configs.nicNetworkPortCount =
379+
loadVariant<uint64_t>(cfg, "NicNetworkPortCount");
380+
}
381+
catch (const std::invalid_argument&)
382+
{
383+
// NicNetworkPortCount is an optional config
384+
configs.nicNetworkPortCount = 0;
385+
}
386+
376387
discoverDevices(io, objectServer, gpuDevices, smaDevices,
377388
pcieDevices, dbusConnection, mctpRequester, configs,
378389
path);

src/nvidia-gpu/NvidiaDeviceDiscovery.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ struct SensorConfigs
2424
{
2525
std::string name;
2626
uint64_t pollRate{};
27+
uint64_t nicNetworkPortCount{};
2728
};
2829

2930
class GpuDevice;

src/nvidia-gpu/NvidiaEthPort.cpp

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright OpenBMC Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#include "NvidiaEthPort.hpp"
7+
8+
#include "NvidiaUtils.hpp"
9+
#include "Utils.hpp"
10+
11+
#include <bits/basic_string.h>
12+
13+
#include <MctpRequester.hpp>
14+
#include <NvidiaGpuMctpVdm.hpp>
15+
#include <NvidiaPcieDevice.hpp>
16+
#include <OcpMctpVdm.hpp>
17+
#include <phosphor-logging/lg2.hpp>
18+
#include <sdbusplus/asio/connection.hpp>
19+
#include <sdbusplus/asio/object_server.hpp>
20+
#include <sdbusplus/message/native_types.hpp>
21+
22+
#include <array>
23+
#include <cstdint>
24+
#include <format>
25+
#include <functional>
26+
#include <memory>
27+
#include <span>
28+
#include <string>
29+
#include <system_error>
30+
#include <utility>
31+
#include <vector>
32+
33+
using std::string;
34+
35+
using namespace std::literals;
36+
37+
NvidiaEthPortMetrics::NvidiaEthPortMetrics(
38+
std::shared_ptr<sdbusplus::asio::connection>& conn,
39+
mctp::MctpRequester& mctpRequester, const std::string& name,
40+
const std::string& deviceName, const std::string& path, uint8_t eid,
41+
uint16_t portNumber, sdbusplus::asio::object_server& objectServer) :
42+
eid(eid), portNumber(portNumber), path(path), conn(conn),
43+
mctpRequester(mctpRequester)
44+
{
45+
const sdbusplus::message::object_path deviceDbusPath =
46+
sdbusplus::message::object_path(nicPathPrefix) / deviceName;
47+
48+
const sdbusplus::message::object_path portDbusPath =
49+
sdbusplus::message::object_path(nicPathPrefix) / deviceName / name;
50+
51+
const std::string metricsDbusPathPrefix =
52+
metricPath + std::format("port_{}_{}", deviceName, name);
53+
54+
portInterface = objectServer.add_interface(
55+
portDbusPath, "xyz.openbmc_project.Inventory.Connector.Port");
56+
57+
std::vector<Association> associations;
58+
associations.emplace_back("connected_to", "connecting", deviceDbusPath);
59+
60+
associationInterface =
61+
objectServer.add_interface(portDbusPath, association::interface);
62+
63+
associationInterface->register_property("Associations", associations);
64+
65+
constexpr std::array<std::pair<uint8_t, const char*>, 21> telemetryMetrics =
66+
{{
67+
{0, "/nic/rx_bytes"},
68+
{1, "/nic/tx_bytes"},
69+
{2, "/nic/rx_unicast_frames"},
70+
{3, "/nic/rx_multicast_frames"},
71+
{4, "/nic/rx_broadcast_frames"},
72+
{5, "/nic/tx_unicast_frames"},
73+
{6, "/nic/tx_multicast_frames"},
74+
{7, "/nic/tx_broadcast_frames"},
75+
{8, "/nic/rx_fcs_errors"},
76+
{9, "/nic/rx_frame_alignment_errors"},
77+
{10, "/nic/rx_false_carrier_errors"},
78+
{11, "/nic/rx_undersize_frames"},
79+
{12, "/nic/rx_oversize_frames"},
80+
{13, "/nic/rx_pause_xon_frames"},
81+
{14, "/nic/rx_pause_xoff_frames"},
82+
{15, "/nic/tx_pause_xon_frames"},
83+
{16, "/nic/tx_pause_xoff_frames"},
84+
{17, "/nic/tx_single_collisions"},
85+
{18, "/nic/tx_multiple_collisions"},
86+
{19, "/nic/tx_late_collisions"},
87+
{20, "/nic/tx_excessive_collisions"},
88+
}};
89+
90+
for (const auto& [tag, metricName] : telemetryMetrics)
91+
{
92+
metricValueInterface[tag] =
93+
objectServer.add_interface(metricsDbusPathPrefix + metricName,
94+
"xyz.openbmc_project.Metric.Value");
95+
96+
metricValueInterface[tag]->register_property(
97+
"Unit", "xyz.openbmc_project.Metric.Value.Unit.Count"s);
98+
metricValueInterface[tag]->register_property("Value", 0.0);
99+
100+
std::vector<Association> associations;
101+
associations.emplace_back("measuring", "measured_by", portDbusPath);
102+
103+
metricAssociationInterfaces[tag] = objectServer.add_interface(
104+
metricsDbusPathPrefix + metricName, association::interface);
105+
metricAssociationInterfaces[tag]->register_property("Associations",
106+
associations);
107+
if (!metricValueInterface[tag]->initialize())
108+
{
109+
lg2::error(
110+
"Error initializing Ethernet Port Metric Interface for EID={EID}, PortNumber={PN}, Metric={MN}",
111+
"EID", eid, "PN", portNumber, "MN", metricName);
112+
}
113+
114+
if (!metricAssociationInterfaces[tag]->initialize())
115+
{
116+
lg2::error(
117+
"Error initializing Ethernet Port Metric Association Interface for EID={EID}, PortNumber={PN}, Metric={MN}",
118+
"EID", eid, "PN", portNumber, "MN", metricName);
119+
}
120+
}
121+
122+
if (!portInterface->initialize())
123+
{
124+
lg2::error(
125+
"Error initializing Ethernet Port Interface for EID={EID}, PortNumber={PN}",
126+
"EID", eid, "PN", portNumber);
127+
}
128+
129+
if (!associationInterface->initialize())
130+
{
131+
lg2::error(
132+
"Error initializing Association Interface for Ethernet Port for EID={EID}, PortNumber={PN}",
133+
"EID", eid, "PN", portNumber);
134+
}
135+
}
136+
137+
void NvidiaEthPortMetrics::processResponse(
138+
const std::error_code& sendRecvMsgResult, std::span<const uint8_t> response)
139+
{
140+
if (sendRecvMsgResult)
141+
{
142+
lg2::error(
143+
"Error updating Ethernet Port Metrics: sending message over MCTP failed, "
144+
"rc={RC}, EID={EID}, PortNumber={PN}",
145+
"RC", sendRecvMsgResult.message(), "EID", eid, "PN", portNumber);
146+
return;
147+
}
148+
149+
ocp::accelerator_management::CompletionCode cc{};
150+
uint16_t reasonCode = 0;
151+
std::vector<std::pair<uint8_t, uint64_t>> telemetryValues;
152+
153+
const int rc = gpu::decodeGetEthernetPortTelemetryCountersResponse(
154+
response, cc, reasonCode, telemetryValues);
155+
156+
if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
157+
{
158+
lg2::error(
159+
"Error updating Ethernet Port Metrics: decode failed, "
160+
"rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}, PortNumber={PN}",
161+
"RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode, "EID",
162+
eid, "PN", portNumber);
163+
return;
164+
}
165+
166+
for (const auto& [tag, value] : telemetryValues)
167+
{
168+
if (tag < maxTelemetryValues && metricValueInterface[tag])
169+
{
170+
metricValueInterface[tag]->set_property("Value",
171+
static_cast<double>(value));
172+
}
173+
}
174+
}
175+
176+
void NvidiaEthPortMetrics::update()
177+
{
178+
const int rc = gpu::encodeGetEthernetPortTelemetryCountersRequest(
179+
0, portNumber, request);
180+
181+
if (rc != 0)
182+
{
183+
lg2::error(
184+
"Error updating Ethernet Port Metrics: encode failed, rc={RC}, EID={EID}, PortNumber={PN}",
185+
"RC", rc, "EID", eid, "PN", portNumber);
186+
return;
187+
}
188+
189+
mctpRequester.sendRecvMsg(
190+
eid, request,
191+
[weak{weak_from_this()}](const std::error_code& ec,
192+
std::span<const uint8_t> buffer) {
193+
std::shared_ptr<NvidiaEthPortMetrics> self = weak.lock();
194+
if (!self)
195+
{
196+
lg2::error("Invalid reference to NvidiaEthPortMetrics");
197+
return;
198+
}
199+
self->processResponse(ec, buffer);
200+
});
201+
}

src/nvidia-gpu/NvidiaEthPort.hpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright OpenBMC Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#pragma once
7+
8+
#include "MctpRequester.hpp"
9+
10+
#include <NvidiaGpuMctpVdm.hpp>
11+
#include <sdbusplus/asio/connection.hpp>
12+
#include <sdbusplus/asio/object_server.hpp>
13+
14+
#include <array>
15+
#include <cstddef>
16+
#include <cstdint>
17+
#include <memory>
18+
#include <string>
19+
20+
struct NvidiaEthPortMetrics :
21+
public std::enable_shared_from_this<NvidiaEthPortMetrics>
22+
{
23+
public:
24+
NvidiaEthPortMetrics(
25+
std::shared_ptr<sdbusplus::asio::connection>& conn,
26+
mctp::MctpRequester& mctpRequester, const std::string& name,
27+
const std::string& deviceName, const std::string& path, uint8_t eid,
28+
uint16_t portNumber, sdbusplus::asio::object_server& objectServer);
29+
30+
void update();
31+
32+
private:
33+
static constexpr size_t maxTelemetryValues = 64;
34+
35+
void processResponse(const std::error_code& sendRecvMsgResult,
36+
std::span<const uint8_t> response);
37+
38+
static double mapPcieGenToLinkSpeedGbps(uint32_t value);
39+
40+
uint8_t eid;
41+
42+
uint16_t portNumber;
43+
44+
std::string path;
45+
46+
std::shared_ptr<sdbusplus::asio::connection> conn;
47+
48+
mctp::MctpRequester& mctpRequester;
49+
50+
std::array<uint8_t, sizeof(gpu::GetEthernetPortTelemetryCountersRequest)>
51+
request{};
52+
53+
std::shared_ptr<sdbusplus::asio::dbus_interface> portInterface;
54+
55+
std::shared_ptr<sdbusplus::asio::dbus_interface> associationInterface;
56+
57+
std::array<std::shared_ptr<sdbusplus::asio::dbus_interface>,
58+
maxTelemetryValues>
59+
metricValueInterface{};
60+
61+
std::array<std::shared_ptr<sdbusplus::asio::dbus_interface>,
62+
maxTelemetryValues>
63+
metricAssociationInterfaces;
64+
};

0 commit comments

Comments
 (0)