Skip to content

Commit

Permalink
GT/GTI: fix_hgx_gpu_model_missing_issue
Browse files Browse the repository at this point in the history
Summary:
# Description
    Per Nv doc#1069815 NVIDIA HGX Baseboard Redfish Design Collaterals
    and doc#1108844 Blackwell HGX 8-GPU Redfish Design Collateral, gpu
    model name should be retrived by the below refish command:

    curl -X GET http://192.168.31.1/redfish/v1/Chassis/HGX_Chassis_0/Assembly

    Here are the changes in the PR:
    1. Use the above command to retrieve GPU model
    2. store gpu model name into kv file once read successfully
    3. If can't retrieve gpu model, return the default model (H100)

    Example:
    root@bmc-oob:~# curl -X GET http://192.168.31.1/redfish/v1/Chassis/HGX_Chassis_0/Assembly
    {
      "odata.id": "/redfish/v1/Chassis/HGX_Chassis_0/Assembly",
      "odata.type": "#Assembly.v1_3_0.Assembly",
      "Assemblies": [
        {
          "odata.id": "/redfish/v1/Chassis/HGX_Chassis_0/Assembly#/Assemblies/0",
          "Location": {
            "PartLocation": {
              "LocationType": "Embedded"
            }
          },
          "MemberId": "0",
          "Model": "P5612-B00",
          "Name": "Board FRU Assembly",
          "PartNumber": "699-25612-0002-204",
          "PhysicalContext": "GPUSubsystem",
          "ProductionDate": "2022-12-17T08:54:00Z",
          "SerialNumber": "1665022550432",
          "Vendor": "NVIDIA"
        },
        {
          "odata.id": "/redfish/v1/Chassis/HGX_Chassis_0/Assembly#/Assemblies/1",
          "Location": {
            "PartLocation": {
              "LocationType": "Embedded"
            }
          },
          "MemberId": "1",
          "Model": "NVIDIA HGX H100 8-GPU",
    .
    .
    .

X-link: facebookexternal/openbmc.quanta#4429

Test Plan:
Build and test pass on GTI system with FW bundle v1.3, v1.4 and v1.5

    root@bmc-oob:~# rm /tmp/cache_store/gpu_model

    root@bmc-oob:~# fw-util hgx --version
    erot-fpga Version: 00.02.0150.0000_n00

    root@bmc-oob:~# kv get gpu_model
    "NVIDIA HGX H100 8-GPU"

    root@bmc-oob:~# sensor-util hgx
    hgx:
    HGX_HSC_0_Power_W            (0x1) :  88.292 Watts | (ok)
    HGX_HSC_1_Power_W            (0x

Reviewed By: amithash

Differential Revision: D62107276

fbshipit-source-id: 0cf70a50647340a4124431a589b4574cb00248fe
  • Loading branch information
JimmyHuang777 authored and facebook-github-bot committed Sep 5, 2024
1 parent 4566b37 commit c6aa0e0
Showing 1 changed file with 40 additions and 27 deletions.
67 changes: 40 additions & 27 deletions meta-facebook/meta-grandteton/recipes-grandteton/hgx/files/hgx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,52 +316,65 @@ std::string updateNonBlocking(const std::string& comp, const std::string& path,
}

HMCPhase getHMCPhase() {
static HMCPhase phase = HMCPhase::HMC_FW_UNKNOWN;
// TODO Try to do this with one Get at the root.
HMCPhase phase = HMCPhase::BMC_FW_DVT;
try {
std::string strModel = kv::get("gpu_model");
if (containStr(strModel, {"H100", "H200"})) {
return HMCPhase::BMC_FW_DVT;
}
else if (containStr(strModel, {"B100", "B200"})) {
return HMCPhase::BMC_FW_B100;
}
else if (containStr(strModel, {"HGX_DVT"})) {
return HMCPhase::HMC_FW_DVT;
}
else if (containStr(strModel, {"HGX_EVT"})) {
return HMCPhase::HMC_FW_EVT;
}
} catch (std::exception& e) {
//do nothing
}

auto tryPhase = [](const std::string& url) {
try {
hgx.get(url);
return true;
} catch (std::exception&) {
} catch (std::exception& e) {
return false;
}
};
if (phase != HMCPhase::HMC_FW_UNKNOWN) {
return phase;
}

if (tryPhase(HMC_FW_INVENTORY + "HGX_FW_BMC_0")) {
std::vector<std::string> urls = {
HMC_URL + "Chassis/HGX_Chassis_0",
HMC_URL + "Chassis/HGX_BMC_0"
};
nlohmann::json jurl;

for (const auto& url : urls) {
jurl = nlohmann::json::parse(hgx.get(url));
if (jurl.contains("Model")) {
auto model = jurl["Model"].dump();
if (containStr(model, {"H100"})) {
phase = HMCPhase::BMC_FW_DVT;
break;
}
else if (containStr(model, {"B100"})) {
phase = HMCPhase::BMC_FW_B100;
break;
std::string url = HMC_URL + "Chassis/HGX_Chassis_0/Assembly";
phase = HMCPhase::BMC_FW_DVT;
json jurl = json::parse(hgx.get(url));

if (jurl.contains("Assemblies")) {
json &tempArray = jurl["Assemblies"];
for (auto &x : tempArray) {
if (x.contains("Model")) {
auto model = x["Model"].dump();
if (containStr(model, {"H100", "H200"})) {
kv::set("gpu_model", model);
return HMCPhase::BMC_FW_DVT;
}
else if (containStr(model, {"B100", "B200"})) {
kv::set("gpu_model", model);
return HMCPhase::BMC_FW_B100;
}
}
}
phase = HMC_FW_UNKNOWN;
}
}
else if (tryPhase(HMC_FW_INVENTORY + "HGX_FW_HMC_0")) {
kv::set("gpu_model", "HGX_DVT");
phase = HMCPhase::HMC_FW_DVT;
}
else if (tryPhase(HMC_FW_INVENTORY + "HMC_Firmware")) {
kv::set("gpu_model", "HGX_EVT");
phase = HMCPhase::HMC_FW_EVT;
}
else {
phase = HMC_FW_UNKNOWN;
}

return phase;
}

Expand Down

0 comments on commit c6aa0e0

Please sign in to comment.