Skip to content

Commit

Permalink
libvirt: call get_capabilities() with all CPUs online
Browse files Browse the repository at this point in the history
While we do cache the hosts's capabilities in self._caps in the
libvirt Host object, if we happen to fist call get_capabilities() with
some of our dedicated CPUs offline, libvirt erroneously reports them
as being on socket 0 regardless of their real socket. We would then
cache that topology, thus breaking pretty much all of our NUMA
accounting.

To fix this, this patch makes sure to call get_capabilities()
immediately upon host init, and to power up all our dedicated CPUs
before doing so. That way, we cache their real socket ID.

For testing, because we don't really want to implement a libvirt bug
in our Python libvirt fixture, we make due with a simple unit tests
that asserts that init_host() has powered on the correct CPUs.

Closes-bug: 2077228
Change-Id: I9a2a7614313297f11a55d99fb94916d3583a9504
  • Loading branch information
notartom committed Aug 17, 2024
1 parent 7399728 commit 79d1f06
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 5 deletions.
10 changes: 10 additions & 0 deletions nova/tests/unit/virt/libvirt/test_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -895,6 +895,16 @@ def test_driver_capabilities_secure_boot(self, mock_supports):
)
mock_supports.assert_called_once_with()

@mock.patch.object(hardware, 'get_cpu_dedicated_set',
return_value=set([0, 42, 1337]))
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_register_all_undefined_instance_details')
def test_init_host_topology(self, mock_get_cpu_dedicated_set, _):
driver = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
with mock.patch.object(driver.cpu_api, 'power_up') as mock_power_up:
driver.init_host('goat')
mock_power_up.assert_called_with(set([0, 42, 1337]))

@mock.patch.object(
libvirt_driver.LibvirtDriver,
'_register_all_undefined_instance_details',
Expand Down
6 changes: 3 additions & 3 deletions nova/virt/libvirt/cpu/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def core(self, i):
"""
return Core(i)

def _power_up(self, cpus: ty.Set[int]) -> None:
def power_up(self, cpus: ty.Set[int]) -> None:
if not CONF.libvirt.cpu_power_management:
return
cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set()
Expand All @@ -111,7 +111,7 @@ def power_up_for_instance(self, instance: objects.Instance) -> None:
return
pcpus = instance.numa_topology.cpu_pinning.union(
instance.numa_topology.cpuset_reserved)
self._power_up(pcpus)
self.power_up(pcpus)

def power_up_for_migration(
self, dst_numa_info: objects.LibvirtLiveMigrateNUMAInfo
Expand All @@ -121,7 +121,7 @@ def power_up_for_migration(
pcpus = dst_numa_info.emulator_pins
for pins in dst_numa_info.cpu_pins.values():
pcpus = pcpus.union(pins)
self._power_up(pcpus)
self.power_up(pcpus)

def _power_down(self, cpus: ty.Set[int]) -> None:
if not CONF.libvirt.cpu_power_management:
Expand Down
23 changes: 21 additions & 2 deletions nova/virt/libvirt/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -751,12 +751,31 @@ def _handle_conn_event(self, enabled, reason):
{'enabled': enabled, 'reason': reason})
self._set_host_enabled(enabled, reason)

def _init_host_topology(self):
"""To work around a bug in libvirt that reports offline CPUs as always
being on socket 0 regardless of their real socket, power up all
dedicated CPUs (the only ones whose socket we actually care about),
then call get_capabilities() to initialize the topology with the
correct socket values. get_capabilities()'s implementation will reuse
these initial socket value, and avoid clobbering them with 0 for
offline CPUs.
"""
cpus = hardware.get_cpu_dedicated_set()
if cpus:
self.cpu_api.power_up(cpus)
self._host.get_capabilities()

def init_host(self, host):
self._host.initialize()

self._update_host_specific_capabilities()

# NOTE(artom) Do this first to make sure our first call to
# get_capabilities() happens with all dedicated CPUs online and caches
# their correct socket ID. Unused dedicated CPUs will be powered down
# further down in this method.
self._check_cpu_set_configuration()
self._init_host_topology()

self._update_host_specific_capabilities()

self._do_quality_warnings()

Expand Down

0 comments on commit 79d1f06

Please sign in to comment.