Skip to content

Commit c1bed7b

Browse files
committed
--report-bindings on whole machine, plus numa markers
I combined a few related features here: 1. --report-bindings on whole machine 2. mark unallowed (eg cgroup) parts of the whole machine with ~ 3. numa markers 4. allow hwloc tree to not have sockets/cores 1. whole machine: The incoming topology to the pretty-print functions probably didn't use the WHOLE_SYSTEM flag. In general we don't want WHOLE_SYSTEM, but for pretty printing I think it makes more sense. So I'm caching a WHOLE_SYSTEM load of the current machine and using it if the incoming topology also identifies as the current machine. Examples of what pretty-printing looks like with/without whole system: Suppose the machine is [..../..../..../....][..../..../..../....] 0 4 8 12 16 20 24 28 and we run with cgset -r cpuset.cpus=24,25,28,29 mycgroup1 cgset -r cpuset.cpus=26,27,30,31 mycgroup2 to leave only these hardware threads active: mycgroup1: [~~~~/~~~~/~~~~/~~~~][~~~~/~~~~/..~~/..~~] mycgroup2: [~~~~/~~~~/~~~~/~~~~][~~~~/~~~~/~~../~~..] Without whole-system the printout (for both of the above) would be (-np 2) MCW rank 0 bound to socket 1[core 0[hwt 0-1]]: [][BB/..] MCW rank 1 bound to socket 1[core 1[hwt 0-1]]: [][../BB] With whole-system the output is this, which I think is more informative mycgroup1 (-np 2): MCW rank 0 bound to socket 1[core 6[hwt 0-1]]: [~~~~/~~~~/~~~~/~~~~][~~~~/~~~~/BB~~/..~~] MCW rank 1 bound to socket 1[core 7[hwt 0-1]]: [~~~~/~~~~/~~~~/~~~~][~~~~/~~~~/..~~/BB~~] mycgroup2 (-np 2): MCW rank 0 bound to socket 1[core 6[hwt 2-3]]: [~~~~/~~~~/~~~~/~~~~][~~~~/~~~~/~~BB/~~..] MCW rank 1 bound to socket 1[core 7[hwt 2-3]]: [~~~~/~~~~/~~~~/~~~~][~~~~/~~~~/~~../~~BB] 2. mark unallowed (~) When using the whole-machine option there's a bitmask available to identify the allowed PUs, eg omitting PUs not in our cgroup. To distinguish those PUs I'm using "~" 3. numa markers (<>) I like having numa markers as well as the existing separators between sockets and cores. They're a little harder since the numas are more fluid, eg sockets always contain cores not vice versa, so you can hard code a loop over sockets follwed by a loop over cores. But numas might be be above or below sockets in the tree. This code identifies which level should be considered the child of the numas, and has each of the hard coded loops capable of adding numa markers. Currently I don't have any tunable to turn off the numa markers. A lot of machines have fairly simple numa output where each socket contains one numa, and that ends up looking like this: [<..../..../..../....>][<..../..../..../....>] If others feel that's too cluttered I'm okay with having some tunable so people have to ask for numa markers. 4. allow hwloc tree to not have sockets/cores I may be behind the times on hwloc development, but as far as I know hwloc trees aren't guaranteed to have sockets and cores, just a MACHINE at the top and PU at the bottom. So I added a little code to the loops so it would still print the PUs on a hypothetical machine that lacked any structuring of the PUs into cores/sockets. Signed-off-by: Mark Allen <[email protected]>
1 parent 7c3aeb3 commit c1bed7b

File tree

1 file changed

+228
-9
lines changed

1 file changed

+228
-9
lines changed

opal/mca/hwloc/base/hwloc_base_util.c

Lines changed: 228 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1704,6 +1704,8 @@ static int build_map(int *num_sockets_arg, int *num_cores_arg,
17041704
return OPAL_SUCCESS;
17051705
}
17061706

1707+
static hwloc_topology_t whole_system = NULL;
1708+
17071709
/*
17081710
* Make a prettyprint string for a hwloc_cpuset_t
17091711
*/
@@ -1720,6 +1722,15 @@ int opal_hwloc_base_cset2str(char *str, int len,
17201722
hwloc_obj_t root;
17211723
opal_hwloc_topo_data_t *sum;
17221724

1725+
if (!whole_system) {
1726+
hwloc_topology_init(&whole_system);
1727+
hwloc_topology_set_flags(whole_system, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM);
1728+
hwloc_topology_load(whole_system);
1729+
}
1730+
if (hwloc_topology_is_thissystem(topo) && whole_system) {
1731+
topo = whole_system;
1732+
}
1733+
17231734
str[0] = tmp[stmp] = '\0';
17241735

17251736
/* if the cpuset is all zero, then not bound */
@@ -1769,13 +1780,107 @@ int opal_hwloc_base_cset2str(char *str, int len,
17691780
return OPAL_SUCCESS;
17701781
}
17711782

1783+
// given an input obj somewhere in the hwloc tree, look for a
1784+
// numa object that contains it
1785+
static hwloc_obj_t
1786+
find_my_numa(hwloc_obj_t obj)
1787+
{
1788+
hwloc_obj_t p, numa;
1789+
int i;
1790+
1791+
p = obj;
1792+
while (p && p->memory_arity == 0) {
1793+
p = p->parent;
1794+
}
1795+
// p should have either found a level that contains numas or reached NULL
1796+
if (p == NULL) { return NULL; }
1797+
for (i=0; i<p->memory_arity; ++i) {
1798+
numa = &(p->memory_first_child[i]);
1799+
1800+
if (hwloc_bitmap_isincluded(obj->cpuset, numa->cpuset)) {
1801+
return numa;
1802+
}
1803+
}
1804+
return NULL;
1805+
}
1806+
1807+
// I added numa-markers to the --report_bindings output but I'm
1808+
// considering the possibility it might look cluttered for the
1809+
// trivial cases. Eg
1810+
// [<../../../..>][<../../../..>]
1811+
// So I'm adding a check we could enable for whether the numa
1812+
// markers are interesting enough to print.
1813+
//
1814+
// Using multiple numa nodes under a direct parent is a possible
1815+
// criteria. For now I'm always returning 1.
1816+
static int
1817+
is_numa_output_worth_printing(hwloc_topology_t topo)
1818+
{
1819+
hwloc_obj_t p;
1820+
1821+
// keeping all the numa output for now:
1822+
return 1;
1823+
1824+
p = hwloc_get_obj_by_type(topo, HWLOC_OBJ_PU, 0);
1825+
while (p && p->memory_arity == 0) {
1826+
p = p->parent;
1827+
}
1828+
if (p && p->memory_arity > 1) {
1829+
return 1;
1830+
}
1831+
1832+
return 0;
1833+
}
1834+
1835+
// which level from the set {socket, core, pu} has
1836+
// the first descendent underneath the lowest numa level.
1837+
// returns MACHINE if there is no numa level
1838+
//
1839+
// Eg if an hwloc tree had numas containing sockets like this
1840+
// <[../..][../..]><[../..][../..]>
1841+
// the tree would be
1842+
// mach +memory_children: n n
1843+
// s s s s
1844+
// c c c c c c c c
1845+
// pppppppppppppppp
1846+
// so this should return SOCKET
1847+
static hwloc_obj_type_t
1848+
first_type_under_a_numa(hwloc_topology_t topo)
1849+
{
1850+
hwloc_obj_t p;
1851+
hwloc_obj_type_t type;
1852+
1853+
p = hwloc_get_obj_by_type(topo, HWLOC_OBJ_PU, 0);
1854+
while (p && p->memory_arity == 0) {
1855+
if (p->type == HWLOC_OBJ_PU ||
1856+
p->type == HWLOC_OBJ_CORE ||
1857+
p->type == HWLOC_OBJ_SOCKET)
1858+
{
1859+
type = p->type;
1860+
}
1861+
p = p->parent;
1862+
}
1863+
if (p && p->memory_arity > 0) {
1864+
return type;
1865+
}
1866+
1867+
return HWLOC_OBJ_MACHINE;
1868+
}
1869+
17721870
/*
17731871
* Make a prettyprint string for a cset in a map format.
17741872
* Example: [B./..]
17751873
* Key: [] - signifies socket
1874+
* <> - signifies numa
17761875
* / - divider between cores
17771876
* . - signifies PU a process not bound to
17781877
* B - signifies PU a process is bound to
1878+
* ~ - signifies PU that is disallowed, eg not in our cgroup:
1879+
*
1880+
* The incoming topo is expected to be from a topology_load without
1881+
* WHOLE_SYSTEM. For the purpose of printing though we want a whole-system
1882+
* version. So we'll cache one here for the current system and use that
1883+
* if the incoming topo is also the current system.
17791884
*/
17801885
int opal_hwloc_base_cset2mapstr(char *str, int len,
17811886
hwloc_topology_t topo,
@@ -1787,6 +1892,27 @@ int opal_hwloc_base_cset2mapstr(char *str, int len,
17871892
hwloc_obj_t socket, core, pu;
17881893
hwloc_obj_t root;
17891894
opal_hwloc_topo_data_t *sum;
1895+
hwloc_cpuset_t allowed;
1896+
hwloc_obj_t prev_numa = NULL;
1897+
hwloc_obj_t cur_numa = NULL;
1898+
hwloc_obj_type_t type_under_numa;
1899+
int a_numa_marker_is_open = 0;
1900+
int print_numa_markers = 1;
1901+
1902+
if (!whole_system) {
1903+
hwloc_topology_init(&whole_system);
1904+
hwloc_topology_set_flags(whole_system, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM);
1905+
hwloc_topology_load(whole_system);
1906+
}
1907+
if (hwloc_topology_is_thissystem(topo) && whole_system) {
1908+
topo = whole_system;
1909+
}
1910+
1911+
allowed = hwloc_topology_get_allowed_cpuset(topo);
1912+
print_numa_markers = is_numa_output_worth_printing(topo);
1913+
if (print_numa_markers) {
1914+
type_under_numa = first_type_under_a_numa(topo);
1915+
}
17901916

17911917
str[0] = tmp[stmp] = '\0';
17921918

@@ -1807,44 +1933,137 @@ int opal_hwloc_base_cset2mapstr(char *str, int len,
18071933
}
18081934
}
18091935

1936+
// As far as I know hwloc trees aren't required to have sockets and cores,
1937+
// just a MACHINE at the top and PU at the bottom. The 'fake_*' vars make
1938+
// the loops always iterate at least once, even if the initial socket = ...
1939+
// etc lookup is NULL.
1940+
1941+
int fake_on_first_socket;
1942+
int fake_on_first_core;
1943+
hwloc_cpuset_t cpuset_for_socket; // can be fake, eg the machine's
1944+
// cpuset if there are no sockets
1945+
hwloc_cpuset_t cpuset_for_core;
1946+
18101947
/* Iterate over all existing sockets */
1948+
fake_on_first_socket = 1;
18111949
for (socket = hwloc_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0);
1812-
NULL != socket;
1950+
NULL != socket || fake_on_first_socket;
18131951
socket = socket->next_cousin) {
1814-
strncat(str, "[", len - strlen(str) - 1);
1952+
fake_on_first_socket = 0;
1953+
1954+
// if numas contain sockets, example output <[../..][../..]><[../..][../..]>
1955+
if (print_numa_markers && type_under_numa == HWLOC_OBJ_SOCKET) {
1956+
prev_numa = cur_numa;
1957+
cur_numa = find_my_numa(socket);
1958+
if (cur_numa && cur_numa != prev_numa) {
1959+
if (a_numa_marker_is_open) {
1960+
strncat(str, ">", len - strlen(str) - 1);
1961+
}
1962+
strncat(str, "<", len - strlen(str) - 1);
1963+
a_numa_marker_is_open = 1;
1964+
}
1965+
}
1966+
1967+
if (socket != NULL) { strncat(str, "[", len - strlen(str) - 1); }
1968+
1969+
if (socket != NULL) {
1970+
cpuset_for_socket = socket->cpuset;
1971+
} else {
1972+
cpuset_for_socket = root->cpuset;
1973+
}
18151974

18161975
/* Iterate over all existing cores in this socket */
1976+
fake_on_first_core = 1;
18171977
core_index = 0;
18181978
for (core = hwloc_get_obj_inside_cpuset_by_type(topo,
1819-
socket->cpuset,
1979+
cpuset_for_socket,
18201980
HWLOC_OBJ_CORE, core_index);
1821-
NULL != core;
1981+
NULL != core || fake_on_first_core;
18221982
core = hwloc_get_obj_inside_cpuset_by_type(topo,
1823-
socket->cpuset,
1983+
cpuset_for_socket,
18241984
HWLOC_OBJ_CORE, ++core_index)) {
1985+
fake_on_first_core = 0;
1986+
1987+
// if numas contain cores and are contained by sockets,
1988+
// example output [<../..><../..>][<../../../..>]
1989+
if (print_numa_markers && type_under_numa == HWLOC_OBJ_CORE) {
1990+
prev_numa = cur_numa;
1991+
cur_numa = find_my_numa(core);
1992+
if (cur_numa && cur_numa != prev_numa) {
1993+
if (a_numa_marker_is_open) {
1994+
strncat(str, ">", len - strlen(str) - 1);
1995+
}
1996+
strncat(str, "<", len - strlen(str) - 1);
1997+
a_numa_marker_is_open = 1;
1998+
}
1999+
}
2000+
2001+
18252002
if (core_index > 0) {
18262003
strncat(str, "/", len - strlen(str) - 1);
18272004
}
18282005

2006+
if (core != NULL) {
2007+
cpuset_for_core = core->cpuset;
2008+
} else {
2009+
cpuset_for_core = cpuset_for_socket;
2010+
}
2011+
18292012
/* Iterate over all existing PUs in this core */
18302013
pu_index = 0;
18312014
for (pu = hwloc_get_obj_inside_cpuset_by_type(topo,
1832-
core->cpuset,
2015+
cpuset_for_core,
18332016
HWLOC_OBJ_PU, pu_index);
18342017
NULL != pu;
18352018
pu = hwloc_get_obj_inside_cpuset_by_type(topo,
1836-
core->cpuset,
2019+
cpuset_for_core,
18372020
HWLOC_OBJ_PU, ++pu_index)) {
18382021

2022+
// if numas contain PU and are contained by cores (seems unlikely)
2023+
// example output [<..../....>/<..../....>/<..../....>/<..../....>]
2024+
if (print_numa_markers && type_under_numa == HWLOC_OBJ_PU) {
2025+
prev_numa = cur_numa;
2026+
cur_numa = find_my_numa(pu);
2027+
if (cur_numa && cur_numa != prev_numa) {
2028+
if (a_numa_marker_is_open) {
2029+
strncat(str, ">", len - strlen(str) - 1);
2030+
}
2031+
strncat(str, "<", len - strlen(str) - 1);
2032+
a_numa_marker_is_open = 1;
2033+
}
2034+
}
2035+
18392036
/* Is this PU in the cpuset? */
18402037
if (hwloc_bitmap_isset(cpuset, pu->os_index)) {
18412038
strncat(str, "B", len - strlen(str) - 1);
18422039
} else {
1843-
strncat(str, ".", len - strlen(str) - 1);
2040+
if (hwloc_bitmap_isset(allowed, pu->os_index)) {
2041+
strncat(str, ".", len - strlen(str) - 1);
2042+
} else {
2043+
strncat(str, "~", len - strlen(str) - 1);
2044+
}
2045+
}
2046+
}
2047+
if (print_numa_markers && type_under_numa == HWLOC_OBJ_PU) {
2048+
if (a_numa_marker_is_open) {
2049+
strncat(str, ">", len - strlen(str) - 1);
2050+
a_numa_marker_is_open = 0;
18442051
}
18452052
}
18462053
}
1847-
strncat(str, "]", len - strlen(str) - 1);
2054+
if (print_numa_markers && type_under_numa == HWLOC_OBJ_CORE) {
2055+
if (a_numa_marker_is_open) {
2056+
strncat(str, ">", len - strlen(str) - 1);
2057+
a_numa_marker_is_open = 0;
2058+
}
2059+
}
2060+
if (socket != NULL) { strncat(str, "]", len - strlen(str) - 1); }
2061+
}
2062+
if (print_numa_markers && type_under_numa == HWLOC_OBJ_SOCKET) {
2063+
if (a_numa_marker_is_open) {
2064+
strncat(str, ">", len - strlen(str) - 1);
2065+
a_numa_marker_is_open = 0;
2066+
}
18482067
}
18492068

18502069
return OPAL_SUCCESS;

0 commit comments

Comments
 (0)