* Added analysis tool to summarize CPUs or group compute nodes based
on their CPUs.
This commit is contained in:
@@ -62,9 +62,12 @@ class sh(object):
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
globals().setdefault("NODE_LIST", [])
|
||||
globals().setdefault("NODE_BAD_LIST", set())
|
||||
_g = globals()
|
||||
_g.setdefault("NODE_LIST", [])
|
||||
#_g.setdefault("NODE_BAD_LIST", set())
|
||||
_g.setdefault("NODE_BAD_LIST", [])
|
||||
_g.setdefault("NODE_GOOD_LIST", [])
|
||||
_g.setdefault("ROOT_DIR", "cluster-info")
|
||||
|
||||
|
||||
def get_node_list():
|
||||
@@ -95,14 +98,17 @@ def rhost_run(host, cmdline):
|
||||
return rslt
|
||||
|
||||
|
||||
def rhosts_pipe_out(cmdline, filename, hosts=None, rootdir="cluster-info"):
|
||||
def rhosts_pipe_out(cmdline, filename, hosts=None, rootdir=None):
|
||||
"""Executes cmdline on each remote host (the list is given in and
|
||||
"""
|
||||
global ROOT_DIR
|
||||
from os.path import dirname, join, isdir
|
||||
path_join = join
|
||||
Verb = 100
|
||||
if hosts is None:
|
||||
hosts = node_list()
|
||||
if rootdir is None:
|
||||
rootdir = ROOT_DIR
|
||||
for H in hosts:
|
||||
host_base = H.split(".")[0]
|
||||
outfname = path_join(rootdir, host_base, filename)
|
||||
@@ -137,12 +143,69 @@ def test_accessible_hosts(hosts=None):
|
||||
return good_hosts, bad_hosts
|
||||
|
||||
|
||||
def cpuinfo_extract_processor_names(fn, ht=False):
|
||||
# REFS:
|
||||
# https://access.redhat.com/discussions/480953
|
||||
"""Extracts the names of processors from /proc/cpuinfo.
|
||||
Returns it as a list of processor names.
|
||||
|
||||
WARNING: Hyperthreading is detected with a lame methodology,
|
||||
and only half of the number of cores are reported (i.e. only
|
||||
physical cores)"""
|
||||
A = []
|
||||
siblings_on_socket = None
|
||||
cores_on_socket = None
|
||||
with open(fn, "r") as F:
|
||||
for L in F:
|
||||
if L.startswith("model name"):
|
||||
modelname = L.split(":", 1)[1].strip()
|
||||
A.append(modelname)
|
||||
elif L.startswith("siblings"):
|
||||
siblings_on_socket = int(L.split(":", 1)[1].strip())
|
||||
elif L.startswith("cpu cores"):
|
||||
cores_on_socket = int(L.split(":", 1)[1].strip())
|
||||
|
||||
#print "siblings: ", siblings_on_socket
|
||||
#print "cores: ", cores_on_socket
|
||||
|
||||
# FIXME: Quick-and-dirty solution for hyperthreading;
|
||||
# see Red Hat site above; not 100% reliable if there are several
|
||||
# kinds of CPU models, which I don't think I'll ever encountered.
|
||||
if (not ht) \
|
||||
and siblings_on_socket is not None \
|
||||
and cores_on_socket is not None \
|
||||
and siblings_on_socket != cores_on_socket:
|
||||
assert cores_on_socket*2 == siblings_on_socket
|
||||
# ^^otherwise it's not Hyperthreading, the code has to be fixed!
|
||||
|
||||
A = A[0:len(A)/2] ### HACK!!!
|
||||
print("Warning: hyperthreading detected in %s" % fn)
|
||||
|
||||
return A
|
||||
|
||||
|
||||
def agg_count_names(namelist):
|
||||
"""Aggregates the names in namelist to names->count mapping, as a dict.
|
||||
Useful, e.g. for counting number of unique elements in a list.
|
||||
"""
|
||||
A = {}
|
||||
for C in namelist:
|
||||
try:
|
||||
A[C] = A[C] + 1
|
||||
except KeyError:
|
||||
A[C] = 1
|
||||
return A
|
||||
|
||||
|
||||
# Below are the main gather tools
|
||||
|
||||
def gather_cpuinfo(hosts=None):
|
||||
"""Gather tool: for cpuinfo"""
|
||||
rhosts_pipe_out(("cat", "/proc/cpuinfo"), "cpuinfo.txt", hosts=hosts)
|
||||
|
||||
def gather_lscpu(hosts=None):
|
||||
"""Gather tool: for lscpu"""
|
||||
rhosts_pipe_out(("lscpu"), "lscpu.txt", hosts=hosts)
|
||||
|
||||
def gather_lspci(hosts=None):
|
||||
"""Gather tool: for lspci"""
|
||||
@@ -152,5 +215,95 @@ def gather_free(hosts=None):
|
||||
"""Gather tool: for free"""
|
||||
rhosts_pipe_out(("free"), "free.txt", hosts=hosts)
|
||||
|
||||
def gather_uname_a(hosts=None):
|
||||
"""Gather tool: for free"""
|
||||
rhosts_pipe_out(("uname", "-a"), "uname-a.txt", hosts=hosts)
|
||||
|
||||
|
||||
#def dict_str_sorted(d):
|
||||
# return "{" + ", ".
|
||||
|
||||
def summarize_cpu(hosts=None):
|
||||
from pprint import pformat
|
||||
global ROOT_DIR
|
||||
hosts_base = [ H.split(".")[0] for H in hosts ]
|
||||
getfile = lambda H, bn: os.path.join(ROOT_DIR, H, bn)
|
||||
cpu_info = []
|
||||
|
||||
px_hosts_by_type = {}
|
||||
|
||||
for H in hosts_base:
|
||||
px_names = cpuinfo_extract_processor_names(getfile(H, "cpuinfo.txt"))
|
||||
px_group = agg_count_names(px_names)
|
||||
#print("%s : %s" % (H, px_group))
|
||||
|
||||
px_group_key = pformat(px_group) # use pretty representation
|
||||
|
||||
try:
|
||||
px_hosts_by_type[px_group_key]["hosts"] += [ H ]
|
||||
except KeyError:
|
||||
px_hosts_by_type[px_group_key] = {
|
||||
"cpu_count": px_group,
|
||||
"hosts": [ H ]
|
||||
}
|
||||
|
||||
return px_hosts_by_type
|
||||
|
||||
|
||||
def print_summarize_cpu(summary):
|
||||
host_types = sorted(summary.keys())
|
||||
nproc_grand_total = 0
|
||||
nnode_grand_total = 0
|
||||
for T in host_types:
|
||||
rec = summary[T]
|
||||
nproc_per_node = sum(rec["cpu_count"].values())
|
||||
print("%s:: %d hosts, %d procs/node, total %d procs" \
|
||||
% (T,
|
||||
len(rec["hosts"]),
|
||||
nproc_per_node,
|
||||
len(rec["hosts"]) * nproc_per_node,
|
||||
))
|
||||
print("")
|
||||
print(" " + " ".join(sorted(rec["hosts"])))
|
||||
print("")
|
||||
nproc_grand_total += len(rec["hosts"]) * nproc_per_node
|
||||
nnode_grand_total += len(rec["hosts"])
|
||||
|
||||
print("Grand total %d procs" % nproc_grand_total)
|
||||
print("Grand total %d nodes" % nnode_grand_total)
|
||||
|
||||
|
||||
def tally_summarize_cpu(summary):
|
||||
"""Tallies up the total number of processors
|
||||
"""
|
||||
|
||||
|
||||
def analyze_cpu_composition():
|
||||
summ = summarize_cpu(NODE_GOOD_LIST)
|
||||
print_summarize_cpu(summ)
|
||||
|
||||
|
||||
def Gather_all():
|
||||
"""Master gathering routine, to gather everything all at once.
|
||||
It will take some time to gather every bit of information.
|
||||
"""
|
||||
global NODE_GOOD_LIST, NODE_BAD_LIST, NODE_LIST
|
||||
print("Testing node accesibility...")
|
||||
NODE_GOOD_LIST, NODE_BAD_LIST = test_accesible_hosts()
|
||||
|
||||
print("\nGathering cpuinfo...")
|
||||
gather_cpuinfo(NODE_GOOD_LIST)
|
||||
|
||||
print("\nGathering lscpu...")
|
||||
gather_lscpu(NODE_GOOD_LIST)
|
||||
|
||||
print("\nGathering lspci...")
|
||||
gather_lspci(NODE_GOOD_LIST)
|
||||
|
||||
print("\nGathering free mem...")
|
||||
gather_free(NODE_GOOD_LIST)
|
||||
|
||||
print("\nGathering uname...")
|
||||
gather_uname_a(NODE_GOOD_LIST)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user