* Added "hoststats" subcommand for summarizing host occupancy statistics
irrespective of queue.
This commit is contained in:
@@ -50,6 +50,52 @@ def node_slot_stats_raw(qstat_f, show_disabled_nodes=False):
|
||||
print(L)
|
||||
|
||||
|
||||
def node_slot_stats(qstat_f, show_disabled_nodes=False):
|
||||
"""Prints status of slot availability per machine type (defined as
|
||||
host with the same base hostname (e.g. "c6-", or "c8-").
|
||||
Originally implemented based on the naming of hosts on Turing cluster.
|
||||
In SGE terminology, "slot" means a CPU core.
|
||||
|
||||
Example output:
|
||||
|
||||
MACHTYPE NODE CORES used free resv
|
||||
c6 15 240 77 163 0
|
||||
c8 40 768 569 199 0
|
||||
cr 74 1480 988 492 0
|
||||
crhimem 3 96 0 96 0
|
||||
crphi 10 200 48 152 0
|
||||
d430 49 1568 1292 276 0
|
||||
d730 10 280 10 270 0
|
||||
|
||||
(changes depending on what's disabled and the load of the cluster)
|
||||
|
||||
FIXME: If a machine is covered by more than one queue, this will
|
||||
cause the counts to be overestimated. Must register if a machine has
|
||||
been encountered and not re-account that machine.
|
||||
However this may not be the best approach as queues are overlapping
|
||||
on machines. Since on Turing, the practice is not to further split a
|
||||
machine to multiple queues (i.e. a 32-core node have all the 32
|
||||
cores assignable to both main and timed-main queues, rather than
|
||||
dedicating 16 for main and 16 for timed-main), we use a particular
|
||||
way to avoid the double-counting:
|
||||
|
||||
- slots_resv: total number of reserved slots in a node (for whatever
|
||||
the sysadmin designates) -- sum them up
|
||||
|
||||
- slots_used: total number of slots currently used (i.e.,
|
||||
occupied by jobs) -- sum them up
|
||||
|
||||
- slots_tot: total number of slots in a node -- take the maximum
|
||||
value encountered.
|
||||
Had the nodes split-dedicated to a particular queue, we have to
|
||||
take the sum of the values instead.
|
||||
|
||||
"""
|
||||
from pprint import pprint
|
||||
host_stats = collect_host_stats(qstat_f, show_disabled_nodes)
|
||||
print_host_stats(host_stats)
|
||||
|
||||
|
||||
def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False):
|
||||
"""Prints status of slot availability per machine type (defined as
|
||||
host with the same base hostname (e.g. "c6-", or "c8-").
|
||||
@@ -174,6 +220,35 @@ def collect_host_stats(qstat_f, show_disabled_nodes=None):
|
||||
return host_stats
|
||||
|
||||
|
||||
def node_load_ratio(node_load, slots_used):
|
||||
"""Ratio of node load vs slots claimed to be used. This should be close to one
|
||||
if the job uses the CPUs efficiently, or near zero if most jobs are interactive
|
||||
(i.e. lots of idling)."""
|
||||
return node_load / slots_used if slots_used != 0 \
|
||||
else 0.0 if node_load < 0.75 \
|
||||
else float('nan')
|
||||
|
||||
|
||||
def print_host_stats(host_stats):
|
||||
"""Prints the per-host statistics gathered by `collect_host_stats`.
|
||||
"""
|
||||
hostnames = sorted(host_stats.keys())
|
||||
print("%-16s %5s %5s %5s %5s %7s %9s" \
|
||||
% ("HOST", "CORES", "used", "free", "resv", "load", "load/used"))
|
||||
for h in hostnames:
|
||||
hs = host_stats[h]
|
||||
print("%-16s %5d %5d %5d %5d %7.2f %9.3f" \
|
||||
% (h,
|
||||
hs['slots_tot'],
|
||||
hs['slots_used'],
|
||||
hs['slots_tot'] - hs['slots_used'] - hs['slots_resv'],
|
||||
hs['slots_resv'],
|
||||
hs['node_load'],
|
||||
node_load_ratio(hs['node_load'], hs['slots_used']),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def summarize_hosttype_stats(host_stats):
|
||||
"""Further summarize the host stats by the host type (denoted by the
|
||||
prefix of the hostname before the dash character, i.e. "c8" for
|
||||
@@ -226,6 +301,7 @@ def print_hosttype_stats(hosttype_stats):
|
||||
|
||||
|
||||
|
||||
|
||||
def help():
|
||||
msg = """\
|
||||
%(CMD)s - Shows node status from SGE information
|
||||
@@ -263,7 +339,9 @@ def main_default(argv):
|
||||
elif argv[1] in ('--raw', 'raw'):
|
||||
cmd = "raw"
|
||||
elif argv[1] in ('--stats', 'stats', 'stat'):
|
||||
cmd = "stats"
|
||||
cmd = "stats" # old stats, a.k.a. hosttype_stats
|
||||
elif re.search(r'^(--)?host-?stat', argv[1]):
|
||||
cmd = "hoststats"
|
||||
elif argv[1] in ('--help', 'help', '-h'):
|
||||
help()
|
||||
return 0
|
||||
@@ -311,6 +389,10 @@ def main_default(argv):
|
||||
node_slot_stats_raw(qstat_f_current,
|
||||
show_disabled_nodes=show_disabled_nodes,
|
||||
)
|
||||
elif cmd == "hoststats":
|
||||
node_slot_stats(qstat_f_current,
|
||||
show_disabled_nodes=show_disabled_nodes,
|
||||
)
|
||||
elif cmd == "stats":
|
||||
node_slot_stats_per_machine_type(qstat_f_current,
|
||||
show_disabled_nodes=show_disabled_nodes,
|
||||
|
||||
Reference in New Issue
Block a user