* Added "hoststats" subcommand for summarizing host occupancy statistics
irrespective of queue.
This commit is contained in:
@@ -50,6 +50,52 @@ def node_slot_stats_raw(qstat_f, show_disabled_nodes=False):
|
|||||||
print(L)
|
print(L)
|
||||||
|
|
||||||
|
|
||||||
|
def node_slot_stats(qstat_f, show_disabled_nodes=False):
|
||||||
|
"""Prints status of slot availability per machine type (defined as
|
||||||
|
host with the same base hostname (e.g. "c6-", or "c8-").
|
||||||
|
Originally implemented based on the naming of hosts on Turing cluster.
|
||||||
|
In SGE terminology, "slot" means a CPU core.
|
||||||
|
|
||||||
|
Example output:
|
||||||
|
|
||||||
|
MACHTYPE NODE CORES used free resv
|
||||||
|
c6 15 240 77 163 0
|
||||||
|
c8 40 768 569 199 0
|
||||||
|
cr 74 1480 988 492 0
|
||||||
|
crhimem 3 96 0 96 0
|
||||||
|
crphi 10 200 48 152 0
|
||||||
|
d430 49 1568 1292 276 0
|
||||||
|
d730 10 280 10 270 0
|
||||||
|
|
||||||
|
(changes depending on what's disabled and the load of the cluster)
|
||||||
|
|
||||||
|
FIXME: If a machine is covered by more than one queue, this will
|
||||||
|
cause the counts to be overestimated. Must register if a machine has
|
||||||
|
been encountered and not re-account that machine.
|
||||||
|
However this may not be the best approach as queues are overlapping
|
||||||
|
on machines. Since on Turing, the practice is not to further split a
|
||||||
|
machine to multiple queues (i.e. a 32-core node have all the 32
|
||||||
|
cores assignable to both main and timed-main queues, rather than
|
||||||
|
dedicating 16 for main and 16 for timed-main), we use a particular
|
||||||
|
way to avoid the double-counting:
|
||||||
|
|
||||||
|
- slots_resv: total number of reserved slots in a node (for whatever
|
||||||
|
the sysadmin designates) -- sum them up
|
||||||
|
|
||||||
|
- slots_used: total number of slots currently used (i.e.,
|
||||||
|
occupied by jobs) -- sum them up
|
||||||
|
|
||||||
|
- slots_tot: total number of slots in a node -- take the maximum
|
||||||
|
value encountered.
|
||||||
|
Had the nodes split-dedicated to a particular queue, we have to
|
||||||
|
take the sum of the values instead.
|
||||||
|
|
||||||
|
"""
|
||||||
|
from pprint import pprint
|
||||||
|
host_stats = collect_host_stats(qstat_f, show_disabled_nodes)
|
||||||
|
print_host_stats(host_stats)
|
||||||
|
|
||||||
|
|
||||||
def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False):
|
def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False):
|
||||||
"""Prints status of slot availability per machine type (defined as
|
"""Prints status of slot availability per machine type (defined as
|
||||||
host with the same base hostname (e.g. "c6-", or "c8-").
|
host with the same base hostname (e.g. "c6-", or "c8-").
|
||||||
@@ -174,6 +220,35 @@ def collect_host_stats(qstat_f, show_disabled_nodes=None):
|
|||||||
return host_stats
|
return host_stats
|
||||||
|
|
||||||
|
|
||||||
|
def node_load_ratio(node_load, slots_used):
|
||||||
|
"""Ratio of node load vs slots claimed to be used. This should be close to one
|
||||||
|
if the job uses the CPUs efficiently, or near zero if most jobs are interactive
|
||||||
|
(i.e. lots of idling)."""
|
||||||
|
return node_load / slots_used if slots_used != 0 \
|
||||||
|
else 0.0 if node_load < 0.75 \
|
||||||
|
else float('nan')
|
||||||
|
|
||||||
|
|
||||||
|
def print_host_stats(host_stats):
|
||||||
|
"""Prints the per-host statistics gathered by `collect_host_stats`.
|
||||||
|
"""
|
||||||
|
hostnames = sorted(host_stats.keys())
|
||||||
|
print("%-16s %5s %5s %5s %5s %7s %9s" \
|
||||||
|
% ("HOST", "CORES", "used", "free", "resv", "load", "load/used"))
|
||||||
|
for h in hostnames:
|
||||||
|
hs = host_stats[h]
|
||||||
|
print("%-16s %5d %5d %5d %5d %7.2f %9.3f" \
|
||||||
|
% (h,
|
||||||
|
hs['slots_tot'],
|
||||||
|
hs['slots_used'],
|
||||||
|
hs['slots_tot'] - hs['slots_used'] - hs['slots_resv'],
|
||||||
|
hs['slots_resv'],
|
||||||
|
hs['node_load'],
|
||||||
|
node_load_ratio(hs['node_load'], hs['slots_used']),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def summarize_hosttype_stats(host_stats):
|
def summarize_hosttype_stats(host_stats):
|
||||||
"""Further summarize the host stats by the host type (denoted by the
|
"""Further summarize the host stats by the host type (denoted by the
|
||||||
prefix of the hostname before the dash character, i.e. "c8" for
|
prefix of the hostname before the dash character, i.e. "c8" for
|
||||||
@@ -226,6 +301,7 @@ def print_hosttype_stats(hosttype_stats):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def help():
|
def help():
|
||||||
msg = """\
|
msg = """\
|
||||||
%(CMD)s - Shows node status from SGE information
|
%(CMD)s - Shows node status from SGE information
|
||||||
@@ -263,7 +339,9 @@ def main_default(argv):
|
|||||||
elif argv[1] in ('--raw', 'raw'):
|
elif argv[1] in ('--raw', 'raw'):
|
||||||
cmd = "raw"
|
cmd = "raw"
|
||||||
elif argv[1] in ('--stats', 'stats', 'stat'):
|
elif argv[1] in ('--stats', 'stats', 'stat'):
|
||||||
cmd = "stats"
|
cmd = "stats" # old stats, a.k.a. hosttype_stats
|
||||||
|
elif re.search(r'^(--)?host-?stat', argv[1]):
|
||||||
|
cmd = "hoststats"
|
||||||
elif argv[1] in ('--help', 'help', '-h'):
|
elif argv[1] in ('--help', 'help', '-h'):
|
||||||
help()
|
help()
|
||||||
return 0
|
return 0
|
||||||
@@ -311,6 +389,10 @@ def main_default(argv):
|
|||||||
node_slot_stats_raw(qstat_f_current,
|
node_slot_stats_raw(qstat_f_current,
|
||||||
show_disabled_nodes=show_disabled_nodes,
|
show_disabled_nodes=show_disabled_nodes,
|
||||||
)
|
)
|
||||||
|
elif cmd == "hoststats":
|
||||||
|
node_slot_stats(qstat_f_current,
|
||||||
|
show_disabled_nodes=show_disabled_nodes,
|
||||||
|
)
|
||||||
elif cmd == "stats":
|
elif cmd == "stats":
|
||||||
node_slot_stats_per_machine_type(qstat_f_current,
|
node_slot_stats_per_machine_type(qstat_f_current,
|
||||||
show_disabled_nodes=show_disabled_nodes,
|
show_disabled_nodes=show_disabled_nodes,
|
||||||
|
|||||||
Reference in New Issue
Block a user