import agent_util
import time
import sys
import platform
import os
import socket
from agent_util import float
try:
import psutil
except:
psutil = None
try:
import distro
except:
distro = None
def search_esxtop(headers, search_string):
for idx, column in enumerate(headers):
if search_string in column:
return idx
return None
def get_cpu_metrics(cls):
retcode, output = agent_util.execute_command("cat /proc/stat")
cls.log.debug("cat /proc/stat output: %s" % str(output))
output = output.splitlines()
stat_fields = ["user", "nice", "system", "idle", "iowait", "irq", "softirq", "steal", "guest", "guest_nice"]
cpus = {}
for line in output:
if not line.startswith("cpu"): continue
# python3 compatible lambda function
if sys.version_info[0] == 3:
parts = list(filter(lambda p: p, line.split(" ")))
else:
parts = filter(lambda p: p, line.split(" "))
core = parts[0]
if core == "cpu": core = "Total"
if len(parts) >= 11:
user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice = map(int, parts[1:11])
cpus[core] = {
"user": user,
"nice": nice,
"system": system,
"idle": idle,
"iowait": iowait,
"irq": irq,
"softirq": softirq,
"steal": steal,
"guest": guest,
"guest_nice": guest_nice
}
elif len(parts) > 8 and len(parts) < 11:
user, nice, system, idle, iowait, irq, softirq = map(int, parts[1:8])
cpus[core] = {
"user": user,
"nice": nice,
"system": system,
"idle": idle,
"iowait": iowait,
"irq": irq,
"softirq": softirq
}
return cpus
class CPUUsagePlugin(agent_util.Plugin):
textkey = "cpu_usage"
label = "CPU"
@classmethod
def get_metadata(self, config):
status = agent_util.SUPPORTED
msg = None
if 'aix' in sys.platform:
status = agent_util.SUPPORTED
data = {'load_average.1': {'label': '1 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
'load_average.5': {'label': '5 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
'load_average.15': {'label': '15 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
"usage_percentage": {"label": "Usage percentage",
"options": sorted(get_cpu_metrics(self).keys()),
"status": status,
"error_message": msg,
"unit": "percent"},
"user_usage_percentage": {"label": "User usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
"system_usage_percentage": {"label": "System usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
"idle_usage_percentage": {"label": "Idle usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
"iowait_usage_percentage": {"label": "I/O Wait usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
"cpu_entitlement_percentage": {"label": "CPU entitlement percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
}
return data
elif 'sunos' in sys.platform:
status = agent_util.SUPPORTED
data = {'load_average.1': {'label': '1 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
'load_average.5': {'label': '5 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
'load_average.15': {'label': '15 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
"usage_percentage": {"label": "Usage percentage",
"options": sorted(get_cpu_metrics(self).keys()),
"status": status,
"error_message": msg,
"unit": "percent"},
"user_usage_percentage": {"label": "User usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
"system_usage_percentage": {"label": "System usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
"idle_usage_percentage": {"label": "Idle usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
"iowait_usage_percentage": {"label": "I/O Wait usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
}
return data
elif 'freebsd' in sys.platform or "darwin" in sys.platform:
status = agent_util.SUPPORTED
data = {'load_average.1': {'label': '1 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
'load_average.5': {'label': '5 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
'load_average.15': {'label': '15 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
"usage_percentage": {"label": "Usage percentage",
"options": ["Total"],
"status": status,
"error_message": msg,
"unit": "percent"},
"user_usage_percentage": {"label": "User usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
"system_usage_percentage": {"label": "System usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
"idle_usage_percentage": {"label": "Idle usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
}
return data
elif 'hp-ux' in sys.platform:
status = agent_util.SUPPORTED
metadata = {'load_average.1': {'label': '1 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
'load_average.5': {'label': '5 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
'load_average.15': {'label': '15 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
"usage_percentage": {"label": "Total Usage percentage",
"options": ['Total'],
"status": status,
"error_message": msg,
"unit": "percent"},
"user_usage_percentage": {"label": "User usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
"system_usage_percentage": {"label": "System usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
"idle_usage_percentage": {"label": "Idle usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"},
}
return metadata
elif 'vmware' in sys.platform:
status = agent_util.SUPPORTED
# here we're gathering the CPU cores that we can monitor and adding in a Total aggregation
cpus = []
ret, out = agent_util.execute_command('esxcli hardware cpu list | grep "CPU:"')
tmp_cpus = [x for x in out.split('\n') if x != '']
for c in tmp_cpus:
cpu = "Cpu (%s)" % c.split(':')[1]
cpus.append(cpu)
cpus.append("Total")
data = {'load_average.1': {'label': '1 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
'load_average.5': {'label': '5 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
'load_average.15': {'label': '15 minute CPU load average',
'options': None,
'status': status,
'error_message': msg,
'unit': 'avg'},
"usage_percentage": {"label": "Usage percentage",
"options": cpus,
"status": status,
"error_message": msg,
"min_value": 0,
"max_value": 100,
"unit": "percent"},
"idle_usage_percentage": {"label": "Idle usage percentage",
"options": cpus,
"status": status,
"error_message": msg,
"min_value": 0,
"max_value": 100,
"unit": "percent"},
}
return data
else:
if psutil is None:
# Unable to import psutil
self.log.info("Unable to import psutil library, no process metrics available")
status = agent_util.UNSUPPORTED
msg = "Unable to import psutil library, please install and rebuild metadata"
# Core Linux
if not agent_util.which("top", exc=False):
self.log.info("top binary not found")
status = agent_util.UNSUPPORTED
msg = "top binary not found"
try:
distro_info = platform.dist()
except AttributeError:
if distro:
distro_info = distro.linux_distribution()
distro_info = '. '.join(distro_info)
else:
raise ValueError('Unable to grab distribution information. Please verify dependencies. Distro for Python3.8')
if ('centos' in distro_info or 'redhat' in distro_info or 'oracle' in distro_info) and not agent_util.which('iostat', exc=False):
self.log.info('Missing sysstat package.')
status = agent_util.UNSUPPORTED
msg = "iostat/sysstat binary not found. Please install"
metadata = {
"load_average.1": {
"label": "1 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg"
},
"load_average.5": {
"label": "5 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg"
},
"load_average.15": {
"label": "15 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg"
},
"usage_percentage": {
"label": "Usage percentage",
"options": sorted(get_cpu_metrics(self).keys()),
"status": status,
"error_message": msg,
"unit": "percent"
},
"user_usage_percentage": {
"label": "User usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"
},
"system_usage_percentage": {
"label": "System usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"
},
"idle_usage_percentage": {
"label": "Idle usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"
},
"iowait_usage_percentage": {
"label": "I/O Wait usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"
},
"irq_usage_percentage": {
"label": "Hardware IRQ usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"
},
"softirg_usage_percentage": {
"label": "Software IRQ usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"
},
"stealtime_usage_percentage": {
"label": "Steal Time usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"
},
"nice_usage_percentage": {
"label": "Nice usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent"
}
}
return metadata
def check(self, textkey, data, config={}):
# AIX-specific logic
if 'aix' in sys.platform or 'darwin' in sys.platform or "freebsd" in sys.platform:
if textkey.startswith('load_average'):
retcode, load = agent_util.execute_command('uptime')
fields = load.strip().split()
if textkey == 'load_average.1':
return float(fields[-3].strip(','))
elif textkey == 'load_average.5':
return float(fields[-2].strip(','))
elif textkey == 'load_average.15':
return float(fields[-1])
else:
return None
else:
iostat = str(agent_util.which('iostat'))
if 'aix' in sys.platform:
retcode, output = agent_util.execute_command(iostat+' | grep -p tty')
if "darwin" in sys.platform or "freebsd" in sys.platform:
retcode, output = agent_util.execute_command(iostat+' -C -c 2 | tail -1')
output = output.strip().split('\n')
self.log.debug("iostat output: %s" % output)
iostatline = False
enti = False
entc = 0
inuse = 0
user = 0
system = 0
idle = 0
iowait = 0
for line in output:
if line.startswith('tty'):
iostatline = True
if 'entc' in line.split()[-1]:
enti = True
continue
fields = line.split()
if "darwin" in sys.platform:
user = float(fields[-6])
system = float(fields[-5])
idle = float(fields[-4])
elif 'freebsd' in sys.platform:
user = float(-5)
idle = float(fields[-1])
system = float(fields[-3])
else:
user = float(fields[2])
system = float(fields[3])
idle = float(fields[4])
iowait = float(fields[5])
if enti == True:
entc = float(fields[-1])
inuse = 100. - idle
if textkey == 'usage_percentage':
return inuse
elif textkey == 'user_usage_percentage':
return user
elif textkey == 'system_usage_percentage':
return system
elif textkey == 'idle_usage_percentage':
return idle
elif textkey == 'iowait_usage_percentage':
return iowait
elif textkey == 'cpu_entitlement_percentage' and enti == True:
return entc
# If we got here, we don't know how to gather this metric
# for AIX - return None
return None
elif 'sunos' in sys.platform:
if textkey.startswith('load_average'):
retcode, load = agent_util.execute_command('uptime')
fields = load.strip().split()
if textkey == 'load_average.1':
return float(fields[-3].strip(','))
elif textkey == 'load_average.5':
return float(fields[-2].strip(','))
elif textkey == 'load_average.15':
return float(fields[-1])
else:
return None
retcode, output = agent_util.execute_command('mpstat')
output = output.split('\n')
for line in output:
if 'CPU' in line or not line: continue
fields = line.split()
if textkey == 'usage_percentage':
return 100.-float(fields[-1])
elif textkey == 'user_usage_percentage':
return float(fields[-4])
elif textkey == 'system_usage_percentage':
return float(fields[-3])
elif textkey == 'idle_usage_percentage':
return float(fields[-1])
elif textkey == 'iowait_usage_percentage':
return float(fields[-2])
# If we got here we don't know how to gather this metric for Solaris
return None
elif 'vmware' in sys.platform:
hostname = socket.gethostname()
search_string = '\\\\%s\\Physical ' % hostname
metric_value = None
# actually gather the data to parse
ret, out = agent_util.execute_command("esxtop -b -n 2 -d 2", cache_timeout=agent_util.DEFAULT_CACHE_TIMEOUT)
out_list = out.split('\n')
headers = out_list[0].replace('"', '').split(',')
esxtop_data = []
for idx, val in enumerate(out_list[::1]):
if not val or val == '': continue
esxtop_data = out_list[idx].replace('"', '').split(',')
# finish building search string
if textkey.startswith('load_average'):
search_string += 'Cpu Load\\Cpu Load (%s Minute Avg)' % textkey.split('.')[-1]
elif data and (textkey == 'usage_percentage' or textkey == 'idle_usage_percentage'):
if data == "Total":
search_string += 'Cpu(_Total)'
else:
search_string += data
search_string += '\\% Processor Time'
# find index from headers and match to esxtop_data collected
search_idx = search_esxtop(headers, search_string)
if not search_idx:
self.log.error("Unable to parse ESXTOP output for %s" % search_string)
return None
if textkey == 'idle_usage_percentage':
metric_value = 100 - float(esxtop_data[search_idx])
else:
metric_value = float(esxtop_data[search_idx])
return metric_value
elif 'hp-ux' in sys.platform:
# add terminal specification for hpux
os.environ['TERM'] = "xterm"
# !!! applicable to HP-UX 11.31 !!!
ret, out = agent_util.execute_command("top -s2 -d2", env=os.environ)
top = out.strip().splitlines()
self.log.debug(top)
metric_mapping = {}
cpu_str = ''
load_str = ''
for line in top:
if line.lower().startswith('avg'):
cpu_str = line
elif line.lower().startswith('load averages'):
load_str = line
cpu = cpu_str.replace('%', '').split()
self.log.debug(cpu)
metric_mapping['user_usage_percentage'] = float(cpu[2])
metric_mapping['system_usage_percentage'] = float(cpu[4])
metric_mapping['idle_usage_percentage'] = float(cpu[5])
metric_mapping['usage_percentage'] = 100. - metric_mapping['idle_usage_percentage']
load = load_str.strip().replace(',', '').split()
self.log.debug(load)
self.log.debug("'%s'" % load[4][:4])
metric_mapping['load_average.1'] = float(load[2])
metric_mapping['load_average.5'] = float(load[3])
metric_mapping['load_average.15'] = float(load[4][:4])
return float(metric_mapping.get(textkey, None))
else:
if psutil is None:
self.log.error("PSUTIL PACKAGE MISSING! UNABLE TO COLLECT CPU METRICS")
return None
# Default Linux/FreeBSD logic
if textkey.startswith("load_average"):
retcode, output = agent_util.execute_command("top -b -n 2 -d 0.5")
if config.get("debug", False):
self.log.debug('#####################################################')
self.log.debug("CPU usage command 'top -b -n 2 -d 0.5:")
self.log.debug(str(output))
self.log.debug('#####################################################')
self.log.debug("top -b -n 2 -d 0.5: %s" % str(output))
output = output.splitlines()
space_index = [0]
for (var, item) in enumerate(output):
if item == "":
space_index.append(var)
tmp_out = []
for line in output[space_index[2]:]:
if line.strip():
tmp_out.append(line)
output = tmp_out
if textkey.startswith("load_average"):
fields = output[0].split()
if textkey == "load_average.1": index = -3
elif textkey == "load_average.5": index = -2
elif textkey == "load_average.15": index = -1
return float(fields[index].strip(","))
elif textkey.endswith('usage_percentage') and textkey != 'usage_percentage':
num_cores = psutil.cpu_count()
usage_textkey_map = {
'user_usage_percentage': 'user',
'system_usage_percentage': 'system',
'idle_usage_percentage': 'idle',
'iowait_usage_percentage': 'iowait',
'irq_usage_percentage': 'irq',
'softirg_usage_percentage': 'softirq',
'stealtime_usage_percentage': 'steal',
'nice_usage_percentage': 'nice',
}
key_name = usage_textkey_map.get(textkey, None)
if key_name is None:
self.log.error("Unknown resource textkey '%s'!" % textkey)
return None
c = self.get_cache_results('psutil', 'detailed_cpu_usage')
self.log.debug("Retrieved cached value:\n%s" % c)
cur_cpu = psutil.cpu_times()
self.log.debug("Retrieved instant value:\n%s" % getattr(cur_cpu, key_name))
last_cpu = c and c[0][1] or None
self.cache_result('psutil', 'detailed_cpu_usage', cur_cpu, replace=True)
if last_cpu is None:
return None
use_diff = (getattr(cur_cpu, key_name) - getattr(last_cpu, key_name)) / num_cores
if use_diff < 0:
# The system was likely rebooted, and the cached
# CPU stats are no longer relevant.
# Cache new values and exit without reporting a value.
return None
elapsed = c[0][0]
usage_time = (use_diff / elapsed) * 100.
return usage_time
elif textkey == 'usage_percentage' and data.lower() == 'total':
num_cores = psutil.cpu_count()
c = self.get_cache_results('psutil', 'total_cpu_usage')
self.log.debug("Retrieved cached value:\n%s" % c)
cur_cpu = psutil.cpu_times()
self.log.debug("Retrieved instant value:\n%s" % cur_cpu.idle)
last_cpu = c and c[0][1] or None
self.cache_result('psutil', 'total_cpu_usage', cur_cpu, replace=True)
if last_cpu is None:
return None
idle_diff = (cur_cpu.idle - last_cpu.idle) / num_cores
steal_diff = (cur_cpu.steal - last_cpu.steal) / num_cores
if idle_diff < 0 or steal_diff < 0:
# The system was likely rebooted, and the cached
# CPU stats are no longer relevant.
# Cache new values and exit without reporting a value.
return None
use_diff = idle_diff + steal_diff
elapsed = c[0][0]
usage_time = 100 - ((use_diff / elapsed) * 100.)
return usage_time
elif textkey == 'usage_percentage' and data.lower() != 'total':
self.log.debug("Checking for core %s" % data)
num_cores = psutil.cpu_count()
c = self.get_cache_results('psutil', '%s_cpu_usage' % data)
self.log.debug("Retrieved cached value:\n%s" % c)
try:
cur_cpu = psutil.cpu_times(percpu=True)[int(str(data).strip('cpu'))]
except IndexError:
self.log.critical("UNABLE TO FIND CPU #%s" % data)
return None
self.log.debug("Retrieved instant value:\n%s" % cur_cpu.idle)
last_cpu = c and c[0][1] or None
self.cache_result('psutil', '%s_cpu_usage' % data, cur_cpu, replace=True)
if last_cpu is None:
return None
idle_diff = (cur_cpu.idle - last_cpu.idle)
steal_diff = (cur_cpu.steal - last_cpu.steal)
if idle_diff < 0 or steal_diff < 0:
# The system was likely rebooted, and the cached
# CPU stats are no longer relevant.
# Cache new values and exit without reporting a value.
return None
use_diff = idle_diff + steal_diff
elapsed = c[0][0]
usage_time = 100 - ((use_diff / elapsed) * 100.)
if usage_time < 0:
return 0
return usage_time
return 0