EVOLUTION-MANAGER
Edit File: check_cpu
#!/usr/bin/env python ############################################################################## # Copyright (c) Members of the EGEE Collaboration. 2011. # See http://www.eu-egee.org/partners/ for details on the copyright # holders. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS # OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################## # # NAME : check_cpu # # DESCRIPTION : Checks the CPU activity # # AUTHORS : Alejandro.Alvarez.Ayllon@cern.ch # ############################################################################## import commands import os import time from lcgdmcommon import * class cpu: """ Stores some information for the CPU """ # Fields names FIELDS = ["user", "nice", "system", "idle", "iowait", "irq", "softirq"] NFIELDS = len(FIELDS) def __setattr__(self, key, value): """ All attributes are integers """ self.__dict__[key] = int(value) return value def __sub__(self, other): """ Substracts every field, and returns an array with the result """ r = [] for f in self.FIELDS: r.append(self.__dict__[f] - other.__dict__[f]) return r class check_cpu: "Checks the CPU activity" __version__ = "0.0.1" __nagios_id__ = "DM-CPU" # Constants PROC_STAT = "/proc/stat" # Defaults DEFAULT_WARNING = "60,60,60,100,80,60,60" DEFAULT_CRITICAL = "70,70,70,100,90,70,70" DEFAULT_INTERVAL = 5 # Specific parameters, where key = short, value = long (i.e. {"h":"help", "C:":"command="}) # getopt format. The long version will be the one passed even when the short is specified __additional_opts__ = {"w:": "warning=", "c:": "critical=", "i:": "interval="} # Specific usage information __usage__ = """ \t-w, --warning\tSets the warning values. Default: %s \t-c, --critical\tSets the critical values. Default: %s \t-i, --interval\tMeasure interval. Default: %d The warning and critical value order is %s. The value will be considered percentage over the total. There is no need to use the %% symbol. The result identify the percentage of time the CPU has spent performing the following operation: \tuser:\t\tnormal processes executing in user mode \tnice:\t\tniced processes executing in user mode \tsystem:\t\tprocesses executing in kernel mode \tidle:\t\ttwiddling thumbs \tiowait:\t\twaiting for I/O to complete \tirq:\t\tservicing interrupts \tsoftirq:\tservicing softirqs Description of work executed by the probe: \t1. Get informations about cpu activity using command /proc/stat \t2. wait 5 second \t3. Get new informations about cpu activity \t4. Considering these two values, compute the percentage the cpu spent in each mode \t5. Return values to nagios \t\t Warning alert is triggered if a cpu state reach is corresponding threshold (in term of percentage of total usage) \t\t Critical alert is triggered if a cpu state reach is corresponding threshold (in term of percentage of total usage) """ % (DEFAULT_WARNING, DEFAULT_CRITICAL, DEFAULT_INTERVAL, ",".join(cpu.FIELDS)) # Methods def __init__(self, opt = {}, args = []): """ Constructor @param opt Contains a dictionary with the long option name as the key, and the argument as value @param args Contains the arguments not associated with any option """ # Parameters opt_warning = self.DEFAULT_WARNING opt_critical = self.DEFAULT_CRITICAL if "warning" in opt: opt_warning = opt["warning"] if "critical" in opt: opt_critical = opt["critical"] opt_warning = get_tuple(opt_warning, cpu.NFIELDS) opt_critical = get_tuple(opt_critical, cpu.NFIELDS) self.warning = [] self.critical = [] for i in range(0, cpu.NFIELDS): self.warning.append(real_value(opt_warning[i], 100)) self.critical.append(real_value(opt_critical[i], 100)) if "interval" in opt: self.interval = int(opt["interval"]) else: self.interval = self.DEFAULT_INTERVAL def get_status(self): """ Returns a dictionary with the CPUs and their information """ stat_line = cat(self.PROC_STAT)[0] c = cpu() (c.user, c.nice, c.system, c.idle, c.iowait, c.irq, c.softirq) = stat_line.split()[1:8] return c def main(self): """ Test code itself. May raise exceptions. @return A tuple (exit code, message, performance) """ debug("Using %s as source" % self.PROC_STAT) initial = self.get_status() time.sleep(self.interval) final = self.get_status() diff = final - initial total = sum(diff) exit = EX_OK msg = None for i in range(0, cpu.NFIELDS): diff[i] = (diff[i] * 100.0) / total if diff[i] > self.warning[i] and exit == EX_OK: exit = EX_WARNING prefix = "%s is using %.2f%% of the CPU time " % (cpu.FIELDS[i], diff[i]) elif diff[i] > self.critical[i]: exit = EX_CRITICAL prefix = "%s is using %.2f%% of the CPU time " % (cpu.FIELDS[i], diff[i]) if not msg: if verbosity(V_EXTENDED): indexes = range(0, cpu.NFIELDS) else: indexes = [3,] msg = " ".join([ "%s=%.2f%%" % (cpu.FIELDS[i].capitalize(), diff[i]) for i in indexes ]) # Message string performance = " ".join([ "%s=%.2f%%;%d;%d" % (cpu.FIELDS[i], diff[i], self.warning[i], self.critical[i]) for i in range(0, cpu.NFIELDS) ]) return (exit, msg, performance) # When called directly if __name__ == "__main__": run(check_cpu)