EVOLUTION-MANAGER
Edit File: check_process
#!/usr/bin/env python ############################################################################## # Copyright (c) Members of the EGEE Collaboration. 2011. # See http://www.eu-egee.org/partners/ for details on the copyright # holders. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS # OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################## # # NAME : check_process # # DESCRIPTION : Checks one or several processes performance data # # AUTHORS : Alejandro.Alvarez.Ayllon@cern.ch # ############################################################################## import commands import os import pwd from lcgdmcommon import * class process: """ Process information """ def __init__(self, comm): """ Constructor """ self.comm = comm self.pid = [] self.cpu = 0 self.mem = 0 self.thr = 0 self.conn = 0 self.fd = 0 def __str__(self): """ String representation for performance """ procname = str.replace(self.comm, '.', '-') return "'%s instances'=%d '%s cpu'=%.2f%% '%s mem'=%.2f%% '%s thr'=%d '%s conn'=%d '%s fd'=%d" % \ (procname, len(self.pid), procname, self.cpu, procname, self.mem, procname, self.thr, procname, self.conn, procname, self.fd) class check_process: "Checks a process activity" __version__ = "0.0.1" __nagios_id__ = "DM-PROC" # Defaults DEFAULT_WARNING = "10,80%,50%,100,100,800" DEFAULT_CRITICAL = "20,95%,80%,200,200,950" # Specific parameters, where key = short, value = long (i.e. {"h":"help", "C:":"command="}) # getopt format. The long version will be the one passed even when the short is specified __additional_opts__ = {"p:": "processes=", "w:": "warning=", "c:": "critical="} # Specific usage information __usage__ = """ \t-p, --processes\tA list of processes separated by commas (e.g. gridftp,rfiod) \t-w, --warning\tWarning limits. It is a tuple with the values of instances,cpu%%,mem%%,threads,connections,descriptors. Default: %s \t-c. --critical\tCritical limits. Same format as warning. Default: %s For a given process (specified with -p option), the probe should return the following information: \tprocess instance:\tNumber of running instance \tprocess cpu:\t\tPercentage of CPU usage dedicated \tprocess mem:\t\tPercentage of memory dedicated \tprocess thread:\t\tNumber of running thread \tprocess conn:\t\tNumber of connection opened \tprocess fd:\t\tNumber of file descriptor associated Description of work executed by the probe: \t1. Get informations about process activity using ps command \t2. Retrieve basic informations about the process: \t\tNumber of instance \t\tCPU / memory usage dedicated to it \t\tNumber of running thread \t3. Execute a "lsof" command to figure out how many network connection are currently opened for this process \t4. Execute a "lsof" command to figure out how many file descriptor are currently linked to this process \t5. Return values to nagios \t\tWarning or critical alerts are triggered if there is too much ressources dedicated to one process """ % (DEFAULT_WARNING, DEFAULT_CRITICAL) # Methods def __init__(self, opt = {}, args = []): """ Constructor @param opt Contains a dictionary with the long option name as the key, and the argument as value @param args Contains the arguments not associated with any option """ self.proc_info = {} self.pnames = opt["processes"].split(',') if "warning" in opt: self.warning = opt["warning"].split(",") else: self.warning = self.DEFAULT_WARNING.split(",") if "critical" in opt: self.critical = opt["critical"].split(",") else: self.critical = self.DEFAULT_CRITICAL.split(",") if len(self.critical) < 6: raise ValueError("Six values are needed for --critical") if len(self.warning) < 6: raise ValueError("Six values are needed for --warning") for i in range(6): self.critical[i] = real_value(self.critical[i], 100) self.warning[i] = real_value(self.warning[i], 100) def check_margin(self, index, value, message, exit, msg): """ Checks the margin and sets the exit code and message properly """ if value > self.critical[index]: exit = EX_CRITICAL msg += "(C) %s " % message if value > self.warning[index]: if exit != EX_CRITICAL: exit = EX_WARNING msg += "(W) %s " % message return (exit, msg) def main(self): """ Test code itself. May raise exceptions. @return A tuple (exit code, message, performance) """ (status, output) = commands.getstatusoutput("ps -o \"comm pid %%cpu %%mem thcount\" -C %s" % ','.join(self.pnames)) if status != 0: return (EX_UNKNOWN, "Could not execute ps", None) for line in output.split('\n')[1:]: # Returned by ps (comm, pid, cpu, mem, threads) = line.split() if not comm in self.proc_info: proc = process(comm) self.proc_info[comm] = proc else: proc = self.proc_info[comm] proc.pid.append(int(pid)) proc.cpu += float(cpu) proc.mem += float(mem) proc.thr += int(threads) # Network connections (status, lsof_out) = commands.getstatusoutput("sudo -n /usr/sbin/lsof -a -i4 -i6 -p %d" % int(pid)) if status == 0: proc.conn += len(lsof_out.split('\n')) - 1 else: debug("Could not retrieve sockets for %s (%d): Maybe died or sudo not configured properly" % (proc.comm, int(pid))) # File descriptors (status, lsof_out) = commands.getstatusoutput("sudo -n /usr/sbin/lsof -a -d 0-99999 -p %d" % int(pid)) if status == 0: proc.fd = len(lsof_out.split('\n')) - 1 else: debug("Could not retrieve file descriptors for %s (%d): Maybe died or sudo not configured properly" % (proc.comm, int(pid))) # Performance performance = " ".join([ str(proc) for proc in self.proc_info.values() ]) # Check margins msg = "" exit = EX_OK for proc in self.proc_info.values(): instances = len(proc.pid) (exit, msg) = self.check_margin(0, instances, "%s has %d instances" % (proc.comm, instances), exit, msg) (exit, msg) = self.check_margin(1, proc.cpu, "%s CPU usage is %.2f%%" % (proc.comm, proc.cpu), exit, msg) (exit, msg) = self.check_margin(2, proc.mem, "%s memory usage is %.2f%%" % (proc.comm, proc.mem), exit, msg) (exit, msg) = self.check_margin(3, proc.thr, "%s has %d threads" % (proc.comm, proc.thr), exit, msg) (exit, msg) = self.check_margin(4, proc.conn, "%s has %d connections" % (proc.comm, proc.conn), exit, msg) (exit, msg) = self.check_margin(5, proc.fd, "%s is using %d file descriptors" % (proc.comm, proc.fd), exit, msg) if exit == EX_OK: msg = "All parameters OK" # End return (exit, msg, performance) # When called directly if __name__ == "__main__": run(check_process)