]>
Commit | Line | Data |
---|---|---|
11fdf7f2 | 1 | #!/usr/bin/env bash |
7c673cae FG |
2 | # |
3 | #(c) 2004-present, Facebook, all rights reserved. | |
4 | # See the LICENSE file for usage and distribution rights. | |
5 | # | |
6 | ||
7 | trap 'echo "Caught exception, dying"; exit' 1 2 3 15 | |
8 | ||
9 | ME=`basename $0` | |
10 | SERVER=`hostname` | |
11 | ||
12 | #parameters used | |
13 | # | |
14 | Dump_Config=0 | |
15 | DEBUG= | |
16 | OS=`/bin/uname -s` | |
17 | VMEM= | |
18 | RSS= | |
19 | CPU= | |
20 | VERBOSE= | |
21 | VAR= | |
22 | LIMIT= | |
23 | ACTION= | |
24 | N= | |
25 | WAIT= | |
26 | ||
27 | # | |
28 | #supported OS: Linux only for now. Easy to add | |
29 | # | |
30 | oscheck() { | |
31 | case ${OS} in | |
32 | Linux) | |
33 | VMEM=vsz | |
34 | RSS=rss | |
35 | CPU=bsdtime | |
36 | ;; | |
37 | *) | |
38 | die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks." | |
39 | ;; | |
40 | esac | |
41 | } | |
42 | ||
43 | ||
44 | verbose() { | |
45 | if [ "x$DEBUG" != "x" ]; then | |
46 | echo "$@" >&2 | |
47 | fi | |
48 | } | |
49 | ||
50 | warn() { | |
51 | echo "$@" >&2 | |
52 | } | |
53 | ||
54 | die() { | |
55 | echo "ERROR: " "$@" >&2; | |
56 | exit; | |
57 | } | |
58 | ||
59 | dump_config() { | |
60 | cat <<EOCONFIG; | |
61 | $ME running on ${HOSTNAME} at `date` | |
62 | ||
63 | Configuration for this run: | |
64 | PID to monitor : ${PID} | |
65 | Resource monitored : ${VAR} | |
66 | Resource limit : ${LIMIT} | |
67 | Check every : ${WAIT} seconds | |
68 | No. of times run : ${N} | |
69 | What to do : ${ACTION} | |
70 | EOCONFIG | |
71 | ||
72 | } | |
73 | ||
74 | usage() { | |
75 | cat <<USAGE; exit | |
76 | $@ | |
77 | ||
78 | Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait] | |
79 | ||
80 | Monitor a process for set of violations. Options: | |
81 | ||
82 | -p: PID of process to monitor | |
83 | ||
84 | -x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM | |
85 | ||
86 | -l: what is the threshold/limit for the metric that is being sensed. | |
87 | Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU | |
88 | NOTE: defaults to 1GB | |
89 | ||
90 | -a: action. Currently {warn|die|kill} are supported. | |
91 | The default action is to 'warn'. Here is the behavior: | |
92 | ||
93 | warn: complain if usage exceeds threshold, but continue monitoring | |
94 | kill: complain, kill the db_bench process and exit | |
95 | die: if usage exceeds threshold, die immediately | |
96 | ||
97 | -n: number of cycles to monitor. Default is to monitor until PID no longer exists. | |
98 | ||
99 | -w: wait time per cycle of monitoring. Default is 5 seconds. | |
100 | ||
101 | -v: verbose messaging | |
102 | ||
103 | USAGE | |
104 | ||
105 | } | |
106 | ||
107 | #set default values if none given | |
108 | set_defaults_if_noopt_given() { | |
109 | ||
110 | : ${VAR:=vsz} | |
111 | : ${LIMIT:=1024000} | |
112 | : ${WAIT:=5} | |
113 | : ${N:=999999} | |
114 | : ${ACTION:=warn} | |
115 | } | |
116 | ||
117 | validate_options() { | |
118 | if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then | |
119 | usage "PID is mandatory" | |
120 | fi | |
121 | } | |
122 | ||
123 | ###### START | |
124 | ||
125 | ||
126 | while getopts ":p:x:l:a:n:t:vhd" opt; do | |
127 | case $opt in | |
128 | d) | |
129 | Dump_Config=1 | |
130 | ;; | |
131 | h) | |
132 | usage; | |
133 | ;; | |
134 | a) | |
135 | ACTION=${OPTARG}; | |
136 | ;; | |
137 | v) | |
138 | DEBUG=1; | |
139 | ;; | |
140 | p) | |
141 | PID=$OPTARG; | |
142 | ;; | |
143 | x) | |
144 | VAR=$OPTARG; | |
145 | ;; | |
146 | l) | |
147 | LIMIT=$OPTARG; | |
148 | ;; | |
149 | w) | |
150 | WAIT=$OPTARG; | |
151 | ;; | |
152 | n) | |
153 | N=$OPTARG; | |
154 | ;; | |
155 | \?) | |
156 | usage; | |
157 | ;; | |
158 | esac | |
159 | done | |
160 | ||
161 | oscheck; | |
162 | set_defaults_if_noopt_given; | |
163 | validate_options; | |
164 | ||
165 | if [ $Dump_Config -eq 1 ]; then | |
166 | dump_config; | |
167 | exit; | |
168 | fi | |
169 | ||
170 | Done=0 | |
171 | ||
172 | verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration"; | |
173 | ||
174 | while [ $Done -eq 0 ]; do | |
175 | VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'` | |
176 | if [ ${VAL:=0} -eq 0 ]; then | |
177 | warn "Process $PID ended without incident." | |
178 | Done=1; | |
179 | break; | |
180 | fi | |
181 | ||
182 | if [ $VAL -ge $LIMIT ]; then | |
183 | Done=1; | |
184 | else | |
185 | echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}" | |
186 | sleep $WAIT; | |
187 | fi | |
188 | if [ $Done -eq 1 ]; then | |
189 | ||
190 | if [ "$ACTION" = "kill" ]; then | |
191 | kill ${PID} || kill -3 ${PID} | |
192 | exit; | |
193 | ||
194 | elif [ "$ACTION" = "warn" ]; then | |
195 | ||
196 | # go back to monitoring. | |
197 | ||
198 | warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}" | |
199 | Done=0 #go back to monitoring | |
200 | ||
201 | elif [ "$ACTION" = "die" ]; then | |
202 | warn "WARNING: dying without killing process ${PID} on ${SERVER}" | |
203 | warn "The process details are below: " | |
204 | warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`" | |
205 | warn "" | |
206 | ||
207 | #should we send email/notify someone? TODO... for now, bail. | |
208 | ||
209 | exit -1; | |
210 | ||
211 | fi | |
212 | else | |
213 | : | |
214 | #warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded"; | |
215 | fi | |
216 | done | |
217 |