1#! /system/bin/sh
2#
3# Monitors PSI and detects system readiness post CUJ completion.
4#
5# Setup required for the script to run:
6#   adb push psi_monitor.sh /data/local/tmp/psi_monitor.sh
7#   adb shell chmod 755 /data/local/tmp/psi_monitor.sh
8#   adb shell mkdir -p /data/local/tmp/psi_monitor
9#   adb root; adb shell setenforce 0;
10#
11# Examples:
12#   adb shell psi_monitor.sh --no_detect_baseline --out_dir=/data/local/tmp/psi_monitor
13#   adb shell psi_monitor.sh -d 120 -t 30 --exit_on_psi_stabilized
14#   adb shell psi_monitor.sh -d 120 -t 30 -b 5 -m 5
15#
16# Incoming signal:
17#   - PSI monitor waits for persist.debug.psi_monitor.cuj_completed to be true before starting
18#     the monitoring for PSI changes.
19#
20# Outgoing signals:
21#   - PSI monitor sets persist.debug.psi_monitor.threshold_met to true when the PSI exceeds the
22#     threshold and drop below the threshold again after the CUJ is completed.
23#   - PSI monitor sets persist.debug.psi_monitor.baseline_met to true when the PSI is stable
24#     (defined by MAX_PSI_BASE_POINT_DIFF) across the last N PSI entries (defined by
25#     TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE) after the CUJ is completed.
26
27readonly PSI_AVG10_POSITION=2
28PSI_AVG10_THRESHOLD=80
29# Baseline should determined only when the PSI is below 50.
30MAX_BASELINE_PSI=50
31TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE=10
32MAX_PSI_BASE_POINT_DIFF=5
33
34SHOULD_DYNAMICALLY_DETECT_BASELINE=true
35MAX_DURATION_SECONDS=120  # 2 minutes
36OUTPUT_DIR=/data/local/tmp/psi_monitor
37DID_CUJ_COMPLETE=false
38EXIT_ON_PSI_STABILIZED=false
39
40alias logcat_log=log
41alias did_cuj_complete='getprop persist.debug.psi_monitor.cuj_completed'
42alias reset_threshold_met='setprop persist.debug.psi_monitor.threshold_met false'
43alias set_threshold_met='setprop persist.debug.psi_monitor.threshold_met true'
44alias reset_baseline_met='setprop persist.debug.psi_monitor.baseline_met false'
45alias set_baseline_met='setprop persist.debug.psi_monitor.baseline_met true'
46
47set -e
48function trap_error() {
49    local return_value=$?
50    local line_no=$1
51    echo "Error at line ${line_no}: \"${BASH_COMMAND}\"" > ${LOG_FILE}
52    echo "Return value ${return_value}" > ${LOG_FILE}
53    exit ${return_value}
54}
55trap 'trap_error $LINENO' ERR
56
57
58function uptime_millis() {
59  seconds=$(cat /proc/uptime | cut -f1 -d" ")
60  echo "(${seconds}*1000)/1" | bc
61}
62
63function err() {
64  echo -e "[$(date +'%Y-%m-%d %H:%M:%S%z')] [$(uptime_millis)]: $*" | tee -a ${LOG_FILE} >&2
65  logcat_log -t "$0" -p e "$@"
66}
67
68function print_log() {
69  echo -e "[$(date +'%Y-%m-%d %H:%M:%S%z')] [$(uptime_millis)]: $*" | tee -a ${LOG_FILE}
70  logcat_log -t "$0" -p i "$@"
71}
72
73function usage() {
74  echo "Monitors PSI and detects system readiness post CUJ completion."
75  echo "Usage: psi_monitor.sh [--no_detect_baseline] [--out_dir <output_dir>] "\
76       "[--max_duration_seconds <max_duration_seconds>] "\
77       "[--psi_avg10_threshold <psi_avg10_threshold>] "\
78       "[--last_n_psi_entries_to_monitor_baseline <last_n_psi_entries_to_monitor_baseline>] "\
79       "[--max_psi_base_point_diff <max_psi_base_point_diff>] --exit_on_psi_stabilized"
80  echo "--no_detect_baseline: Instruct the monitor to not wait for the PSI to reach "\
81       "a baseline value"
82  echo "-o|--out_dir: Location to output the psi dump and logs"
83  echo "-d|--max_duration_seconds: Maximum duration to monitor for PSI changes"
84  echo "-t|--psi_avg10_threshold: PSI threshold level for peak CPU activity during post CUJ"
85  echo "-b|--total_psi_entries_to_monitor_baseline: Last N PSI entries to monitor to determine "\
86       "baseline"
87  echo "-m|--max_psi_base_point_diff: Max PSI diff between the min and max PSI values to "\
88       "determine baseline"
89  echo "--exit_on_psi_stabilized: Exit the monitor when the PSI is stabilized"
90}
91
92function parse_arguments() {
93  # Set the above threshold / duration / out_dir / dynamicall detect baseline by parsing
94  # the arguments.
95  while [[ $# > 0 ]]; do
96    key="$1"
97    case $key in
98      -h|--help)
99        usage
100        exit 1;;
101      --no_detect_baseline)
102        SHOULD_DYNAMICALLY_DETECT_BASELINE=false
103        ;;
104      -o|--out_dir)
105        OUTPUT_DIR=$2
106        shift;;
107      -d|--max_duration_seconds)
108        MAX_DURATION_SECONDS=$2
109        shift;;
110      -t|--psi_avg10_threshold)
111        PSI_AVG10_THRESHOLD=$2
112        shift;;
113      -b|--total_psi_entries_to_monitor_baseline)
114        TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE=$2
115        shift;;
116      -m|--max_psi_base_point_diff)
117        MAX_PSI_BASE_POINT_DIFF=$2
118        shift;;
119      --exit_on_psi_stabilized)
120        EXIT_ON_PSI_STABILIZED=true
121        shift;;
122      *)
123        err "Invalid option ${1}"
124        usage
125        exit 1;;
126    esac
127    shift # past argument or value
128  done
129}
130
131function print_arguments() {
132  print_log "Command line args:"
133  print_log "\t SHOULD_DYNAMICALLY_DETECT_BASELINE=${SHOULD_DYNAMICALLY_DETECT_BASELINE}"
134  print_log "\t OUTPUT_DIR=${OUTPUT_DIR}"
135  print_log "\t MAX_DURATION_SECONDS=${MAX_DURATION_SECONDS}"
136  print_log "\t PSI_AVG10_THRESHOLD=${PSI_AVG10_THRESHOLD}"
137  print_log "\t TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE=${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE}"
138  print_log "\t MAX_PSI_BASE_POINT_DIFF=${MAX_PSI_BASE_POINT_DIFF}"
139  print_log "\t EXIT_ON_PSI_STABILIZED=${EXIT_ON_PSI_STABILIZED}"
140}
141
142function check_arguments() {
143  readonly OUTPUT_DIR
144  if [[ ! -d ${OUTPUT_DIR} ]]; then
145    err "Out dir ${OUTPUT_DIR} does not exist"
146    exit 1
147  fi
148
149  readonly DUMP_FILE=${OUTPUT_DIR}/psi_dump.txt
150  readonly LOG_FILE=${OUTPUT_DIR}/log.txt
151  rm -f ${DUMP_FILE}; touch ${DUMP_FILE}
152  rm -f ${LOG_FILE}; touch ${LOG_FILE}
153
154  if [[ ! -w ${DUMP_FILE} ]]; then
155    err "Dump file ${DUMP_FILE} is not writable"
156    exit 1
157  fi
158  if [[ ! -w ${LOG_FILE} ]]; then
159    err "Log file ${LOG_FILE} is not writable"
160    exit 1
161  fi
162
163  readonly PSI_AVG10_THRESHOLD
164  if [[ ${PSI_AVG10_THRESHOLD} != +([[:digit:]]) || ${PSI_AVG10_THRESHOLD} -le 0 \
165        || ${PSI_AVG10_THRESHOLD} -ge 100 ]]; then
166    err "PSI Avg10 threshold ${PSI_AVG10_THRESHOLD} is not a valid number. The value should be "\
167        "between 1 and 99"
168    exit 1
169  fi
170
171  if [[ ${PSI_AVG10_THRESHOLD} -lt ${MAX_BASELINE_PSI} ]]; then
172    print_log "Setting max baseline PSI to ${PSI_AVG10_THRESHOLD}, which is the PSI threshold"
173    MAX_BASELINE_PSI=${PSI_AVG10_THRESHOLD}
174  fi
175  readonly MAX_BASELINE_PSI
176
177  readonly MAX_DURATION_SECONDS
178  if [[ ${MAX_DURATION_SECONDS} != +([[:digit:]]) || ${MAX_DURATION_SECONDS} -lt 10 \
179        || ${MAX_DURATION_SECONDS} -gt 3600 ]]; then
180    err "Max duration seconds ${MAX_DURATION_SECONDS} is not a valid number. The value should be"\
181        "between 10 and 3600"
182    exit 1
183  fi
184
185  readonly TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE
186  if [[ ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} != +([[:digit:]]) \
187        || ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} -lt 5 \
188        || ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} -gt 10 ]]; then
189    err "Last N PSI entries to monitor baseline ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} is not "\
190        "a valid number. The value should be between 5 and 10"
191    exit 1
192  fi
193
194  readonly MAX_PSI_BASE_POINT_DIFF
195  if [[ ${MAX_PSI_BASE_POINT_DIFF} != +([[:digit:]]) || ${MAX_PSI_BASE_POINT_DIFF} -lt 1 \
196        || ${MAX_PSI_BASE_POINT_DIFF} -gt 10 ]]; then
197    err "Max PSI base point diff ${MAX_PSI_BASE_POINT_DIFF} is not a valid number. The value "\
198        "should be between 1 and 10"
199    exit 1
200  fi
201}
202
203latest_psi_avg_10=0
204latest_psi_line=""
205function read_psi_avg() {
206  latest_psi_line=$(grep . /proc/pressure/* | tr '\n' ' ' | sed 's|/proc/pressure/||g')
207  local cpu_some_line=$(echo ${latest_psi_line} | sed 's/\(total=[0-9]*\) /\1\n/g' \
208                        | grep "cpu:some")
209  latest_psi_avg_10=$(echo ${cpu_some_line} | cut -f${PSI_AVG10_POSITION} -d' ' \
210    | cut -f2 -d'=' | cut -f1 -d'.')
211  if [[ ${latest_psi_avg_10} != +([[:digit:]]) ]]; then
212    err "Error reading PSI. Read value ${latest_psi_avg_10}"
213    exit 1
214  fi
215}
216
217EXCEEDED_THRESHOLD_UPTIME_MILLIS=-1
218DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS=-1
219readonly PSI_TYPE_CPU_SOME="cpu:some avg10"
220function populate_exceeded_and_dropped_below_threshold() {
221  local psi_avg10=${1}
222  if [[ ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} -lt 0 && ${psi_avg10} -ge ${PSI_AVG10_THRESHOLD} ]]
223  then
224    EXCEEDED_THRESHOLD_UPTIME_MILLIS=$(uptime_millis)
225    echo -n " \"PSI exceeded threshold: ${PSI_AVG10_THRESHOLD}% ${PSI_TYPE_CPU_SOME}\"" \
226      >> ${DUMP_FILE}
227    return
228  fi
229  if [[ ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} -gt 0 && ${psi_avg10} -lt ${PSI_AVG10_THRESHOLD} ]]
230  then
231    DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS=$(uptime_millis)
232    echo -n " \"PSI dropped below threshold: ${PSI_AVG10_THRESHOLD}% ${PSI_TYPE_CPU_SOME}\"" \
233      >> ${DUMP_FILE}
234  fi
235}
236
237function check_exceed_and_drop_below_threshold() {
238  if [[ ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} -gt 0 \
239        && ${DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS} -gt 0 ]]; then
240    return
241  fi
242  populate_exceeded_and_dropped_below_threshold ${@}
243  if [[ ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} -gt 0 \
244        && ${DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS} -gt 0 ]]; then
245    print_log "CPU PSI exceeded threshold ${PSI_AVG10_THRESHOLD} at" \
246        "${EXCEEDED_THRESHOLD_UPTIME_MILLIS} and dropped below at" \
247        "${DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS}"
248    set_threshold_met
249    return
250  fi
251}
252
253LAST_N_PSI_AVG10_ARRAY=()
254BASELINE_UPTIME_MILLIS=-1
255NEXT_ELEMENT_TO_REMOVE=0
256function monitor_baseline_psi() {
257  if [[ ${1} -gt ${MAX_BASELINE_PSI} ||  ${SHOULD_DYNAMICALLY_DETECT_BASELINE} == false \
258        || ${BASELINE_UPTIME_MILLIS} -gt 0 ]]; then
259    return
260  fi
261  LAST_N_PSI_AVG10_ARRAY+=($1)
262  length=${#LAST_N_PSI_AVG10_ARRAY[@]}
263  if [[ ${length} -lt ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} ]]; then
264    return
265  elif [[ ${length} -gt ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} ]]; then
266    unset 'LAST_N_PSI_AVG10_ARRAY[NEXT_ELEMENT_TO_REMOVE]'
267    NEXT_ELEMENT_TO_REMOVE=$(expr ${NEXT_ELEMENT_TO_REMOVE} + 1)
268  fi
269  psi_min=$(echo ${LAST_N_PSI_AVG10_ARRAY[@]} | tr ' ' '\n' | sort -nr | tail -n1)
270  psi_max=$(echo ${LAST_N_PSI_AVG10_ARRAY[@]} | tr ' ' '\n' | sort -nr | head -n1)
271
272  if [[ `expr ${psi_max} - ${psi_min}` -gt ${MAX_PSI_BASE_POINT_DIFF} ]]; then
273    return
274  fi
275  BASELINE_UPTIME_MILLIS=$(uptime_millis)
276  print_log "PSI baseline is stable across ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} entries. "\
277            "Min / Max / Latest PSI: [${psi_min}, ${psi_max}, ${1}]"
278  echo -n " \"PSI reached baseline across latest ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE}" \
279          "entries\"" >> ${DUMP_FILE}
280  set_baseline_met
281  return
282}
283
284function main() {
285  parse_arguments "$@"
286  print_arguments
287  check_arguments
288  reset_threshold_met
289  reset_baseline_met
290
291  if [[ ${EXIT_ON_PSI_STABILIZED} == true ]]; then
292    print_log "Starting CPU PSI monitoring. Will exit when PSI is stabilized or after"\
293              "${MAX_DURATION_SECONDS} seconds"
294  else
295    print_log "Starting CPU PSI monitoring. Will exit after ${MAX_DURATION_SECONDS} seconds"
296  fi
297
298  start_uptime_millis=$(uptime_millis)
299  max_uptime_millis=`echo "${start_uptime_millis} + (${MAX_DURATION_SECONDS} * 1000)" | bc`
300  cuj_completion_uptime_millis=-1
301
302  while [[ $(uptime_millis) -lt ${max_uptime_millis} ]]; do
303    read_psi_avg
304
305    echo -n "$(uptime_millis) $(date '+%Y-%m-%d %H:%M:%S.%N') ${latest_psi_line}" >> ${DUMP_FILE}
306
307    if [[ ${cuj_completion_uptime_millis} -gt 0 || $(did_cuj_complete) == true ]]; then
308      if [[ ${cuj_completion_uptime_millis} == -1 ]]; then
309        cuj_completion_uptime_millis=$(uptime_millis)
310        echo -n " \"CUJ completed\"" >> ${DUMP_FILE}
311      fi
312      check_exceed_and_drop_below_threshold ${latest_psi_avg_10}
313      monitor_baseline_psi ${latest_psi_avg_10}
314    fi
315    if [[ ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} -gt 0
316          && ${DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS} -gt 0
317          && ( ${SHOULD_DYNAMICALLY_DETECT_BASELINE} == false || ${BASELINE_UPTIME_MILLIS} -gt 0 )
318          && ${EXIT_ON_PSI_STABILIZED} == true ]]; then
319          print_log "Stopping on psi stabilized"
320          break
321    fi
322    echo "" >> ${DUMP_FILE}
323    sleep 1
324  done
325
326  if [[ ${cuj_completion_uptime_millis} -le 0 ]]; then
327    print_log "CUJ did not complete"
328  else
329    print_log "CUJ completed at ${cuj_completion_uptime_millis}"
330    if [[ ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} -gt 0
331          && ${DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS} -gt 0 ]]; then
332      print_log "CPU PSI exceeded threshold at ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} and dropped"\
333                "below threshold at ${DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS}"
334    fi
335    if [[ ${SHOULD_DYNAMICALLY_DETECT_BASELINE} == true && ${BASELINE_UPTIME_MILLIS} -gt 0 ]]; then
336      print_log "CPU PSI reached baseline at ${BASELINE_UPTIME_MILLIS}"
337    elif [[ ${SHOULD_DYNAMICALLY_DETECT_BASELINE} == true ]]; then
338      print_log "CPU PSI did not reach baseline. Last N PSI values: ${LAST_N_PSI_AVG10_ARRAY[@]}"
339    fi
340  fi
341}
342
343main "$@"
344