1#! /system/bin/sh 2# 3# Monitors PSI and detects system readiness post CUJ completion. 4# 5# Setup required for the script to run: 6# adb push psi_monitor.sh /data/local/tmp/psi_monitor.sh 7# adb shell chmod 755 /data/local/tmp/psi_monitor.sh 8# adb shell mkdir -p /data/local/tmp/psi_monitor 9# adb root; adb shell setenforce 0; 10# 11# Examples: 12# adb shell psi_monitor.sh --no_detect_baseline --out_dir=/data/local/tmp/psi_monitor 13# adb shell psi_monitor.sh -d 120 -t 30 --exit_on_psi_stabilized 14# adb shell psi_monitor.sh -d 120 -t 30 -b 5 -m 5 15# 16# Incoming signal: 17# - PSI monitor waits for persist.debug.psi_monitor.cuj_completed to be true before starting 18# the monitoring for PSI changes. 19# 20# Outgoing signals: 21# - PSI monitor sets persist.debug.psi_monitor.threshold_met to true when the PSI exceeds the 22# threshold and drop below the threshold again after the CUJ is completed. 23# - PSI monitor sets persist.debug.psi_monitor.baseline_met to true when the PSI is stable 24# (defined by MAX_PSI_BASE_POINT_DIFF) across the last N PSI entries (defined by 25# TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE) after the CUJ is completed. 26 27readonly PSI_AVG10_POSITION=2 28PSI_AVG10_THRESHOLD=80 29# Baseline should determined only when the PSI is below 50. 30MAX_BASELINE_PSI=50 31TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE=10 32MAX_PSI_BASE_POINT_DIFF=5 33 34SHOULD_DYNAMICALLY_DETECT_BASELINE=true 35MAX_DURATION_SECONDS=120 # 2 minutes 36OUTPUT_DIR=/data/local/tmp/psi_monitor 37DID_CUJ_COMPLETE=false 38EXIT_ON_PSI_STABILIZED=false 39 40alias logcat_log=log 41alias did_cuj_complete='getprop persist.debug.psi_monitor.cuj_completed' 42alias reset_threshold_met='setprop persist.debug.psi_monitor.threshold_met false' 43alias set_threshold_met='setprop persist.debug.psi_monitor.threshold_met true' 44alias reset_baseline_met='setprop persist.debug.psi_monitor.baseline_met false' 45alias set_baseline_met='setprop persist.debug.psi_monitor.baseline_met true' 46 47set -e 48function trap_error() { 49 local return_value=$? 50 local line_no=$1 51 echo "Error at line ${line_no}: \"${BASH_COMMAND}\"" > ${LOG_FILE} 52 echo "Return value ${return_value}" > ${LOG_FILE} 53 exit ${return_value} 54} 55trap 'trap_error $LINENO' ERR 56 57 58function uptime_millis() { 59 seconds=$(cat /proc/uptime | cut -f1 -d" ") 60 echo "(${seconds}*1000)/1" | bc 61} 62 63function err() { 64 echo -e "[$(date +'%Y-%m-%d %H:%M:%S%z')] [$(uptime_millis)]: $*" | tee -a ${LOG_FILE} >&2 65 logcat_log -t "$0" -p e "$@" 66} 67 68function print_log() { 69 echo -e "[$(date +'%Y-%m-%d %H:%M:%S%z')] [$(uptime_millis)]: $*" | tee -a ${LOG_FILE} 70 logcat_log -t "$0" -p i "$@" 71} 72 73function usage() { 74 echo "Monitors PSI and detects system readiness post CUJ completion." 75 echo "Usage: psi_monitor.sh [--no_detect_baseline] [--out_dir <output_dir>] "\ 76 "[--max_duration_seconds <max_duration_seconds>] "\ 77 "[--psi_avg10_threshold <psi_avg10_threshold>] "\ 78 "[--last_n_psi_entries_to_monitor_baseline <last_n_psi_entries_to_monitor_baseline>] "\ 79 "[--max_psi_base_point_diff <max_psi_base_point_diff>] --exit_on_psi_stabilized" 80 echo "--no_detect_baseline: Instruct the monitor to not wait for the PSI to reach "\ 81 "a baseline value" 82 echo "-o|--out_dir: Location to output the psi dump and logs" 83 echo "-d|--max_duration_seconds: Maximum duration to monitor for PSI changes" 84 echo "-t|--psi_avg10_threshold: PSI threshold level for peak CPU activity during post CUJ" 85 echo "-b|--total_psi_entries_to_monitor_baseline: Last N PSI entries to monitor to determine "\ 86 "baseline" 87 echo "-m|--max_psi_base_point_diff: Max PSI diff between the min and max PSI values to "\ 88 "determine baseline" 89 echo "--exit_on_psi_stabilized: Exit the monitor when the PSI is stabilized" 90} 91 92function parse_arguments() { 93 # Set the above threshold / duration / out_dir / dynamicall detect baseline by parsing 94 # the arguments. 95 while [[ $# > 0 ]]; do 96 key="$1" 97 case $key in 98 -h|--help) 99 usage 100 exit 1;; 101 --no_detect_baseline) 102 SHOULD_DYNAMICALLY_DETECT_BASELINE=false 103 ;; 104 -o|--out_dir) 105 OUTPUT_DIR=$2 106 shift;; 107 -d|--max_duration_seconds) 108 MAX_DURATION_SECONDS=$2 109 shift;; 110 -t|--psi_avg10_threshold) 111 PSI_AVG10_THRESHOLD=$2 112 shift;; 113 -b|--total_psi_entries_to_monitor_baseline) 114 TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE=$2 115 shift;; 116 -m|--max_psi_base_point_diff) 117 MAX_PSI_BASE_POINT_DIFF=$2 118 shift;; 119 --exit_on_psi_stabilized) 120 EXIT_ON_PSI_STABILIZED=true 121 shift;; 122 *) 123 err "Invalid option ${1}" 124 usage 125 exit 1;; 126 esac 127 shift # past argument or value 128 done 129} 130 131function print_arguments() { 132 print_log "Command line args:" 133 print_log "\t SHOULD_DYNAMICALLY_DETECT_BASELINE=${SHOULD_DYNAMICALLY_DETECT_BASELINE}" 134 print_log "\t OUTPUT_DIR=${OUTPUT_DIR}" 135 print_log "\t MAX_DURATION_SECONDS=${MAX_DURATION_SECONDS}" 136 print_log "\t PSI_AVG10_THRESHOLD=${PSI_AVG10_THRESHOLD}" 137 print_log "\t TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE=${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE}" 138 print_log "\t MAX_PSI_BASE_POINT_DIFF=${MAX_PSI_BASE_POINT_DIFF}" 139 print_log "\t EXIT_ON_PSI_STABILIZED=${EXIT_ON_PSI_STABILIZED}" 140} 141 142function check_arguments() { 143 readonly OUTPUT_DIR 144 if [[ ! -d ${OUTPUT_DIR} ]]; then 145 err "Out dir ${OUTPUT_DIR} does not exist" 146 exit 1 147 fi 148 149 readonly DUMP_FILE=${OUTPUT_DIR}/psi_dump.txt 150 readonly LOG_FILE=${OUTPUT_DIR}/log.txt 151 rm -f ${DUMP_FILE}; touch ${DUMP_FILE} 152 rm -f ${LOG_FILE}; touch ${LOG_FILE} 153 154 if [[ ! -w ${DUMP_FILE} ]]; then 155 err "Dump file ${DUMP_FILE} is not writable" 156 exit 1 157 fi 158 if [[ ! -w ${LOG_FILE} ]]; then 159 err "Log file ${LOG_FILE} is not writable" 160 exit 1 161 fi 162 163 readonly PSI_AVG10_THRESHOLD 164 if [[ ${PSI_AVG10_THRESHOLD} != +([[:digit:]]) || ${PSI_AVG10_THRESHOLD} -le 0 \ 165 || ${PSI_AVG10_THRESHOLD} -ge 100 ]]; then 166 err "PSI Avg10 threshold ${PSI_AVG10_THRESHOLD} is not a valid number. The value should be "\ 167 "between 1 and 99" 168 exit 1 169 fi 170 171 if [[ ${PSI_AVG10_THRESHOLD} -lt ${MAX_BASELINE_PSI} ]]; then 172 print_log "Setting max baseline PSI to ${PSI_AVG10_THRESHOLD}, which is the PSI threshold" 173 MAX_BASELINE_PSI=${PSI_AVG10_THRESHOLD} 174 fi 175 readonly MAX_BASELINE_PSI 176 177 readonly MAX_DURATION_SECONDS 178 if [[ ${MAX_DURATION_SECONDS} != +([[:digit:]]) || ${MAX_DURATION_SECONDS} -lt 10 \ 179 || ${MAX_DURATION_SECONDS} -gt 3600 ]]; then 180 err "Max duration seconds ${MAX_DURATION_SECONDS} is not a valid number. The value should be"\ 181 "between 10 and 3600" 182 exit 1 183 fi 184 185 readonly TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE 186 if [[ ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} != +([[:digit:]]) \ 187 || ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} -lt 5 \ 188 || ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} -gt 10 ]]; then 189 err "Last N PSI entries to monitor baseline ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} is not "\ 190 "a valid number. The value should be between 5 and 10" 191 exit 1 192 fi 193 194 readonly MAX_PSI_BASE_POINT_DIFF 195 if [[ ${MAX_PSI_BASE_POINT_DIFF} != +([[:digit:]]) || ${MAX_PSI_BASE_POINT_DIFF} -lt 1 \ 196 || ${MAX_PSI_BASE_POINT_DIFF} -gt 10 ]]; then 197 err "Max PSI base point diff ${MAX_PSI_BASE_POINT_DIFF} is not a valid number. The value "\ 198 "should be between 1 and 10" 199 exit 1 200 fi 201} 202 203latest_psi_avg_10=0 204latest_psi_line="" 205function read_psi_avg() { 206 latest_psi_line=$(grep . /proc/pressure/* | tr '\n' ' ' | sed 's|/proc/pressure/||g') 207 local cpu_some_line=$(echo ${latest_psi_line} | sed 's/\(total=[0-9]*\) /\1\n/g' \ 208 | grep "cpu:some") 209 latest_psi_avg_10=$(echo ${cpu_some_line} | cut -f${PSI_AVG10_POSITION} -d' ' \ 210 | cut -f2 -d'=' | cut -f1 -d'.') 211 if [[ ${latest_psi_avg_10} != +([[:digit:]]) ]]; then 212 err "Error reading PSI. Read value ${latest_psi_avg_10}" 213 exit 1 214 fi 215} 216 217EXCEEDED_THRESHOLD_UPTIME_MILLIS=-1 218DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS=-1 219readonly PSI_TYPE_CPU_SOME="cpu:some avg10" 220function populate_exceeded_and_dropped_below_threshold() { 221 local psi_avg10=${1} 222 if [[ ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} -lt 0 && ${psi_avg10} -ge ${PSI_AVG10_THRESHOLD} ]] 223 then 224 EXCEEDED_THRESHOLD_UPTIME_MILLIS=$(uptime_millis) 225 echo -n " \"PSI exceeded threshold: ${PSI_AVG10_THRESHOLD}% ${PSI_TYPE_CPU_SOME}\"" \ 226 >> ${DUMP_FILE} 227 return 228 fi 229 if [[ ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} -gt 0 && ${psi_avg10} -lt ${PSI_AVG10_THRESHOLD} ]] 230 then 231 DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS=$(uptime_millis) 232 echo -n " \"PSI dropped below threshold: ${PSI_AVG10_THRESHOLD}% ${PSI_TYPE_CPU_SOME}\"" \ 233 >> ${DUMP_FILE} 234 fi 235} 236 237function check_exceed_and_drop_below_threshold() { 238 if [[ ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} -gt 0 \ 239 && ${DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS} -gt 0 ]]; then 240 return 241 fi 242 populate_exceeded_and_dropped_below_threshold ${@} 243 if [[ ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} -gt 0 \ 244 && ${DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS} -gt 0 ]]; then 245 print_log "CPU PSI exceeded threshold ${PSI_AVG10_THRESHOLD} at" \ 246 "${EXCEEDED_THRESHOLD_UPTIME_MILLIS} and dropped below at" \ 247 "${DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS}" 248 set_threshold_met 249 return 250 fi 251} 252 253LAST_N_PSI_AVG10_ARRAY=() 254BASELINE_UPTIME_MILLIS=-1 255NEXT_ELEMENT_TO_REMOVE=0 256function monitor_baseline_psi() { 257 if [[ ${1} -gt ${MAX_BASELINE_PSI} || ${SHOULD_DYNAMICALLY_DETECT_BASELINE} == false \ 258 || ${BASELINE_UPTIME_MILLIS} -gt 0 ]]; then 259 return 260 fi 261 LAST_N_PSI_AVG10_ARRAY+=($1) 262 length=${#LAST_N_PSI_AVG10_ARRAY[@]} 263 if [[ ${length} -lt ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} ]]; then 264 return 265 elif [[ ${length} -gt ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} ]]; then 266 unset 'LAST_N_PSI_AVG10_ARRAY[NEXT_ELEMENT_TO_REMOVE]' 267 NEXT_ELEMENT_TO_REMOVE=$(expr ${NEXT_ELEMENT_TO_REMOVE} + 1) 268 fi 269 psi_min=$(echo ${LAST_N_PSI_AVG10_ARRAY[@]} | tr ' ' '\n' | sort -nr | tail -n1) 270 psi_max=$(echo ${LAST_N_PSI_AVG10_ARRAY[@]} | tr ' ' '\n' | sort -nr | head -n1) 271 272 if [[ `expr ${psi_max} - ${psi_min}` -gt ${MAX_PSI_BASE_POINT_DIFF} ]]; then 273 return 274 fi 275 BASELINE_UPTIME_MILLIS=$(uptime_millis) 276 print_log "PSI baseline is stable across ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE} entries. "\ 277 "Min / Max / Latest PSI: [${psi_min}, ${psi_max}, ${1}]" 278 echo -n " \"PSI reached baseline across latest ${TOTAL_PSI_ENTRIES_TO_MONITOR_BASELINE}" \ 279 "entries\"" >> ${DUMP_FILE} 280 set_baseline_met 281 return 282} 283 284function main() { 285 parse_arguments "$@" 286 print_arguments 287 check_arguments 288 reset_threshold_met 289 reset_baseline_met 290 291 if [[ ${EXIT_ON_PSI_STABILIZED} == true ]]; then 292 print_log "Starting CPU PSI monitoring. Will exit when PSI is stabilized or after"\ 293 "${MAX_DURATION_SECONDS} seconds" 294 else 295 print_log "Starting CPU PSI monitoring. Will exit after ${MAX_DURATION_SECONDS} seconds" 296 fi 297 298 start_uptime_millis=$(uptime_millis) 299 max_uptime_millis=`echo "${start_uptime_millis} + (${MAX_DURATION_SECONDS} * 1000)" | bc` 300 cuj_completion_uptime_millis=-1 301 302 while [[ $(uptime_millis) -lt ${max_uptime_millis} ]]; do 303 read_psi_avg 304 305 echo -n "$(uptime_millis) $(date '+%Y-%m-%d %H:%M:%S.%N') ${latest_psi_line}" >> ${DUMP_FILE} 306 307 if [[ ${cuj_completion_uptime_millis} -gt 0 || $(did_cuj_complete) == true ]]; then 308 if [[ ${cuj_completion_uptime_millis} == -1 ]]; then 309 cuj_completion_uptime_millis=$(uptime_millis) 310 echo -n " \"CUJ completed\"" >> ${DUMP_FILE} 311 fi 312 check_exceed_and_drop_below_threshold ${latest_psi_avg_10} 313 monitor_baseline_psi ${latest_psi_avg_10} 314 fi 315 if [[ ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} -gt 0 316 && ${DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS} -gt 0 317 && ( ${SHOULD_DYNAMICALLY_DETECT_BASELINE} == false || ${BASELINE_UPTIME_MILLIS} -gt 0 ) 318 && ${EXIT_ON_PSI_STABILIZED} == true ]]; then 319 print_log "Stopping on psi stabilized" 320 break 321 fi 322 echo "" >> ${DUMP_FILE} 323 sleep 1 324 done 325 326 if [[ ${cuj_completion_uptime_millis} -le 0 ]]; then 327 print_log "CUJ did not complete" 328 else 329 print_log "CUJ completed at ${cuj_completion_uptime_millis}" 330 if [[ ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} -gt 0 331 && ${DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS} -gt 0 ]]; then 332 print_log "CPU PSI exceeded threshold at ${EXCEEDED_THRESHOLD_UPTIME_MILLIS} and dropped"\ 333 "below threshold at ${DROPPED_BELOW_THRESHOLD_UPTIME_MILLIS}" 334 fi 335 if [[ ${SHOULD_DYNAMICALLY_DETECT_BASELINE} == true && ${BASELINE_UPTIME_MILLIS} -gt 0 ]]; then 336 print_log "CPU PSI reached baseline at ${BASELINE_UPTIME_MILLIS}" 337 elif [[ ${SHOULD_DYNAMICALLY_DETECT_BASELINE} == true ]]; then 338 print_log "CPU PSI did not reach baseline. Last N PSI values: ${LAST_N_PSI_AVG10_ARRAY[@]}" 339 fi 340 fi 341} 342 343main "$@" 344