xref: /aosp_15_r20/frameworks/av/media/utils/TimeCheck.cpp (revision ec779b8e0859a360c3d303172224686826e6e0e1)
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <csignal>
18 #include "mediautils/TimerThread.h"
19 #define LOG_TAG "TimeCheck"
20 
21 #include <optional>
22 
23 #include <android-base/logging.h>
24 #include <android-base/strings.h>
25 #include <audio_utils/clock.h>
26 #include <cutils/properties.h>
27 #include <mediautils/EventLog.h>
28 #include <mediautils/FixedString.h>
29 #include <mediautils/MethodStatistics.h>
30 #include <mediautils/TimeCheck.h>
31 #include <mediautils/TidWrapper.h>
32 #include <utils/Log.h>
33 
34 #if defined(__ANDROID__)
35 #include "debuggerd/handler.h"
36 #endif
37 
38 
39 namespace android::mediautils {
40 
41 // Note: The sum of kDefaultTimeOutDurationMs and kDefaultSecondChanceDurationMs
42 // should be no less than 2 seconds, otherwise spurious timeouts
43 // may occur with system suspend.
44 static constexpr int kDefaultTimeoutDurationMs = 3000;
45 
46 // Due to suspend abort not incrementing the monotonic clock,
47 // we allow another second chance timeout after the first timeout expires.
48 //
49 // The total timeout is therefore kDefaultTimeoutDuration + kDefaultSecondChanceDuration,
50 // and the result is more stable when the monotonic clock increments during suspend.
51 //
52 static constexpr int kDefaultSecondChanceDurationMs = 2000;
53 
54 /* static */
getDefaultTimeoutDuration()55 TimeCheck::Duration TimeCheck::getDefaultTimeoutDuration() {
56     static constinit std::atomic<int> defaultTimeoutDurationMs{};
57     auto defaultMs = defaultTimeoutDurationMs.load(std::memory_order_relaxed);
58     if (defaultMs == 0) {
59         defaultMs = property_get_int32(
60                 "audio.timecheck.timeout_duration_ms", kDefaultTimeoutDurationMs);
61         if (defaultMs < 1) defaultMs = kDefaultTimeoutDurationMs;
62         defaultTimeoutDurationMs.store(defaultMs, std::memory_order_relaxed);
63     }
64     return std::chrono::milliseconds(defaultMs);
65 }
66 
67 /* static */
getDefaultSecondChanceDuration()68 TimeCheck::Duration TimeCheck::getDefaultSecondChanceDuration() {
69     static constinit std::atomic<int> defaultSecondChanceDurationMs{};
70     auto defaultMs = defaultSecondChanceDurationMs.load(std::memory_order_relaxed);
71     if (defaultMs == 0) {
72         defaultMs = property_get_int32(
73                 "audio.timecheck.second_chance_duration_ms", kDefaultSecondChanceDurationMs);
74         if (defaultMs < 1) defaultMs = kDefaultSecondChanceDurationMs;
75         defaultSecondChanceDurationMs.store(defaultMs, std::memory_order_relaxed);
76     }
77     return std::chrono::milliseconds(defaultMs);
78 }
79 
80 // This function appropriately signals a pid to dump a backtrace if we are
81 // running on device (and the HAL exists). If we are not running on an Android
82 // device, there is no HAL to signal (so we do nothing).
signalAudioHAL(pid_t pid)83 static inline void signalAudioHAL([[maybe_unused]] pid_t pid) {
84 #if defined(__ANDROID__)
85     sigqueue(pid, DEBUGGER_SIGNAL, {.sival_int = 0});
86 #endif
87 }
88 
89 /**
90  * Returns the std::string "HH:MM:SS.MSc" from a system_clock time_point.
91  */
formatTime(std::chrono::system_clock::time_point t)92 std::string formatTime(std::chrono::system_clock::time_point t) {
93     auto time_string = audio_utils_time_string_from_ns(
94             std::chrono::nanoseconds(t.time_since_epoch()).count());
95 
96     // The time string is 19 characters (including null termination).
97     // Example: "03-27 16:47:06.187"
98     //           MM DD HH MM SS MS
99     // We offset by 6 to get HH:MM:SS.MSc
100     //
101     return time_string.time + 6; // offset to remove month/day.
102 }
103 
104 /**
105  * Finds the end of the common time prefix.
106  *
107  * This is as an option to remove the common time prefix to avoid
108  * unnecessary duplicated strings.
109  *
110  * \param time1 a time string
111  * \param time2 a time string
112  * \return      the position where the common time prefix ends. For abbreviated
113  *              printing of time2, offset the character pointer by this position.
114  */
commonTimePrefixPosition(std::string_view time1,std::string_view time2)115 static size_t commonTimePrefixPosition(std::string_view time1, std::string_view time2) {
116     const size_t endPos = std::min(time1.size(), time2.size());
117     size_t i;
118 
119     // Find location of the first mismatch between strings
120     for (i = 0; ; ++i) {
121         if (i == endPos) {
122             return i; // strings match completely to the length of one of the strings.
123         }
124         if (time1[i] != time2[i]) {
125             break;
126         }
127         if (time1[i] == '\0') {
128             return i; // "printed" strings match completely.  No need to check further.
129         }
130     }
131 
132     // Go backwards until we find a delimeter or space.
133     for (; i > 0
134            && isdigit(time1[i]) // still a number
135            && time1[i - 1] != ' '
136          ; --i) {
137     }
138     return i;
139 }
140 
141 /**
142  * Returns the unique suffix of time2 that isn't present in time1.
143  *
144  * If time2 is identical to time1, then an empty string_view is returned.
145  * This method is used to elide the common prefix when printing times.
146  */
timeSuffix(std::string_view time1,std::string_view time2)147 std::string_view timeSuffix(std::string_view time1, std::string_view time2) {
148     const size_t pos = commonTimePrefixPosition(time1, time2);
149     return time2.substr(pos);
150 }
151 
152 // Audio HAL server pids vector used to generate audio HAL processes tombstone
153 // when audioserver watchdog triggers.
154 // We use a lockless storage to avoid potential deadlocks in the context of watchdog
155 // trigger.
156 // Protection again simultaneous writes is not needed given one update takes place
157 // during AudioFlinger construction and other comes necessarily later once the IAudioFlinger
158 // interface is available.
159 // The use of an atomic index just guaranties that current vector is fully initialized
160 // when read.
161 /* static */
accessAudioHalPids(std::vector<pid_t> * pids,bool update)162 void TimeCheck::accessAudioHalPids(std::vector<pid_t>* pids, bool update) {
163     static constexpr int kNumAudioHalPidsVectors = 3;
164     static std::vector<pid_t> audioHalPids[kNumAudioHalPidsVectors];
165     static std::atomic<unsigned> curAudioHalPids = 0;
166 
167     if (update) {
168         audioHalPids[(curAudioHalPids++ + 1) % kNumAudioHalPidsVectors] = *pids;
169     } else {
170         *pids = audioHalPids[curAudioHalPids % kNumAudioHalPidsVectors];
171     }
172 }
173 
174 /* static */
setAudioHalPids(const std::vector<pid_t> & pids)175 void TimeCheck::setAudioHalPids(const std::vector<pid_t>& pids) {
176     accessAudioHalPids(&(const_cast<std::vector<pid_t>&>(pids)), true);
177 }
178 
179 /* static */
getAudioHalPids()180 std::vector<pid_t> TimeCheck::getAudioHalPids() {
181     std::vector<pid_t> pids;
182     accessAudioHalPids(&pids, false);
183     return pids;
184 }
185 
186 /* static */
signalAudioHals()187 std::string TimeCheck::signalAudioHals() {
188     std::vector<pid_t> pids = getAudioHalPids();
189     std::string halPids;
190     if (pids.size() != 0) {
191         for (const auto& pid : pids) {
192             ALOGI("requesting tombstone for pid: %d", pid);
193             halPids.append(std::to_string(pid)).append(" ");
194             signalAudioHAL(pid);
195         }
196         // Allow time to complete, usually the caller is forcing restart afterwards.
197         sleep(1);
198     }
199     return halPids;
200 }
201 
202 /* static */
getTimeCheckThread()203 TimerThread& TimeCheck::getTimeCheckThread() {
204     static TimerThread sTimeCheckThread{};
205     return sTimeCheckThread;
206 }
207 
208 /* static */
toString()209 std::string TimeCheck::toString() {
210     // note pending and retired are individually locked for maximum concurrency,
211     // snapshot is not instantaneous at a single time.
212     return getTimeCheckThread().getSnapshotAnalysis().toString();
213 }
214 
TimeCheck(std::string_view tag,OnTimerFunc && onTimer,Duration requestedTimeoutDuration,Duration secondChanceDuration,bool crashOnTimeout)215 TimeCheck::TimeCheck(std::string_view tag, OnTimerFunc&& onTimer, Duration requestedTimeoutDuration,
216         Duration secondChanceDuration, bool crashOnTimeout)
217     : mTimeCheckHandler{ std::make_shared<TimeCheckHandler>(
218             tag, std::move(onTimer), crashOnTimeout, requestedTimeoutDuration,
219             secondChanceDuration, std::chrono::system_clock::now(), getThreadIdWrapper()) }
220     , mTimerHandle(requestedTimeoutDuration.count() == 0
221               /* for TimeCheck we don't consider a non-zero secondChanceDuration here */
222               ? getTimeCheckThread().trackTask(mTimeCheckHandler->tag)
223               : getTimeCheckThread().scheduleTask(
224                       mTimeCheckHandler->tag,
225                       // Pass in all the arguments by value to this task for safety.
226                       // The thread could call the callback before the constructor is finished.
227                       // The destructor is not blocked on callback.
__anon2392a1780102(TimerThread::Handle timerHandle) 228                       [ timeCheckHandler = mTimeCheckHandler ](TimerThread::Handle timerHandle) {
229                           timeCheckHandler->onTimeout(timerHandle);
230                       },
231                       requestedTimeoutDuration,
232                       secondChanceDuration)) {}
233 
~TimeCheck()234 TimeCheck::~TimeCheck() {
235     if (mTimeCheckHandler) {
236         mTimeCheckHandler->onCancel(mTimerHandle);
237     }
238 }
239 
240 /* static */
analyzeTimeouts(float requestedTimeoutMs,float secondChanceMs,float elapsedSteadyMs,float elapsedSystemMs)241 std::string TimeCheck::analyzeTimeouts(
242         float requestedTimeoutMs, float secondChanceMs,
243         float elapsedSteadyMs, float elapsedSystemMs) {
244     // Track any OS clock issues with suspend.
245     // It is possible that the elapsedSystemMs is much greater than elapsedSteadyMs if
246     // a suspend occurs; however, we always expect the timeout ms should always be slightly
247     // less than the elapsed steady ms regardless of whether a suspend occurs or not.
248 
249     const float totalTimeoutMs = requestedTimeoutMs + secondChanceMs;
250     std::string s = std::format(
251             "Timeout ms {:.2f} ({:.2f} + {:.2f})"
252             " elapsed steady ms {:.4f} elapsed system ms {:.4f}",
253             totalTimeoutMs, requestedTimeoutMs, secondChanceMs, elapsedSteadyMs, elapsedSystemMs);
254 
255     // Is there something unusual?
256     static constexpr float TOLERANCE_CONTEXT_SWITCH_MS = 200.f;
257 
258     if (totalTimeoutMs > elapsedSteadyMs || totalTimeoutMs > elapsedSystemMs) {
259         s.append("\nError: early expiration - "
260                 "totalTimeoutMs should be less than elapsed time");
261     }
262 
263     if (elapsedSteadyMs > elapsedSystemMs + TOLERANCE_CONTEXT_SWITCH_MS) {
264         s.append("\nWarning: steady time should not advance faster than system time");
265     }
266 
267     // This has been found in suspend stress testing.
268     if (elapsedSteadyMs > totalTimeoutMs + TOLERANCE_CONTEXT_SWITCH_MS) {
269         s.append("\nWarning: steady time significantly exceeds timeout "
270                 "- possible thread stall or aborted suspend");
271     }
272 
273     // This has been found in suspend stress testing.
274     if (elapsedSystemMs > totalTimeoutMs + TOLERANCE_CONTEXT_SWITCH_MS) {
275         s.append("\nInformation: system time significantly exceeds timeout "
276                 "- possible suspend");
277     }
278     return s;
279 }
280 
281 // To avoid any potential race conditions, the timer handle
282 // (expiration = clock steady start + timeout) is passed into the callback.
onCancel(TimerThread::Handle timerHandle) const283 void TimeCheck::TimeCheckHandler::onCancel(TimerThread::Handle timerHandle) const
284 {
285     if (TimeCheck::getTimeCheckThread().cancelTask(timerHandle) && onTimer) {
286         const std::chrono::steady_clock::time_point endSteadyTime =
287                 std::chrono::steady_clock::now();
288         const float elapsedSteadyMs = std::chrono::duration_cast<FloatMs>(
289                 endSteadyTime - timerHandle + timeoutDuration).count();
290         // send the elapsed steady time for statistics.
291         onTimer(false /* timeout */, elapsedSteadyMs);
292     }
293 }
294 
295 // To avoid any potential race conditions, the timer handle
296 // (expiration = clock steady start + timeout) is passed into the callback.
onTimeout(TimerThread::Handle timerHandle) const297 void TimeCheck::TimeCheckHandler::onTimeout(TimerThread::Handle timerHandle) const
298 {
299     const std::chrono::steady_clock::time_point endSteadyTime = std::chrono::steady_clock::now();
300     const std::chrono::system_clock::time_point endSystemTime = std::chrono::system_clock::now();
301     // timerHandle incorporates the timeout
302     const float elapsedSteadyMs = std::chrono::duration_cast<FloatMs>(
303             endSteadyTime - (timerHandle - timeoutDuration)).count();
304     const float elapsedSystemMs = std::chrono::duration_cast<FloatMs>(
305             endSystemTime - startSystemTime).count();
306     const float requestedTimeoutMs = std::chrono::duration_cast<FloatMs>(
307             timeoutDuration).count();
308     const float secondChanceMs = std::chrono::duration_cast<FloatMs>(
309             secondChanceDuration).count();
310 
311     if (onTimer) {
312         onTimer(true /* timeout */, elapsedSteadyMs);
313     }
314 
315     if (!crashOnTimeout) return;
316 
317     // Generate the TimerThread summary string early before sending signals to the
318     // HAL processes which can affect thread behavior.
319     const auto snapshotAnalysis = getTimeCheckThread().getSnapshotAnalysis(4 /* retiredCount */);
320 
321     // Generate audio HAL processes tombstones.
322     std::string halPids = signalAudioHals();
323     if (!halPids.empty()) {
324         halPids = "HAL pids [ " + halPids + "]";
325     } else {
326         halPids = "No HAL process pids available";
327         ALOGI("%s", (halPids + ", skipping tombstones").c_str());
328     }
329 
330     LOG_EVENT_STRING(LOGTAG_AUDIO_BINDER_TIMEOUT, tag.c_str());
331 
332     // Create abort message string - caution: this can be very large.
333     const std::string abortMessage = std::string("TimeCheck timeout for ")
334             .append(tag)
335             .append(" scheduled ").append(formatTime(startSystemTime))
336             .append(" on thread ").append(std::to_string(tid)).append("\n")
337             .append(analyzeTimeouts(requestedTimeoutMs, secondChanceMs,
338                     elapsedSteadyMs, elapsedSystemMs)).append("\n")
339             .append(halPids).append("\n")
340             .append(snapshotAnalysis.toString());
341 
342     // In many cases, the initial timeout stack differs from the abort backtrace because
343     // (1) the time difference between initial timeout and the final abort signal
344     // and (2) signalling the HAL audio service may cause
345     // the thread to unblock and continue.
346 
347     // Note: LOG_ALWAYS_FATAL limits the size of the string - per log/log.h:
348     // Log message text may be truncated to less than an
349     // implementation-specific limit (1023 bytes).
350     //
351     // Here, we send the string through android-base/logging.h LOG()
352     // to avoid the size limitation. LOG(FATAL) does an abort whereas
353     // LOG(FATAL_WITHOUT_ABORT) does not abort.
354 
355     static constexpr pid_t invalidPid = TimerThread::SnapshotAnalysis::INVALID_PID;
356     pid_t tidToAbort = invalidPid;
357     if (snapshotAnalysis.suspectTid != invalidPid) {
358         tidToAbort = snapshotAnalysis.suspectTid;
359     } else if (snapshotAnalysis.timeoutTid != invalidPid) {
360         tidToAbort = snapshotAnalysis.timeoutTid;
361     }
362 
363     LOG(FATAL_WITHOUT_ABORT) << abortMessage;
364     const auto ret = abortTid(tidToAbort);
365     if (ret < 0) {
366         LOG(FATAL) << "TimeCheck thread signal failed, aborting process. "
367                        "errno: " << errno << base::ErrnoNumberAsString(errno);
368     }
369 }
370 
371 // Automatically create a TimeCheck class for a class and method.
372 // This is used for Audio HAL support.
makeTimeCheckStatsForClassMethod(std::string_view className,std::string_view methodName)373 mediautils::TimeCheck makeTimeCheckStatsForClassMethod(
374         std::string_view className, std::string_view methodName) {
375     std::shared_ptr<MethodStatistics<std::string>> statistics =
376             mediautils::getStatisticsForClass(className);
377     if (!statistics) return {}; // empty TimeCheck.
378     return mediautils::TimeCheck(
379             FixedString62(className).append("::").append(methodName),
380             [ safeMethodName = FixedString30(methodName),
381               stats = std::move(statistics) ]
382             (bool timeout, float elapsedMs) {
383                     if (timeout) {
384                         ; // ignored, there is no timeout value.
385                     } else {
386                         stats->event(safeMethodName.asStringView(), elapsedMs);
387                     }
388             }, {} /* timeoutDuration */, {} /* secondChanceDuration */, false /* crashOnTimeout */);
389 }
390 
391 }  // namespace android::mediautils
392