1 /** 2 * Copyright (c) 2020, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #pragma once 18 19 #include "PressureMonitor.h" 20 #include "ProcDiskStatsCollector.h" 21 #include "ProcStatCollector.h" 22 #include "UidStatsCollector.h" 23 #include "WatchdogPerfService.h" 24 25 #include <android-base/chrono_utils.h> 26 #include <android-base/result.h> 27 #include <android/util/ProtoOutputStream.h> 28 #include <cutils/multiuser.h> 29 #include <gtest/gtest_prod.h> 30 #include <meminfo/procmeminfo.h> 31 #include <utils/Errors.h> 32 #include <utils/Mutex.h> 33 #include <utils/RefBase.h> 34 #include <utils/SystemClock.h> 35 36 #include <android_car_feature.h> 37 38 #include <ctime> 39 #include <string> 40 #include <unordered_set> 41 #include <variant> 42 #include <vector> 43 44 namespace android { 45 namespace automotive { 46 namespace watchdog { 47 48 // Number of periodic collection records to cache in memory. 49 constexpr int32_t kDefaultPeriodicCollectionBufferSize = 180; 50 constexpr const char kEmptyCollectionMessage[] = "No collection recorded\n"; 51 52 // Forward declaration for testing use only. 53 namespace internal { 54 55 class PerformanceProfilerPeer; 56 57 } // namespace internal 58 59 // Below classes, structs and enums should be used only by the implementation and unit tests. 60 enum ProcStatType { 61 IO_BLOCKED_TASKS_COUNT = 0, 62 MAJOR_FAULTS, 63 CPU_TIME, 64 MEMORY_STATS, 65 PROC_STAT_TYPES, 66 }; 67 68 // UserPackageStats represents the user package performance stats. 69 class UserPackageStats { 70 public: 71 struct UidIoSingleOpStats { 72 int64_t bytes[UID_STATES] = {0}; 73 int64_t fsync[UID_STATES] = {0}; 74 totalBytesUidIoSingleOpStats75 int64_t totalBytes() const { 76 return std::numeric_limits<int64_t>::max() - bytes[UidState::FOREGROUND] > 77 bytes[UidState::BACKGROUND] 78 ? bytes[UidState::FOREGROUND] + bytes[UidState::BACKGROUND] 79 : std::numeric_limits<int64_t>::max(); 80 } 81 }; 82 struct UidSingleStats { 83 uint64_t value = 0; 84 struct ProcessSingleStats { 85 std::string comm = ""; 86 uint64_t value = 0; 87 }; 88 std::vector<ProcessSingleStats> topNProcesses = {}; 89 }; 90 struct UidCpuStats { 91 int64_t cpuTimeMillis = 0; 92 int64_t cpuCycles = 0; 93 struct ProcessCpuStats { 94 int32_t pid = -1; 95 std::string comm = ""; 96 int64_t cpuTimeMillis = 0; 97 int64_t cpuCycles = 0; 98 }; 99 std::vector<ProcessCpuStats> topNProcesses = {}; 100 }; 101 struct MemoryStats { 102 uint64_t rssKb = 0; 103 uint64_t pssKb = 0; 104 uint64_t ussKb = 0; 105 uint64_t swapPssKb = 0; 106 }; 107 struct UidMemoryStats { 108 MemoryStats memoryStats; 109 bool isSmapsRollupSupported; 110 struct ProcessMemoryStats { 111 std::string comm = ""; 112 MemoryStats memoryStats; 113 }; 114 std::vector<ProcessMemoryStats> topNProcesses = {}; 115 }; 116 117 UserPackageStats(MetricType metricType, const UidStats& uidStats); 118 UserPackageStats(ProcStatType procStatType, const UidStats& uidStats, int topNProcessCount, 119 bool isSmapsRollupSupported); 120 121 // Class must be DefaultInsertable for std::vector<T>::resize to work UserPackageStats()122 UserPackageStats() : uid(0), genericPackageName("") {} 123 // For unit test case only UserPackageStats(uid_t uid,std::string genericPackageName,std::variant<std::monostate,UidIoSingleOpStats,UidSingleStats,UidCpuStats,UidMemoryStats> statsVariant)124 UserPackageStats(uid_t uid, std::string genericPackageName, 125 std::variant<std::monostate, UidIoSingleOpStats, UidSingleStats, 126 UidCpuStats, UidMemoryStats> 127 statsVariant) : 128 uid(uid), 129 genericPackageName(std::move(genericPackageName)), 130 statsVariant(std::move(statsVariant)) {} 131 132 // Returns the primary value of the current UidStats. If the variant has value 133 // |std::monostate|, returns 0. 134 // 135 // This value should be used to sort the UidStats. 136 uint64_t getValue() const; 137 std::string toString(MetricType metricsType, const int64_t totalIoStats[][UID_STATES]) const; 138 std::string toString(int64_t totalValue) const; 139 std::string toString(int64_t totalRssKb, int64_t totalPssKb) const; 140 141 uid_t uid; 142 std::string genericPackageName; 143 std::variant<std::monostate, 144 UidIoSingleOpStats, 145 UidSingleStats, 146 UidCpuStats, 147 UidMemoryStats> 148 statsVariant; 149 150 private: 151 void cacheTopNProcessSingleStats( 152 ProcStatType procStatType, const UidStats& uidStats, int topNProcessCount, 153 std::vector<UserPackageStats::UidSingleStats::ProcessSingleStats>* 154 topNProcesses); 155 void cacheTopNProcessCpuStats( 156 const UidStats& uidStats, int topNProcessCount, 157 std::vector<UserPackageStats::UidCpuStats::ProcessCpuStats>* 158 topNProcesses); 159 void cacheTopNProcessMemStats( 160 const UidStats& uidStats, int topNProcessCount, bool isSmapsRollupSupported, 161 std::vector<UserPackageStats::UidMemoryStats::ProcessMemoryStats>* topNProcesses); 162 }; 163 164 /** 165 * User package summary performance stats collected from the `/proc/uid_io/stats`, 166 * `/proc/[pid]/stat`, `/proc/[pid]/task/[tid]/stat`, and /proc/[pid]/status` files. 167 */ 168 struct UserPackageSummaryStats { 169 std::vector<UserPackageStats> topNCpuTimes = {}; 170 std::vector<UserPackageStats> topNIoReads = {}; 171 std::vector<UserPackageStats> topNIoWrites = {}; 172 std::vector<UserPackageStats> topNIoBlocked = {}; 173 std::vector<UserPackageStats> topNMajorFaults = {}; 174 std::vector<UserPackageStats> topNMemStats = {}; 175 int64_t totalIoStats[METRIC_TYPES][UID_STATES] = {{0}}; 176 std::unordered_map<uid_t, uint64_t> taskCountByUid = {}; 177 // TODO(b/337115923): Clean up below duplicate fields and report `totalMajorFaults`, 178 // `totalRssKb`, `totalPssKb`, and `majorFaultsPercentChange` as part of `SystemSummaryStats`. 179 int64_t totalCpuTimeMillis = 0; 180 uint64_t totalCpuCycles = 0; 181 uint64_t totalMajorFaults = 0; 182 uint64_t totalRssKb = 0; 183 uint64_t totalPssKb = 0; 184 // Percentage of increase/decrease in the major page faults since last collection. 185 double majorFaultsPercentChange = 0.0; 186 std::string toString() const; 187 }; 188 189 // TODO(b/268402964): Calculate the total CPU cycles using the per-UID BPF tool. 190 // System performance stats collected from the `/proc/stat` file. 191 struct SystemSummaryStats { 192 int64_t cpuIoWaitTimeMillis = 0; 193 int64_t cpuIdleTimeMillis = 0; 194 int64_t totalCpuTimeMillis = 0; 195 uint64_t totalCpuCycles = 0; 196 uint64_t contextSwitchesCount = 0; 197 uint32_t ioBlockedProcessCount = 0; 198 uint32_t totalProcessCount = 0; 199 std::string toString() const; 200 }; 201 202 // Performance record collected during a sampling/collection period. 203 struct PerfStatsRecord { 204 time_point_millis collectionTimeMillis; 205 SystemSummaryStats systemSummaryStats; 206 UserPackageSummaryStats userPackageSummaryStats; 207 std::unordered_map<PressureMonitorInterface::PressureLevel, std::chrono::milliseconds> 208 memoryPressureLevelDurations; 209 std::string toString() const; 210 }; 211 212 // Group of performance records collected for a collection event. 213 struct CollectionInfo { 214 size_t maxCacheSize = 0; // Maximum cache size for the collection. 215 std::vector<PerfStatsRecord> records; // Cache of collected performance records. 216 std::string toString() const; 217 }; 218 219 // Group of performance records collected for a user switch collection event. 220 struct UserSwitchCollectionInfo : CollectionInfo { 221 userid_t from = 0; 222 userid_t to = 0; 223 }; 224 225 // PerformanceProfiler implements the I/O performance data collection module. 226 class PerformanceProfiler final : 227 public DataProcessorInterface, 228 public PressureMonitorInterface::PressureChangeCallbackInterface { 229 public: 230 PerformanceProfiler( 231 const android::sp<PressureMonitorInterface>& pressureMonitor, 232 const std::function<int64_t()>& getElapsedTimeSinceBootMillisFunc = &elapsedRealtime) : kPressureMonitor(pressureMonitor)233 kPressureMonitor(pressureMonitor), 234 kGetElapsedTimeSinceBootMillisFunc(getElapsedTimeSinceBootMillisFunc), 235 mTopNStatsPerCategory(0), 236 mTopNStatsPerSubcategory(0), 237 mMaxUserSwitchEvents(0), 238 mSystemEventDataCacheDurationSec(0), 239 // TODO(b/333722043): Once carwatchdogd has sys_ptrace capability, set 240 // mIsSmapsRollupSupported field from `android::meminfo::IsSmapsRollupSupported()`. 241 // Disabling smaps_rollup support because this file cannot be read without sys_ptrace 242 // capability. 243 mIsSmapsRollupSupported(false), 244 mIsMemoryProfilingEnabled(android::car::feature::car_watchdog_memory_profiling()), 245 mBoottimeCollection({}), 246 mPeriodicCollection({}), 247 mUserSwitchCollections({}), 248 mWakeUpCollection({}), 249 mCustomCollection({}), 250 mLastMajorFaults(0), 251 mDoSendResourceUsageStats(false), 252 mMemoryPressureLevelDeltaInfo(PressureLevelDeltaInfo(getElapsedTimeSinceBootMillisFunc)) { 253 } 254 ~PerformanceProfiler()255 ~PerformanceProfiler() { terminate(); } 256 name()257 std::string name() const override { return "PerformanceProfiler"; } 258 259 // Implements DataProcessorInterface. 260 android::base::Result<void> onSystemStartup() override; 261 262 void onCarWatchdogServiceRegistered() override; 263 264 android::base::Result<void> onBoottimeCollection( 265 time_point_millis time, 266 const android::wp<UidStatsCollectorInterface>& uidStatsCollector, 267 const android::wp<ProcStatCollectorInterface>& procStatCollector, 268 aidl::android::automotive::watchdog::internal::ResourceStats* resourceStats) override; 269 270 android::base::Result<void> onWakeUpCollection( 271 time_point_millis time, 272 const android::wp<UidStatsCollectorInterface>& uidStatsCollector, 273 const android::wp<ProcStatCollectorInterface>& procStatCollector) override; 274 275 android::base::Result<void> onPeriodicCollection( 276 time_point_millis time, SystemState systemState, 277 const android::wp<UidStatsCollectorInterface>& uidStatsCollector, 278 const android::wp<ProcStatCollectorInterface>& procStatCollector, 279 aidl::android::automotive::watchdog::internal::ResourceStats* resourceStats) override; 280 281 android::base::Result<void> onUserSwitchCollection( 282 time_point_millis time, userid_t from, userid_t to, 283 const android::wp<UidStatsCollectorInterface>& uidStatsCollector, 284 const android::wp<ProcStatCollectorInterface>& procStatCollector) override; 285 286 android::base::Result<void> onCustomCollection( 287 time_point_millis time, SystemState systemState, 288 const std::unordered_set<std::string>& filterPackages, 289 const android::wp<UidStatsCollectorInterface>& uidStatsCollector, 290 const android::wp<ProcStatCollectorInterface>& procStatCollector, 291 aidl::android::automotive::watchdog::internal::ResourceStats* resourceStats) override; 292 onPeriodicMonitor(time_t time,const android::wp<ProcDiskStatsCollectorInterface> & procDiskStatsCollector,const std::function<void ()> & alertHandler)293 android::base::Result<void> onPeriodicMonitor( 294 [[maybe_unused]] time_t time, 295 [[maybe_unused]] const android::wp<ProcDiskStatsCollectorInterface>& 296 procDiskStatsCollector, 297 [[maybe_unused]] const std::function<void()>& alertHandler) override { 298 // No monitoring done here as this DataProcessor only collects I/O performance records. 299 return {}; 300 } 301 302 android::base::Result<void> onDump(int fd) const override; 303 304 android::base::Result<void> onDumpProto( 305 const CollectionIntervals& collectionIntervals, 306 android::util::ProtoOutputStream& outProto) const override; 307 308 android::base::Result<void> onCustomCollectionDump(int fd) override; 309 310 void onPressureChanged(PressureMonitorInterface::PressureLevel) override; 311 312 protected: 313 android::base::Result<void> init(); 314 315 // Clears in-memory cache. 316 void terminate(); 317 318 private: 319 class PressureLevelDeltaInfo { 320 public: PressureLevelDeltaInfo(const std::function<int64_t ()> & getElapsedTimeSinceBootMillisFunc)321 explicit PressureLevelDeltaInfo( 322 const std::function<int64_t()>& getElapsedTimeSinceBootMillisFunc) : 323 kGetElapsedTimeSinceBootMillisFunc(getElapsedTimeSinceBootMillisFunc), 324 mLatestPressureLevel(PressureMonitorInterface::PRESSURE_LEVEL_NONE), 325 mLatestPressureLevelElapsedRealtimeMillis(getElapsedTimeSinceBootMillisFunc()) {} 326 327 // Calculates the duration for the previously reported pressure level, updates it in 328 // mPressureLevelDurations, and sets the latest pressure level and its elapsed realtime. 329 void setLatestPressureLevelLocked(PressureMonitorInterface::PressureLevel pressureLevel); 330 331 // Returns the latest pressure stats and flushes stats to mPressureLevelDurations. 332 std::unordered_map<PressureMonitorInterface::PressureLevel, std::chrono::milliseconds> 333 onCollectionLocked(); 334 335 private: 336 // Updated by test for mocking elapsed time. 337 const std::function<int64_t()> kGetElapsedTimeSinceBootMillisFunc; 338 339 // Latest pressure level reported by the PressureMonitor. 340 PressureMonitorInterface::PressureLevel mLatestPressureLevel; 341 342 // Time when the latest pressure level was recorded. Used to calculate 343 // pressureLevelDurations. 344 int64_t mLatestPressureLevelElapsedRealtimeMillis = 0; 345 346 // Duration spent in different pressure levels since the last poll. 347 std::unordered_map<PressureMonitorInterface::PressureLevel, std::chrono::milliseconds> 348 mPressureLevelDurations = {}; 349 }; 350 351 // Processes the collected data. 352 android::base::Result<void> processLocked( 353 time_point_millis time, SystemState systemState, 354 const std::unordered_set<std::string>& filterPackages, 355 const android::sp<UidStatsCollectorInterface>& uidStatsCollector, 356 const android::sp<ProcStatCollectorInterface>& procStatCollector, 357 CollectionInfo* collectionInfo, 358 aidl::android::automotive::watchdog::internal::ResourceStats* resourceStats); 359 360 // Processes per-UID performance data. 361 void processUidStatsLocked( 362 bool isGarageModeActive, int64_t totalCpuTimeMillis, 363 const std::unordered_set<std::string>& filterPackages, 364 const android::sp<UidStatsCollectorInterface>& uidStatsCollector, 365 std::vector<aidl::android::automotive::watchdog::internal::UidResourceUsageStats>* 366 uidResourceUsageStats, 367 UserPackageSummaryStats* userPackageSummaryStats); 368 369 // Processes system performance data from the `/proc/stat` file. 370 void processProcStatLocked(const android::sp<ProcStatCollectorInterface>& procStatCollector, 371 SystemSummaryStats* systemSummaryStats) const; 372 373 // Dump the user switch collection 374 android::base::Result<void> onUserSwitchCollectionDump(int fd) const; 375 376 void clearExpiredSystemEventCollections(time_point_millis time); 377 378 void dumpStatsRecordsProto(const CollectionInfo& collection, 379 android::util::ProtoOutputStream& outProto) const; 380 381 void dumpPackageCpuStatsProto(const std::vector<UserPackageStats>& userPackageStats, 382 android::util::ProtoOutputStream& outProto) const; 383 384 void dumpPackageStorageIoStatsProto(const std::vector<UserPackageStats>& userPackageStats, 385 const uint64_t storageStatsFieldId, 386 android::util::ProtoOutputStream& outProto) const; 387 388 void dumpPackageTaskStateStatsProto(const std::vector<UserPackageStats>& userPackageStats, 389 const std::unordered_map<uid_t, uint64_t>& taskCountByUid, 390 android::util::ProtoOutputStream& outProto) const; 391 392 void dumpPackageMajorPageFaultsProto(const std::vector<UserPackageStats>& userPackageStats, 393 android::util::ProtoOutputStream& outProto) const; 394 395 // Pressure monitor instance. 396 const android::sp<PressureMonitorInterface> kPressureMonitor; 397 398 // Updated by test for mocking elapsed time. 399 const std::function<int64_t()> kGetElapsedTimeSinceBootMillisFunc; 400 401 // Top N per-UID stats per category. 402 int mTopNStatsPerCategory; 403 404 // Top N per-process stats per subcategory. 405 int mTopNStatsPerSubcategory; 406 407 // Max amount of user switch events cached in |mUserSwitchCollections|. 408 size_t mMaxUserSwitchEvents; 409 410 // Amount of seconds before a system event's cache is cleared. 411 std::chrono::seconds mSystemEventDataCacheDurationSec; 412 413 // Smaps rollup is supported by kernel or not. 414 bool mIsSmapsRollupSupported; 415 416 // Memory Profiling feature flag is enabled or not. 417 bool mIsMemoryProfilingEnabled; 418 419 // Makes sure only one collection is running at any given time. 420 mutable Mutex mMutex; 421 422 // Info for the boot-time collection event. The cache is persisted until system shutdown/reboot 423 // or a wake-up collection occurs. 424 CollectionInfo mBoottimeCollection GUARDED_BY(mMutex); 425 426 // Info for the periodic collection event. The cache size is limited by 427 // |ro.carwatchdog.periodic_collection_buffer_size|. 428 CollectionInfo mPeriodicCollection GUARDED_BY(mMutex); 429 430 // Cache for user switch collection events. Events are cached from oldest to newest. 431 std::vector<UserSwitchCollectionInfo> mUserSwitchCollections GUARDED_BY(mMutex); 432 433 // Info for the wake-up collection event. Only the latest wake-up collection is cached. 434 CollectionInfo mWakeUpCollection GUARDED_BY(mMutex); 435 436 // Info for the custom collection event. The info is cleared at the end of every custom 437 // collection. 438 CollectionInfo mCustomCollection GUARDED_BY(mMutex); 439 440 // Major faults delta from last collection. Useful when calculating the percentage change in 441 // major faults since last collection. 442 uint64_t mLastMajorFaults GUARDED_BY(mMutex); 443 444 // Enables the sending of resource usage stats to CarService. 445 bool mDoSendResourceUsageStats GUARDED_BY(mMutex); 446 447 // Aggregated pressure level changes occurred since the last collection. 448 PressureLevelDeltaInfo mMemoryPressureLevelDeltaInfo GUARDED_BY(mMutex); 449 450 friend class WatchdogPerfService; 451 452 // For unit tests. 453 friend class internal::PerformanceProfilerPeer; 454 }; 455 456 } // namespace watchdog 457 } // namespace automotive 458 } // namespace android 459