1 /**
2  * Copyright (c) 2020, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "PressureMonitor.h"
20 #include "ProcDiskStatsCollector.h"
21 #include "ProcStatCollector.h"
22 #include "UidStatsCollector.h"
23 #include "WatchdogPerfService.h"
24 
25 #include <android-base/chrono_utils.h>
26 #include <android-base/result.h>
27 #include <android/util/ProtoOutputStream.h>
28 #include <cutils/multiuser.h>
29 #include <gtest/gtest_prod.h>
30 #include <meminfo/procmeminfo.h>
31 #include <utils/Errors.h>
32 #include <utils/Mutex.h>
33 #include <utils/RefBase.h>
34 #include <utils/SystemClock.h>
35 
36 #include <android_car_feature.h>
37 
38 #include <ctime>
39 #include <string>
40 #include <unordered_set>
41 #include <variant>
42 #include <vector>
43 
44 namespace android {
45 namespace automotive {
46 namespace watchdog {
47 
48 // Number of periodic collection records to cache in memory.
49 constexpr int32_t kDefaultPeriodicCollectionBufferSize = 180;
50 constexpr const char kEmptyCollectionMessage[] = "No collection recorded\n";
51 
52 // Forward declaration for testing use only.
53 namespace internal {
54 
55 class PerformanceProfilerPeer;
56 
57 }  // namespace internal
58 
59 // Below classes, structs and enums should be used only by the implementation and unit tests.
60 enum ProcStatType {
61     IO_BLOCKED_TASKS_COUNT = 0,
62     MAJOR_FAULTS,
63     CPU_TIME,
64     MEMORY_STATS,
65     PROC_STAT_TYPES,
66 };
67 
68 // UserPackageStats represents the user package performance stats.
69 class UserPackageStats {
70 public:
71     struct UidIoSingleOpStats {
72         int64_t bytes[UID_STATES] = {0};
73         int64_t fsync[UID_STATES] = {0};
74 
totalBytesUidIoSingleOpStats75         int64_t totalBytes() const {
76             return std::numeric_limits<int64_t>::max() - bytes[UidState::FOREGROUND] >
77                             bytes[UidState::BACKGROUND]
78                     ? bytes[UidState::FOREGROUND] + bytes[UidState::BACKGROUND]
79                     : std::numeric_limits<int64_t>::max();
80         }
81     };
82     struct UidSingleStats {
83         uint64_t value = 0;
84         struct ProcessSingleStats {
85             std::string comm = "";
86             uint64_t value = 0;
87         };
88         std::vector<ProcessSingleStats> topNProcesses = {};
89     };
90     struct UidCpuStats {
91         int64_t cpuTimeMillis = 0;
92         int64_t cpuCycles = 0;
93         struct ProcessCpuStats {
94             int32_t pid = -1;
95             std::string comm = "";
96             int64_t cpuTimeMillis = 0;
97             int64_t cpuCycles = 0;
98         };
99         std::vector<ProcessCpuStats> topNProcesses = {};
100     };
101     struct MemoryStats {
102         uint64_t rssKb = 0;
103         uint64_t pssKb = 0;
104         uint64_t ussKb = 0;
105         uint64_t swapPssKb = 0;
106     };
107     struct UidMemoryStats {
108         MemoryStats memoryStats;
109         bool isSmapsRollupSupported;
110         struct ProcessMemoryStats {
111             std::string comm = "";
112             MemoryStats memoryStats;
113         };
114         std::vector<ProcessMemoryStats> topNProcesses = {};
115     };
116 
117     UserPackageStats(MetricType metricType, const UidStats& uidStats);
118     UserPackageStats(ProcStatType procStatType, const UidStats& uidStats, int topNProcessCount,
119                      bool isSmapsRollupSupported);
120 
121     // Class must be DefaultInsertable for std::vector<T>::resize to work
UserPackageStats()122     UserPackageStats() : uid(0), genericPackageName("") {}
123     // For unit test case only
UserPackageStats(uid_t uid,std::string genericPackageName,std::variant<std::monostate,UidIoSingleOpStats,UidSingleStats,UidCpuStats,UidMemoryStats> statsVariant)124     UserPackageStats(uid_t uid, std::string genericPackageName,
125                      std::variant<std::monostate, UidIoSingleOpStats, UidSingleStats,
126                                   UidCpuStats, UidMemoryStats>
127                              statsVariant) :
128           uid(uid),
129           genericPackageName(std::move(genericPackageName)),
130           statsVariant(std::move(statsVariant)) {}
131 
132     // Returns the primary value of the current UidStats. If the variant has value
133     // |std::monostate|, returns 0.
134     //
135     // This value should be used to sort the UidStats.
136     uint64_t getValue() const;
137     std::string toString(MetricType metricsType, const int64_t totalIoStats[][UID_STATES]) const;
138     std::string toString(int64_t totalValue) const;
139     std::string toString(int64_t totalRssKb, int64_t totalPssKb) const;
140 
141     uid_t uid;
142     std::string genericPackageName;
143     std::variant<std::monostate,
144                 UidIoSingleOpStats,
145                 UidSingleStats,
146                 UidCpuStats,
147                 UidMemoryStats>
148             statsVariant;
149 
150 private:
151     void cacheTopNProcessSingleStats(
152           ProcStatType procStatType, const UidStats& uidStats, int topNProcessCount,
153           std::vector<UserPackageStats::UidSingleStats::ProcessSingleStats>*
154               topNProcesses);
155     void cacheTopNProcessCpuStats(
156             const UidStats& uidStats, int topNProcessCount,
157             std::vector<UserPackageStats::UidCpuStats::ProcessCpuStats>*
158                 topNProcesses);
159     void cacheTopNProcessMemStats(
160             const UidStats& uidStats, int topNProcessCount, bool isSmapsRollupSupported,
161             std::vector<UserPackageStats::UidMemoryStats::ProcessMemoryStats>* topNProcesses);
162 };
163 
164 /**
165  * User package summary performance stats collected from the `/proc/uid_io/stats`,
166  * `/proc/[pid]/stat`, `/proc/[pid]/task/[tid]/stat`, and /proc/[pid]/status` files.
167  */
168 struct UserPackageSummaryStats {
169     std::vector<UserPackageStats> topNCpuTimes = {};
170     std::vector<UserPackageStats> topNIoReads = {};
171     std::vector<UserPackageStats> topNIoWrites = {};
172     std::vector<UserPackageStats> topNIoBlocked = {};
173     std::vector<UserPackageStats> topNMajorFaults = {};
174     std::vector<UserPackageStats> topNMemStats = {};
175     int64_t totalIoStats[METRIC_TYPES][UID_STATES] = {{0}};
176     std::unordered_map<uid_t, uint64_t> taskCountByUid = {};
177     // TODO(b/337115923): Clean up below duplicate fields and report `totalMajorFaults`,
178     //  `totalRssKb`, `totalPssKb`, and `majorFaultsPercentChange` as part of `SystemSummaryStats`.
179     int64_t totalCpuTimeMillis = 0;
180     uint64_t totalCpuCycles = 0;
181     uint64_t totalMajorFaults = 0;
182     uint64_t totalRssKb = 0;
183     uint64_t totalPssKb = 0;
184     // Percentage of increase/decrease in the major page faults since last collection.
185     double majorFaultsPercentChange = 0.0;
186     std::string toString() const;
187 };
188 
189 // TODO(b/268402964): Calculate the total CPU cycles using the per-UID BPF tool.
190 // System performance stats collected from the `/proc/stat` file.
191 struct SystemSummaryStats {
192     int64_t cpuIoWaitTimeMillis = 0;
193     int64_t cpuIdleTimeMillis = 0;
194     int64_t totalCpuTimeMillis = 0;
195     uint64_t totalCpuCycles = 0;
196     uint64_t contextSwitchesCount = 0;
197     uint32_t ioBlockedProcessCount = 0;
198     uint32_t totalProcessCount = 0;
199     std::string toString() const;
200 };
201 
202 // Performance record collected during a sampling/collection period.
203 struct PerfStatsRecord {
204     time_point_millis collectionTimeMillis;
205     SystemSummaryStats systemSummaryStats;
206     UserPackageSummaryStats userPackageSummaryStats;
207     std::unordered_map<PressureMonitorInterface::PressureLevel, std::chrono::milliseconds>
208             memoryPressureLevelDurations;
209     std::string toString() const;
210 };
211 
212 // Group of performance records collected for a collection event.
213 struct CollectionInfo {
214     size_t maxCacheSize = 0;               // Maximum cache size for the collection.
215     std::vector<PerfStatsRecord> records;  // Cache of collected performance records.
216     std::string toString() const;
217 };
218 
219 // Group of performance records collected for a user switch collection event.
220 struct UserSwitchCollectionInfo : CollectionInfo {
221     userid_t from = 0;
222     userid_t to = 0;
223 };
224 
225 // PerformanceProfiler implements the I/O performance data collection module.
226 class PerformanceProfiler final :
227       public DataProcessorInterface,
228       public PressureMonitorInterface::PressureChangeCallbackInterface {
229 public:
230     PerformanceProfiler(
231             const android::sp<PressureMonitorInterface>& pressureMonitor,
232             const std::function<int64_t()>& getElapsedTimeSinceBootMillisFunc = &elapsedRealtime) :
kPressureMonitor(pressureMonitor)233           kPressureMonitor(pressureMonitor),
234           kGetElapsedTimeSinceBootMillisFunc(getElapsedTimeSinceBootMillisFunc),
235           mTopNStatsPerCategory(0),
236           mTopNStatsPerSubcategory(0),
237           mMaxUserSwitchEvents(0),
238           mSystemEventDataCacheDurationSec(0),
239           // TODO(b/333722043): Once carwatchdogd has sys_ptrace capability, set
240           // mIsSmapsRollupSupported field from `android::meminfo::IsSmapsRollupSupported()`.
241           // Disabling smaps_rollup support because this file cannot be read without sys_ptrace
242           // capability.
243           mIsSmapsRollupSupported(false),
244           mIsMemoryProfilingEnabled(android::car::feature::car_watchdog_memory_profiling()),
245           mBoottimeCollection({}),
246           mPeriodicCollection({}),
247           mUserSwitchCollections({}),
248           mWakeUpCollection({}),
249           mCustomCollection({}),
250           mLastMajorFaults(0),
251           mDoSendResourceUsageStats(false),
252           mMemoryPressureLevelDeltaInfo(PressureLevelDeltaInfo(getElapsedTimeSinceBootMillisFunc)) {
253     }
254 
~PerformanceProfiler()255     ~PerformanceProfiler() { terminate(); }
256 
name()257     std::string name() const override { return "PerformanceProfiler"; }
258 
259     // Implements DataProcessorInterface.
260     android::base::Result<void> onSystemStartup() override;
261 
262     void onCarWatchdogServiceRegistered() override;
263 
264     android::base::Result<void> onBoottimeCollection(
265             time_point_millis time,
266             const android::wp<UidStatsCollectorInterface>& uidStatsCollector,
267             const android::wp<ProcStatCollectorInterface>& procStatCollector,
268             aidl::android::automotive::watchdog::internal::ResourceStats* resourceStats) override;
269 
270     android::base::Result<void> onWakeUpCollection(
271             time_point_millis time,
272             const android::wp<UidStatsCollectorInterface>& uidStatsCollector,
273             const android::wp<ProcStatCollectorInterface>& procStatCollector) override;
274 
275     android::base::Result<void> onPeriodicCollection(
276             time_point_millis time, SystemState systemState,
277             const android::wp<UidStatsCollectorInterface>& uidStatsCollector,
278             const android::wp<ProcStatCollectorInterface>& procStatCollector,
279             aidl::android::automotive::watchdog::internal::ResourceStats* resourceStats) override;
280 
281     android::base::Result<void> onUserSwitchCollection(
282             time_point_millis time, userid_t from, userid_t to,
283             const android::wp<UidStatsCollectorInterface>& uidStatsCollector,
284             const android::wp<ProcStatCollectorInterface>& procStatCollector) override;
285 
286     android::base::Result<void> onCustomCollection(
287             time_point_millis time, SystemState systemState,
288             const std::unordered_set<std::string>& filterPackages,
289             const android::wp<UidStatsCollectorInterface>& uidStatsCollector,
290             const android::wp<ProcStatCollectorInterface>& procStatCollector,
291             aidl::android::automotive::watchdog::internal::ResourceStats* resourceStats) override;
292 
onPeriodicMonitor(time_t time,const android::wp<ProcDiskStatsCollectorInterface> & procDiskStatsCollector,const std::function<void ()> & alertHandler)293     android::base::Result<void> onPeriodicMonitor(
294             [[maybe_unused]] time_t time,
295             [[maybe_unused]] const android::wp<ProcDiskStatsCollectorInterface>&
296                     procDiskStatsCollector,
297             [[maybe_unused]] const std::function<void()>& alertHandler) override {
298         // No monitoring done here as this DataProcessor only collects I/O performance records.
299         return {};
300     }
301 
302     android::base::Result<void> onDump(int fd) const override;
303 
304     android::base::Result<void> onDumpProto(
305             const CollectionIntervals& collectionIntervals,
306             android::util::ProtoOutputStream& outProto) const override;
307 
308     android::base::Result<void> onCustomCollectionDump(int fd) override;
309 
310     void onPressureChanged(PressureMonitorInterface::PressureLevel) override;
311 
312 protected:
313     android::base::Result<void> init();
314 
315     // Clears in-memory cache.
316     void terminate();
317 
318 private:
319     class PressureLevelDeltaInfo {
320     public:
PressureLevelDeltaInfo(const std::function<int64_t ()> & getElapsedTimeSinceBootMillisFunc)321         explicit PressureLevelDeltaInfo(
322                 const std::function<int64_t()>& getElapsedTimeSinceBootMillisFunc) :
323               kGetElapsedTimeSinceBootMillisFunc(getElapsedTimeSinceBootMillisFunc),
324               mLatestPressureLevel(PressureMonitorInterface::PRESSURE_LEVEL_NONE),
325               mLatestPressureLevelElapsedRealtimeMillis(getElapsedTimeSinceBootMillisFunc()) {}
326 
327         // Calculates the duration for the previously reported pressure level, updates it in
328         // mPressureLevelDurations, and sets the latest pressure level and its elapsed realtime.
329         void setLatestPressureLevelLocked(PressureMonitorInterface::PressureLevel pressureLevel);
330 
331         // Returns the latest pressure stats and flushes stats to mPressureLevelDurations.
332         std::unordered_map<PressureMonitorInterface::PressureLevel, std::chrono::milliseconds>
333         onCollectionLocked();
334 
335     private:
336         // Updated by test for mocking elapsed time.
337         const std::function<int64_t()> kGetElapsedTimeSinceBootMillisFunc;
338 
339         // Latest pressure level reported by the PressureMonitor.
340         PressureMonitorInterface::PressureLevel mLatestPressureLevel;
341 
342         // Time when the latest pressure level was recorded. Used to calculate
343         // pressureLevelDurations.
344         int64_t mLatestPressureLevelElapsedRealtimeMillis = 0;
345 
346         // Duration spent in different pressure levels since the last poll.
347         std::unordered_map<PressureMonitorInterface::PressureLevel, std::chrono::milliseconds>
348                 mPressureLevelDurations = {};
349     };
350 
351     // Processes the collected data.
352     android::base::Result<void> processLocked(
353             time_point_millis time, SystemState systemState,
354             const std::unordered_set<std::string>& filterPackages,
355             const android::sp<UidStatsCollectorInterface>& uidStatsCollector,
356             const android::sp<ProcStatCollectorInterface>& procStatCollector,
357             CollectionInfo* collectionInfo,
358             aidl::android::automotive::watchdog::internal::ResourceStats* resourceStats);
359 
360     // Processes per-UID performance data.
361     void processUidStatsLocked(
362             bool isGarageModeActive, int64_t totalCpuTimeMillis,
363             const std::unordered_set<std::string>& filterPackages,
364             const android::sp<UidStatsCollectorInterface>& uidStatsCollector,
365             std::vector<aidl::android::automotive::watchdog::internal::UidResourceUsageStats>*
366                     uidResourceUsageStats,
367             UserPackageSummaryStats* userPackageSummaryStats);
368 
369     // Processes system performance data from the `/proc/stat` file.
370     void processProcStatLocked(const android::sp<ProcStatCollectorInterface>& procStatCollector,
371                                SystemSummaryStats* systemSummaryStats) const;
372 
373     // Dump the user switch collection
374     android::base::Result<void> onUserSwitchCollectionDump(int fd) const;
375 
376     void clearExpiredSystemEventCollections(time_point_millis time);
377 
378     void dumpStatsRecordsProto(const CollectionInfo& collection,
379                                android::util::ProtoOutputStream& outProto) const;
380 
381     void dumpPackageCpuStatsProto(const std::vector<UserPackageStats>& userPackageStats,
382                                   android::util::ProtoOutputStream& outProto) const;
383 
384     void dumpPackageStorageIoStatsProto(const std::vector<UserPackageStats>& userPackageStats,
385                                         const uint64_t storageStatsFieldId,
386                                         android::util::ProtoOutputStream& outProto) const;
387 
388     void dumpPackageTaskStateStatsProto(const std::vector<UserPackageStats>& userPackageStats,
389                                         const std::unordered_map<uid_t, uint64_t>& taskCountByUid,
390                                         android::util::ProtoOutputStream& outProto) const;
391 
392     void dumpPackageMajorPageFaultsProto(const std::vector<UserPackageStats>& userPackageStats,
393                                          android::util::ProtoOutputStream& outProto) const;
394 
395     // Pressure monitor instance.
396     const android::sp<PressureMonitorInterface> kPressureMonitor;
397 
398     // Updated by test for mocking elapsed time.
399     const std::function<int64_t()> kGetElapsedTimeSinceBootMillisFunc;
400 
401     // Top N per-UID stats per category.
402     int mTopNStatsPerCategory;
403 
404     // Top N per-process stats per subcategory.
405     int mTopNStatsPerSubcategory;
406 
407     // Max amount of user switch events cached in |mUserSwitchCollections|.
408     size_t mMaxUserSwitchEvents;
409 
410     // Amount of seconds before a system event's cache is cleared.
411     std::chrono::seconds mSystemEventDataCacheDurationSec;
412 
413     // Smaps rollup is supported by kernel or not.
414     bool mIsSmapsRollupSupported;
415 
416     // Memory Profiling feature flag is enabled or not.
417     bool mIsMemoryProfilingEnabled;
418 
419     // Makes sure only one collection is running at any given time.
420     mutable Mutex mMutex;
421 
422     // Info for the boot-time collection event. The cache is persisted until system shutdown/reboot
423     // or a wake-up collection occurs.
424     CollectionInfo mBoottimeCollection GUARDED_BY(mMutex);
425 
426     // Info for the periodic collection event. The cache size is limited by
427     // |ro.carwatchdog.periodic_collection_buffer_size|.
428     CollectionInfo mPeriodicCollection GUARDED_BY(mMutex);
429 
430     // Cache for user switch collection events. Events are cached from oldest to newest.
431     std::vector<UserSwitchCollectionInfo> mUserSwitchCollections GUARDED_BY(mMutex);
432 
433     // Info for the wake-up collection event. Only the latest wake-up collection is cached.
434     CollectionInfo mWakeUpCollection GUARDED_BY(mMutex);
435 
436     // Info for the custom collection event. The info is cleared at the end of every custom
437     // collection.
438     CollectionInfo mCustomCollection GUARDED_BY(mMutex);
439 
440     // Major faults delta from last collection. Useful when calculating the percentage change in
441     // major faults since last collection.
442     uint64_t mLastMajorFaults GUARDED_BY(mMutex);
443 
444     // Enables the sending of resource usage stats to CarService.
445     bool mDoSendResourceUsageStats GUARDED_BY(mMutex);
446 
447     // Aggregated pressure level changes occurred since the last collection.
448     PressureLevelDeltaInfo mMemoryPressureLevelDeltaInfo GUARDED_BY(mMutex);
449 
450     friend class WatchdogPerfService;
451 
452     // For unit tests.
453     friend class internal::PerformanceProfilerPeer;
454 };
455 
456 }  // namespace watchdog
457 }  // namespace automotive
458 }  // namespace android
459