xref: /aosp_15_r20/external/autotest/client/bin/result_tools/dedupe_file_throttler.py (revision 9c5db1993ded3edbeafc8092d69fe5de2ee02df7)
1# Copyright 2017 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""This throttler tries to remove the remove repeated files sharing the same
6prefix, for example, screenshots or dumps in the same folder. The dedupe logic
7does not compare the file content, instead, it sorts the files with the same
8prefix and remove files in the middle part.
9"""
10
11import os
12import re
13
14try:
15    from autotest_lib.client.bin.result_tools import delete_file_throttler
16    from autotest_lib.client.bin.result_tools import result_info_lib
17    from autotest_lib.client.bin.result_tools import throttler_lib
18    from autotest_lib.client.bin.result_tools import utils_lib
19except ImportError:
20    import result_info_lib
21    import throttler_lib
22    import utils_lib
23
24
25# Number of files to keep for the oldest files.
26OLDEST_FILES_TO_KEEP_COUNT = 2
27# Number of files to keep for the newest files.
28NEWEST_FILES_TO_KEEP_COUNT = 1
29
30# Files with path mathing following patterns should not be deduped.
31NO_DEDUPE_FILE_PATTERNS = [
32        'debug/.*',
33        '.*perf.data$',       # Performance test data.
34        '.*/debug/.*',
35        '.*dir_summary_\d+.json',
36        ]
37
38# regex pattern to get the prefix of a file.
39PREFIX_PATTERN = '([a-zA-Z_-]*).*'
40
41def _group_by(file_infos, keys):
42    """Group the file infos by the given keys.
43
44    @param file_infos: A list of ResultInfo objects.
45    @param keys: A list of names of the attribute to group the file infos by.
46    @return: A dictionary of grouped_key: [ResultInfo].
47    """
48    grouped_infos = {}
49    for info in file_infos:
50        key_values = []
51        for key in keys:
52            key_values.append(getattr(info, key))
53        grouped_key = os.sep.join(key_values)
54        if grouped_key not in grouped_infos:
55            grouped_infos[grouped_key] = []
56        grouped_infos[grouped_key].append(info)
57    return grouped_infos
58
59
60def _dedupe_files(summary, file_infos, max_result_size_KB):
61    """Delete the given file and update the summary.
62
63    @param summary: A ResultInfo object containing result summary.
64    @param file_infos: A list of ResultInfo objects to be de-duplicated.
65    @param max_result_size_KB: Maximum test result size in KB.
66    """
67    # Sort file infos based on the modify date of the file.
68    file_infos.sort(
69            key=lambda f: result_info_lib.get_last_modification_time(f.path))
70    file_infos_to_delete = file_infos[
71            OLDEST_FILES_TO_KEEP_COUNT:-NEWEST_FILES_TO_KEEP_COUNT]
72
73    for file_info in file_infos_to_delete:
74        if throttler_lib.try_delete_file_on_disk(file_info.path):
75            file_info.trimmed_size = 0
76
77            if throttler_lib.check_throttle_limit(summary, max_result_size_KB):
78                return
79
80
81def throttle(summary, max_result_size_KB):
82    """Throttle the files in summary by de-duplicating files.
83
84    Stop throttling until all files are processed or the result size is already
85    reduced to be under the given max_result_size_KB.
86
87    @param summary: A ResultInfo object containing result summary.
88    @param max_result_size_KB: Maximum test result size in KB.
89    """
90    _, grouped_files = throttler_lib.sort_result_files(summary)
91    # Respect the non-delete patterns in the delete throttler.
92    keep_patterns = (NO_DEDUPE_FILE_PATTERNS +
93                     delete_file_throttler.NON_DELETABLE_FILE_PATH_PATTERNS)
94    for pattern in throttler_lib.RESULT_THROTTLE_PRIORITY:
95        throttable_files = list(
96                throttler_lib.get_throttleable_files(grouped_files[pattern],
97                                                     keep_patterns))
98
99        for info in throttable_files:
100            info.parent_dir = os.path.dirname(info.path)
101            info.prefix = re.match(PREFIX_PATTERN, info.name).group(1)
102
103        # Group files for each parent directory
104        grouped_infos = _group_by(throttable_files, ['parent_dir', 'prefix'])
105
106        for infos in grouped_infos.values():
107            if (len(infos) <=
108                OLDEST_FILES_TO_KEEP_COUNT + NEWEST_FILES_TO_KEEP_COUNT):
109                # No need to dedupe if the count of file is too few.
110                continue
111
112            # Remove files can be deduped
113            utils_lib.LOG('De-duplicating files in %s with the same prefix of '
114                          '"%s"' % (infos[0].parent_dir, infos[0].prefix))
115            #dedupe_file_infos = [i.result_info for i in infos]
116            _dedupe_files(summary, infos, max_result_size_KB)
117
118            if throttler_lib.check_throttle_limit(summary, max_result_size_KB):
119                return
120