xref: /aosp_15_r20/external/toolchain-utils/llvm_tools/revert_checker.py (revision 760c253c1ed00ce9abd48f8546f08516e57485fe)
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3# ===----------------------------------------------------------------------===##
4#
5# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
6# See https://llvm.org/LICENSE.txt for license information.
7# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8#
9# ===----------------------------------------------------------------------===##
10#
11# !!!!!!!!!!!! NOTE !!!!!!!!!!!!
12# This is copied directly from upstream LLVM. Please make any changes upstream,
13# rather than to this file directly. Once changes are made there, you're free
14# to integrate them here.
15
16"""Checks for reverts of commits across a given git commit.
17
18To clarify the meaning of 'across' with an example, if we had the following
19commit history (where `a -> b` notes that `b` is a direct child of `a`):
20
21123abc -> 223abc -> 323abc -> 423abc -> 523abc
22
23And where 423abc is a revert of 223abc, this revert is considered to be 'across'
24323abc. More generally, a revert A of a parent commit B is considered to be
25'across' a commit C if C is a parent of A and B is a parent of C.
26
27Please note that revert detection in general is really difficult, since merge
28conflicts/etc always introduce _some_ amount of fuzziness. This script just
29uses a bundle of heuristics, and is bound to ignore / incorrectly flag some
30reverts. The hope is that it'll easily catch the vast majority (>90%) of them,
31though.
32
33This is designed to be used in one of two ways: an import in Python, or run
34directly from a shell. If you want to import this, the `find_reverts`
35function is the thing to look at. If you'd rather use this from a shell, have a
36usage example:
37
38```
39./revert_checker.py c47f97169 origin/main origin/release/12.x
40```
41
42This checks for all reverts from the tip of origin/main to c47f97169, which are
43across the latter. It then does the same for origin/release/12.x to c47f97169.
44Duplicate reverts discovered when walking both roots (origin/main and
45origin/release/12.x) are deduplicated in output.
46"""
47
48import argparse
49import collections
50import logging
51import re
52import subprocess
53import sys
54from typing import Generator, Iterable, List, NamedTuple
55
56
57assert sys.version_info >= (3, 6), "Only Python 3.6+ is supported."
58
59# People are creative with their reverts, and heuristics are a bit difficult.
60# Like 90% of of reverts have "This reverts commit ${full_sha}".
61# Some lack that entirely, while others have many of them specified in ad-hoc
62# ways, while others use short SHAs and whatever.
63#
64# The 90% case is trivial to handle (and 100% free + automatic). The extra 10%
65# starts involving human intervention, which is probably not worth it for now.
66
67
68def _try_parse_reverts_from_commit_message(commit_message: str) -> List[str]:
69    if not commit_message:
70        return []
71
72    results = re.findall(
73        r"This reverts commit ([a-f0-9]{40})\b", commit_message
74    )
75
76    first_line = commit_message.splitlines()[0]
77    initial_revert = re.match(r'Revert ([a-f0-9]{6,}) "', first_line)
78    if initial_revert:
79        results.append(initial_revert.group(1))
80    return results
81
82
83def _stream_stdout(command: List[str]) -> Generator[str, None, None]:
84    with subprocess.Popen(
85        command, stdout=subprocess.PIPE, encoding="utf-8", errors="replace"
86    ) as p:
87        assert p.stdout is not None  # for mypy's happiness.
88        yield from p.stdout
89
90
91def _resolve_sha(git_dir: str, sha: str) -> str:
92    if len(sha) == 40:
93        return sha
94
95    return subprocess.check_output(
96        ["git", "-C", git_dir, "rev-parse", sha],
97        encoding="utf-8",
98        stderr=subprocess.DEVNULL,
99    ).strip()
100
101
102_LogEntry = NamedTuple(
103    "_LogEntry",
104    [
105        ("sha", str),
106        ("commit_message", str),
107    ],
108)
109
110
111def _log_stream(
112    git_dir: str, root_sha: str, end_at_sha: str
113) -> Iterable[_LogEntry]:
114    sep = 50 * "<>"
115    log_command = [
116        "git",
117        "-C",
118        git_dir,
119        "log",
120        "^" + end_at_sha,
121        root_sha,
122        "--format=" + sep + "%n%H%n%B%n",
123    ]
124
125    stdout_stream = iter(_stream_stdout(log_command))
126
127    # Find the next separator line. If there's nothing to log, it may not exist.
128    # It might not be the first line if git feels complainy.
129    found_commit_header = False
130    for line in stdout_stream:
131        if line.rstrip() == sep:
132            found_commit_header = True
133            break
134
135    while found_commit_header:
136        sha = next(stdout_stream, None)
137        assert sha is not None, "git died?"
138        sha = sha.rstrip()
139
140        commit_message = []
141
142        found_commit_header = False
143        for line in stdout_stream:
144            line = line.rstrip()
145            if line.rstrip() == sep:
146                found_commit_header = True
147                break
148            commit_message.append(line)
149
150        yield _LogEntry(sha, "\n".join(commit_message).rstrip())
151
152
153def _shas_between(git_dir: str, base_ref: str, head_ref: str) -> Iterable[str]:
154    rev_list = [
155        "git",
156        "-C",
157        git_dir,
158        "rev-list",
159        "--first-parent",
160        f"{base_ref}..{head_ref}",
161    ]
162    return (x.strip() for x in _stream_stdout(rev_list))
163
164
165def _rev_parse(git_dir: str, ref: str) -> str:
166    return subprocess.check_output(
167        ["git", "-C", git_dir, "rev-parse", ref],
168        encoding="utf-8",
169    ).strip()
170
171
172Revert = NamedTuple(
173    "Revert",
174    [
175        ("sha", str),
176        ("reverted_sha", str),
177    ],
178)
179
180
181def _find_common_parent_commit(git_dir: str, ref_a: str, ref_b: str) -> str:
182    """Finds the closest common parent commit between `ref_a` and `ref_b`."""
183    return subprocess.check_output(
184        ["git", "-C", git_dir, "merge-base", ref_a, ref_b],
185        encoding="utf-8",
186    ).strip()
187
188
189def find_reverts(git_dir: str, across_ref: str, root: str) -> List[Revert]:
190    """Finds reverts across `across_ref` in `git_dir`, starting from `root`.
191
192    These reverts are returned in order of oldest reverts first.
193    """
194    across_sha = _rev_parse(git_dir, across_ref)
195    root_sha = _rev_parse(git_dir, root)
196
197    common_ancestor = _find_common_parent_commit(git_dir, across_sha, root_sha)
198    if common_ancestor != across_sha:
199        raise ValueError(
200            f"{across_sha} isn't an ancestor of {root_sha} "
201            "(common ancestor: {common_ancestor})"
202        )
203
204    intermediate_commits = set(_shas_between(git_dir, across_sha, root_sha))
205    assert across_sha not in intermediate_commits
206
207    logging.debug(
208        "%d commits appear between %s and %s",
209        len(intermediate_commits),
210        across_sha,
211        root_sha,
212    )
213
214    all_reverts = []
215    for sha, commit_message in _log_stream(git_dir, root_sha, across_sha):
216        reverts = _try_parse_reverts_from_commit_message(commit_message)
217        if not reverts:
218            continue
219
220        resolved_reverts = sorted(
221            set(_resolve_sha(git_dir, x) for x in reverts)
222        )
223        for reverted_sha in resolved_reverts:
224            if reverted_sha in intermediate_commits:
225                logging.debug(
226                    "Commit %s reverts %s, which happened after %s",
227                    sha,
228                    reverted_sha,
229                    across_sha,
230                )
231                continue
232
233            try:
234                object_type = subprocess.check_output(
235                    ["git", "-C", git_dir, "cat-file", "-t", reverted_sha],
236                    encoding="utf-8",
237                    stderr=subprocess.DEVNULL,
238                ).strip()
239            except subprocess.CalledProcessError:
240                logging.warning(
241                    "Failed to resolve reverted object %s (claimed to be reverted "
242                    "by sha %s)",
243                    reverted_sha,
244                    sha,
245                )
246                continue
247
248            if object_type == "commit":
249                all_reverts.append(Revert(sha, reverted_sha))
250                continue
251
252            logging.error(
253                "%s claims to revert %s -- which isn't a commit -- %s",
254                sha,
255                object_type,
256                reverted_sha,
257            )
258
259    # Since `all_reverts` contains reverts in log order (e.g., newer comes before
260    # older), we need to reverse this to keep with our guarantee of older =
261    # earlier in the result.
262    all_reverts.reverse()
263    return all_reverts
264
265
266def _main() -> None:
267    parser = argparse.ArgumentParser(
268        description=__doc__,
269        formatter_class=argparse.RawDescriptionHelpFormatter,
270    )
271    parser.add_argument(
272        "base_ref", help="Git ref or sha to check for reverts around."
273    )
274    parser.add_argument(
275        "-C", "--git_dir", default=".", help="Git directory to use."
276    )
277    parser.add_argument(
278        "root", nargs="+", help="Root(s) to search for commits from."
279    )
280    parser.add_argument("--debug", action="store_true")
281    parser.add_argument(
282        "-u",
283        "--review_url",
284        action="store_true",
285        help="Format SHAs as llvm review URLs",
286    )
287    opts = parser.parse_args()
288
289    logging.basicConfig(
290        format="%(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: %(message)s",
291        level=logging.DEBUG if opts.debug else logging.INFO,
292    )
293
294    # `root`s can have related history, so we want to filter duplicate commits
295    # out. The overwhelmingly common case is also to have one root, and it's way
296    # easier to reason about output that comes in an order that's meaningful to
297    # git.
298    seen_reverts = set()
299    all_reverts = []
300    for root in opts.root:
301        for revert in find_reverts(opts.git_dir, opts.base_ref, root):
302            if revert not in seen_reverts:
303                seen_reverts.add(revert)
304                all_reverts.append(revert)
305
306    for revert in all_reverts:
307        sha_fmt = (
308            f"https://reviews.llvm.org/rG{revert.sha}"
309            if opts.review_url
310            else revert.sha
311        )
312        reverted_sha_fmt = (
313            f"https://reviews.llvm.org/rG{revert.reverted_sha}"
314            if opts.review_url
315            else revert.reverted_sha
316        )
317        print(f"{sha_fmt} claims to revert {reverted_sha_fmt}")
318
319
320if __name__ == "__main__":
321    _main()
322