xref: /aosp_15_r20/external/toolchain-utils/llvm_tools/werror_logs.py (revision 760c253c1ed00ce9abd48f8546f08516e57485fe)
1#!/usr/bin/env python3
2# Copyright 2024 The ChromiumOS Authors
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Helps reason about -Werror logs emitted by the compiler wrapper.
7
8Specifically, this works with the -Werror reports produced by the compiler
9wrapper in FORCE_DISABLE_WERROR mode. It's intended to be run on trees of these
10reports, so devs can run roughly the following commands:
11
12$ apply_force_disable_werror  # (There's no actual script to do this today.)
13$ build_packages --board=foo --nousepkg
14$ ./werror_logs.py aggregate --directory=/build/foo/var/lib/chromeos
15
16And see a full aggregation of all warnings that were suppressed in that
17`build_packages` invocation.
18
19It can also be used to fetch warnings reports from CQ runs, for instance,
20$ ./werror_logs.py fetch-cq --cq-orchestrator-id=123456
21
22In this case, it downloads _all -Werror logs_ from children of the given
23cq-orchestrator, and prints the parent directory of all of these reports. If
24you run `aggregate` on this directory, it's highly recommended to use the
25`--canonicalize-board-roots` flag.
26"""
27
28import argparse
29import collections
30import dataclasses
31import json
32import logging
33import multiprocessing.pool
34import os
35from pathlib import Path
36import re
37import shutil
38import subprocess
39import sys
40import tempfile
41import threading
42from typing import Any, Counter, DefaultDict, Dict, IO, Iterable, List, Optional
43
44import cros_cls
45
46
47_DEFAULT_FETCH_DIRECTORY = Path("/tmp/werror_logs")
48
49
50def canonicalize_file_path_board_root(file_path: str) -> str:
51    # Get rid of double slashes, unnecessary directory traversal
52    # (foo/../bar/..), etc. Easier to read this way.
53    file_path = os.path.normpath(file_path)
54    if file_path.startswith("/build/"):
55        i = file_path.find("/", len("/build/"))
56        if i != -1:
57            return f"/build/{{board}}/{file_path[i+1:]}"
58    return file_path
59
60
61@dataclasses.dataclass(frozen=True, eq=True, order=True)
62class ClangWarningLocation:
63    """Represents a location at which a Clang warning was emitted."""
64
65    file: str
66    line: int
67    column: int
68
69    @classmethod
70    def parse(
71        cls, location: str, canonicalize_board_root: bool = False
72    ) -> "ClangWarningLocation":
73        split = location.rsplit(":", 2)
74        if len(split) == 3:
75            file = split[0]
76            if canonicalize_board_root:
77                file = canonicalize_file_path_board_root(file)
78            return cls(file=file, line=int(split[1]), column=int(split[2]))
79        raise ValueError(f"Invalid location: {location!r}")
80
81
82@dataclasses.dataclass(frozen=True, eq=True)
83class ClangWarning:
84    """Represents a Clang warning at a specific location (if applicable)."""
85
86    # The name of the warning, e.g., -Wunused-variable
87    name: str
88    # The message of the warning, e.g., "'allocate' is deprecated."
89    message: str
90    # The location of this warning. Not present for frontend diagnostics.
91    location: Optional[ClangWarningLocation]
92
93    # This parses two kinds of errors:
94    # 1. `clang-17: error: foo [-W...]`
95    # 2. `/file/path:123:45: error: foo [-W...]"
96    _WARNING_RE = re.compile(
97        # Capture the location on its own, since `clang-\d+` is unused below.
98        r"^(?:([^:]*:\d+:\d+)|clang-\d+)"
99        r": error: "
100        # Capture the message
101        r"(.*?)\s+"
102        r"\[(-W[^\][]+)]\s*$"
103    )
104
105    @classmethod
106    def try_parse_line(
107        cls, line: str, canonicalize_board_root: bool = False
108    ) -> Optional["ClangWarning"]:
109        # Fast path: we can expect "error: " in interesting lines. Break early
110        # if that's not present.
111        if "error: " not in line:
112            return None
113
114        m = cls._WARNING_RE.fullmatch(line)
115        if not m:
116            return None
117
118        location, message, warning_flags = m.groups()
119        individual_warning_flags = [
120            x for x in warning_flags.split(",") if x != "-Werror"
121        ]
122
123        # This isn't impossible to handle in theory, just unexpected. Complain
124        # about it.
125        if len(individual_warning_flags) != 1:
126            raise ValueError(
127                f"Weird: parsed warnings {individual_warning_flags} out "
128                f"of {line}"
129            )
130
131        if location is None:
132            parsed_location = None
133        else:
134            parsed_location = ClangWarningLocation.parse(
135                location, canonicalize_board_root
136            )
137        return cls(
138            name=individual_warning_flags[0],
139            message=message,
140            location=parsed_location,
141        )
142
143
144@dataclasses.dataclass(frozen=True, eq=True)
145class WarningInfo:
146    """Carries information about a ClangWarning."""
147
148    packages: DefaultDict[str, int] = dataclasses.field(
149        default_factory=lambda: collections.defaultdict(int)
150    )
151
152
153class UnknownPackageNameError(ValueError):
154    """Raised when a package name can't be determined from a warning report."""
155
156
157@dataclasses.dataclass
158class AggregatedWarnings:
159    """Aggregates warning reports incrementally."""
160
161    num_reports: int = 0
162    # Mapping of warning -> list of packages that emitted it. Warnings in
163    # headers may be referred to by multiple packages.
164    warnings: DefaultDict[ClangWarning, WarningInfo] = dataclasses.field(
165        default_factory=lambda: collections.defaultdict(WarningInfo)
166    )
167
168    _CWD_PACKAGE_RE = re.compile(
169        r"^(?:/build/[^/]+)?/var/(?:cache|tmp)/portage/([^/]+/[^/]+)/"
170    )
171
172    @classmethod
173    def _guess_package_name(cls, report: Dict[str, Any]) -> str:
174        """Tries to guess what package `report` is from.
175
176        Raises:
177            UnknownPackageNameError if the package's name couldn't be
178            determined.
179        """
180        m = cls._CWD_PACKAGE_RE.match(report.get("cwd", ""))
181        if not m:
182            raise UnknownPackageNameError()
183        return m.group(1)
184
185    def add_report_json(
186        self, report_json: Dict[str, Any], canonicalize_board_root: bool = False
187    ) -> int:
188        """Adds the given report, returning the number of warnings parsed.
189
190        Raises:
191            UnknownPackageNameError if the package's name couldn't be
192            determined.
193        """
194        self.num_reports += 1
195        package_name = self._guess_package_name(report_json)
196
197        num_warnings = 0
198        for line in report_json.get("stdout", "").splitlines():
199            if parsed := ClangWarning.try_parse_line(
200                line, canonicalize_board_root
201            ):
202                self.warnings[parsed].packages[package_name] += 1
203                num_warnings += 1
204
205        return num_warnings
206
207    def add_report(
208        self, report_file: Path, canonicalize_board_root: bool = False
209    ) -> None:
210        with report_file.open(encoding="utf-8") as f:
211            report = json.load(f)
212
213        try:
214            n = self.add_report_json(report, canonicalize_board_root)
215        except UnknownPackageNameError:
216            logging.warning(
217                "Failed guessing package name for report at %r; ignoring file",
218                report_file,
219            )
220            return
221
222        if not n:
223            logging.warning(
224                "Report at %r had no parseable warnings", report_file
225            )
226
227
228def print_aligned_counts(
229    name_count_map: Dict[str, int], file: Optional[IO[str]] = None
230) -> None:
231    assert name_count_map
232    # Sort on value, highest first. Name breaks ties.
233    summary = sorted(name_count_map.items(), key=lambda x: (-x[1], x[0]))
234    num_col_width = len(f"{summary[0][1]:,}")
235    name_col_width = max(len(x) for x in name_count_map)
236    for name, count in summary:
237        fmt_name = name.rjust(name_col_width)
238        fmt_count = f"{count:,}".rjust(num_col_width)
239        print(f"\t{fmt_name}: {fmt_count}", file=file)
240
241
242def summarize_per_package_warnings(
243    warning_infos: Iterable[WarningInfo],
244    file: Optional[IO[str]] = None,
245) -> None:
246    warnings_per_package: DefaultDict[str, int] = collections.defaultdict(int)
247    for info in warning_infos:
248        for package_name, warning_count in info.packages.items():
249            warnings_per_package[package_name] += warning_count
250
251    if not warnings_per_package:
252        return
253
254    print("## Per-package warning counts:", file=file)
255    print_aligned_counts(warnings_per_package, file=file)
256
257
258def summarize_warnings_by_flag(
259    warnings: Dict[ClangWarning, WarningInfo],
260    file: Optional[IO[str]] = None,
261) -> None:
262    if not warnings:
263        return
264
265    warnings_per_flag: Counter[str] = collections.Counter()
266    for warning, info in warnings.items():
267        warnings_per_flag[warning.name] += sum(info.packages.values())
268
269    print("## Instances of each fatal warning:", file=file)
270    print_aligned_counts(warnings_per_flag, file=file)
271
272
273def aggregate_reports(opts: argparse.Namespace) -> None:
274    directory = opts.directory
275    aggregated = AggregatedWarnings()
276    for report in directory.glob("**/warnings_report*.json"):
277        logging.debug("Discovered report %s", report)
278        aggregated.add_report(report, opts.canonicalize_board_roots)
279
280    if not aggregated.num_reports:
281        raise ValueError(f"Found no warnings report under {directory}")
282
283    logging.info("Discovered %d report files in total", aggregated.num_reports)
284    summarize_per_package_warnings(aggregated.warnings.values())
285    summarize_warnings_by_flag(aggregated.warnings)
286
287
288def fetch_werror_tarball_links(
289    child_builders: Dict[str, cros_cls.BuildID]
290) -> List[str]:
291    outputs = cros_cls.CQBoardBuilderOutput.fetch_many(child_builders.values())
292    artifacts_links = []
293    for builder_name, out in zip(child_builders, outputs):
294        if out.artifacts_link:
295            artifacts_links.append(out.artifacts_link)
296        else:
297            logging.info("%s had no output artifacts; ignoring", builder_name)
298
299    gsutil_stdout = subprocess.run(
300        ["gsutil", "-m", "ls"] + artifacts_links,
301        check=True,
302        encoding="utf-8",
303        stdin=subprocess.DEVNULL,
304        stdout=subprocess.PIPE,
305    ).stdout
306
307    return [
308        x
309        for x in gsutil_stdout.splitlines()
310        if x.endswith(".fatal_clang_warnings.tar.xz")
311    ]
312
313
314def cq_builder_name_from_werror_logs_path(werror_logs: str) -> str:
315    """Returns the CQ builder given a -Werror logs path.
316
317    >>> cq_builder_name_from_werror_logs_path(
318            "gs://chromeos-image-archive/staryu-cq/"
319            "R123-15771.0.0-94466-8756713501925941617/"
320            "staryu.20240207.fatal_clang_warnings.tar.xz"
321        )
322    "staryu-cq"
323    """
324    return os.path.basename(os.path.dirname(os.path.dirname(werror_logs)))
325
326
327def download_and_unpack_werror_tarballs(
328    unpack_dir: Path, download_dir: Path, gs_urls: List[str]
329):
330    # This is necessary below when we're untarring files. It should trivially
331    # always be the case, and assuming it makes testing easier.
332    assert download_dir.is_absolute(), download_dir
333
334    unpack_dir.mkdir()
335    download_dir.mkdir()
336
337    logging.info(
338        "Fetching and unpacking %d -Werror reports; this may take a bit",
339        len(gs_urls),
340    )
341    # Run the download in a threadpool since we can have >100 logs, and all of
342    # this is heavily I/O-bound.
343    # Max 8 downloads at a time is arbitrary, but should minimize the chance of
344    # rate-limiting. Don't limit `tar xaf`, since those should be short-lived.
345    download_limiter = threading.BoundedSemaphore(8)
346
347    def download_one_url(
348        unpack_dir: Path, download_dir: Path, gs_url: str
349    ) -> Optional[subprocess.CalledProcessError]:
350        """Downloads and unpacks -Werror logs from the given gs_url.
351
352        Leaves the tarball in `download_dir`, and the unpacked version in
353        `unpack_dir`.
354
355        Returns:
356            None if all went well; otherwise, returns the command that failed.
357            All commands have stderr data piped in.
358        """
359        file_targ = download_dir / os.path.basename(gs_url)
360        try:
361            with download_limiter:
362                subprocess.run(
363                    ["gsutil", "cp", gs_url, file_targ],
364                    check=True,
365                    stdin=subprocess.DEVNULL,
366                    stdout=subprocess.DEVNULL,
367                    stderr=subprocess.PIPE,
368                    encoding="utf-8",
369                    errors="replace",
370                )
371
372            # N.B., file_targ is absolute, so running with `file_targ` while
373            # changing `cwd` is safe.
374            subprocess.run(
375                ["tar", "xaf", file_targ],
376                check=True,
377                cwd=unpack_dir,
378                stdin=subprocess.DEVNULL,
379                stdout=subprocess.DEVNULL,
380                stderr=subprocess.PIPE,
381                encoding="utf-8",
382                errors="replace",
383            )
384        except subprocess.CalledProcessError as e:
385            return e
386        return None
387
388    with multiprocessing.pool.ThreadPool() as thread_pool:
389        download_futures = []
390        for gs_url in gs_urls:
391            name = cq_builder_name_from_werror_logs_path(gs_url)
392            unpack_to = unpack_dir / name
393            unpack_to.mkdir()
394            download_to = download_dir / name
395            download_to.mkdir()
396            download_futures.append(
397                (
398                    name,
399                    thread_pool.apply_async(
400                        download_one_url, (unpack_to, download_to, gs_url)
401                    ),
402                )
403            )
404
405        num_failures = 0
406        for name, future in download_futures:
407            result = future.get()
408            if not result:
409                continue
410
411            num_failures += 1
412            logging.error(
413                "Downloading %s failed: running %r. Stderr: %r",
414                name,
415                result.cmd,
416                result.stderr,
417            )
418    if num_failures:
419        raise ValueError(f"{num_failures} download(s) failed.")
420
421
422def fetch_cq_reports(opts: argparse.Namespace) -> None:
423    if opts.cl:
424        logging.info(
425            "Fetching most recent completed CQ orchestrator from %s", opts.cl
426        )
427        all_ids = cros_cls.fetch_cq_orchestrator_ids(opts.cl)
428        if not all_ids:
429            raise ValueError(
430                f"No CQ orchestrators found under {opts.cl}. See --help for "
431                "how to pass a build ID directly."
432            )
433        # Note that these cq-orchestrator runs are returned in oldest-to-newest
434        # order. The user probably wants the newest run.
435        cq_orchestrator_id = all_ids[-1]
436        cq_orchestrator_url = cros_cls.builder_url(cq_orchestrator_id)
437        logging.info("Checking CQ run %s", cq_orchestrator_url)
438    else:
439        cq_orchestrator_id = opts.cq_orchestrator_id
440        cq_orchestrator_url = cros_cls.builder_url(cq_orchestrator_id)
441
442    # This is the earliest point at which we can compute this directory with
443    # certainty. Figure it out now and fail early if it exists.
444    output_directory = opts.directory
445    if not output_directory:
446        output_directory = _DEFAULT_FETCH_DIRECTORY / str(cq_orchestrator_id)
447
448    if output_directory.exists():
449        if not opts.force:
450            sys.exit(
451                f"Directory at {output_directory} exists; not overwriting. "
452                "Pass --force to overwrite."
453            )
454        # Actually _remove_ it when we have all logs unpacked and are able to
455        # create the output directory with confidence.
456
457    logging.info("Fetching info on child builders of %s", cq_orchestrator_url)
458    child_builders = cros_cls.CQOrchestratorOutput.fetch(
459        cq_orchestrator_id
460    ).child_builders
461    if not child_builders:
462        raise ValueError(f"No child builders found for {cq_orchestrator_url}")
463
464    logging.info(
465        "%d child builders found; finding associated tarball links",
466        len(child_builders),
467    )
468    werror_links = fetch_werror_tarball_links(child_builders)
469    if not werror_links:
470        raise ValueError(
471            f"No -Werror logs found in children of {cq_orchestrator_url}"
472        )
473
474    logging.info("%d -Werror logs found", len(werror_links))
475    with tempfile.TemporaryDirectory("werror_logs_fetch_cq") as t:
476        tempdir = Path(t)
477        unpack_dir = tempdir / "unpacked"
478        download_and_unpack_werror_tarballs(
479            unpack_dir=unpack_dir,
480            download_dir=tempdir / "tarballs",
481            gs_urls=werror_links,
482        )
483
484        if output_directory.exists():
485            logging.info("Removing output directory at %s", output_directory)
486            shutil.rmtree(output_directory)
487        output_directory.parent.mkdir(parents=True, exist_ok=True)
488        # (Convert these to strs to keep mypy happy.)
489        shutil.move(str(unpack_dir), str(output_directory))
490        logging.info(
491            "CQ logs from %s stored in %s",
492            cq_orchestrator_url,
493            output_directory,
494        )
495
496
497def main(argv: List[str]) -> None:
498    parser = argparse.ArgumentParser(
499        description=__doc__,
500        formatter_class=argparse.RawDescriptionHelpFormatter,
501    )
502    parser.add_argument(
503        "--debug", action="store_true", help="Enable debug logging"
504    )
505    subparsers = parser.add_subparsers(required=True)
506    # b/318833638: While there's only one subparser here for the moment, more
507    # are expected to come (specifically, one to download logs from a CQ run).
508    aggregate = subparsers.add_parser(
509        "aggregate",
510        help="""
511        Aggregate all -Werror reports beneath a directory. Note that this will
512        traverse all children of the directory, so can be used either on
513        unpacked -Werror reports from CQ builders, or can be used on e.g.,
514        /build/cherry/var/lib/chromeos.
515        """,
516    )
517    aggregate.set_defaults(func=aggregate_reports)
518    aggregate.add_argument(
519        "--canonicalize-board-roots",
520        action="store_true",
521        help="""
522        Converts warnings paths starting with a board root (e.g., /build/atlas)
523        to a form consistent across many boards.
524        """,
525    )
526    aggregate.add_argument(
527        "--directory", type=Path, required=True, help="Directory to inspect."
528    )
529
530    fetch_cq = subparsers.add_parser(
531        "fetch-cq",
532        help="Fetch all -Werror reports for a CQ run.",
533    )
534    fetch_cq.set_defaults(func=fetch_cq_reports)
535    cl_or_cq_orchestrator = fetch_cq.add_mutually_exclusive_group(required=True)
536    cl_or_cq_orchestrator.add_argument(
537        "--cl",
538        type=cros_cls.ChangeListURL.parse_with_patch_set,
539        help="Link to a CL to get the most recent cq-orchestrator from",
540    )
541    cl_or_cq_orchestrator.add_argument(
542        "--cq-orchestrator-id",
543        type=cros_cls.BuildID,
544        help="""
545        Build number for a cq-orchestrator run. Builders invoked by this are
546        examined for -Werror logs.
547        """,
548    )
549    fetch_cq.add_argument(
550        "--directory",
551        type=Path,
552        help=f"""
553        Directory to put downloaded -Werror logs in. Default is a subdirectory
554        of {_DEFAULT_FETCH_DIRECTORY}.
555        """,
556    )
557    fetch_cq.add_argument(
558        "-f",
559        "--force",
560        action="store_true",
561        help="Remove the directory at `--directory` if it exists",
562    )
563
564    opts = parser.parse_args(argv)
565
566    logging.basicConfig(
567        format=">> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: "
568        "%(message)s",
569        level=logging.DEBUG if opts.debug else logging.INFO,
570    )
571
572    assert getattr(opts, "func", None), "Unknown subcommand?"
573    opts.func(opts)
574
575
576if __name__ == "__main__":
577    main(sys.argv[1:])
578