xref: /aosp_15_r20/external/bazelbuild-rules_python/python/private/pypi/simpleapi_download.bzl (revision 60517a1edbc8ecf509223e9af94a7adec7d736b8)
1# Copyright 2024 The Bazel Authors. All rights reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""
16A file that houses private functions used in the `bzlmod` extension with the same name.
17"""
18
19load("@bazel_features//:features.bzl", "bazel_features")
20load("//python/private:auth.bzl", "get_auth")
21load("//python/private:envsubst.bzl", "envsubst")
22load("//python/private:normalize_name.bzl", "normalize_name")
23load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")
24
25def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
26    """Download Simple API HTML.
27
28    Args:
29        ctx: The module_ctx or repository_ctx.
30        attr: Contains the parameters for the download. They are grouped into a
31          struct for better clarity. It must have attributes:
32           * index_url: str, the index.
33           * index_url_overrides: dict[str, str], the index overrides for
34             separate packages.
35           * extra_index_urls: Extra index URLs that will be looked up after
36             the main is looked up.
37           * sources: list[str], the sources to download things for. Each value is
38             the contents of requirements files.
39           * envsubst: list[str], the envsubst vars for performing substitution in index url.
40           * netrc: The netrc parameter for ctx.download, see http_file for docs.
41           * auth_patterns: The auth_patterns parameter for ctx.download, see
42               http_file for docs.
43        cache: A dictionary that can be used as a cache between calls during a
44            single evaluation of the extension. We use a dictionary as a cache
45            so that we can reuse calls to the simple API when evaluating the
46            extension. Using the canonical_id parameter of the module_ctx would
47            deposit the simple API responses to the bazel cache and that is
48            undesirable because additions to the PyPI index would not be
49            reflected when re-evaluating the extension unless we do
50            `bazel clean --expunge`.
51        parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.
52
53    Returns:
54        dict of pkg name to the parsed HTML contents - a list of structs.
55    """
56    index_url_overrides = {
57        normalize_name(p): i
58        for p, i in (attr.index_url_overrides or {}).items()
59    }
60
61    download_kwargs = {}
62    if bazel_features.external_deps.download_has_block_param:
63        download_kwargs["block"] = not parallel_download
64
65    # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
66    # to replicate how `pip` would handle this case.
67    async_downloads = {}
68    contents = {}
69    index_urls = [attr.index_url] + attr.extra_index_urls
70    for pkg in attr.sources:
71        pkg_normalized = normalize_name(pkg)
72
73        success = False
74        for index_url in index_urls:
75            result = _read_simpleapi(
76                ctx = ctx,
77                url = "{}/{}/".format(
78                    index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
79                    pkg,
80                ),
81                attr = attr,
82                cache = cache,
83                **download_kwargs
84            )
85            if hasattr(result, "wait"):
86                # We will process it in a separate loop:
87                async_downloads.setdefault(pkg_normalized, []).append(
88                    struct(
89                        pkg_normalized = pkg_normalized,
90                        wait = result.wait,
91                    ),
92                )
93                continue
94
95            if result.success:
96                contents[pkg_normalized] = result.output
97                success = True
98                break
99
100        if not async_downloads and not success:
101            fail("Failed to download metadata from urls: {}".format(
102                ", ".join(index_urls),
103            ))
104
105    if not async_downloads:
106        return contents
107
108    # If we use `block` == False, then we need to have a second loop that is
109    # collecting all of the results as they were being downloaded in parallel.
110    for pkg, downloads in async_downloads.items():
111        success = False
112        for download in downloads:
113            result = download.wait()
114
115            if result.success and download.pkg_normalized not in contents:
116                contents[download.pkg_normalized] = result.output
117                success = True
118
119        if not success:
120            fail("Failed to download metadata from urls: {}".format(
121                ", ".join(index_urls),
122            ))
123
124    return contents
125
126def _read_simpleapi(ctx, url, attr, cache, **download_kwargs):
127    """Read SimpleAPI.
128
129    Args:
130        ctx: The module_ctx or repository_ctx.
131        url: str, the url parameter that can be passed to ctx.download.
132        attr: The attribute that contains necessary info for downloading. The
133          following attributes must be present:
134           * envsubst: The envsubst values for performing substitutions in the URL.
135           * netrc: The netrc parameter for ctx.download, see http_file for docs.
136           * auth_patterns: The auth_patterns parameter for ctx.download, see
137               http_file for docs.
138        cache: A dict for storing the results.
139        **download_kwargs: Any extra params to ctx.download.
140            Note that output and auth will be passed for you.
141
142    Returns:
143        A similar object to what `download` would return except that in result.out
144        will be the parsed simple api contents.
145    """
146    # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
147    # the whl location and we cannot handle multiple URLs at once by passing
148    # them to ctx.download if we want to correctly handle the relative URLs.
149    # TODO: Add a test that env subbed index urls do not leak into the lock file.
150
151    real_url = envsubst(
152        url,
153        attr.envsubst,
154        ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get,
155    )
156
157    cache_key = real_url
158    if cache_key in cache:
159        return struct(success = True, output = cache[cache_key])
160
161    output_str = envsubst(
162        url,
163        attr.envsubst,
164        # Use env names in the subst values - this will be unique over
165        # the lifetime of the execution of this function and we also use
166        # `~` as the separator to ensure that we don't get clashes.
167        {e: "~{}~".format(e) for e in attr.envsubst}.get,
168    )
169
170    # Transform the URL into a valid filename
171    for char in [".", ":", "/", "\\", "-"]:
172        output_str = output_str.replace(char, "_")
173
174    output = ctx.path(output_str.strip("_").lower() + ".html")
175
176    # NOTE: this may have block = True or block = False in the download_kwargs
177    download = ctx.download(
178        url = [real_url],
179        output = output,
180        auth = get_auth(ctx, [real_url], ctx_attr = attr),
181        allow_fail = True,
182        **download_kwargs
183    )
184
185    if download_kwargs.get("block") == False:
186        # Simulate the same API as ctx.download has
187        return struct(
188            wait = lambda: _read_index_result(ctx, download.wait(), output, real_url, cache, cache_key),
189        )
190
191    return _read_index_result(ctx, download, output, real_url, cache, cache_key)
192
193def _read_index_result(ctx, result, output, url, cache, cache_key):
194    if not result.success:
195        return struct(success = False)
196
197    content = ctx.read(output)
198
199    output = parse_simpleapi_html(url = url, content = content)
200    if output:
201        cache.setdefault(cache_key, output)
202        return struct(success = True, output = output, cache_key = cache_key)
203    else:
204        return struct(success = False)
205