xref: /aosp_15_r20/external/bazelbuild-rules_python/python/private/pypi/parse_simpleapi_html.bzl (revision 60517a1edbc8ecf509223e9af94a7adec7d736b8)
1# Copyright 2024 The Bazel Authors. All rights reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""
16Parse SimpleAPI HTML in Starlark.
17"""
18
19def parse_simpleapi_html(*, url, content):
20    """Get the package URLs for given shas by parsing the Simple API HTML.
21
22    Args:
23        url(str): The URL that the HTML content can be downloaded from.
24        content(str): The Simple API HTML content.
25
26    Returns:
27        A list of structs with:
28        * filename: The filename of the artifact.
29        * url: The URL to download the artifact.
30        * sha256: The sha256 of the artifact.
31        * metadata_sha256: The whl METADATA sha256 if we can download it. If this is
32          present, then the 'metadata_url' is also present. Defaults to "".
33        * metadata_url: The URL for the METADATA if we can download it. Defaults to "".
34    """
35    sdists = {}
36    whls = {}
37    lines = content.split("<a href=\"")
38
39    _, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"")
40    api_version, _, _ = api_version.partition("\"")
41
42    # We must assume the 1.0 if it is not present
43    # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients
44    api_version = api_version or "1.0"
45    api_version = tuple([int(i) for i in api_version.split(".")])
46
47    if api_version >= (2, 0):
48        # We don't expect to have version 2.0 here, but have this check in place just in case.
49        # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
50        fail("Unsupported API version: {}".format(api_version))
51
52    # Each line follows the following pattern
53    # <a href="https://...#sha256=..." attribute1="foo" ... attributeN="bar">filename</a><br />
54    for line in lines[1:]:
55        dist_url, _, tail = line.partition("#sha256=")
56        sha256, _, tail = tail.partition("\"")
57
58        # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
59        yanked = "data-yanked" in line
60
61        head, _, _ = tail.rpartition("</a>")
62        maybe_metadata, _, filename = head.rpartition(">")
63
64        metadata_sha256 = ""
65        metadata_url = ""
66        for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]:
67            metadata_marker = metadata_marker + "=\"sha256="
68            if metadata_marker in maybe_metadata:
69                # Implement https://peps.python.org/pep-0714/
70                _, _, tail = maybe_metadata.partition(metadata_marker)
71                metadata_sha256, _, _ = tail.partition("\"")
72                metadata_url = dist_url + ".metadata"
73                break
74
75        if filename.endswith(".whl"):
76            whls[sha256] = struct(
77                filename = filename,
78                url = _absolute_url(url, dist_url),
79                sha256 = sha256,
80                metadata_sha256 = metadata_sha256,
81                metadata_url = _absolute_url(url, metadata_url) if metadata_url else "",
82                yanked = yanked,
83            )
84        else:
85            sdists[sha256] = struct(
86                filename = filename,
87                url = _absolute_url(url, dist_url),
88                sha256 = sha256,
89                metadata_sha256 = "",
90                metadata_url = "",
91                yanked = yanked,
92            )
93
94    return struct(
95        sdists = sdists,
96        whls = whls,
97    )
98
99def _get_root_directory(url):
100    scheme_end = url.find("://")
101    if scheme_end == -1:
102        fail("Invalid URL format")
103
104    scheme = url[:scheme_end]
105    host_end = url.find("/", scheme_end + 3)
106    if host_end == -1:
107        host_end = len(url)
108    host = url[scheme_end + 3:host_end]
109
110    return "{}://{}".format(scheme, host)
111
112def _is_downloadable(url):
113    """Checks if the URL would be accepted by the Bazel downloader.
114
115    This is based on Bazel's HttpUtils::isUrlSupportedByDownloader
116    """
117    return url.startswith("http://") or url.startswith("https://") or url.startswith("file://")
118
119def _absolute_url(index_url, candidate):
120    if candidate == "":
121        return candidate
122
123    if _is_downloadable(candidate):
124        return candidate
125
126    if candidate.startswith("/"):
127        # absolute path
128        root_directory = _get_root_directory(index_url)
129        return "{}{}".format(root_directory, candidate)
130
131    if candidate.startswith(".."):
132        # relative path with up references
133        candidate_parts = candidate.split("..")
134        last = candidate_parts[-1]
135        for _ in range(len(candidate_parts) - 1):
136            index_url, _, _ = index_url.rstrip("/").rpartition("/")
137
138        return "{}/{}".format(index_url, last.strip("/"))
139
140    # relative path without up-references
141    return "{}/{}".format(index_url, candidate)
142