changesummary.py - OpenGrok cross reference for /aosp_15_r20/external/python/google-api-python-client/scripts/changesummary.py

# Copyright 2021 Google LLC

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     https://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from enum import IntEnum
import json
from multiprocessing import Pool
import pandas as pd
import pathlib
import numpy as np

BRANCH_ARTIFACTS_DIR = (
    pathlib.Path(__file__).parent.resolve()
    / "googleapiclient"
    / "discovery_cache"
    / "documents"
)
MAIN_ARTIFACTS_DIR = (
    pathlib.Path(__file__).parent.resolve()
    / ".."
    / "main"
    / "googleapiclient"
    / "discovery_cache"
    / "documents"
)

MULTIPROCESSING_NUM_PER_BATCH = 5
MULTIPROCESSING_NUM_AGENTS = 10


class ChangeType(IntEnum):
    UNKNOWN = 0
    DELETED = 1
    ADDED = 2
    CHANGED = 3


class DirectoryDoesNotExist(ValueError):
    """Raised when the specified directory does not exist."""

    pass


class ChangeSummary:
    """Represents the change summary between 2 directories containing \
        artifacts.
    """

    def __init__(self, new_artifacts_dir, current_artifacts_dir, temp_dir, file_list):
        """Initializes an instance of a ChangeSummary.

        Args:
            new_artifacts_dir (str): The relative path to the directory with the
                new discovery artifacts.
            current_artifacts_dir (str): The relative path to the directory with
                the current discovery artifacts.
            temp_dir (str): The relative path to the directory used for
                temporary storage where intermediate files will be stored.
            file_list (list): A list of strings containing files to analyze.
        """

        self._file_list = file_list
        self._new_artifacts_dir = pathlib.Path(new_artifacts_dir)
        self._current_artifacts_dir = pathlib.Path(current_artifacts_dir)
        self._temp_dir = pathlib.Path(temp_dir)

        # Sanity checks to ensure directories exist
        self._raise_if_directory_not_found(self._new_artifacts_dir)
        self._raise_if_directory_not_found(self._current_artifacts_dir)
        self._raise_if_directory_not_found(self._temp_dir)

    def _raise_if_directory_not_found(self, directory):
        """Raises if the `directory` doesn't exist

        args:
            directory (str): The relative path to the `directory`
        """

        if not pathlib.Path(directory).exists():
            raise DirectoryDoesNotExist(
                "Directory does not exist : {0}".format(directory)
            )

    def _load_json_to_dataframe(self, file_path):
        """Returns a pandas dataframe from the json file provided.

        args:
            file_path (str): The relative path to the discovery artifact to
                parse.
        """

        # Create an empty dataframe as we will need to return it if the file
        # doesn't exist
        dataframe_doc = pd.DataFrame()

        if pathlib.Path(file_path).is_file():
            with open(file_path, "r") as f:
                # Now load the json file into a pandas dataframe as a flat table
                dataframe_doc = pd.json_normalize(json.load(f))
        return dataframe_doc

    def _get_discovery_differences(self, filename):
        """Returns a pandas dataframe which contains the differences with the
        current and new discovery artifact directories, corresponding to the
        file name provided.

        args:
            filename (str): The name of the discovery artifact to parse.
        """
        # The paths of the 2 discovery artifacts to compare
        current_artifact_path = self._current_artifacts_dir / filename
        new_artifact_path = self._new_artifacts_dir / filename

        # Use a helper functions to load the discovery artifacts into pandas
        # dataframes
        current_doc = self._load_json_to_dataframe(current_artifact_path)
        new_doc = self._load_json_to_dataframe(new_artifact_path)

        # Concatenate the 2 dataframes, transpose them, and create
        # a new dataframe called combined_docs with columns
        # `Key`, `CurrentValue`, `NewValue`.
        combined_docs = (
            pd.concat([current_doc, new_doc], keys=["CurrentValue", "NewValue"])
            # Drop the index column
            .reset_index(drop=True, level=1)
            # Transpose the DataFrame, Resulting Columns should be
            # ["Key", "CurrentValue", "New Value"]
            .rename_axis(["Key"], axis=1).transpose()
            # Drop the index column
            .reset_index()
        )

        # When discovery documents are added, the column `CurrentValue` will
        # not exist. In that case, we'll just populate with `np.nan`.
        if "CurrentValue" not in combined_docs.columns:
            combined_docs["CurrentValue"] = np.nan

        # When discovery documents are deleted, the column `NewValue` will
        # not exist. In that case, we'll just populate with `np.nan`.
        if "NewValue" not in combined_docs.columns:
            combined_docs["NewValue"] = np.nan

        # Split the Key into 2 columns for `Parent` and `Child` in order
        # to group keys with the same parents together to summarize the changes
        # by parent.
        parent_child_df = combined_docs["Key"].str.rsplit(".", 1, expand=True)
        # Rename the columns and join them with the combined_docs dataframe.
        # If we only have a `Parent` column, it means that the Key doesn't have
        # any children.
        if len(parent_child_df.columns) == 1:
            parent_child_df.columns = ["Parent"]
        else:
            parent_child_df.columns = ["Parent", "Child"]
        combined_docs = combined_docs.join(parent_child_df)

        # Create a new column `Added` to identify rows which have new keys.
        combined_docs["Added"] = np.where(
            combined_docs["CurrentValue"].isnull(), True, False
        )

        # Create a new column `Deleted` to identify rows which have deleted keys.
        combined_docs["Deleted"] = np.where(
            combined_docs["NewValue"].isnull(), True, False
        )

        # Aggregate the keys added by grouping keys with the same parents
        # together to summarize the changes by parent rather than by key.
        parent_added_agg = (
            combined_docs.groupby("Parent")
            .Added.value_counts(normalize=True)
            .reset_index(name="Proportion")
        )

        # Add a column NumLevels to inicate the number of levels in the tree
        # which will allow us to sort the parents in hierarchical order.
        parent_added_agg["NumLevels"] = (
            parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x))
        )

        # Aggregate the keys deleted by grouping keys with the same parents
        # together to summarize the changes by parent rather than by key.
        parent_deleted_agg = (
            combined_docs.groupby("Parent")
            .Deleted.value_counts(normalize=True)
            .reset_index(name="Proportion")
        )

        # Add a column NumLevels to inicate the number of levels in the tree
        # which will allow us to sort the parents in hierarchical order.
        parent_deleted_agg["NumLevels"] = (
            parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x))
        )

        # Create a list of all parents that have been added in hierarchical
        # order. When `Proportion` is 1, it means that the parent is new as all
        # children keys have been added.
        all_added = (
            parent_added_agg[
                (parent_added_agg["Proportion"] == 1)
                & (parent_added_agg["Added"] == True)
            ][["Parent", "NumLevels"]]
            .sort_values("NumLevels", ascending=True)
            .Parent.to_list()
        )

        # Create a list of all parents that have been deleted in hierarchical
        # order. When `Proportion` is 1, it means that the parent is new as all
        # children keys have been deleted.
        all_deleted = (
            parent_deleted_agg[
                (parent_deleted_agg["Proportion"] == 1)
                & (parent_deleted_agg["Deleted"] == True)
            ][["Parent", "NumLevels"]]
            .sort_values("NumLevels", ascending=True)
            .Parent.to_list()
        )

        # Go through the list of parents that have been added. If we find any
        # keys with parents which are a substring of the parent in this list,
        # then it means that the entire parent is new. We don't need verbose
        # information about the children, so we replace the parent.
        for i in range(0, len(all_added)):
            word = all_added[i]
            combined_docs.Parent = np.where(
                combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent
            )

        # Go through the list of parents that have been deleted. If we find any
        # keys with parents which are a substring of the parent in this list,
        # then it means that the entire parent is deleted. We don't need verbose
        # information about the children, so we replace the parent.
        for i in range(0, len(all_deleted)):
            word = all_deleted[i]
            combined_docs.Parent = np.where(
                combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent
            )

        # Create a new dataframe with only the keys which have changed
        docs_diff = combined_docs[
            combined_docs["CurrentValue"] != combined_docs["NewValue"]
        ].copy(deep=False)

        # Get the API and Version from the file name but exclude the extension.
        api_version_string = filename.split(".")[:-1]
        # Create columns `Name` and `Version` using the version string
        docs_diff["Name"] = api_version_string[0]
        docs_diff["Version"] = ".".join(api_version_string[1:])

        # These conditions are used as arguments in the `np.where` function
        # below.
        deleted_condition = docs_diff["NewValue"].isnull()
        added_condition = docs_diff["CurrentValue"].isnull()

        # Create a new `ChangeType` column. The `np.where()` function is like a
        # tenary operator. When the `deleted_condition` is `True`, the
        # `ChangeType` will be `ChangeType.Deleted`. If the added_condition is
        # `True` the `ChangeType` will be `ChangeType.Added`, otherwise the
        # `ChangeType` will be `ChangeType.Changed`.
        docs_diff["ChangeType"] = np.where(
            deleted_condition,
            ChangeType.DELETED,
            np.where(added_condition, ChangeType.ADDED, ChangeType.CHANGED),
        )

        # Filter out keys which rarely affect functionality. For example:
        # {"description", "documentation", "enum", "etag", "revision", "title",
        # "url", "rootUrl"}
        docs_diff = docs_diff[
            ~docs_diff["Key"].str.contains(
                "|".join(self._get_keys_to_ignore()), case=False
            )
        ]

        # Group keys with similar parents together and create a new column
        # called 'Count' which indicates the number of keys that have been
        # grouped together. The reason for the count column is that when keys
        # have the same parent, we group them together to improve readability.
        docs_diff_with_count = (
            docs_diff.groupby(
                ["Parent", "Added", "Deleted", "Name", "Version", "ChangeType"]
            )
            .size()
            .reset_index(name="Count")
        )

        # Add counts column
        docs_diff = docs_diff.merge(docs_diff_with_count)

        # When the count is greater than 1, update the key with the name of the
        # parent since we are consolidating keys with the same parent.
        docs_diff.loc[docs_diff["Count"] > 1, "Key"] = docs_diff["Parent"]

        return docs_diff[
            ["Key", "Added", "Deleted", "Name", "Version", "ChangeType", "Count"]
        ].drop_duplicates()

    def _build_summary_message(self, api_name, is_feature):
        """Returns a string containing the summary for a given api. The string
        returned will be in the format `fix(<api_name>): update the API`
        when `is_feature=False` and `feat(<api_name>)!: update the API`
        when `is_feature=True`.

        args:
            api_name (str): The name of the api to include in the summary.
            is_feature (bool): If True, include the prefix `feat` otherwise use
                `fix`
        """

        # Build the conventional commit string based on the arguments provided
        commit_type = "feat" if is_feature else "fix"
        return "{0}({1}): update the api".format(commit_type, api_name)

    def _get_keys_to_ignore(self):
        """Returns a list of strings with keys to ignore because they rarely
            affect functionality.

        args: None
        """
        keys_to_ignore = [
            "description",
            "documentation",
            "enum",
            "etag",
            "revision",
            "title",
            "url",
            "rootUrl",
        ]
        return keys_to_ignore

    def _get_stable_versions(self, versions):
        """Returns a pandas series `pd.Series()` of boolean values,
        corresponding to the given series, indicating whether the version is
        considered stable or not.
        args:
            versions (object): a pandas series containing version
                information for all discovery artifacts.
        """
        # Use a regex on the version to find versions with the pattern
        # <v>.<0-9>.<0-9>.<0-9> . Any api that matches this pattern will be
        # labeled as stable. In other words, v1, v1.4 and v1.4.5 is stable
        # but v1b1 v1aplha and v1beta1 is not stable.
        return versions.str.extract(r"(v\d?\.?\d?\.?\d+$)").notnull()

    def _get_summary_and_write_to_disk(self, dataframe, directory):
        """Writes summary information to file about changes made to discovery
        artifacts based on the provided dataframe and returns a dataframe
        with the same. The file `'allapis.dataframe'` is saved to the current
        working directory.
        args:
            dataframe (object): a pandas dataframe containing summary change
                information for all discovery artifacts
            directory (str): path where the summary file should be saved
        """

        dataframe["IsStable"] = self._get_stable_versions(dataframe["Version"])

        # Create a filter for features, which contains only rows which have keys
        # that have been deleted or added, that will be used as an argument in
        # the `np.where()` call below.
        filter_features = (dataframe["ChangeType"] == ChangeType.DELETED) | (
            dataframe["ChangeType"] == ChangeType.ADDED
        )

        # Create a new column `IsFeature` to indicate which rows should be
        # considered as features.
        dataframe["IsFeature"] = np.where(filter_features, True, np.nan)

        # Create a new column `IsFeatureAggregate` which will be used to
        # summarize the api changes. We can either have feature or fix but not
        # both.
        dataframe["IsFeatureAggregate"] = dataframe.groupby("Name").IsFeature.transform(
            lambda x: x.any()
        )

        # Create a new column `Summary`, which will contain a string with the
        # conventional commit message.
        dataframe["Summary"] = np.vectorize(self._build_summary_message)(
            dataframe["Name"], dataframe["IsFeatureAggregate"]
        )

        # Write the final dataframe to disk as it will be used in the
        # buildprbody.py script
        dataframe.to_csv(directory / "allapis.dataframe")
        return dataframe

    def _write_verbose_changes_to_disk(self, dataframe, directory, summary_df):
        """Writes verbose information to file about changes made to discovery
        artifacts based on the provided dataframe. A separate file is saved
        for each api in the current working directory. The extension of the
        files will be `'.verbose'`.

        args:
            dataframe (object): a pandas dataframe containing verbose change
                information for all discovery artifacts
            directory (str): path where the summary file should be saved
            summary_df (object): A dataframe containing a summary of the changes
        """
        # Array of strings which will contains verbose change information for
        # each api
        verbose_changes = []

        # Sort the dataframe to minimize file operations below.
        dataframe.sort_values(
            by=["Name", "Version", "ChangeType"], ascending=True, inplace=True
        )

        # Select only the relevant columns. We need to create verbose output
        # by Api Name, Version and ChangeType so we need to group by these
        # columns.

        change_type_groups = dataframe[
            ["Name", "Version", "ChangeType", "Key", "Count"]
        ].groupby(["Name", "Version", "ChangeType"])

        lastApi = ""
        lastVersion = ""
        lastType = ChangeType.UNKNOWN

        f = None
        for name, group in change_type_groups:
            currentApi = name[0]
            currentVersion = name[1]
            currentType = name[2]

            # We need to handing file opening and closing when processing an API
            # which is different from the previous one
            if lastApi != currentApi:
                # If we are processing a new api, close the file used for
                # processing the previous API
                if f is not None:
                    f.writelines(verbose_changes)
                    f.close()
                    f = None
                # Clear the array of strings with information from the previous
                # api and reset the last version
                verbose_changes = []
                lastVersion = ""
                # Create a file which contains verbose changes for the current
                # API being processed
                filename = "{0}.verbose".format(currentApi)
                f = open(pathlib.Path(directory / filename), "a")
                lastApi = currentApi

                # Create a filter with only the rows for the current API
                current_api_filter = summary_df["Name"] == currentApi

                # Get the string in the `Summary` column for the current api and
                # append it to `verbose_changes`. The `Summary` column contains
                # the conventional commit message. Use pandas.Series.iloc[0] to
                # retrieve only the first elemnt, since all the values in the
                # summary column are the same for a given API.
                verbose_changes.append(summary_df[current_api_filter].Summary.iloc[0])

            # If the version has changed, we need to create append a new heading
            # in the verbose summary which contains the api and version.
            if lastVersion != currentVersion:
                # Append a header string with the API and version
                verbose_changes.append(
                    "\n\n#### {0}:{1}\n\n".format(currentApi, currentVersion)
                )

                lastVersion = currentVersion
                lastType = ChangeType.UNKNOWN

            # Whenever the change type is different, we need to create a new
            # heading for the group of keys with the same change type.
            if currentType != lastType:
                if currentType == ChangeType.DELETED:
                    verbose_changes.append("\nThe following keys were deleted:\n")
                elif currentType == ChangeType.ADDED:
                    verbose_changes.append("\nThe following keys were added:\n")
                else:
                    verbose_changes.append("\nThe following keys were changed:\n")

                lastType = currentType

                # Append the keys, and corresponding count, in the same change
                # type group.
                verbose_changes.extend(
                    [
                        "- {0} (Total Keys: {1})\n".format(row["Key"], row["Count"])
                        for index, row in group[["Key", "Count"]].iterrows()
                    ]
                )

        # Make sure to close the last file and write the changes.
        if f is not None:
            f.writelines(verbose_changes)
            f.close()
            f = None

    def detect_discovery_changes(self):
        """Writes a summary of the changes to the discovery artifacts to disk
            at the path specified in `temp_dir`.

        args: None
        """
        result = pd.DataFrame()
        # Process files in parallel to improve performance
        with Pool(processes=MULTIPROCESSING_NUM_AGENTS) as pool:
            result = result.append(
                pool.map(
                    self._get_discovery_differences,
                    self._file_list,
                    MULTIPROCESSING_NUM_PER_BATCH,
                )
            )

        if len(result):
            # Sort the resulting dataframe by `Name`, `Version`, `ChangeType`
            # and `Key`
            sort_columns = ["Name", "Version", "ChangeType", "Key"]
            result.sort_values(by=sort_columns, ascending=True, inplace=True)

            # Create a folder which be used by the `createcommits.sh` and
            # `buildprbody.py` scripts.
            pathlib.Path(self._temp_dir).mkdir(exist_ok=True)

            # Create a summary which contains a conventional commit message
            # for each API and write it to disk.
            summary_df = self._get_summary_and_write_to_disk(result, self._temp_dir)

            # Create verbose change information for each API which contains
            # a list of changes by key and write it to disk.
            self._write_verbose_changes_to_disk(result, self._temp_dir, summary_df)