1# Copyright 2021 Google LLC 2 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6 7# https://www.apache.org/licenses/LICENSE-2.0 8 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14 15from enum import IntEnum 16import json 17from multiprocessing import Pool 18import pandas as pd 19import pathlib 20import numpy as np 21 22BRANCH_ARTIFACTS_DIR = ( 23 pathlib.Path(__file__).parent.resolve() 24 / "googleapiclient" 25 / "discovery_cache" 26 / "documents" 27) 28MAIN_ARTIFACTS_DIR = ( 29 pathlib.Path(__file__).parent.resolve() 30 / ".." 31 / "main" 32 / "googleapiclient" 33 / "discovery_cache" 34 / "documents" 35) 36 37MULTIPROCESSING_NUM_PER_BATCH = 5 38MULTIPROCESSING_NUM_AGENTS = 10 39 40 41class ChangeType(IntEnum): 42 UNKNOWN = 0 43 DELETED = 1 44 ADDED = 2 45 CHANGED = 3 46 47 48class DirectoryDoesNotExist(ValueError): 49 """Raised when the specified directory does not exist.""" 50 51 pass 52 53 54class ChangeSummary: 55 """Represents the change summary between 2 directories containing \ 56 artifacts. 57 """ 58 59 def __init__(self, new_artifacts_dir, current_artifacts_dir, temp_dir, file_list): 60 """Initializes an instance of a ChangeSummary. 61 62 Args: 63 new_artifacts_dir (str): The relative path to the directory with the 64 new discovery artifacts. 65 current_artifacts_dir (str): The relative path to the directory with 66 the current discovery artifacts. 67 temp_dir (str): The relative path to the directory used for 68 temporary storage where intermediate files will be stored. 69 file_list (list): A list of strings containing files to analyze. 70 """ 71 72 self._file_list = file_list 73 self._new_artifacts_dir = pathlib.Path(new_artifacts_dir) 74 self._current_artifacts_dir = pathlib.Path(current_artifacts_dir) 75 self._temp_dir = pathlib.Path(temp_dir) 76 77 # Sanity checks to ensure directories exist 78 self._raise_if_directory_not_found(self._new_artifacts_dir) 79 self._raise_if_directory_not_found(self._current_artifacts_dir) 80 self._raise_if_directory_not_found(self._temp_dir) 81 82 def _raise_if_directory_not_found(self, directory): 83 """Raises if the `directory` doesn't exist 84 85 args: 86 directory (str): The relative path to the `directory` 87 """ 88 89 if not pathlib.Path(directory).exists(): 90 raise DirectoryDoesNotExist( 91 "Directory does not exist : {0}".format(directory) 92 ) 93 94 def _load_json_to_dataframe(self, file_path): 95 """Returns a pandas dataframe from the json file provided. 96 97 args: 98 file_path (str): The relative path to the discovery artifact to 99 parse. 100 """ 101 102 # Create an empty dataframe as we will need to return it if the file 103 # doesn't exist 104 dataframe_doc = pd.DataFrame() 105 106 if pathlib.Path(file_path).is_file(): 107 with open(file_path, "r") as f: 108 # Now load the json file into a pandas dataframe as a flat table 109 dataframe_doc = pd.json_normalize(json.load(f)) 110 return dataframe_doc 111 112 def _get_discovery_differences(self, filename): 113 """Returns a pandas dataframe which contains the differences with the 114 current and new discovery artifact directories, corresponding to the 115 file name provided. 116 117 args: 118 filename (str): The name of the discovery artifact to parse. 119 """ 120 # The paths of the 2 discovery artifacts to compare 121 current_artifact_path = self._current_artifacts_dir / filename 122 new_artifact_path = self._new_artifacts_dir / filename 123 124 # Use a helper functions to load the discovery artifacts into pandas 125 # dataframes 126 current_doc = self._load_json_to_dataframe(current_artifact_path) 127 new_doc = self._load_json_to_dataframe(new_artifact_path) 128 129 # Concatenate the 2 dataframes, transpose them, and create 130 # a new dataframe called combined_docs with columns 131 # `Key`, `CurrentValue`, `NewValue`. 132 combined_docs = ( 133 pd.concat([current_doc, new_doc], keys=["CurrentValue", "NewValue"]) 134 # Drop the index column 135 .reset_index(drop=True, level=1) 136 # Transpose the DataFrame, Resulting Columns should be 137 # ["Key", "CurrentValue", "New Value"] 138 .rename_axis(["Key"], axis=1).transpose() 139 # Drop the index column 140 .reset_index() 141 ) 142 143 # When discovery documents are added, the column `CurrentValue` will 144 # not exist. In that case, we'll just populate with `np.nan`. 145 if "CurrentValue" not in combined_docs.columns: 146 combined_docs["CurrentValue"] = np.nan 147 148 # When discovery documents are deleted, the column `NewValue` will 149 # not exist. In that case, we'll just populate with `np.nan`. 150 if "NewValue" not in combined_docs.columns: 151 combined_docs["NewValue"] = np.nan 152 153 # Split the Key into 2 columns for `Parent` and `Child` in order 154 # to group keys with the same parents together to summarize the changes 155 # by parent. 156 parent_child_df = combined_docs["Key"].str.rsplit(".", 1, expand=True) 157 # Rename the columns and join them with the combined_docs dataframe. 158 # If we only have a `Parent` column, it means that the Key doesn't have 159 # any children. 160 if len(parent_child_df.columns) == 1: 161 parent_child_df.columns = ["Parent"] 162 else: 163 parent_child_df.columns = ["Parent", "Child"] 164 combined_docs = combined_docs.join(parent_child_df) 165 166 # Create a new column `Added` to identify rows which have new keys. 167 combined_docs["Added"] = np.where( 168 combined_docs["CurrentValue"].isnull(), True, False 169 ) 170 171 # Create a new column `Deleted` to identify rows which have deleted keys. 172 combined_docs["Deleted"] = np.where( 173 combined_docs["NewValue"].isnull(), True, False 174 ) 175 176 # Aggregate the keys added by grouping keys with the same parents 177 # together to summarize the changes by parent rather than by key. 178 parent_added_agg = ( 179 combined_docs.groupby("Parent") 180 .Added.value_counts(normalize=True) 181 .reset_index(name="Proportion") 182 ) 183 184 # Add a column NumLevels to inicate the number of levels in the tree 185 # which will allow us to sort the parents in hierarchical order. 186 parent_added_agg["NumLevels"] = ( 187 parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x)) 188 ) 189 190 # Aggregate the keys deleted by grouping keys with the same parents 191 # together to summarize the changes by parent rather than by key. 192 parent_deleted_agg = ( 193 combined_docs.groupby("Parent") 194 .Deleted.value_counts(normalize=True) 195 .reset_index(name="Proportion") 196 ) 197 198 # Add a column NumLevels to inicate the number of levels in the tree 199 # which will allow us to sort the parents in hierarchical order. 200 parent_deleted_agg["NumLevels"] = ( 201 parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x)) 202 ) 203 204 # Create a list of all parents that have been added in hierarchical 205 # order. When `Proportion` is 1, it means that the parent is new as all 206 # children keys have been added. 207 all_added = ( 208 parent_added_agg[ 209 (parent_added_agg["Proportion"] == 1) 210 & (parent_added_agg["Added"] == True) 211 ][["Parent", "NumLevels"]] 212 .sort_values("NumLevels", ascending=True) 213 .Parent.to_list() 214 ) 215 216 # Create a list of all parents that have been deleted in hierarchical 217 # order. When `Proportion` is 1, it means that the parent is new as all 218 # children keys have been deleted. 219 all_deleted = ( 220 parent_deleted_agg[ 221 (parent_deleted_agg["Proportion"] == 1) 222 & (parent_deleted_agg["Deleted"] == True) 223 ][["Parent", "NumLevels"]] 224 .sort_values("NumLevels", ascending=True) 225 .Parent.to_list() 226 ) 227 228 # Go through the list of parents that have been added. If we find any 229 # keys with parents which are a substring of the parent in this list, 230 # then it means that the entire parent is new. We don't need verbose 231 # information about the children, so we replace the parent. 232 for i in range(0, len(all_added)): 233 word = all_added[i] 234 combined_docs.Parent = np.where( 235 combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent 236 ) 237 238 # Go through the list of parents that have been deleted. If we find any 239 # keys with parents which are a substring of the parent in this list, 240 # then it means that the entire parent is deleted. We don't need verbose 241 # information about the children, so we replace the parent. 242 for i in range(0, len(all_deleted)): 243 word = all_deleted[i] 244 combined_docs.Parent = np.where( 245 combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent 246 ) 247 248 # Create a new dataframe with only the keys which have changed 249 docs_diff = combined_docs[ 250 combined_docs["CurrentValue"] != combined_docs["NewValue"] 251 ].copy(deep=False) 252 253 # Get the API and Version from the file name but exclude the extension. 254 api_version_string = filename.split(".")[:-1] 255 # Create columns `Name` and `Version` using the version string 256 docs_diff["Name"] = api_version_string[0] 257 docs_diff["Version"] = ".".join(api_version_string[1:]) 258 259 # These conditions are used as arguments in the `np.where` function 260 # below. 261 deleted_condition = docs_diff["NewValue"].isnull() 262 added_condition = docs_diff["CurrentValue"].isnull() 263 264 # Create a new `ChangeType` column. The `np.where()` function is like a 265 # tenary operator. When the `deleted_condition` is `True`, the 266 # `ChangeType` will be `ChangeType.Deleted`. If the added_condition is 267 # `True` the `ChangeType` will be `ChangeType.Added`, otherwise the 268 # `ChangeType` will be `ChangeType.Changed`. 269 docs_diff["ChangeType"] = np.where( 270 deleted_condition, 271 ChangeType.DELETED, 272 np.where(added_condition, ChangeType.ADDED, ChangeType.CHANGED), 273 ) 274 275 # Filter out keys which rarely affect functionality. For example: 276 # {"description", "documentation", "enum", "etag", "revision", "title", 277 # "url", "rootUrl"} 278 docs_diff = docs_diff[ 279 ~docs_diff["Key"].str.contains( 280 "|".join(self._get_keys_to_ignore()), case=False 281 ) 282 ] 283 284 # Group keys with similar parents together and create a new column 285 # called 'Count' which indicates the number of keys that have been 286 # grouped together. The reason for the count column is that when keys 287 # have the same parent, we group them together to improve readability. 288 docs_diff_with_count = ( 289 docs_diff.groupby( 290 ["Parent", "Added", "Deleted", "Name", "Version", "ChangeType"] 291 ) 292 .size() 293 .reset_index(name="Count") 294 ) 295 296 # Add counts column 297 docs_diff = docs_diff.merge(docs_diff_with_count) 298 299 # When the count is greater than 1, update the key with the name of the 300 # parent since we are consolidating keys with the same parent. 301 docs_diff.loc[docs_diff["Count"] > 1, "Key"] = docs_diff["Parent"] 302 303 return docs_diff[ 304 ["Key", "Added", "Deleted", "Name", "Version", "ChangeType", "Count"] 305 ].drop_duplicates() 306 307 def _build_summary_message(self, api_name, is_feature): 308 """Returns a string containing the summary for a given api. The string 309 returned will be in the format `fix(<api_name>): update the API` 310 when `is_feature=False` and `feat(<api_name>)!: update the API` 311 when `is_feature=True`. 312 313 args: 314 api_name (str): The name of the api to include in the summary. 315 is_feature (bool): If True, include the prefix `feat` otherwise use 316 `fix` 317 """ 318 319 # Build the conventional commit string based on the arguments provided 320 commit_type = "feat" if is_feature else "fix" 321 return "{0}({1}): update the api".format(commit_type, api_name) 322 323 def _get_keys_to_ignore(self): 324 """Returns a list of strings with keys to ignore because they rarely 325 affect functionality. 326 327 args: None 328 """ 329 keys_to_ignore = [ 330 "description", 331 "documentation", 332 "enum", 333 "etag", 334 "revision", 335 "title", 336 "url", 337 "rootUrl", 338 ] 339 return keys_to_ignore 340 341 def _get_stable_versions(self, versions): 342 """Returns a pandas series `pd.Series()` of boolean values, 343 corresponding to the given series, indicating whether the version is 344 considered stable or not. 345 args: 346 versions (object): a pandas series containing version 347 information for all discovery artifacts. 348 """ 349 # Use a regex on the version to find versions with the pattern 350 # <v>.<0-9>.<0-9>.<0-9> . Any api that matches this pattern will be 351 # labeled as stable. In other words, v1, v1.4 and v1.4.5 is stable 352 # but v1b1 v1aplha and v1beta1 is not stable. 353 return versions.str.extract(r"(v\d?\.?\d?\.?\d+$)").notnull() 354 355 def _get_summary_and_write_to_disk(self, dataframe, directory): 356 """Writes summary information to file about changes made to discovery 357 artifacts based on the provided dataframe and returns a dataframe 358 with the same. The file `'allapis.dataframe'` is saved to the current 359 working directory. 360 args: 361 dataframe (object): a pandas dataframe containing summary change 362 information for all discovery artifacts 363 directory (str): path where the summary file should be saved 364 """ 365 366 dataframe["IsStable"] = self._get_stable_versions(dataframe["Version"]) 367 368 # Create a filter for features, which contains only rows which have keys 369 # that have been deleted or added, that will be used as an argument in 370 # the `np.where()` call below. 371 filter_features = (dataframe["ChangeType"] == ChangeType.DELETED) | ( 372 dataframe["ChangeType"] == ChangeType.ADDED 373 ) 374 375 # Create a new column `IsFeature` to indicate which rows should be 376 # considered as features. 377 dataframe["IsFeature"] = np.where(filter_features, True, np.nan) 378 379 # Create a new column `IsFeatureAggregate` which will be used to 380 # summarize the api changes. We can either have feature or fix but not 381 # both. 382 dataframe["IsFeatureAggregate"] = dataframe.groupby("Name").IsFeature.transform( 383 lambda x: x.any() 384 ) 385 386 # Create a new column `Summary`, which will contain a string with the 387 # conventional commit message. 388 dataframe["Summary"] = np.vectorize(self._build_summary_message)( 389 dataframe["Name"], dataframe["IsFeatureAggregate"] 390 ) 391 392 # Write the final dataframe to disk as it will be used in the 393 # buildprbody.py script 394 dataframe.to_csv(directory / "allapis.dataframe") 395 return dataframe 396 397 def _write_verbose_changes_to_disk(self, dataframe, directory, summary_df): 398 """Writes verbose information to file about changes made to discovery 399 artifacts based on the provided dataframe. A separate file is saved 400 for each api in the current working directory. The extension of the 401 files will be `'.verbose'`. 402 403 args: 404 dataframe (object): a pandas dataframe containing verbose change 405 information for all discovery artifacts 406 directory (str): path where the summary file should be saved 407 summary_df (object): A dataframe containing a summary of the changes 408 """ 409 # Array of strings which will contains verbose change information for 410 # each api 411 verbose_changes = [] 412 413 # Sort the dataframe to minimize file operations below. 414 dataframe.sort_values( 415 by=["Name", "Version", "ChangeType"], ascending=True, inplace=True 416 ) 417 418 # Select only the relevant columns. We need to create verbose output 419 # by Api Name, Version and ChangeType so we need to group by these 420 # columns. 421 422 change_type_groups = dataframe[ 423 ["Name", "Version", "ChangeType", "Key", "Count"] 424 ].groupby(["Name", "Version", "ChangeType"]) 425 426 lastApi = "" 427 lastVersion = "" 428 lastType = ChangeType.UNKNOWN 429 430 f = None 431 for name, group in change_type_groups: 432 currentApi = name[0] 433 currentVersion = name[1] 434 currentType = name[2] 435 436 # We need to handing file opening and closing when processing an API 437 # which is different from the previous one 438 if lastApi != currentApi: 439 # If we are processing a new api, close the file used for 440 # processing the previous API 441 if f is not None: 442 f.writelines(verbose_changes) 443 f.close() 444 f = None 445 # Clear the array of strings with information from the previous 446 # api and reset the last version 447 verbose_changes = [] 448 lastVersion = "" 449 # Create a file which contains verbose changes for the current 450 # API being processed 451 filename = "{0}.verbose".format(currentApi) 452 f = open(pathlib.Path(directory / filename), "a") 453 lastApi = currentApi 454 455 # Create a filter with only the rows for the current API 456 current_api_filter = summary_df["Name"] == currentApi 457 458 # Get the string in the `Summary` column for the current api and 459 # append it to `verbose_changes`. The `Summary` column contains 460 # the conventional commit message. Use pandas.Series.iloc[0] to 461 # retrieve only the first elemnt, since all the values in the 462 # summary column are the same for a given API. 463 verbose_changes.append(summary_df[current_api_filter].Summary.iloc[0]) 464 465 # If the version has changed, we need to create append a new heading 466 # in the verbose summary which contains the api and version. 467 if lastVersion != currentVersion: 468 # Append a header string with the API and version 469 verbose_changes.append( 470 "\n\n#### {0}:{1}\n\n".format(currentApi, currentVersion) 471 ) 472 473 lastVersion = currentVersion 474 lastType = ChangeType.UNKNOWN 475 476 # Whenever the change type is different, we need to create a new 477 # heading for the group of keys with the same change type. 478 if currentType != lastType: 479 if currentType == ChangeType.DELETED: 480 verbose_changes.append("\nThe following keys were deleted:\n") 481 elif currentType == ChangeType.ADDED: 482 verbose_changes.append("\nThe following keys were added:\n") 483 else: 484 verbose_changes.append("\nThe following keys were changed:\n") 485 486 lastType = currentType 487 488 # Append the keys, and corresponding count, in the same change 489 # type group. 490 verbose_changes.extend( 491 [ 492 "- {0} (Total Keys: {1})\n".format(row["Key"], row["Count"]) 493 for index, row in group[["Key", "Count"]].iterrows() 494 ] 495 ) 496 497 # Make sure to close the last file and write the changes. 498 if f is not None: 499 f.writelines(verbose_changes) 500 f.close() 501 f = None 502 503 def detect_discovery_changes(self): 504 """Writes a summary of the changes to the discovery artifacts to disk 505 at the path specified in `temp_dir`. 506 507 args: None 508 """ 509 result = pd.DataFrame() 510 # Process files in parallel to improve performance 511 with Pool(processes=MULTIPROCESSING_NUM_AGENTS) as pool: 512 result = result.append( 513 pool.map( 514 self._get_discovery_differences, 515 self._file_list, 516 MULTIPROCESSING_NUM_PER_BATCH, 517 ) 518 ) 519 520 if len(result): 521 # Sort the resulting dataframe by `Name`, `Version`, `ChangeType` 522 # and `Key` 523 sort_columns = ["Name", "Version", "ChangeType", "Key"] 524 result.sort_values(by=sort_columns, ascending=True, inplace=True) 525 526 # Create a folder which be used by the `createcommits.sh` and 527 # `buildprbody.py` scripts. 528 pathlib.Path(self._temp_dir).mkdir(exist_ok=True) 529 530 # Create a summary which contains a conventional commit message 531 # for each API and write it to disk. 532 summary_df = self._get_summary_and_write_to_disk(result, self._temp_dir) 533 534 # Create verbose change information for each API which contains 535 # a list of changes by key and write it to disk. 536 self._write_verbose_changes_to_disk(result, self._temp_dir, summary_df) 537