1# Contains all of the classes that are shared across both the Canary Wrapper and the Persistent Canary Wrapper scripts 2# If a class can/is reused, then it should be in this file. 3 4# Needs to be installed prior to running 5import boto3 6import psutil 7# Part of standard packages in Python 3.4+ 8import time 9import os 10import json 11import subprocess 12import zipfile 13import datetime 14 15# ================================================================================ 16 17# Class that holds metric data and has a few utility functions for getting that data in a format we can use for Cloudwatch 18class DataSnapshot_Metric(): 19 def __init__(self, metric_name, metric_function, metric_dimensions=[], 20 metric_unit="None", metric_alarm_threshold=None, metric_alarm_severity=6, 21 git_hash="", git_repo_name="", reports_to_skip=0, is_percent=False): 22 self.metric_name = metric_name 23 self.metric_function = metric_function 24 self.metric_dimensions = metric_dimensions 25 self.metric_unit = metric_unit 26 self.metric_alarm_threshold = metric_alarm_threshold 27 self.metric_alarm_name = self.metric_name + "-" + git_repo_name + "-" + git_hash 28 self.metric_alarm_description = 'Alarm for metric "' + self.metric_name + '" - git hash: ' + git_hash 29 self.metric_value = None 30 self.reports_to_skip = reports_to_skip 31 self.metric_alarm_severity = metric_alarm_severity 32 self.is_percent = is_percent 33 34 # Gets the latest metric value from the metric_function callback 35 def get_metric_value(self, psutil_process : psutil.Process): 36 if not self.metric_function is None: 37 self.metric_value = self.metric_function(psutil_process) 38 return self.metric_value 39 40 # Returns the data needed to send to Cloudwatch when posting metrics 41 def get_metric_cloudwatch_dictionary(self): 42 if (self.reports_to_skip > 0): 43 self.reports_to_skip -= 1 44 return None # skips sending to Cloudwatch 45 46 if (self.metric_value == None): 47 return None # skips sending to Cloudwatch 48 49 return { 50 "MetricName": self.metric_name, 51 "Dimensions": self.metric_dimensions, 52 "Value": self.metric_value, 53 "Unit": self.metric_unit 54 } 55 56class DataSnapshot_Dashboard_Widget(): 57 def __init__(self, widget_name, metric_namespace, metric_dimension, cloudwatch_region="us-east-1", widget_period=60) -> None: 58 self.metric_list = [] 59 self.region = cloudwatch_region 60 self.widget_name = widget_name 61 self.metric_namespace = metric_namespace 62 self.metric_dimension = metric_dimension 63 self.widget_period = widget_period 64 65 def add_metric_to_widget(self, new_metric_name): 66 try: 67 self.metric_list.append(new_metric_name) 68 except Exception as e: 69 print ("[DataSnapshot_Dashboard] ERROR - could not add metric to dashboard widget due to exception!") 70 print ("[DataSnapshot_Dashboard] Exception: " + str(e)) 71 72 def remove_metric_from_widget(self, existing_metric_name): 73 try: 74 self.metric_list.remove(existing_metric_name) 75 except Exception as e: 76 print ("[DataSnapshot_Dashboard] ERROR - could not remove metric from dashboard widget due to exception!") 77 print ("[DataSnapshot_Dashboard] Exception: " + str(e)) 78 79 def get_widget_dictionary(self): 80 metric_list_json = [] 81 for metric_name in self.metric_list: 82 metric_list_json.append([self.metric_namespace, metric_name, self.metric_dimension, metric_name]) 83 84 return { 85 "type":"metric", 86 "properties" : { 87 "metrics" : metric_list_json, 88 "region": self.region, 89 "title": self.widget_name, 90 "period": self.widget_period, 91 }, 92 "width": 14, 93 "height": 10 94 } 95 96# ================================================================================ 97 98# Class that keeps track of the metrics registered, sets up Cloudwatch and S3, and sends periodic reports 99# Is the backbone of the reporting operation 100class DataSnapshot(): 101 def __init__(self, 102 git_hash=None, 103 git_repo_name=None, 104 git_hash_as_namespace=False, 105 git_fixed_namespace_text="mqtt5_canary", 106 datetime_string=None, 107 output_log_filepath=None, 108 output_to_console=True, 109 cloudwatch_region="us-east-1", 110 cloudwatch_make_dashboard=False, 111 cloudwatch_teardown_alarms_on_complete=True, 112 cloudwatch_teardown_dashboard_on_complete=True, 113 s3_bucket_name="canary-wrapper-bucket", 114 s3_bucket_upload_on_complete=True, 115 lambda_name="CanarySendEmailLambda", 116 metric_frequency=None): 117 118 # Setting initial values 119 # ================== 120 self.first_metric_call = True 121 self.metrics = [] 122 self.metrics_numbers = [] 123 self.metric_report_number = 0 124 self.metric_report_non_zero_count = 4 125 126 # Needed so we can initialize Cloudwatch alarms, etc, outside of the init function 127 # but before we start sending data. 128 # This boolean tracks whether we have done the post-initialization prior to sending the first report. 129 self.perform_final_initialization = True 130 131 # Watched by the thread creating the snapshot. Will cause the thread(s) to abort and return an error. 132 self.abort_due_to_internal_error = False 133 self.abort_due_to_internal_error_reason = "" 134 self.abort_due_to_internal_error_due_to_credentials = False 135 136 self.git_hash = None 137 self.git_repo_name = None 138 self.git_hash_as_namespace = git_hash_as_namespace 139 self.git_fixed_namespace_text = git_fixed_namespace_text 140 self.git_metric_namespace = None 141 142 self.cloudwatch_region = cloudwatch_region 143 self.cloudwatch_client = None 144 self.cloudwatch_make_dashboard = cloudwatch_make_dashboard 145 self.cloudwatch_teardown_alarms_on_complete = cloudwatch_teardown_alarms_on_complete 146 self.cloudwatch_teardown_dashboard_on_complete = cloudwatch_teardown_dashboard_on_complete 147 self.cloudwatch_dashboard_name = "" 148 self.cloudwatch_dashboard_widgets = [] 149 150 self.s3_bucket_name = s3_bucket_name 151 self.s3_client = None 152 self.s3_bucket_upload_on_complete = s3_bucket_upload_on_complete 153 154 self.output_to_file_filepath = output_log_filepath 155 self.output_to_file = False 156 self.output_file = None 157 self.output_to_console = output_to_console 158 159 self.lambda_client = None 160 self.lambda_name = lambda_name 161 162 self.datetime_string = datetime_string 163 self.metric_frequency = metric_frequency 164 # ================== 165 166 # Check for valid credentials 167 # ================== 168 try: 169 tmp_sts_client = boto3.client('sts') 170 tmp_sts_client.get_caller_identity() 171 except Exception as e: 172 print ("[DataSnapshot] ERROR - AWS credentials are NOT valid!") 173 self.abort_due_to_internal_error = True 174 self.abort_due_to_internal_error_reason = "AWS credentials are NOT valid!" 175 self.abort_due_to_internal_error_due_to_credentials = True 176 return 177 # ================== 178 179 # Git related stuff 180 # ================== 181 if (git_hash == None or git_repo_name == None): 182 print("[DataSnapshot] ERROR - a Git hash and repository name are REQUIRED for the canary wrapper to run!") 183 self.abort_due_to_internal_error = True 184 self.abort_due_to_internal_error_reason = "No Git hash and repository passed!" 185 return 186 187 self.git_hash = git_hash 188 self.git_repo_name = git_repo_name 189 190 if (self.git_hash_as_namespace == False): 191 self.git_metric_namespace = self.git_fixed_namespace_text 192 else: 193 if (self.datetime_string == None): 194 git_namespace_prepend_text = self.git_repo_name + "-" + self.git_hash 195 else: 196 git_namespace_prepend_text = self.git_repo_name + "/" + self.datetime_string + "-" + self.git_hash 197 self.git_metric_namespace = git_namespace_prepend_text 198 # ================== 199 200 # Cloudwatch related stuff 201 # ================== 202 try: 203 self.cloudwatch_client = boto3.client('cloudwatch', self.cloudwatch_region) 204 self.cloudwatch_dashboard_name = self.git_metric_namespace 205 except Exception as e: 206 self.print_message("[DataSnapshot] ERROR - could not make Cloudwatch client due to exception!") 207 self.print_message("[DataSnapshot] Exception: " + str(e)) 208 self.cloudwatch_client = None 209 self.abort_due_to_internal_error = True 210 self.abort_due_to_internal_error_reason = "Could not make Cloudwatch client!" 211 return 212 # ================== 213 214 # S3 related stuff 215 # ================== 216 try: 217 self.s3_client = boto3.client("s3") 218 except Exception as e: 219 self.print_message("[DataSnapshot] ERROR - could not make S3 client due to exception!") 220 self.print_message("[DataSnapshot] Exception: " + str(e)) 221 self.s3_client = None 222 self.abort_due_to_internal_error = True 223 self.abort_due_to_internal_error_reason = "Could not make S3 client!" 224 return 225 # ================== 226 227 # Lambda related stuff 228 # ================== 229 try: 230 self.lambda_client = boto3.client("lambda", self.cloudwatch_region) 231 except Exception as e: 232 self.print_message("[DataSnapshot] ERROR - could not make Lambda client due to exception!") 233 self.print_message("[DataSnapshot] Exception: " + str(e)) 234 self.lambda_client = None 235 self.abort_due_to_internal_error = True 236 self.abort_due_to_internal_error_reason = "Could not make Lambda client!" 237 return 238 # ================== 239 240 # File output (logs) related stuff 241 # ================== 242 if (not output_log_filepath is None): 243 self.output_to_file = True 244 self.output_file = open(self.output_to_file_filepath, "w") 245 else: 246 self.output_to_file = False 247 self.output_file = None 248 # ================== 249 250 self.print_message("[DataSnapshot] Data snapshot created!") 251 252 # Cleans the class - closing any files, removing alarms, and sending data to S3. 253 # Should be called at the end when you are totally finished shadowing metrics 254 def cleanup(self, error_occurred=False): 255 if (self.s3_bucket_upload_on_complete == True): 256 self.export_result_to_s3_bucket(copy_output_log=True, log_is_error=error_occurred) 257 258 self._cleanup_cloudwatch_alarms() 259 if (self.cloudwatch_make_dashboard == True): 260 self._cleanup_cloudwatch_dashboard() 261 262 self.print_message("[DataSnapshot] Data snapshot cleaned!") 263 264 if (self.output_file is not None): 265 self.output_file.close() 266 self.output_file = None 267 268 # Utility function for printing messages 269 def print_message(self, message): 270 if self.output_to_file == True: 271 self.output_file.write(message + "\n") 272 if self.output_to_console == True: 273 print(message, flush=True) 274 275 # Utility function - adds the metric alarms to Cloudwatch. We do run this right before the first 276 # collection of metrics so we can register metrics before we initialize Cloudwatch 277 def _init_cloudwatch_pre_first_run(self): 278 for metric in self.metrics: 279 if (not metric.metric_alarm_threshold is None): 280 self._add_cloudwatch_metric_alarm(metric) 281 282 if (self.cloudwatch_make_dashboard == True): 283 self._init_cloudwatch_pre_first_run_dashboard() 284 285 # Utility function - adds the Cloudwatch Dashboard for the currently running data snapshot 286 def _init_cloudwatch_pre_first_run_dashboard(self): 287 try: 288 # Remove the old dashboard if it exists before adding a new one 289 self._cleanup_cloudwatch_dashboard() 290 291 new_dashboard_widgets_array = [] 292 for widget in self.cloudwatch_dashboard_widgets: 293 new_dashboard_widgets_array.append(widget.get_widget_dictionary()) 294 295 new_dashboard_body = { 296 "start": "-PT1H", 297 "widgets": new_dashboard_widgets_array, 298 } 299 new_dashboard_body_json = json.dumps(new_dashboard_body) 300 301 self.cloudwatch_client.put_dashboard( 302 DashboardName=self.cloudwatch_dashboard_name, 303 DashboardBody= new_dashboard_body_json) 304 self.print_message("[DataSnapshot] Added Cloudwatch dashboard successfully") 305 except Exception as e: 306 self.print_message("[DataSnapshot] ERROR - Cloudwatch client could not make dashboard due to exception!") 307 self.print_message("[DataSnapshot] Exception: " + str(e)) 308 self.abort_due_to_internal_error = True 309 self.abort_due_to_internal_error_reason = "Cloudwatch client could not make dashboard due to exception" 310 return 311 312 # Utility function - The function that adds each individual metric alarm. 313 def _add_cloudwatch_metric_alarm(self, metric): 314 if self.cloudwatch_client is None: 315 self.print_message("[DataSnapshot] ERROR - Cloudwatch client not setup. Cannot register alarm") 316 return 317 318 try: 319 self.cloudwatch_client.put_metric_alarm( 320 AlarmName=metric.metric_alarm_name, 321 AlarmDescription=metric.metric_alarm_description, 322 MetricName=metric.metric_name, 323 Namespace=self.git_metric_namespace, 324 Statistic="Maximum", 325 Dimensions=metric.metric_dimensions, 326 Period=60, # How long (in seconds) is an evaluation period? 327 EvaluationPeriods=120, # How many periods does it need to be invalid for? 328 DatapointsToAlarm=1, # How many data points need to be invalid? 329 Threshold=metric.metric_alarm_threshold, 330 ComparisonOperator="GreaterThanOrEqualToThreshold", 331 ) 332 except Exception as e: 333 self.print_message("[DataSnapshot] ERROR - could not register alarm for metric due to exception: " + metric.metric_name) 334 self.print_message("[DataSnapshot] Exception: " + str(e)) 335 336 # Utility function - removes all the Cloudwatch alarms for the metrics 337 def _cleanup_cloudwatch_alarms(self): 338 if (self.cloudwatch_teardown_alarms_on_complete == True): 339 try: 340 for metric in self.metrics: 341 if (not metric.metric_alarm_threshold is None): 342 self.cloudwatch_client.delete_alarms(AlarmNames=[metric.metric_alarm_name]) 343 except Exception as e: 344 self.print_message("[DataSnapshot] ERROR - could not delete alarms due to exception!") 345 self.print_message("[DataSnapshot] Exception: " + str(e)) 346 347 # Utility function - removes all Cloudwatch dashboards created 348 def _cleanup_cloudwatch_dashboard(self): 349 if (self.cloudwatch_teardown_dashboard_on_complete == True): 350 try: 351 self.cloudwatch_client.delete_dashboards(DashboardNames=[self.cloudwatch_dashboard_name]) 352 self.print_message("[DataSnapshot] Cloudwatch Dashboards deleted successfully!") 353 except Exception as e: 354 self.print_message("[DataSnapshot] ERROR - dashboard cleaning function failed due to exception!") 355 self.print_message("[DataSnapshot] Exception: " + str(e)) 356 self.abort_due_to_internal_error = True 357 self.abort_due_to_internal_error_reason = "Cloudwatch dashboard cleaning function failed due to exception" 358 return 359 360 # Returns the results of the metric alarms. Will return a list containing tuples with the following structure: 361 # [Boolean (False = the alarm is in the ALARM state), String (Name of the alarm that is in the ALARM state), int (severity of alarm)] 362 # Currently this function will only return a list of failed alarms, so if the returned list is empty, then it means all 363 # alarms did not get to the ALARM state in Cloudwatch for the registered metrics 364 def get_cloudwatch_alarm_results(self): 365 return self._check_cloudwatch_alarm_states() 366 367 # Utility function - collects the metric alarm results and returns them in a list. 368 def _check_cloudwatch_alarm_states(self): 369 return_result_list = [] 370 371 tmp = None 372 for metric in self.metrics: 373 tmp = self._check_cloudwatch_alarm_state_metric(metric) 374 if (tmp[1] != None): 375 # Do not cut a ticket for the "Alive_Alarm" that we use to check if the Canary is running 376 if ("Alive_Alarm" in tmp[1] == False): 377 if (tmp[0] != True): 378 return_result_list.append(tmp) 379 380 return return_result_list 381 382 # Utility function - checks each individual alarm and returns a tuple with the following format: 383 # [Boolean (False if the alarm is in the ALARM state, otherwise it is true), String (name of the alarm), Int (severity of alarm)] 384 def _check_cloudwatch_alarm_state_metric(self, metric): 385 alarms_response = self.cloudwatch_client.describe_alarms_for_metric( 386 MetricName=metric.metric_name, 387 Namespace=self.git_metric_namespace, 388 Dimensions=metric.metric_dimensions) 389 390 return_result = [True, None, metric.metric_alarm_severity] 391 392 for metric_alarm_dict in alarms_response["MetricAlarms"]: 393 if metric_alarm_dict["StateValue"] == "ALARM": 394 return_result[0] = False 395 return_result[1] = metric_alarm_dict["AlarmName"] 396 break 397 398 return return_result 399 400 # Exports a file with the same name as the commit Git hash to an S3 bucket in a folder with the Git repo name. 401 # By default, this file will only contain the Git hash. 402 # If copy_output_log is true, then the output log will be copied into this file, which may be useful for debugging. 403 def export_result_to_s3_bucket(self, copy_output_log=False, log_is_error=False): 404 if (self.s3_client is None): 405 self.print_message("[DataSnapshot] ERROR - No S3 client initialized! Cannot send log to S3") 406 self.abort_due_to_internal_error = True 407 self.abort_due_to_internal_error_reason = "S3 client not initialized and therefore cannot send log to S3" 408 return 409 410 s3_file = open(self.git_hash + ".log", "w") 411 s3_file.write(self.git_hash) 412 413 # Might be useful for debugging? 414 if (copy_output_log == True and self.output_to_file == True): 415 # Are we still writing? If so, then we need to close the file first so everything is written to it 416 is_output_file_open_previously = False 417 if (self.output_file != None): 418 self.output_file.close() 419 is_output_file_open_previously = True 420 self.output_file = open(self.output_to_file_filepath, "r") 421 422 s3_file.write("\n\nOUTPUT LOG\n") 423 s3_file.write("==========================================================================================\n") 424 output_file_lines = self.output_file.readlines() 425 for line in output_file_lines: 426 s3_file.write(line) 427 428 self.output_file.close() 429 430 # If we were writing to the output previously, then we need to open in RW mode so we can continue to write to it 431 if (is_output_file_open_previously == True): 432 self.output_to_file = open(self.output_to_file_filepath, "a") 433 434 s3_file.close() 435 436 # Upload to S3 437 try: 438 if (log_is_error == False): 439 if (self.datetime_string == None): 440 self.s3_client.upload_file(self.git_hash + ".log", self.s3_bucket_name, self.git_repo_name + "/" + self.git_hash + ".log") 441 else: 442 self.s3_client.upload_file(self.git_hash + ".log", self.s3_bucket_name, self.git_repo_name + "/" + self.datetime_string + "/" + self.git_hash + ".log") 443 else: 444 if (self.datetime_string == None): 445 self.s3_client.upload_file(self.git_hash + ".log", self.s3_bucket_name, self.git_repo_name + "/Failed_Logs/" + self.git_hash + ".log") 446 else: 447 self.s3_client.upload_file(self.git_hash + ".log", self.s3_bucket_name, self.git_repo_name + "/Failed_Logs/" + self.datetime_string + "/" + self.git_hash + ".log") 448 self.print_message("[DataSnapshot] Uploaded to S3!") 449 except Exception as e: 450 self.print_message("[DataSnapshot] ERROR - could not upload to S3 due to exception!") 451 self.print_message("[DataSnapshot] Exception: " + str(e)) 452 self.abort_due_to_internal_error = True 453 self.abort_due_to_internal_error_reason = "S3 client had exception and therefore could not upload log!" 454 os.remove(self.git_hash + ".log") 455 return 456 457 # Delete the file when finished 458 os.remove(self.git_hash + ".log") 459 460 # Sends an email via a special lambda. The payload has to contain a message and a subject 461 # * (REQUIRED) message is the message you want to send in the body of the email 462 # * (REQUIRED) subject is the subject that the email will be sent with 463 def lambda_send_email(self, message, subject): 464 465 payload = {"Message":message, "Subject":subject} 466 payload_string = json.dumps(payload) 467 468 try: 469 self.lambda_client.invoke( 470 FunctionName=self.lambda_name, 471 InvocationType="Event", 472 ClientContext="MQTT Wrapper Script", 473 Payload=payload_string 474 ) 475 except Exception as e: 476 self.print_message("[DataSnapshot] ERROR - could not send email via Lambda due to exception!") 477 self.print_message("[DataSnapshot] Exception: " + str(e)) 478 self.abort_due_to_internal_error = True 479 self.abort_due_to_internal_error_reason = "Lambda email function had an exception!" 480 return 481 482 # Registers a metric to be polled by the Snapshot. 483 # * (REQUIRED) new_metric_name is the name of the metric. Cloudwatch will use this name 484 # * (REQUIRED) new_metric_function is expected to be a pointer to a Python function and will not work if you pass a value/object 485 # * (OPTIONAL) new_metric_unit is the metric unit. There is a list of possible metric unit types on the Boto3 documentation for Cloudwatch 486 # * (OPTIONAL) new_metric_alarm_threshold is the value that the metric has to exceed in order to be registered as an alarm 487 # * (OPTIONAL) new_reports_to_skip is the number of reports this metric will return nothing, but will get it's value. 488 # * Useful for CPU calculations that require deltas 489 # * (OPTIONAL) new_metric_alarm_severity is the severity of the ticket if this alarm is triggered. A severity of 6+ means no ticket. 490 # * (OPTIONAL) is_percent whether or not to display the metric as a percent when printing it (default=false) 491 def register_metric(self, new_metric_name, new_metric_function, new_metric_unit="None", 492 new_metric_alarm_threshold=None, new_metric_reports_to_skip=0, new_metric_alarm_severity=6, is_percent=False): 493 494 new_metric_dimensions = [] 495 496 if (self.git_hash_as_namespace == False): 497 git_namespace_prepend_text = self.git_repo_name + "-" + self.git_hash 498 new_metric_dimensions.append( 499 {"Name": git_namespace_prepend_text, "Value": new_metric_name}) 500 else: 501 new_metric_dimensions.append( 502 {"Name": "System_Metrics", "Value": new_metric_name}) 503 504 new_metric = DataSnapshot_Metric( 505 metric_name=new_metric_name, 506 metric_function=new_metric_function, 507 metric_dimensions=new_metric_dimensions, 508 metric_unit=new_metric_unit, 509 metric_alarm_threshold=new_metric_alarm_threshold, 510 metric_alarm_severity=new_metric_alarm_severity, 511 git_hash=self.git_hash, 512 git_repo_name=self.git_repo_name, 513 reports_to_skip=new_metric_reports_to_skip, 514 is_percent=is_percent 515 ) 516 self.metrics.append(new_metric) 517 # append an empty list so we can track it's metrics over time 518 self.metrics_numbers.append([]) 519 520 def register_dashboard_widget(self, new_widget_name, metrics_to_add=[], new_widget_period=60): 521 522 # We need to know what metric dimension to get the metric(s) from 523 metric_dimension_string = "" 524 if (self.git_hash_as_namespace == False): 525 metric_dimension_string = self.git_repo_name + "-" + self.git_hash 526 else: 527 metric_dimension_string = "System_Metrics" 528 529 widget = self._find_cloudwatch_widget(name=new_widget_name) 530 if (widget == None): 531 widget = DataSnapshot_Dashboard_Widget( 532 widget_name=new_widget_name, metric_namespace=self.git_metric_namespace, 533 metric_dimension=metric_dimension_string, 534 cloudwatch_region=self.cloudwatch_region, 535 widget_period=new_widget_period) 536 self.cloudwatch_dashboard_widgets.append(widget) 537 538 for metric in metrics_to_add: 539 self.register_metric_to_dashboard_widget(widget_name=new_widget_name, metric_name=metric) 540 541 def register_metric_to_dashboard_widget(self, widget_name, metric_name, widget=None): 542 if widget is None: 543 widget = self._find_cloudwatch_widget(name=widget_name) 544 if widget is None: 545 print ("[DataSnapshot] ERROR - could not find widget with name: " + widget_name, flush=True) 546 return 547 548 # Adjust metric name so it has the git hash, repo, etc 549 metric_name_formatted = metric_name 550 551 widget.add_metric_to_widget(new_metric_name=metric_name_formatted) 552 return 553 554 def remove_metric_from_dashboard_widget(self, widget_name, metric_name, widget=None): 555 if widget is None: 556 widget = self._find_cloudwatch_widget(name=widget_name) 557 if widget is None: 558 print ("[DataSnapshot] ERROR - could not find widget with name: " + widget_name, flush=True) 559 return 560 widget.remove_metric_from_widget(existing_metric_name=metric_name) 561 return 562 563 def _find_cloudwatch_widget(self, name): 564 result = None 565 for widget in self.cloudwatch_dashboard_widgets: 566 if widget.widget_name == name: 567 return widget 568 return result 569 570 # Prints the metrics to the console 571 def export_metrics_console(self): 572 datetime_now = datetime.datetime.now() 573 datetime_string = datetime_now.strftime("%d-%m-%Y/%H:%M:%S") 574 575 self.print_message("\n[DataSnapshot] Metric report: " + str(self.metric_report_number) + " (" + datetime_string + ")") 576 for metric in self.metrics: 577 if (metric.is_percent == True): 578 self.print_message(" " + metric.metric_name + 579 " - value: " + str(metric.metric_value) + "%") 580 else: 581 self.print_message(" " + metric.metric_name + 582 " - value: " + str(metric.metric_value)) 583 self.print_message("") 584 585 # Sends all registered metrics to Cloudwatch. 586 # Does NOT need to called on loop. Call post_metrics on loop to send all the metrics as expected. 587 # This is just the Cloudwatch part of that loop. 588 def export_metrics_cloudwatch(self): 589 if (self.cloudwatch_client == None): 590 self.print_message("[DataSnapshot] Error - cannot export Cloudwatch metrics! Cloudwatch was not initialized.") 591 self.abort_due_to_internal_error = True 592 self.abort_due_to_internal_error_reason = "Could not export Cloudwatch metrics due to no Cloudwatch client initialized!" 593 return 594 595 self.print_message("[DataSnapshot] Preparing to send to Cloudwatch...") 596 metrics_data = [] 597 metric_data_tmp = None 598 for metric in self.metrics: 599 metric_data_tmp = metric.get_metric_cloudwatch_dictionary() 600 if (not metric_data_tmp is None): 601 metrics_data.append(metric_data_tmp) 602 603 if (len(metrics_data) == 0): 604 self.print_message("[DataSnapshot] INFO - no metric data to send. Skipping...") 605 return 606 607 try: 608 self.cloudwatch_client.put_metric_data( 609 Namespace=self.git_metric_namespace, 610 MetricData=metrics_data) 611 self.print_message("[DataSnapshot] Metrics sent to Cloudwatch.") 612 except Exception as e: 613 self.print_message("[DataSnapshot] Error - something when wrong posting cloudwatch metrics!") 614 self.print_message("[DataSnapshot] Exception: " + str(e)) 615 self.abort_due_to_internal_error = True 616 self.abort_due_to_internal_error_reason = "Could not export Cloudwatch metrics due to exception in Cloudwatch client!" 617 return 618 619 # Call this at a set interval to post the metrics to Cloudwatch, etc. 620 # This is the function you want to call repeatedly after you have everything setup. 621 def post_metrics(self, psutil_process : psutil.Process): 622 if (self.perform_final_initialization == True): 623 self.perform_final_initialization = False 624 self._init_cloudwatch_pre_first_run() 625 626 # Update the metric values internally 627 for i in range(0, len(self.metrics)): 628 metric_value = self.metrics[i].get_metric_value(psutil_process) 629 self.metrics_numbers[i].insert(0, metric_value) 630 631 # Only keep the last metric_report_non_zero_count results 632 if (len(self.metrics_numbers[i]) > self.metric_report_non_zero_count): 633 amount_to_delete = len(self.metrics_numbers[i]) - self.metric_report_non_zero_count 634 del self.metrics_numbers[i][-amount_to_delete:] 635 # If we have metric_report_non_zero_count amount of metrics, make sure there is at least one 636 # non-zero. If it is all zero, then print a log so we can easily find it 637 if (len(self.metrics_numbers[i]) == self.metric_report_non_zero_count): 638 non_zero_found = False 639 for j in range(0, len(self.metrics_numbers[i])): 640 if (self.metrics_numbers[i][j] != 0.0 and self.metrics_numbers[i][j] != None): 641 non_zero_found = True 642 break 643 if (non_zero_found == False): 644 self.print_message("\n[DataSnapshot] METRIC ZERO ERROR!") 645 self.print_message(f"[DataSnapshot] Metric index {i} has been zero for last {self.metric_report_non_zero_count} reports!") 646 self.print_message("\n") 647 648 self.metric_report_number += 1 649 650 self.export_metrics_console() 651 self.export_metrics_cloudwatch() 652 653 def output_diagnosis_information(self, dependencies_list): 654 655 # Print general diagnosis information 656 self.print_message("\n========== Canary Wrapper diagnosis information ==========") 657 self.print_message("\nRunning Canary for repository: " + self.git_repo_name) 658 self.print_message("\t Commit hash: " + self.git_hash) 659 660 if not dependencies_list == "": 661 self.print_message("\nDependencies:") 662 dependencies_list = dependencies_list.split(";") 663 dependencies_list_found_hash = False 664 for i in range(0, len(dependencies_list)): 665 # There's probably a better way to do this... 666 if (dependencies_list_found_hash == True): 667 dependencies_list_found_hash = False 668 continue 669 self.print_message("* " + dependencies_list[i]) 670 if (i+1 < len(dependencies_list)): 671 self.print_message("\t Commit hash: " + dependencies_list[i+1]) 672 dependencies_list_found_hash = True 673 else: 674 self.print_message("\t Commit hash: Unknown") 675 676 if (self.metric_frequency != None): 677 self.print_message("\nMetric Snapshot Frequency: " + str(self.metric_frequency) + " seconds") 678 self.print_message("\nMetrics:") 679 for metric in self.metrics: 680 self.print_message("* " + metric.metric_name) 681 if metric.metric_alarm_threshold is not None: 682 self.print_message("\t Alarm Threshold: " + str(metric.metric_alarm_threshold)) 683 self.print_message("\t Alarm Severity: " + str(metric.metric_alarm_severity)) 684 else: 685 self.print_message("\t No alarm set for metric.") 686 687 self.print_message("\n") 688 self.print_message("==========================================================") 689 self.print_message("\n") 690 691# ================================================================================ 692 693class SnapshotMonitor(): 694 def __init__(self, wrapper_data_snapshot, wrapper_metrics_wait_time) -> None: 695 696 self.data_snapshot = wrapper_data_snapshot 697 self.had_internal_error = False 698 self.error_due_to_credentials = False 699 self.internal_error_reason = "" 700 self.error_due_to_alarm = False 701 702 self.can_cut_ticket = False 703 self.has_cut_ticket = False 704 705 # A list of all the alarms triggered in the last check, cached for later 706 # NOTE - this is only the alarm names! Not the severity. This just makes it easier to process 707 self.cloudwatch_current_alarms_triggered = [] 708 709 # Check for errors 710 if (self.data_snapshot.abort_due_to_internal_error == True): 711 self.had_internal_error = True 712 self.internal_error_reason = "Could not initialize DataSnapshot. Likely credentials are not setup!" 713 if (self.data_snapshot.abort_due_to_internal_error_due_to_credentials == True): 714 self.error_due_to_credentials = True 715 self.data_snapshot.cleanup() 716 return 717 718 # How long to wait before posting a metric 719 self.metric_post_timer = 0 720 self.metric_post_timer_time = wrapper_metrics_wait_time 721 722 723 def register_metric(self, new_metric_name, new_metric_function, new_metric_unit="None", new_metric_alarm_threshold=None, 724 new_metric_reports_to_skip=0, new_metric_alarm_severity=6): 725 726 try: 727 self.data_snapshot.register_metric( 728 new_metric_name=new_metric_name, 729 new_metric_function=new_metric_function, 730 new_metric_unit=new_metric_unit, 731 new_metric_alarm_threshold=new_metric_alarm_threshold, 732 new_metric_reports_to_skip=new_metric_reports_to_skip, 733 new_metric_alarm_severity=new_metric_alarm_severity) 734 except Exception as e: 735 self.print_message("[SnaptshotMonitor] ERROR - could not register metric in data snapshot due to exception!") 736 self.print_message("[SnaptshotMonitor] Exception: " + str(e)) 737 self.had_internal_error = True 738 self.internal_error_reason = "Could not register metric in data snapshot due to exception" 739 return 740 741 def register_dashboard_widget(self, new_widget_name, metrics_to_add=[], widget_period=60): 742 self.data_snapshot.register_dashboard_widget(new_widget_name=new_widget_name, metrics_to_add=metrics_to_add, new_widget_period=widget_period) 743 744 def output_diagnosis_information(self, dependencies=""): 745 self.data_snapshot.output_diagnosis_information(dependencies_list=dependencies) 746 747 def check_alarms_for_new_alarms(self, triggered_alarms): 748 749 if len(triggered_alarms) > 0: 750 self.data_snapshot.print_message( 751 "WARNING - One or more alarms are in state of ALARM") 752 753 old_alarms_still_active = [] 754 new_alarms = [] 755 new_alarms_highest_severity = 6 756 new_alarm_found = True 757 new_alarm_ticket_description = "Canary has metrics in ALARM state!\n\nMetrics in alarm:\n" 758 759 for triggered_alarm in triggered_alarms: 760 new_alarm_found = True 761 762 # Is this a new alarm? 763 for old_alarm_name in self.cloudwatch_current_alarms_triggered: 764 if (old_alarm_name == triggered_alarm[1]): 765 new_alarm_found = False 766 old_alarms_still_active.append(triggered_alarm[1]) 767 768 new_alarm_ticket_description += "* (STILL IN ALARM) " + triggered_alarm[1] + "\n" 769 new_alarm_ticket_description += "\tSeverity: " + str(triggered_alarm[2]) 770 new_alarm_ticket_description += "\n" 771 break 772 773 # If it is a new alarm, then add it to our list so we can cut a new ticket 774 if (new_alarm_found == True): 775 self.data_snapshot.print_message(' (NEW) Alarm with name "' + triggered_alarm[1] + '" is in the ALARM state!') 776 new_alarms.append(triggered_alarm[1]) 777 if (triggered_alarm[2] < new_alarms_highest_severity): 778 new_alarms_highest_severity = triggered_alarm[2] 779 new_alarm_ticket_description += "* " + triggered_alarm[1] + "\n" 780 new_alarm_ticket_description += "\tSeverity: " + str(triggered_alarm[2]) 781 new_alarm_ticket_description += "\n" 782 783 784 if len(new_alarms) > 0: 785 if (self.can_cut_ticket == True): 786 cut_ticket_using_cloudwatch( 787 git_repo_name=self.data_snapshot.git_repo_name, 788 git_hash=self.data_snapshot.git_hash, 789 git_hash_as_namespace=False, 790 git_fixed_namespace_text=self.data_snapshot.git_fixed_namespace_text, 791 cloudwatch_region="us-east-1", 792 ticket_description="New metric(s) went into alarm for the Canary! Metrics in alarm: " + str(new_alarms), 793 ticket_reason="New metric(s) went into alarm", 794 ticket_allow_duplicates=True, 795 ticket_category="AWS", 796 ticket_item="IoT SDK for CPP", 797 ticket_group="AWS IoT Device SDK", 798 ticket_type="SDKs and Tools", 799 ticket_severity=4) 800 self.has_cut_ticket = True 801 802 # Cache the new alarms and the old alarms 803 self.cloudwatch_current_alarms_triggered = old_alarms_still_active + new_alarms 804 805 else: 806 self.cloudwatch_current_alarms_triggered.clear() 807 808 809 def monitor_loop_function(self, psutil_process : psutil.Process, time_passed=30): 810 # Check for internal errors 811 if (self.data_snapshot.abort_due_to_internal_error == True): 812 self.had_internal_error = True 813 self.internal_error_reason = "Data Snapshot internal error: " + self.data_snapshot.abort_due_to_internal_error_reason 814 return 815 816 try: 817 # Poll the metric alarms 818 if (self.had_internal_error == False): 819 # Get a report of all the alarms that might have been set to an alarm state 820 triggered_alarms = self.data_snapshot.get_cloudwatch_alarm_results() 821 self.check_alarms_for_new_alarms(triggered_alarms) 822 except Exception as e: 823 self.print_message("[SnaptshotMonitor] ERROR - exception occurred checking metric alarms!") 824 self.print_message("[SnaptshotMonitor] (Likely session credentials expired)") 825 self.had_internal_error = True 826 self.internal_error_reason = "Exception occurred checking metric alarms! Likely session credentials expired" 827 return 828 829 if (self.metric_post_timer <= 0): 830 if (self.had_internal_error == False): 831 try: 832 self.data_snapshot.post_metrics(psutil_process) 833 except Exception as e: 834 self.print_message("[SnaptshotMonitor] ERROR - exception occurred posting metrics!") 835 self.print_message("[SnaptshotMonitor] (Likely session credentials expired)") 836 837 print (e, flush=True) 838 839 self.had_internal_error = True 840 self.internal_error_reason = "Exception occurred posting metrics! Likely session credentials expired" 841 return 842 843 # reset the timer 844 self.metric_post_timer += self.metric_post_timer_time 845 846 # Gather and post the metrics 847 self.metric_post_timer -= time_passed 848 849 850 def send_email(self, email_body, email_subject_text_append=None): 851 if (email_subject_text_append != None): 852 self.data_snapshot.lambda_send_email(email_body, "Canary: " + self.data_snapshot.git_repo_name + ":" + self.data_snapshot.git_hash + " - " + email_subject_text_append) 853 else: 854 self.data_snapshot.lambda_send_email(email_body, "Canary: " + self.data_snapshot.git_repo_name + ":" + self.data_snapshot.git_hash) 855 856 857 def stop_monitoring(self): 858 # Stub - just added for consistency 859 pass 860 861 862 def start_monitoring(self): 863 # Stub - just added for consistency 864 pass 865 866 867 def restart_monitoring(self): 868 # Stub - just added for consistency 869 pass 870 871 872 def cleanup_monitor(self, error_occurred=False): 873 self.data_snapshot.cleanup(error_occurred=error_occurred) 874 875 def print_message(self, message): 876 if (self.data_snapshot != None): 877 self.data_snapshot.print_message(message) 878 else: 879 print(message, flush=True) 880 881# ================================================================================ 882 883class ApplicationMonitor(): 884 def __init__(self, wrapper_application_path, wrapper_application_arguments, wrapper_application_restart_on_finish=True, data_snapshot=None) -> None: 885 self.application_process = None 886 self.application_process_psutil = None 887 self.error_has_occurred = False 888 self.error_due_to_credentials = False 889 self.error_reason = "" 890 self.error_code = 0 891 self.wrapper_application_path = wrapper_application_path 892 self.wrapper_application_arguments = wrapper_application_arguments 893 self.wrapper_application_restart_on_finish = wrapper_application_restart_on_finish 894 self.data_snapshot=data_snapshot 895 896 self.stdout_file_path = "Canary_Stdout_File.txt" 897 898 def start_monitoring(self): 899 self.print_message("[ApplicationMonitor] Starting to monitor application...") 900 901 if (self.application_process == None): 902 try: 903 canary_command = self.wrapper_application_path + " " + self.wrapper_application_arguments 904 self.application_process = subprocess.Popen(canary_command + " | tee " + self.stdout_file_path, shell=True) 905 self.application_process_psutil = psutil.Process(self.application_process.pid) 906 self.print_message ("[ApplicationMonitor] Application started...") 907 except Exception as e: 908 self.print_message ("[ApplicationMonitor] ERROR - Could not launch Canary/Application due to exception!") 909 self.print_message ("[ApplicationMonitor] Exception: " + str(e)) 910 self.error_has_occurred = True 911 self.error_reason = "Could not launch Canary/Application due to exception" 912 self.error_code = 1 913 return 914 else: 915 self.print_message("[ApplicationMonitor] ERROR - Monitor already has an application process! Cannot monitor two applications with one monitor class!") 916 917 def restart_monitoring(self): 918 self.print_message ("[ApplicationMonitor] Restarting monitor application...") 919 920 if (self.application_process != None): 921 try: 922 self.stop_monitoring() 923 self.start_monitoring() 924 self.print_message("\n[ApplicationMonitor] Restarted monitor application!") 925 self.print_message("================================================================================") 926 except Exception as e: 927 self.print_message("[ApplicationMonitor] ERROR - Could not restart Canary/Application due to exception!") 928 self.print_message("[ApplicationMonitor] Exception: " + str(e)) 929 self.error_has_occurred = True 930 self.error_reason = "Could not restart Canary/Application due to exception" 931 self.error_code = 1 932 return 933 else: 934 self.print_message("[ApplicationMonitor] ERROR - Application process restart called but process is/was not running!") 935 self.error_has_occurred = True 936 self.error_reason = "Could not restart Canary/Application due to application process not being started initially" 937 self.error_code = 1 938 return 939 940 941 def stop_monitoring(self): 942 self.print_message ("[ApplicationMonitor] Stopping monitor application...") 943 if (not self.application_process == None): 944 self.application_process.terminate() 945 self.application_process.wait() 946 self.print_message ("[ApplicationMonitor] Stopped monitor application!") 947 self.application_process = None 948 self.print_stdout() 949 else: 950 self.print_message ("[ApplicationMonitor] ERROR - cannot stop monitor application because no process is found!") 951 952 def print_stdout(self): 953 # Print the STDOUT file 954 if (os.path.isfile(self.stdout_file_path)): 955 self.print_message("Just finished Application STDOUT: ") 956 with open(self.stdout_file_path, "r") as stdout_file: 957 self.print_message(stdout_file.read()) 958 os.remove(self.stdout_file_path) 959 960 def monitor_loop_function(self, time_passed=30): 961 if (self.application_process != None): 962 963 application_process_return_code = None 964 try: 965 application_process_return_code = self.application_process.poll() 966 except Exception as e: 967 self.print_message("[ApplicationMonitor] ERROR - exception occurred while trying to poll application status!") 968 self.print_message("[ApplicationMonitor] Exception: " + str(e)) 969 self.error_has_occurred = True 970 self.error_reason = "Exception when polling application status" 971 self.error_code = 1 972 return 973 974 # If it is not none, then the application finished 975 if (application_process_return_code != None): 976 977 self.print_message("[ApplicationMonitor] Monitor application has stopped! Processing result...") 978 979 if (application_process_return_code != 0): 980 self.print_message("[ApplicationMonitor] ERROR - Something Crashed in Canary/Application!") 981 self.print_message("[ApplicationMonitor] Error code: " + str(application_process_return_code)) 982 983 self.error_has_occurred = True 984 self.error_reason = "Canary application crashed!" 985 self.error_code = application_process_return_code 986 else: 987 # Should we restart? 988 if (self.wrapper_application_restart_on_finish == True): 989 self.print_message("[ApplicationMonitor] NOTE - Canary finished running and is restarting...") 990 self.restart_monitoring() 991 else: 992 self.print_message("[ApplicationMonitor] Monitor application has stopped and monitor is not supposed to restart... Finishing...") 993 self.error_has_occurred = True 994 self.error_reason = "Canary Application Finished" 995 self.error_code = 0 996 else: 997 self.print_message("[ApplicationMonitor] Monitor application is still running...") 998 999 def cleanup_monitor(self, error_occurred=False): 1000 pass 1001 1002 def print_message(self, message): 1003 if (self.data_snapshot != None): 1004 self.data_snapshot.print_message(message) 1005 else: 1006 print(message, flush=True) 1007 1008# ================================================================================ 1009 1010class S3Monitor(): 1011 1012 def __init__(self, s3_bucket_name, s3_file_name, s3_file_name_in_zip, canary_local_application_path, data_snapshot) -> None: 1013 self.s3_client = None 1014 self.s3_current_object_version_id = None 1015 self.s3_current_object_last_modified = None 1016 self.s3_bucket_name = s3_bucket_name 1017 self.s3_file_name = s3_file_name 1018 self.s3_file_name_only_path, self.s3_file_name_only_extension = os.path.splitext(s3_file_name) 1019 self.data_snapshot = data_snapshot 1020 1021 self.canary_local_application_path = canary_local_application_path 1022 1023 self.s3_file_name_in_zip = s3_file_name_in_zip 1024 self.s3_file_name_in_zip_only_path = None 1025 self.s3_file_name_in_zip_only_extension = None 1026 if (self.s3_file_name_in_zip != None): 1027 self.s3_file_name_in_zip_only_path, self.s3_file_name_in_zip_only_extension = os.path.splitext(s3_file_name_in_zip) 1028 1029 self.s3_file_needs_replacing = False 1030 1031 self.had_internal_error = False 1032 self.error_due_to_credentials = False 1033 self.internal_error_reason = "" 1034 1035 # Check for valid credentials 1036 # ================== 1037 try: 1038 tmp_sts_client = boto3.client('sts') 1039 tmp_sts_client.get_caller_identity() 1040 except Exception as e: 1041 self.print_message("[S3Monitor] ERROR - (S3 Check) AWS credentials are NOT valid!") 1042 self.had_internal_error = True 1043 self.error_due_to_credentials = True 1044 self.internal_error_reason = "AWS credentials are NOT valid!" 1045 return 1046 # ================== 1047 1048 try: 1049 self.s3_client = boto3.client("s3") 1050 except Exception as e: 1051 self.print_message("[S3Monitor] ERROR - (S3 Check) Could not make S3 client") 1052 self.had_internal_error = True 1053 self.internal_error_reason = "Could not make S3 client for S3 Monitor" 1054 return 1055 1056 1057 def check_for_file_change(self): 1058 try: 1059 version_check_response = self.s3_client.list_object_versions( 1060 Bucket=self.s3_bucket_name, 1061 Prefix=self.s3_file_name_only_path) 1062 if "Versions" in version_check_response: 1063 for version in version_check_response["Versions"]: 1064 if (version["IsLatest"] == True): 1065 if (version["VersionId"] != self.s3_current_object_version_id or 1066 version["LastModified"] != self.s3_current_object_last_modified): 1067 1068 self.print_message("[S3Monitor] Found new version of Canary/Application in S3!") 1069 self.print_message("[S3Monitor] Changing running Canary/Application to new one...") 1070 1071 # Will be checked by thread to trigger replacing the file 1072 self.s3_file_needs_replacing = True 1073 1074 self.s3_current_object_version_id = version["VersionId"] 1075 self.s3_current_object_last_modified = version["LastModified"] 1076 return 1077 1078 except Exception as e: 1079 self.print_message("[S3Monitor] ERROR - Could not check for new version of file in S3 due to exception!") 1080 self.print_message("[S3Monitor] Exception: " + str(e)) 1081 self.had_internal_error = True 1082 self.internal_error_reason = "Could not check for S3 file due to exception in S3 client" 1083 1084 1085 def replace_current_file_for_new_file(self): 1086 try: 1087 self.print_message("[S3Monitor] Making directory...") 1088 if not os.path.exists("tmp"): 1089 os.makedirs("tmp") 1090 except Exception as e: 1091 self.print_message ("[S3Monitor] ERROR - could not make tmp directory to place S3 file into!") 1092 self.had_internal_error = True 1093 self.internal_error_reason = "Could not make TMP folder for S3 file download" 1094 return 1095 1096 # Download the file 1097 new_file_path = "tmp/new_file" + self.s3_file_name_only_extension 1098 try: 1099 self.print_message("[S3Monitor] Downloading file...") 1100 s3_resource = boto3.resource("s3") 1101 s3_resource.meta.client.download_file(self.s3_bucket_name, self.s3_file_name, new_file_path) 1102 except Exception as e: 1103 self.print_message("[S3Monitor] ERROR - could not download latest S3 file into TMP folder!") 1104 self.had_internal_error = True 1105 self.internal_error_reason = "Could not download latest S3 file into TMP folder" 1106 return 1107 1108 # Is it a zip file? 1109 if (self.s3_file_name_in_zip != None): 1110 self.print_message("[S3Monitor] New file is zip file. Unzipping...") 1111 # Unzip it! 1112 with zipfile.ZipFile(new_file_path, 'r') as zip_file: 1113 zip_file.extractall("tmp/new_file_zip") 1114 new_file_path = "tmp/new_file_zip/" + self.s3_file_name_in_zip_only_path + self.s3_file_name_in_zip_only_extension 1115 1116 try: 1117 # is there a file already present there? 1118 if os.path.exists(self.canary_local_application_path) == True: 1119 os.remove(self.canary_local_application_path) 1120 1121 self.print_message("[S3Monitor] Moving file...") 1122 os.replace(new_file_path, self.canary_local_application_path) 1123 self.print_message("[S3Monitor] Getting execution rights...") 1124 os.system("chmod u+x " + self.canary_local_application_path) 1125 1126 except Exception as e: 1127 self.print_message("[S3Monitor] ERROR - could not move file into local application path due to exception!") 1128 self.print_message("[S3Monitor] Exception: " + str(e)) 1129 self.had_internal_error = True 1130 self.internal_error_reason = "Could not move file into local application path" 1131 return 1132 1133 self.print_message("[S3Monitor] New file downloaded and moved into correct location!") 1134 self.s3_file_needs_replacing = False 1135 1136 1137 def stop_monitoring(self): 1138 # Stub - just added for consistency 1139 pass 1140 1141 1142 def start_monitoring(self): 1143 # Stub - just added for consistency 1144 pass 1145 1146 1147 def restart_monitoring(self): 1148 # Stub - just added for consistency 1149 pass 1150 1151 1152 def cleanup_monitor(self): 1153 # Stub - just added for consistency 1154 pass 1155 1156 def monitor_loop_function(self, time_passed=30): 1157 self.check_for_file_change() 1158 1159 def print_message(self, message): 1160 if (self.data_snapshot != None): 1161 self.data_snapshot.print_message(message) 1162 else: 1163 print(message, flush=True) 1164 1165# ================================================================================ 1166 1167 1168# Cuts a ticket to SIM using a temporary Cloudwatch metric that is quickly created, triggered, and destroyed. 1169# Can be called in any thread - creates its own Cloudwatch client and any data it needs is passed in. 1170# 1171# See (https://w.amazon.com/bin/view/CloudWatchAlarms/Internal/CloudWatchAlarmsSIMTicketing) for more details 1172# on how the alarm is sent using Cloudwatch. 1173def cut_ticket_using_cloudwatch( 1174 ticket_description="Description here!", 1175 ticket_reason="Reason here!", 1176 ticket_severity=5, 1177 ticket_category="AWS", 1178 ticket_type="SDKs and Tools", 1179 ticket_item="IoT SDK for CPP", 1180 ticket_group="AWS IoT Device SDK", 1181 ticket_allow_duplicates=False, 1182 git_repo_name="REPO NAME", 1183 git_hash="HASH", 1184 git_hash_as_namespace=False, 1185 git_fixed_namespace_text="mqtt5_canary", 1186 cloudwatch_region="us-east-1"): 1187 1188 git_metric_namespace = "" 1189 if (git_hash_as_namespace == False): 1190 git_metric_namespace = git_fixed_namespace_text 1191 else: 1192 git_namespace_prepend_text = git_repo_name + "-" + git_hash 1193 git_metric_namespace = git_namespace_prepend_text 1194 1195 try: 1196 cloudwatch_client = boto3.client('cloudwatch', cloudwatch_region) 1197 ticket_alarm_name = git_repo_name + "-" + git_hash + "-AUTO-TICKET" 1198 except Exception as e: 1199 print ("ERROR - could not create Cloudwatch client to make ticket metric alarm due to exception!") 1200 print ("Exception: " + str(e), flush=True) 1201 return 1202 1203 new_metric_dimensions = [] 1204 if (git_hash_as_namespace == False): 1205 git_namespace_prepend_text = git_repo_name + "-" + git_hash 1206 new_metric_dimensions.append( 1207 {"Name": git_namespace_prepend_text, "Value": ticket_alarm_name}) 1208 else: 1209 new_metric_dimensions.append( 1210 {"Name": "System_Metrics", "Value": ticket_alarm_name}) 1211 1212 ticket_arn = f"arn:aws:cloudwatch::cwa-internal:ticket:{ticket_severity}:{ticket_category}:{ticket_type}:{ticket_item}:{ticket_group}:" 1213 if (ticket_allow_duplicates == True): 1214 # use "DO-NOT-DEDUPE" so we can run the same commit again and it will cut another ticket. 1215 ticket_arn += "DO-NOT-DEDUPE" 1216 # In the ticket ARN, all spaces need to be replaced with + 1217 ticket_arn = ticket_arn.replace(" ", "+") 1218 1219 ticket_alarm_description = f"AUTO CUT CANARY WRAPPER TICKET\n\nREASON: {ticket_reason}\n\nDESCRIPTION: {ticket_description}\n\n" 1220 1221 # Register a metric alarm so it can auto-cut a ticket for us 1222 try: 1223 cloudwatch_client.put_metric_alarm( 1224 AlarmName=ticket_alarm_name, 1225 AlarmDescription=ticket_alarm_description, 1226 MetricName=ticket_alarm_name, 1227 Namespace=git_metric_namespace, 1228 Statistic="Maximum", 1229 Dimensions=new_metric_dimensions, 1230 Period=60, # How long (in seconds) is an evaluation period? 1231 EvaluationPeriods=1, # How many periods does it need to be invalid for? 1232 DatapointsToAlarm=1, # How many data points need to be invalid? 1233 Threshold=1, 1234 ComparisonOperator="GreaterThanOrEqualToThreshold", 1235 # The data above does not really matter - it just needs to be valid input data. 1236 # This is the part that tells Cloudwatch to cut the ticket 1237 AlarmActions=[ticket_arn] 1238 ) 1239 except Exception as e: 1240 print ("ERROR - could not create ticket metric alarm due to exception!") 1241 print ("Exception: " + str(e), flush=True) 1242 return 1243 1244 # Trigger the alarm so it cuts the ticket 1245 try: 1246 cloudwatch_client.set_alarm_state( 1247 AlarmName=ticket_alarm_name, 1248 StateValue="ALARM", 1249 StateReason="AUTO TICKET CUT") 1250 except Exception as e: 1251 print ("ERROR - could not cut ticket due to exception!") 1252 print ("Exception: " + str(e), flush=True) 1253 return 1254 1255 print("Waiting for ticket metric to trigger...", flush=True) 1256 # Wait a little bit (2 seconds)... 1257 time.sleep(2) 1258 1259 # Remove the metric 1260 print("Removing ticket metric...", flush=True) 1261 cloudwatch_client.delete_alarms(AlarmNames=[ticket_alarm_name]) 1262 1263 print ("Finished cutting ticket via Cloudwatch!", flush=True) 1264 return 1265 1266# A helper function that gets the majority of the ticket information from the arguments result from argparser. 1267def cut_ticket_using_cloudwatch_from_args( 1268 ticket_description="", 1269 ticket_reason="", 1270 ticket_severity=6, 1271 arguments=None): 1272 1273 # Do not cut a ticket for a severity of 6+ 1274 if (ticket_severity >= 6): 1275 return 1276 1277 cut_ticket_using_cloudwatch( 1278 ticket_description=ticket_description, 1279 ticket_reason=ticket_reason, 1280 ticket_severity=ticket_severity, 1281 ticket_category=arguments.ticket_category, 1282 ticket_type=arguments.ticket_type, 1283 ticket_item=arguments.ticket_item, 1284 ticket_group=arguments.ticket_group, 1285 ticket_allow_duplicates=False, 1286 git_repo_name=arguments.git_repo_name, 1287 git_hash=arguments.git_hash, 1288 git_hash_as_namespace=arguments.git_hash_as_namespace) 1289