xref: /aosp_15_r20/external/aws-crt-java/codebuild/CanaryWrapper_Classes.py (revision 3c7ae9de214676c52d19f01067dc1a404272dc11)
1# Contains all of the classes that are shared across both the Canary Wrapper and the Persistent Canary Wrapper scripts
2# If a class can/is reused, then it should be in this file.
3
4# Needs to be installed prior to running
5import boto3
6import psutil
7# Part of standard packages in Python 3.4+
8import time
9import os
10import json
11import subprocess
12import zipfile
13import datetime
14
15# ================================================================================
16
17# Class that holds metric data and has a few utility functions for getting that data in a format we can use for Cloudwatch
18class DataSnapshot_Metric():
19    def __init__(self, metric_name, metric_function, metric_dimensions=[],
20                metric_unit="None", metric_alarm_threshold=None, metric_alarm_severity=6,
21                git_hash="", git_repo_name="", reports_to_skip=0, is_percent=False):
22        self.metric_name = metric_name
23        self.metric_function = metric_function
24        self.metric_dimensions = metric_dimensions
25        self.metric_unit = metric_unit
26        self.metric_alarm_threshold = metric_alarm_threshold
27        self.metric_alarm_name = self.metric_name + "-" + git_repo_name + "-" + git_hash
28        self.metric_alarm_description = 'Alarm for metric "' + self.metric_name + '" - git hash: ' + git_hash
29        self.metric_value = None
30        self.reports_to_skip = reports_to_skip
31        self.metric_alarm_severity = metric_alarm_severity
32        self.is_percent = is_percent
33
34    # Gets the latest metric value from the metric_function callback
35    def get_metric_value(self, psutil_process : psutil.Process):
36        if not self.metric_function is None:
37            self.metric_value = self.metric_function(psutil_process)
38        return self.metric_value
39
40    # Returns the data needed to send to Cloudwatch when posting metrics
41    def get_metric_cloudwatch_dictionary(self):
42        if (self.reports_to_skip > 0):
43            self.reports_to_skip -= 1
44            return None # skips sending to Cloudwatch
45
46        if (self.metric_value == None):
47            return None # skips sending to Cloudwatch
48
49        return {
50            "MetricName": self.metric_name,
51            "Dimensions": self.metric_dimensions,
52            "Value": self.metric_value,
53            "Unit": self.metric_unit
54        }
55
56class DataSnapshot_Dashboard_Widget():
57    def __init__(self, widget_name, metric_namespace, metric_dimension, cloudwatch_region="us-east-1", widget_period=60) -> None:
58        self.metric_list = []
59        self.region = cloudwatch_region
60        self.widget_name = widget_name
61        self.metric_namespace = metric_namespace
62        self.metric_dimension = metric_dimension
63        self.widget_period = widget_period
64
65    def add_metric_to_widget(self, new_metric_name):
66        try:
67            self.metric_list.append(new_metric_name)
68        except Exception as e:
69            print ("[DataSnapshot_Dashboard] ERROR - could not add metric to dashboard widget due to exception!")
70            print ("[DataSnapshot_Dashboard] Exception: " + str(e))
71
72    def remove_metric_from_widget(self, existing_metric_name):
73        try:
74            self.metric_list.remove(existing_metric_name)
75        except Exception as e:
76            print ("[DataSnapshot_Dashboard] ERROR - could not remove metric from dashboard widget due to exception!")
77            print ("[DataSnapshot_Dashboard] Exception: " + str(e))
78
79    def get_widget_dictionary(self):
80        metric_list_json = []
81        for metric_name in self.metric_list:
82            metric_list_json.append([self.metric_namespace, metric_name, self.metric_dimension, metric_name])
83
84        return {
85            "type":"metric",
86            "properties" : {
87                "metrics" : metric_list_json,
88                "region": self.region,
89                "title": self.widget_name,
90                "period": self.widget_period,
91            },
92            "width": 14,
93            "height": 10
94        }
95
96# ================================================================================
97
98# Class that keeps track of the metrics registered, sets up Cloudwatch and S3, and sends periodic reports
99# Is the backbone of the reporting operation
100class DataSnapshot():
101    def __init__(self,
102                 git_hash=None,
103                 git_repo_name=None,
104                 git_hash_as_namespace=False,
105                 git_fixed_namespace_text="mqtt5_canary",
106                 datetime_string=None,
107                 output_log_filepath=None,
108                 output_to_console=True,
109                 cloudwatch_region="us-east-1",
110                 cloudwatch_make_dashboard=False,
111                 cloudwatch_teardown_alarms_on_complete=True,
112                 cloudwatch_teardown_dashboard_on_complete=True,
113                 s3_bucket_name="canary-wrapper-bucket",
114                 s3_bucket_upload_on_complete=True,
115                 lambda_name="CanarySendEmailLambda",
116                 metric_frequency=None):
117
118        # Setting initial values
119        # ==================
120        self.first_metric_call = True
121        self.metrics = []
122        self.metrics_numbers = []
123        self.metric_report_number = 0
124        self.metric_report_non_zero_count = 4
125
126        # Needed so we can initialize Cloudwatch alarms, etc, outside of the init function
127        # but before we start sending data.
128        # This boolean tracks whether we have done the post-initialization prior to sending the first report.
129        self.perform_final_initialization = True
130
131        # Watched by the thread creating the snapshot. Will cause the thread(s) to abort and return an error.
132        self.abort_due_to_internal_error = False
133        self.abort_due_to_internal_error_reason = ""
134        self.abort_due_to_internal_error_due_to_credentials = False
135
136        self.git_hash = None
137        self.git_repo_name = None
138        self.git_hash_as_namespace = git_hash_as_namespace
139        self.git_fixed_namespace_text = git_fixed_namespace_text
140        self.git_metric_namespace = None
141
142        self.cloudwatch_region = cloudwatch_region
143        self.cloudwatch_client = None
144        self.cloudwatch_make_dashboard = cloudwatch_make_dashboard
145        self.cloudwatch_teardown_alarms_on_complete = cloudwatch_teardown_alarms_on_complete
146        self.cloudwatch_teardown_dashboard_on_complete = cloudwatch_teardown_dashboard_on_complete
147        self.cloudwatch_dashboard_name = ""
148        self.cloudwatch_dashboard_widgets = []
149
150        self.s3_bucket_name = s3_bucket_name
151        self.s3_client = None
152        self.s3_bucket_upload_on_complete = s3_bucket_upload_on_complete
153
154        self.output_to_file_filepath = output_log_filepath
155        self.output_to_file = False
156        self.output_file = None
157        self.output_to_console = output_to_console
158
159        self.lambda_client = None
160        self.lambda_name = lambda_name
161
162        self.datetime_string = datetime_string
163        self.metric_frequency = metric_frequency
164        # ==================
165
166        # Check for valid credentials
167        # ==================
168        try:
169            tmp_sts_client = boto3.client('sts')
170            tmp_sts_client.get_caller_identity()
171        except Exception as e:
172            print ("[DataSnapshot] ERROR - AWS credentials are NOT valid!")
173            self.abort_due_to_internal_error = True
174            self.abort_due_to_internal_error_reason = "AWS credentials are NOT valid!"
175            self.abort_due_to_internal_error_due_to_credentials = True
176            return
177        # ==================
178
179        # Git related stuff
180        # ==================
181        if (git_hash == None or git_repo_name == None):
182            print("[DataSnapshot] ERROR - a Git hash and repository name are REQUIRED for the canary wrapper to run!")
183            self.abort_due_to_internal_error = True
184            self.abort_due_to_internal_error_reason = "No Git hash and repository passed!"
185            return
186
187        self.git_hash = git_hash
188        self.git_repo_name = git_repo_name
189
190        if (self.git_hash_as_namespace == False):
191            self.git_metric_namespace = self.git_fixed_namespace_text
192        else:
193            if (self.datetime_string == None):
194                git_namespace_prepend_text = self.git_repo_name + "-" + self.git_hash
195            else:
196                git_namespace_prepend_text = self.git_repo_name + "/" + self.datetime_string + "-" + self.git_hash
197            self.git_metric_namespace = git_namespace_prepend_text
198        # ==================
199
200        # Cloudwatch related stuff
201        # ==================
202        try:
203            self.cloudwatch_client = boto3.client('cloudwatch', self.cloudwatch_region)
204            self.cloudwatch_dashboard_name = self.git_metric_namespace
205        except Exception as e:
206            self.print_message("[DataSnapshot] ERROR - could not make Cloudwatch client due to exception!")
207            self.print_message("[DataSnapshot] Exception: " + str(e))
208            self.cloudwatch_client = None
209            self.abort_due_to_internal_error = True
210            self.abort_due_to_internal_error_reason = "Could not make Cloudwatch client!"
211            return
212        # ==================
213
214        # S3 related stuff
215        # ==================
216        try:
217            self.s3_client = boto3.client("s3")
218        except Exception as e:
219            self.print_message("[DataSnapshot] ERROR - could not make S3 client due to exception!")
220            self.print_message("[DataSnapshot] Exception: " + str(e))
221            self.s3_client = None
222            self.abort_due_to_internal_error = True
223            self.abort_due_to_internal_error_reason = "Could not make S3 client!"
224            return
225        # ==================
226
227        # Lambda related stuff
228        # ==================
229        try:
230            self.lambda_client = boto3.client("lambda", self.cloudwatch_region)
231        except Exception as e:
232            self.print_message("[DataSnapshot] ERROR - could not make Lambda client due to exception!")
233            self.print_message("[DataSnapshot] Exception: " + str(e))
234            self.lambda_client = None
235            self.abort_due_to_internal_error = True
236            self.abort_due_to_internal_error_reason = "Could not make Lambda client!"
237            return
238        # ==================
239
240        # File output (logs) related stuff
241        # ==================
242        if (not output_log_filepath is None):
243            self.output_to_file = True
244            self.output_file = open(self.output_to_file_filepath, "w")
245        else:
246            self.output_to_file = False
247            self.output_file = None
248        # ==================
249
250        self.print_message("[DataSnapshot] Data snapshot created!")
251
252    # Cleans the class - closing any files, removing alarms, and sending data to S3.
253    # Should be called at the end when you are totally finished shadowing metrics
254    def cleanup(self, error_occurred=False):
255        if (self.s3_bucket_upload_on_complete == True):
256            self.export_result_to_s3_bucket(copy_output_log=True, log_is_error=error_occurred)
257
258        self._cleanup_cloudwatch_alarms()
259        if (self.cloudwatch_make_dashboard == True):
260            self._cleanup_cloudwatch_dashboard()
261
262        self.print_message("[DataSnapshot] Data snapshot cleaned!")
263
264        if (self.output_file is not None):
265            self.output_file.close()
266            self.output_file = None
267
268    # Utility function for printing messages
269    def print_message(self, message):
270        if self.output_to_file == True:
271            self.output_file.write(message + "\n")
272        if self.output_to_console == True:
273            print(message, flush=True)
274
275    # Utility function - adds the metric alarms to Cloudwatch. We do run this right before the first
276    # collection of metrics so we can register metrics before we initialize Cloudwatch
277    def _init_cloudwatch_pre_first_run(self):
278        for metric in self.metrics:
279            if (not metric.metric_alarm_threshold is None):
280                self._add_cloudwatch_metric_alarm(metric)
281
282        if (self.cloudwatch_make_dashboard == True):
283            self._init_cloudwatch_pre_first_run_dashboard()
284
285    # Utility function - adds the Cloudwatch Dashboard for the currently running data snapshot
286    def _init_cloudwatch_pre_first_run_dashboard(self):
287        try:
288            # Remove the old dashboard if it exists before adding a new one
289            self._cleanup_cloudwatch_dashboard()
290
291            new_dashboard_widgets_array = []
292            for widget in self.cloudwatch_dashboard_widgets:
293                new_dashboard_widgets_array.append(widget.get_widget_dictionary())
294
295            new_dashboard_body = {
296                "start": "-PT1H",
297                "widgets": new_dashboard_widgets_array,
298            }
299            new_dashboard_body_json = json.dumps(new_dashboard_body)
300
301            self.cloudwatch_client.put_dashboard(
302                DashboardName=self.cloudwatch_dashboard_name,
303                DashboardBody= new_dashboard_body_json)
304            self.print_message("[DataSnapshot] Added Cloudwatch dashboard successfully")
305        except Exception as e:
306            self.print_message("[DataSnapshot] ERROR - Cloudwatch client could not make dashboard due to exception!")
307            self.print_message("[DataSnapshot] Exception: " + str(e))
308            self.abort_due_to_internal_error = True
309            self.abort_due_to_internal_error_reason = "Cloudwatch client could not make dashboard due to exception"
310            return
311
312    # Utility function - The function that adds each individual metric alarm.
313    def _add_cloudwatch_metric_alarm(self, metric):
314        if self.cloudwatch_client is None:
315            self.print_message("[DataSnapshot] ERROR - Cloudwatch client not setup. Cannot register alarm")
316            return
317
318        try:
319            self.cloudwatch_client.put_metric_alarm(
320                AlarmName=metric.metric_alarm_name,
321                AlarmDescription=metric.metric_alarm_description,
322                MetricName=metric.metric_name,
323                Namespace=self.git_metric_namespace,
324                Statistic="Maximum",
325                Dimensions=metric.metric_dimensions,
326                Period=60,  # How long (in seconds) is an evaluation period?
327                EvaluationPeriods=120,  # How many periods does it need to be invalid for?
328                DatapointsToAlarm=1,  # How many data points need to be invalid?
329                Threshold=metric.metric_alarm_threshold,
330                ComparisonOperator="GreaterThanOrEqualToThreshold",
331            )
332        except Exception as e:
333            self.print_message("[DataSnapshot] ERROR - could not register alarm for metric due to exception: " + metric.metric_name)
334            self.print_message("[DataSnapshot] Exception: " + str(e))
335
336    # Utility function - removes all the Cloudwatch alarms for the metrics
337    def _cleanup_cloudwatch_alarms(self):
338        if (self.cloudwatch_teardown_alarms_on_complete == True):
339            try:
340                for metric in self.metrics:
341                    if (not metric.metric_alarm_threshold is None):
342                        self.cloudwatch_client.delete_alarms(AlarmNames=[metric.metric_alarm_name])
343            except Exception as e:
344                self.print_message("[DataSnapshot] ERROR - could not delete alarms due to exception!")
345                self.print_message("[DataSnapshot] Exception: " + str(e))
346
347    # Utility function - removes all Cloudwatch dashboards created
348    def _cleanup_cloudwatch_dashboard(self):
349        if (self.cloudwatch_teardown_dashboard_on_complete == True):
350            try:
351                self.cloudwatch_client.delete_dashboards(DashboardNames=[self.cloudwatch_dashboard_name])
352                self.print_message("[DataSnapshot] Cloudwatch Dashboards deleted successfully!")
353            except Exception as e:
354                self.print_message("[DataSnapshot] ERROR - dashboard cleaning function failed due to exception!")
355                self.print_message("[DataSnapshot] Exception: " + str(e))
356                self.abort_due_to_internal_error = True
357                self.abort_due_to_internal_error_reason = "Cloudwatch dashboard cleaning function failed due to exception"
358                return
359
360    # Returns the results of the metric alarms. Will return a list containing tuples with the following structure:
361    # [Boolean (False = the alarm is in the ALARM state), String (Name of the alarm that is in the ALARM state), int (severity of alarm)]
362    # Currently this function will only return a list of failed alarms, so if the returned list is empty, then it means all
363    # alarms did not get to the ALARM state in Cloudwatch for the registered metrics
364    def get_cloudwatch_alarm_results(self):
365        return self._check_cloudwatch_alarm_states()
366
367    # Utility function - collects the metric alarm results and returns them in a list.
368    def _check_cloudwatch_alarm_states(self):
369        return_result_list = []
370
371        tmp = None
372        for metric in self.metrics:
373            tmp = self._check_cloudwatch_alarm_state_metric(metric)
374            if (tmp[1] != None):
375                # Do not cut a ticket for the "Alive_Alarm" that we use to check if the Canary is running
376                if ("Alive_Alarm" in tmp[1] == False):
377                    if (tmp[0] != True):
378                        return_result_list.append(tmp)
379
380        return return_result_list
381
382    # Utility function - checks each individual alarm and returns a tuple with the following format:
383    # [Boolean (False if the alarm is in the ALARM state, otherwise it is true), String (name of the alarm), Int (severity of alarm)]
384    def _check_cloudwatch_alarm_state_metric(self, metric):
385        alarms_response = self.cloudwatch_client.describe_alarms_for_metric(
386            MetricName=metric.metric_name,
387            Namespace=self.git_metric_namespace,
388            Dimensions=metric.metric_dimensions)
389
390        return_result = [True, None, metric.metric_alarm_severity]
391
392        for metric_alarm_dict in alarms_response["MetricAlarms"]:
393            if metric_alarm_dict["StateValue"] == "ALARM":
394                return_result[0] = False
395                return_result[1] = metric_alarm_dict["AlarmName"]
396                break
397
398        return return_result
399
400    # Exports a file with the same name as the commit Git hash to an S3 bucket in a folder with the Git repo name.
401    # By default, this file will only contain the Git hash.
402    # If copy_output_log is true, then the output log will be copied into this file, which may be useful for debugging.
403    def export_result_to_s3_bucket(self, copy_output_log=False, log_is_error=False):
404        if (self.s3_client is None):
405            self.print_message("[DataSnapshot] ERROR - No S3 client initialized! Cannot send log to S3")
406            self.abort_due_to_internal_error = True
407            self.abort_due_to_internal_error_reason = "S3 client not initialized and therefore cannot send log to S3"
408            return
409
410        s3_file = open(self.git_hash + ".log", "w")
411        s3_file.write(self.git_hash)
412
413        # Might be useful for debugging?
414        if (copy_output_log == True and self.output_to_file == True):
415            # Are we still writing? If so, then we need to close the file first so everything is written to it
416            is_output_file_open_previously = False
417            if (self.output_file != None):
418                self.output_file.close()
419                is_output_file_open_previously = True
420            self.output_file = open(self.output_to_file_filepath, "r")
421
422            s3_file.write("\n\nOUTPUT LOG\n")
423            s3_file.write("==========================================================================================\n")
424            output_file_lines = self.output_file.readlines()
425            for line in output_file_lines:
426                s3_file.write(line)
427
428            self.output_file.close()
429
430            # If we were writing to the output previously, then we need to open in RW mode so we can continue to write to it
431            if (is_output_file_open_previously == True):
432                self.output_to_file = open(self.output_to_file_filepath, "a")
433
434        s3_file.close()
435
436        # Upload to S3
437        try:
438            if (log_is_error == False):
439                if (self.datetime_string == None):
440                    self.s3_client.upload_file(self.git_hash + ".log", self.s3_bucket_name, self.git_repo_name + "/" + self.git_hash + ".log")
441                else:
442                    self.s3_client.upload_file(self.git_hash + ".log", self.s3_bucket_name, self.git_repo_name + "/" + self.datetime_string + "/" + self.git_hash + ".log")
443            else:
444                if (self.datetime_string == None):
445                    self.s3_client.upload_file(self.git_hash + ".log", self.s3_bucket_name, self.git_repo_name + "/Failed_Logs/" + self.git_hash + ".log")
446                else:
447                    self.s3_client.upload_file(self.git_hash + ".log", self.s3_bucket_name, self.git_repo_name + "/Failed_Logs/" + self.datetime_string + "/" + self.git_hash + ".log")
448            self.print_message("[DataSnapshot] Uploaded to S3!")
449        except Exception as e:
450            self.print_message("[DataSnapshot] ERROR - could not upload to S3 due to exception!")
451            self.print_message("[DataSnapshot] Exception: " + str(e))
452            self.abort_due_to_internal_error = True
453            self.abort_due_to_internal_error_reason = "S3 client had exception and therefore could not upload log!"
454            os.remove(self.git_hash + ".log")
455            return
456
457        # Delete the file when finished
458        os.remove(self.git_hash + ".log")
459
460    # Sends an email via a special lambda. The payload has to contain a message and a subject
461    # * (REQUIRED) message is the message you want to send in the body of the email
462    # * (REQUIRED) subject is the subject that the email will be sent with
463    def lambda_send_email(self, message, subject):
464
465        payload = {"Message":message, "Subject":subject}
466        payload_string = json.dumps(payload)
467
468        try:
469            self.lambda_client.invoke(
470                FunctionName=self.lambda_name,
471                InvocationType="Event",
472                ClientContext="MQTT Wrapper Script",
473                Payload=payload_string
474            )
475        except Exception as e:
476            self.print_message("[DataSnapshot] ERROR - could not send email via Lambda due to exception!")
477            self.print_message("[DataSnapshot] Exception: " + str(e))
478            self.abort_due_to_internal_error = True
479            self.abort_due_to_internal_error_reason = "Lambda email function had an exception!"
480            return
481
482    # Registers a metric to be polled by the Snapshot.
483    # * (REQUIRED) new_metric_name is the name of the metric. Cloudwatch will use this name
484    # * (REQUIRED) new_metric_function is expected to be a pointer to a Python function and will not work if you pass a value/object
485    # * (OPTIONAL) new_metric_unit is the metric unit. There is a list of possible metric unit types on the Boto3 documentation for Cloudwatch
486    # * (OPTIONAL) new_metric_alarm_threshold is the value that the metric has to exceed in order to be registered as an alarm
487    # * (OPTIONAL) new_reports_to_skip is the number of reports this metric will return nothing, but will get it's value.
488    #     * Useful for CPU calculations that require deltas
489    # * (OPTIONAL) new_metric_alarm_severity is the severity of the ticket if this alarm is triggered. A severity of 6+ means no ticket.
490    # * (OPTIONAL) is_percent whether or not to display the metric as a percent when printing it (default=false)
491    def register_metric(self, new_metric_name, new_metric_function, new_metric_unit="None",
492                        new_metric_alarm_threshold=None, new_metric_reports_to_skip=0, new_metric_alarm_severity=6, is_percent=False):
493
494        new_metric_dimensions = []
495
496        if (self.git_hash_as_namespace == False):
497            git_namespace_prepend_text = self.git_repo_name + "-" + self.git_hash
498            new_metric_dimensions.append(
499                {"Name": git_namespace_prepend_text, "Value": new_metric_name})
500        else:
501            new_metric_dimensions.append(
502                {"Name": "System_Metrics", "Value": new_metric_name})
503
504        new_metric = DataSnapshot_Metric(
505            metric_name=new_metric_name,
506            metric_function=new_metric_function,
507            metric_dimensions=new_metric_dimensions,
508            metric_unit=new_metric_unit,
509            metric_alarm_threshold=new_metric_alarm_threshold,
510            metric_alarm_severity=new_metric_alarm_severity,
511            git_hash=self.git_hash,
512            git_repo_name=self.git_repo_name,
513            reports_to_skip=new_metric_reports_to_skip,
514            is_percent=is_percent
515        )
516        self.metrics.append(new_metric)
517        # append an empty list so we can track it's metrics over time
518        self.metrics_numbers.append([])
519
520    def register_dashboard_widget(self, new_widget_name, metrics_to_add=[], new_widget_period=60):
521
522        # We need to know what metric dimension to get the metric(s) from
523        metric_dimension_string = ""
524        if (self.git_hash_as_namespace == False):
525            metric_dimension_string = self.git_repo_name + "-" + self.git_hash
526        else:
527            metric_dimension_string = "System_Metrics"
528
529        widget = self._find_cloudwatch_widget(name=new_widget_name)
530        if (widget == None):
531            widget = DataSnapshot_Dashboard_Widget(
532                widget_name=new_widget_name, metric_namespace=self.git_metric_namespace,
533                metric_dimension=metric_dimension_string,
534                cloudwatch_region=self.cloudwatch_region,
535                widget_period=new_widget_period)
536            self.cloudwatch_dashboard_widgets.append(widget)
537
538        for metric in metrics_to_add:
539            self.register_metric_to_dashboard_widget(widget_name=new_widget_name, metric_name=metric)
540
541    def register_metric_to_dashboard_widget(self, widget_name, metric_name, widget=None):
542        if widget is None:
543            widget = self._find_cloudwatch_widget(name=widget_name)
544            if widget is None:
545                print ("[DataSnapshot] ERROR - could not find widget with name: " + widget_name, flush=True)
546                return
547
548        # Adjust metric name so it has the git hash, repo, etc
549        metric_name_formatted = metric_name
550
551        widget.add_metric_to_widget(new_metric_name=metric_name_formatted)
552        return
553
554    def remove_metric_from_dashboard_widget(self, widget_name, metric_name, widget=None):
555        if widget is None:
556            widget = self._find_cloudwatch_widget(name=widget_name)
557            if widget is None:
558                print ("[DataSnapshot] ERROR - could not find widget with name: " + widget_name, flush=True)
559                return
560        widget.remove_metric_from_widget(existing_metric_name=metric_name)
561        return
562
563    def _find_cloudwatch_widget(self, name):
564        result = None
565        for widget in self.cloudwatch_dashboard_widgets:
566            if widget.widget_name == name:
567                return widget
568        return result
569
570    # Prints the metrics to the console
571    def export_metrics_console(self):
572        datetime_now = datetime.datetime.now()
573        datetime_string = datetime_now.strftime("%d-%m-%Y/%H:%M:%S")
574
575        self.print_message("\n[DataSnapshot] Metric report: " + str(self.metric_report_number) + " (" + datetime_string + ")")
576        for metric in self.metrics:
577            if (metric.is_percent == True):
578                self.print_message("    " + metric.metric_name +
579                                " - value: " + str(metric.metric_value) + "%")
580            else:
581                self.print_message("    " + metric.metric_name +
582                                " - value: " + str(metric.metric_value))
583        self.print_message("")
584
585    # Sends all registered metrics to Cloudwatch.
586    # Does NOT need to called on loop. Call post_metrics on loop to send all the metrics as expected.
587    # This is just the Cloudwatch part of that loop.
588    def export_metrics_cloudwatch(self):
589        if (self.cloudwatch_client == None):
590            self.print_message("[DataSnapshot] Error - cannot export Cloudwatch metrics! Cloudwatch was not initialized.")
591            self.abort_due_to_internal_error = True
592            self.abort_due_to_internal_error_reason = "Could not export Cloudwatch metrics due to no Cloudwatch client initialized!"
593            return
594
595        self.print_message("[DataSnapshot] Preparing to send to Cloudwatch...")
596        metrics_data = []
597        metric_data_tmp = None
598        for metric in self.metrics:
599            metric_data_tmp = metric.get_metric_cloudwatch_dictionary()
600            if (not metric_data_tmp is None):
601                metrics_data.append(metric_data_tmp)
602
603        if (len(metrics_data) == 0):
604            self.print_message("[DataSnapshot] INFO - no metric data to send. Skipping...")
605            return
606
607        try:
608            self.cloudwatch_client.put_metric_data(
609                Namespace=self.git_metric_namespace,
610                MetricData=metrics_data)
611            self.print_message("[DataSnapshot] Metrics sent to Cloudwatch.")
612        except Exception as e:
613            self.print_message("[DataSnapshot] Error - something when wrong posting cloudwatch metrics!")
614            self.print_message("[DataSnapshot] Exception: " + str(e))
615            self.abort_due_to_internal_error = True
616            self.abort_due_to_internal_error_reason = "Could not export Cloudwatch metrics due to exception in Cloudwatch client!"
617            return
618
619    # Call this at a set interval to post the metrics to Cloudwatch, etc.
620    # This is the function you want to call repeatedly after you have everything setup.
621    def post_metrics(self, psutil_process : psutil.Process):
622        if (self.perform_final_initialization == True):
623            self.perform_final_initialization = False
624            self._init_cloudwatch_pre_first_run()
625
626        # Update the metric values internally
627        for i in range(0, len(self.metrics)):
628            metric_value = self.metrics[i].get_metric_value(psutil_process)
629            self.metrics_numbers[i].insert(0, metric_value)
630
631            # Only keep the last metric_report_non_zero_count results
632            if (len(self.metrics_numbers[i]) > self.metric_report_non_zero_count):
633                amount_to_delete = len(self.metrics_numbers[i]) - self.metric_report_non_zero_count
634                del self.metrics_numbers[i][-amount_to_delete:]
635            # If we have metric_report_non_zero_count amount of metrics, make sure there is at least one
636            # non-zero. If it is all zero, then print a log so we can easily find it
637            if (len(self.metrics_numbers[i]) == self.metric_report_non_zero_count):
638                non_zero_found = False
639                for j in range(0, len(self.metrics_numbers[i])):
640                    if (self.metrics_numbers[i][j] != 0.0 and self.metrics_numbers[i][j] != None):
641                        non_zero_found = True
642                        break
643                if (non_zero_found == False):
644                    self.print_message("\n[DataSnapshot] METRIC ZERO ERROR!")
645                    self.print_message(f"[DataSnapshot] Metric index {i} has been zero for last {self.metric_report_non_zero_count} reports!")
646                    self.print_message("\n")
647
648        self.metric_report_number += 1
649
650        self.export_metrics_console()
651        self.export_metrics_cloudwatch()
652
653    def output_diagnosis_information(self, dependencies_list):
654
655        # Print general diagnosis information
656        self.print_message("\n========== Canary Wrapper diagnosis information ==========")
657        self.print_message("\nRunning Canary for repository: " + self.git_repo_name)
658        self.print_message("\t Commit hash: " + self.git_hash)
659
660        if not dependencies_list == "":
661            self.print_message("\nDependencies:")
662            dependencies_list = dependencies_list.split(";")
663            dependencies_list_found_hash = False
664            for i in range(0, len(dependencies_list)):
665                # There's probably a better way to do this...
666                if (dependencies_list_found_hash == True):
667                    dependencies_list_found_hash = False
668                    continue
669                self.print_message("* " + dependencies_list[i])
670                if (i+1 < len(dependencies_list)):
671                    self.print_message("\t Commit hash: " + dependencies_list[i+1])
672                    dependencies_list_found_hash = True
673                else:
674                    self.print_message("\t Commit hash: Unknown")
675
676        if (self.metric_frequency != None):
677            self.print_message("\nMetric Snapshot Frequency: " + str(self.metric_frequency) + " seconds")
678        self.print_message("\nMetrics:")
679        for metric in self.metrics:
680            self.print_message("* " + metric.metric_name)
681            if metric.metric_alarm_threshold is not None:
682                self.print_message("\t Alarm Threshold: " + str(metric.metric_alarm_threshold))
683                self.print_message("\t Alarm Severity: " + str(metric.metric_alarm_severity))
684            else:
685                self.print_message("\t No alarm set for metric.")
686
687        self.print_message("\n")
688        self.print_message("==========================================================")
689        self.print_message("\n")
690
691# ================================================================================
692
693class SnapshotMonitor():
694    def __init__(self, wrapper_data_snapshot, wrapper_metrics_wait_time) -> None:
695
696        self.data_snapshot = wrapper_data_snapshot
697        self.had_internal_error = False
698        self.error_due_to_credentials = False
699        self.internal_error_reason = ""
700        self.error_due_to_alarm = False
701
702        self.can_cut_ticket = False
703        self.has_cut_ticket = False
704
705        # A list of all the alarms triggered in the last check, cached for later
706        # NOTE - this is only the alarm names! Not the severity. This just makes it easier to process
707        self.cloudwatch_current_alarms_triggered = []
708
709        # Check for errors
710        if (self.data_snapshot.abort_due_to_internal_error == True):
711            self.had_internal_error = True
712            self.internal_error_reason = "Could not initialize DataSnapshot. Likely credentials are not setup!"
713            if (self.data_snapshot.abort_due_to_internal_error_due_to_credentials == True):
714                self.error_due_to_credentials = True
715            self.data_snapshot.cleanup()
716            return
717
718        # How long to wait before posting a metric
719        self.metric_post_timer = 0
720        self.metric_post_timer_time = wrapper_metrics_wait_time
721
722
723    def register_metric(self, new_metric_name, new_metric_function, new_metric_unit="None", new_metric_alarm_threshold=None,
724                        new_metric_reports_to_skip=0, new_metric_alarm_severity=6):
725
726        try:
727            self.data_snapshot.register_metric(
728                new_metric_name=new_metric_name,
729                new_metric_function=new_metric_function,
730                new_metric_unit=new_metric_unit,
731                new_metric_alarm_threshold=new_metric_alarm_threshold,
732                new_metric_reports_to_skip=new_metric_reports_to_skip,
733                new_metric_alarm_severity=new_metric_alarm_severity)
734        except Exception as e:
735            self.print_message("[SnaptshotMonitor] ERROR - could not register metric in data snapshot due to exception!")
736            self.print_message("[SnaptshotMonitor] Exception: " + str(e))
737            self.had_internal_error = True
738            self.internal_error_reason = "Could not register metric in data snapshot due to exception"
739            return
740
741    def register_dashboard_widget(self, new_widget_name, metrics_to_add=[], widget_period=60):
742        self.data_snapshot.register_dashboard_widget(new_widget_name=new_widget_name, metrics_to_add=metrics_to_add, new_widget_period=widget_period)
743
744    def output_diagnosis_information(self, dependencies=""):
745        self.data_snapshot.output_diagnosis_information(dependencies_list=dependencies)
746
747    def check_alarms_for_new_alarms(self, triggered_alarms):
748
749        if len(triggered_alarms) > 0:
750            self.data_snapshot.print_message(
751                "WARNING - One or more alarms are in state of ALARM")
752
753            old_alarms_still_active = []
754            new_alarms = []
755            new_alarms_highest_severity = 6
756            new_alarm_found = True
757            new_alarm_ticket_description = "Canary has metrics in ALARM state!\n\nMetrics in alarm:\n"
758
759            for triggered_alarm in triggered_alarms:
760                new_alarm_found = True
761
762                # Is this a new alarm?
763                for old_alarm_name in self.cloudwatch_current_alarms_triggered:
764                    if (old_alarm_name == triggered_alarm[1]):
765                        new_alarm_found = False
766                        old_alarms_still_active.append(triggered_alarm[1])
767
768                        new_alarm_ticket_description += "* (STILL IN ALARM) " + triggered_alarm[1] + "\n"
769                        new_alarm_ticket_description += "\tSeverity: " + str(triggered_alarm[2])
770                        new_alarm_ticket_description += "\n"
771                        break
772
773                # If it is a new alarm, then add it to our list so we can cut a new ticket
774                if (new_alarm_found == True):
775                    self.data_snapshot.print_message('    (NEW) Alarm with name "' + triggered_alarm[1] + '" is in the ALARM state!')
776                    new_alarms.append(triggered_alarm[1])
777                    if (triggered_alarm[2] < new_alarms_highest_severity):
778                        new_alarms_highest_severity = triggered_alarm[2]
779                    new_alarm_ticket_description += "* " + triggered_alarm[1] + "\n"
780                    new_alarm_ticket_description += "\tSeverity: " + str(triggered_alarm[2])
781                    new_alarm_ticket_description += "\n"
782
783
784            if len(new_alarms) > 0:
785                if (self.can_cut_ticket == True):
786                    cut_ticket_using_cloudwatch(
787                        git_repo_name=self.data_snapshot.git_repo_name,
788                        git_hash=self.data_snapshot.git_hash,
789                        git_hash_as_namespace=False,
790                        git_fixed_namespace_text=self.data_snapshot.git_fixed_namespace_text,
791                        cloudwatch_region="us-east-1",
792                        ticket_description="New metric(s) went into alarm for the Canary! Metrics in alarm: " + str(new_alarms),
793                        ticket_reason="New metric(s) went into alarm",
794                        ticket_allow_duplicates=True,
795                        ticket_category="AWS",
796                        ticket_item="IoT SDK for CPP",
797                        ticket_group="AWS IoT Device SDK",
798                        ticket_type="SDKs and Tools",
799                        ticket_severity=4)
800                    self.has_cut_ticket = True
801
802            # Cache the new alarms and the old alarms
803            self.cloudwatch_current_alarms_triggered = old_alarms_still_active + new_alarms
804
805        else:
806            self.cloudwatch_current_alarms_triggered.clear()
807
808
809    def monitor_loop_function(self, psutil_process : psutil.Process, time_passed=30):
810        # Check for internal errors
811        if (self.data_snapshot.abort_due_to_internal_error == True):
812            self.had_internal_error = True
813            self.internal_error_reason = "Data Snapshot internal error: " + self.data_snapshot.abort_due_to_internal_error_reason
814            return
815
816        try:
817            # Poll the metric alarms
818            if (self.had_internal_error == False):
819                # Get a report of all the alarms that might have been set to an alarm state
820                triggered_alarms = self.data_snapshot.get_cloudwatch_alarm_results()
821                self.check_alarms_for_new_alarms(triggered_alarms)
822        except Exception as e:
823            self.print_message("[SnaptshotMonitor] ERROR - exception occurred checking metric alarms!")
824            self.print_message("[SnaptshotMonitor] (Likely session credentials expired)")
825            self.had_internal_error = True
826            self.internal_error_reason = "Exception occurred checking metric alarms! Likely session credentials expired"
827            return
828
829        if (self.metric_post_timer <= 0):
830            if (self.had_internal_error == False):
831                try:
832                    self.data_snapshot.post_metrics(psutil_process)
833                except Exception as e:
834                    self.print_message("[SnaptshotMonitor] ERROR - exception occurred posting metrics!")
835                    self.print_message("[SnaptshotMonitor] (Likely session credentials expired)")
836
837                    print (e, flush=True)
838
839                    self.had_internal_error = True
840                    self.internal_error_reason = "Exception occurred posting metrics! Likely session credentials expired"
841                    return
842
843            # reset the timer
844            self.metric_post_timer += self.metric_post_timer_time
845
846        # Gather and post the metrics
847        self.metric_post_timer -= time_passed
848
849
850    def send_email(self, email_body, email_subject_text_append=None):
851        if (email_subject_text_append != None):
852            self.data_snapshot.lambda_send_email(email_body, "Canary: " + self.data_snapshot.git_repo_name + ":" + self.data_snapshot.git_hash + " - " + email_subject_text_append)
853        else:
854            self.data_snapshot.lambda_send_email(email_body, "Canary: " + self.data_snapshot.git_repo_name + ":" + self.data_snapshot.git_hash)
855
856
857    def stop_monitoring(self):
858        # Stub - just added for consistency
859        pass
860
861
862    def start_monitoring(self):
863        # Stub - just added for consistency
864        pass
865
866
867    def restart_monitoring(self):
868        # Stub - just added for consistency
869        pass
870
871
872    def cleanup_monitor(self, error_occurred=False):
873        self.data_snapshot.cleanup(error_occurred=error_occurred)
874
875    def print_message(self, message):
876        if (self.data_snapshot != None):
877            self.data_snapshot.print_message(message)
878        else:
879            print(message, flush=True)
880
881# ================================================================================
882
883class ApplicationMonitor():
884    def __init__(self, wrapper_application_path, wrapper_application_arguments, wrapper_application_restart_on_finish=True, data_snapshot=None) -> None:
885        self.application_process = None
886        self.application_process_psutil = None
887        self.error_has_occurred = False
888        self.error_due_to_credentials = False
889        self.error_reason = ""
890        self.error_code = 0
891        self.wrapper_application_path = wrapper_application_path
892        self.wrapper_application_arguments = wrapper_application_arguments
893        self.wrapper_application_restart_on_finish = wrapper_application_restart_on_finish
894        self.data_snapshot=data_snapshot
895
896        self.stdout_file_path = "Canary_Stdout_File.txt"
897
898    def start_monitoring(self):
899        self.print_message("[ApplicationMonitor] Starting to monitor application...")
900
901        if (self.application_process == None):
902            try:
903                canary_command = self.wrapper_application_path + " " + self.wrapper_application_arguments
904                self.application_process = subprocess.Popen(canary_command + " | tee " + self.stdout_file_path, shell=True)
905                self.application_process_psutil = psutil.Process(self.application_process.pid)
906                self.print_message ("[ApplicationMonitor] Application started...")
907            except Exception as e:
908                self.print_message ("[ApplicationMonitor] ERROR - Could not launch Canary/Application due to exception!")
909                self.print_message ("[ApplicationMonitor] Exception: " + str(e))
910                self.error_has_occurred = True
911                self.error_reason = "Could not launch Canary/Application due to exception"
912                self.error_code = 1
913                return
914        else:
915            self.print_message("[ApplicationMonitor] ERROR - Monitor already has an application process! Cannot monitor two applications with one monitor class!")
916
917    def restart_monitoring(self):
918        self.print_message ("[ApplicationMonitor] Restarting monitor application...")
919
920        if (self.application_process != None):
921            try:
922                self.stop_monitoring()
923                self.start_monitoring()
924                self.print_message("\n[ApplicationMonitor] Restarted monitor application!")
925                self.print_message("================================================================================")
926            except Exception as e:
927                self.print_message("[ApplicationMonitor] ERROR - Could not restart Canary/Application due to exception!")
928                self.print_message("[ApplicationMonitor] Exception: " + str(e))
929                self.error_has_occurred = True
930                self.error_reason = "Could not restart Canary/Application due to exception"
931                self.error_code = 1
932                return
933        else:
934            self.print_message("[ApplicationMonitor] ERROR - Application process restart called but process is/was not running!")
935            self.error_has_occurred = True
936            self.error_reason = "Could not restart Canary/Application due to application process not being started initially"
937            self.error_code = 1
938            return
939
940
941    def stop_monitoring(self):
942        self.print_message ("[ApplicationMonitor] Stopping monitor application...")
943        if (not self.application_process == None):
944            self.application_process.terminate()
945            self.application_process.wait()
946            self.print_message ("[ApplicationMonitor] Stopped monitor application!")
947            self.application_process = None
948            self.print_stdout()
949        else:
950            self.print_message ("[ApplicationMonitor] ERROR - cannot stop monitor application because no process is found!")
951
952    def print_stdout(self):
953        # Print the STDOUT file
954        if (os.path.isfile(self.stdout_file_path)):
955            self.print_message("Just finished Application STDOUT: ")
956            with open(self.stdout_file_path, "r") as stdout_file:
957                self.print_message(stdout_file.read())
958            os.remove(self.stdout_file_path)
959
960    def monitor_loop_function(self, time_passed=30):
961        if (self.application_process != None):
962
963            application_process_return_code = None
964            try:
965                application_process_return_code = self.application_process.poll()
966            except Exception as e:
967                self.print_message("[ApplicationMonitor] ERROR - exception occurred while trying to poll application status!")
968                self.print_message("[ApplicationMonitor] Exception: " + str(e))
969                self.error_has_occurred = True
970                self.error_reason = "Exception when polling application status"
971                self.error_code = 1
972                return
973
974            # If it is not none, then the application finished
975            if (application_process_return_code != None):
976
977                self.print_message("[ApplicationMonitor] Monitor application has stopped! Processing result...")
978
979                if (application_process_return_code != 0):
980                    self.print_message("[ApplicationMonitor] ERROR - Something Crashed in Canary/Application!")
981                    self.print_message("[ApplicationMonitor] Error code: " + str(application_process_return_code))
982
983                    self.error_has_occurred = True
984                    self.error_reason = "Canary application crashed!"
985                    self.error_code = application_process_return_code
986                else:
987                    # Should we restart?
988                    if (self.wrapper_application_restart_on_finish == True):
989                        self.print_message("[ApplicationMonitor] NOTE - Canary finished running and is restarting...")
990                        self.restart_monitoring()
991                    else:
992                        self.print_message("[ApplicationMonitor] Monitor application has stopped and monitor is not supposed to restart... Finishing...")
993                        self.error_has_occurred = True
994                        self.error_reason = "Canary Application Finished"
995                        self.error_code = 0
996            else:
997                self.print_message("[ApplicationMonitor] Monitor application is still running...")
998
999    def cleanup_monitor(self, error_occurred=False):
1000        pass
1001
1002    def print_message(self, message):
1003        if (self.data_snapshot != None):
1004            self.data_snapshot.print_message(message)
1005        else:
1006            print(message, flush=True)
1007
1008# ================================================================================
1009
1010class S3Monitor():
1011
1012    def __init__(self, s3_bucket_name, s3_file_name, s3_file_name_in_zip, canary_local_application_path, data_snapshot) -> None:
1013        self.s3_client = None
1014        self.s3_current_object_version_id = None
1015        self.s3_current_object_last_modified = None
1016        self.s3_bucket_name = s3_bucket_name
1017        self.s3_file_name = s3_file_name
1018        self.s3_file_name_only_path, self.s3_file_name_only_extension = os.path.splitext(s3_file_name)
1019        self.data_snapshot = data_snapshot
1020
1021        self.canary_local_application_path = canary_local_application_path
1022
1023        self.s3_file_name_in_zip = s3_file_name_in_zip
1024        self.s3_file_name_in_zip_only_path = None
1025        self.s3_file_name_in_zip_only_extension = None
1026        if (self.s3_file_name_in_zip != None):
1027            self.s3_file_name_in_zip_only_path, self.s3_file_name_in_zip_only_extension = os.path.splitext(s3_file_name_in_zip)
1028
1029        self.s3_file_needs_replacing = False
1030
1031        self.had_internal_error = False
1032        self.error_due_to_credentials = False
1033        self.internal_error_reason = ""
1034
1035        # Check for valid credentials
1036        # ==================
1037        try:
1038            tmp_sts_client = boto3.client('sts')
1039            tmp_sts_client.get_caller_identity()
1040        except Exception as e:
1041            self.print_message("[S3Monitor] ERROR - (S3 Check) AWS credentials are NOT valid!")
1042            self.had_internal_error = True
1043            self.error_due_to_credentials = True
1044            self.internal_error_reason = "AWS credentials are NOT valid!"
1045            return
1046        # ==================
1047
1048        try:
1049            self.s3_client = boto3.client("s3")
1050        except Exception as e:
1051            self.print_message("[S3Monitor] ERROR - (S3 Check) Could not make S3 client")
1052            self.had_internal_error = True
1053            self.internal_error_reason = "Could not make S3 client for S3 Monitor"
1054            return
1055
1056
1057    def check_for_file_change(self):
1058        try:
1059            version_check_response = self.s3_client.list_object_versions(
1060                Bucket=self.s3_bucket_name,
1061                Prefix=self.s3_file_name_only_path)
1062            if "Versions" in version_check_response:
1063                for version in version_check_response["Versions"]:
1064                    if (version["IsLatest"] == True):
1065                        if (version["VersionId"] != self.s3_current_object_version_id or
1066                            version["LastModified"] != self.s3_current_object_last_modified):
1067
1068                            self.print_message("[S3Monitor] Found new version of Canary/Application in S3!")
1069                            self.print_message("[S3Monitor] Changing running Canary/Application to new one...")
1070
1071                            # Will be checked by thread to trigger replacing the file
1072                            self.s3_file_needs_replacing = True
1073
1074                            self.s3_current_object_version_id = version["VersionId"]
1075                            self.s3_current_object_last_modified = version["LastModified"]
1076                            return
1077
1078        except Exception as e:
1079            self.print_message("[S3Monitor] ERROR - Could not check for new version of file in S3 due to exception!")
1080            self.print_message("[S3Monitor] Exception: " + str(e))
1081            self.had_internal_error = True
1082            self.internal_error_reason = "Could not check for S3 file due to exception in S3 client"
1083
1084
1085    def replace_current_file_for_new_file(self):
1086        try:
1087            self.print_message("[S3Monitor] Making directory...")
1088            if not os.path.exists("tmp"):
1089                os.makedirs("tmp")
1090        except Exception as e:
1091            self.print_message ("[S3Monitor] ERROR - could not make tmp directory to place S3 file into!")
1092            self.had_internal_error = True
1093            self.internal_error_reason = "Could not make TMP folder for S3 file download"
1094            return
1095
1096        # Download the file
1097        new_file_path = "tmp/new_file" + self.s3_file_name_only_extension
1098        try:
1099            self.print_message("[S3Monitor] Downloading file...")
1100            s3_resource = boto3.resource("s3")
1101            s3_resource.meta.client.download_file(self.s3_bucket_name, self.s3_file_name, new_file_path)
1102        except Exception as e:
1103            self.print_message("[S3Monitor] ERROR - could not download latest S3 file into TMP folder!")
1104            self.had_internal_error = True
1105            self.internal_error_reason = "Could not download latest S3 file into TMP folder"
1106            return
1107
1108        # Is it a zip file?
1109        if (self.s3_file_name_in_zip != None):
1110            self.print_message("[S3Monitor] New file is zip file. Unzipping...")
1111            # Unzip it!
1112            with zipfile.ZipFile(new_file_path, 'r') as zip_file:
1113                zip_file.extractall("tmp/new_file_zip")
1114                new_file_path = "tmp/new_file_zip/" + self.s3_file_name_in_zip_only_path + self.s3_file_name_in_zip_only_extension
1115
1116        try:
1117            # is there a file already present there?
1118            if os.path.exists(self.canary_local_application_path) == True:
1119                os.remove(self.canary_local_application_path)
1120
1121            self.print_message("[S3Monitor] Moving file...")
1122            os.replace(new_file_path, self.canary_local_application_path)
1123            self.print_message("[S3Monitor] Getting execution rights...")
1124            os.system("chmod u+x " + self.canary_local_application_path)
1125
1126        except Exception as e:
1127            self.print_message("[S3Monitor] ERROR - could not move file into local application path due to exception!")
1128            self.print_message("[S3Monitor] Exception: " + str(e))
1129            self.had_internal_error = True
1130            self.internal_error_reason = "Could not move file into local application path"
1131            return
1132
1133        self.print_message("[S3Monitor] New file downloaded and moved into correct location!")
1134        self.s3_file_needs_replacing = False
1135
1136
1137    def stop_monitoring(self):
1138        # Stub - just added for consistency
1139        pass
1140
1141
1142    def start_monitoring(self):
1143        # Stub - just added for consistency
1144        pass
1145
1146
1147    def restart_monitoring(self):
1148        # Stub - just added for consistency
1149        pass
1150
1151
1152    def cleanup_monitor(self):
1153        # Stub - just added for consistency
1154        pass
1155
1156    def monitor_loop_function(self, time_passed=30):
1157        self.check_for_file_change()
1158
1159    def print_message(self, message):
1160        if (self.data_snapshot != None):
1161            self.data_snapshot.print_message(message)
1162        else:
1163            print(message, flush=True)
1164
1165# ================================================================================
1166
1167
1168# Cuts a ticket to SIM using a temporary Cloudwatch metric that is quickly created, triggered, and destroyed.
1169# Can be called in any thread - creates its own Cloudwatch client and any data it needs is passed in.
1170#
1171# See (https://w.amazon.com/bin/view/CloudWatchAlarms/Internal/CloudWatchAlarmsSIMTicketing) for more details
1172# on how the alarm is sent using Cloudwatch.
1173def cut_ticket_using_cloudwatch(
1174    ticket_description="Description here!",
1175    ticket_reason="Reason here!",
1176    ticket_severity=5,
1177    ticket_category="AWS",
1178    ticket_type="SDKs and Tools",
1179    ticket_item="IoT SDK for CPP",
1180    ticket_group="AWS IoT Device SDK",
1181    ticket_allow_duplicates=False,
1182    git_repo_name="REPO NAME",
1183    git_hash="HASH",
1184    git_hash_as_namespace=False,
1185    git_fixed_namespace_text="mqtt5_canary",
1186    cloudwatch_region="us-east-1"):
1187
1188    git_metric_namespace = ""
1189    if (git_hash_as_namespace == False):
1190        git_metric_namespace = git_fixed_namespace_text
1191    else:
1192        git_namespace_prepend_text = git_repo_name + "-" + git_hash
1193        git_metric_namespace = git_namespace_prepend_text
1194
1195    try:
1196        cloudwatch_client = boto3.client('cloudwatch', cloudwatch_region)
1197        ticket_alarm_name = git_repo_name + "-" + git_hash + "-AUTO-TICKET"
1198    except Exception as e:
1199        print ("ERROR - could not create Cloudwatch client to make ticket metric alarm due to exception!")
1200        print ("Exception: " + str(e), flush=True)
1201        return
1202
1203    new_metric_dimensions = []
1204    if (git_hash_as_namespace == False):
1205        git_namespace_prepend_text = git_repo_name + "-" + git_hash
1206        new_metric_dimensions.append(
1207            {"Name": git_namespace_prepend_text, "Value": ticket_alarm_name})
1208    else:
1209        new_metric_dimensions.append(
1210            {"Name": "System_Metrics", "Value": ticket_alarm_name})
1211
1212    ticket_arn = f"arn:aws:cloudwatch::cwa-internal:ticket:{ticket_severity}:{ticket_category}:{ticket_type}:{ticket_item}:{ticket_group}:"
1213    if (ticket_allow_duplicates == True):
1214        # use "DO-NOT-DEDUPE" so we can run the same commit again and it will cut another ticket.
1215        ticket_arn += "DO-NOT-DEDUPE"
1216    # In the ticket ARN, all spaces need to be replaced with +
1217    ticket_arn = ticket_arn.replace(" ", "+")
1218
1219    ticket_alarm_description = f"AUTO CUT CANARY WRAPPER TICKET\n\nREASON: {ticket_reason}\n\nDESCRIPTION: {ticket_description}\n\n"
1220
1221    # Register a metric alarm so it can auto-cut a ticket for us
1222    try:
1223        cloudwatch_client.put_metric_alarm(
1224            AlarmName=ticket_alarm_name,
1225            AlarmDescription=ticket_alarm_description,
1226            MetricName=ticket_alarm_name,
1227            Namespace=git_metric_namespace,
1228            Statistic="Maximum",
1229            Dimensions=new_metric_dimensions,
1230            Period=60,  # How long (in seconds) is an evaluation period?
1231            EvaluationPeriods=1,  # How many periods does it need to be invalid for?
1232            DatapointsToAlarm=1,  # How many data points need to be invalid?
1233            Threshold=1,
1234            ComparisonOperator="GreaterThanOrEqualToThreshold",
1235            # The data above does not really matter - it just needs to be valid input data.
1236            # This is the part that tells Cloudwatch to cut the ticket
1237            AlarmActions=[ticket_arn]
1238        )
1239    except Exception as e:
1240        print ("ERROR - could not create ticket metric alarm due to exception!")
1241        print ("Exception: " + str(e), flush=True)
1242        return
1243
1244    # Trigger the alarm so it cuts the ticket
1245    try:
1246        cloudwatch_client.set_alarm_state(
1247            AlarmName=ticket_alarm_name,
1248            StateValue="ALARM",
1249            StateReason="AUTO TICKET CUT")
1250    except Exception as e:
1251        print ("ERROR - could not cut ticket due to exception!")
1252        print ("Exception: " + str(e), flush=True)
1253        return
1254
1255    print("Waiting for ticket metric to trigger...", flush=True)
1256    # Wait a little bit (2 seconds)...
1257    time.sleep(2)
1258
1259    # Remove the metric
1260    print("Removing ticket metric...", flush=True)
1261    cloudwatch_client.delete_alarms(AlarmNames=[ticket_alarm_name])
1262
1263    print ("Finished cutting ticket via Cloudwatch!", flush=True)
1264    return
1265
1266# A helper function that gets the majority of the ticket information from the arguments result from argparser.
1267def cut_ticket_using_cloudwatch_from_args(
1268    ticket_description="",
1269    ticket_reason="",
1270    ticket_severity=6,
1271    arguments=None):
1272
1273    # Do not cut a ticket for a severity of 6+
1274    if (ticket_severity >= 6):
1275        return
1276
1277    cut_ticket_using_cloudwatch(
1278        ticket_description=ticket_description,
1279        ticket_reason=ticket_reason,
1280        ticket_severity=ticket_severity,
1281        ticket_category=arguments.ticket_category,
1282        ticket_type=arguments.ticket_type,
1283        ticket_item=arguments.ticket_item,
1284        ticket_group=arguments.ticket_group,
1285        ticket_allow_duplicates=False,
1286        git_repo_name=arguments.git_repo_name,
1287        git_hash=arguments.git_hash,
1288        git_hash_as_namespace=arguments.git_hash_as_namespace)
1289