1# Python wrapper script for collecting Canary metrics, setting up alarms, reporting metrics to Cloudwatch, 2# checking the alarms to ensure everything is correct at the end of the run, and checking for new 3# builds in S3, downloading them, and launching them if they exist (24/7 operation) 4# 5# Will only stop running if the Canary application itself has an issue - in which case it Canary application will 6# need to be fixed and then the wrapper script restarted 7 8# Needs to be installed prior to running 9# Part of standard packages in Python 3.4+ 10import argparse 11import time 12# Dependencies in project folder 13from CanaryWrapper_Classes import * 14from CanaryWrapper_MetricFunctions import * 15 16# TODO - Using subprocess may not work on Windows for starting/stopping the application thread. 17# Canary will likely be running on Linux, so it's probably okay, but need to confirm/check at some point.... 18# ================================================================================ 19# Code for command line argument parsing 20 21command_parser = argparse.ArgumentParser("CanaryWrapper_24_7") 22command_parser.add_argument("--canary_executable", type=str, required=True, 23 help="The path to the canary executable") 24command_parser.add_argument("--canary_arguments", type=str, default="", 25 help="The arguments to pass/launch the canary executable with") 26command_parser.add_argument("--s3_bucket_name", type=str, default="canary-wrapper-folder", 27 help="(OPTIONAL, default=canary-wrapper-folder) The name of the S3 bucket where success logs will be stored") 28command_parser.add_argument("--s3_bucket_application", type=str, required=True, 29 help="(OPTIONAL, default=canary-wrapper-folder) The S3 URL to monitor for changes MINUS the bucket name") 30command_parser.add_argument("--s3_bucket_application_in_zip", type=str, required=False, default="", 31 help="(OPTIONAL, default="") The file path in the zip folder where the application is stored. Will be ignored if set to empty string") 32command_parser.add_argument("--lambda_name", type=str, default="iot-send-email-lambda", 33 help="(OPTIONAL, default='CanarySendEmailLambda') The name of the Lambda used to send emails") 34command_parser_arguments = command_parser.parse_args() 35 36# ================================================================================ 37# Global variables that both threads use to communicate. 38# NOTE - These should likely be replaced with futures or similar for better thread safety. 39# However, these variables are only either read or written to from a single thread, no 40# thread should read and write to these variables. 41 42# The local file path (and extension) of the Canary application that the wrapper will manage 43# (This will also be the filename and directory used when a new file is detected in S3) 44# [THIS IS READ ONLY] 45canary_local_application_path = command_parser_arguments.canary_executable 46if (canary_local_application_path == ""): 47 print ("ERROR - required canary_executable is empty!") 48 exit (1) # cannot run without a canary executable 49# This is the arguments passed to the local file path when starting 50# [THIS IS READ ONLY] 51canary_local_application_arguments = command_parser_arguments.canary_arguments 52# The "Git Hash" to use for metrics and dimensions 53# [THIS IS READ ONLY] 54canary_local_git_hash_stub = "Canary" 55# The "Git Repo" name to use for metrics and dimensions. Is hard-coded since this is a 24/7 canary that should only run for MQTT 56# [THIS IS READ ONLY] 57canary_local_git_repo_stub = "MQTT5_24_7" 58# The Fixed Namespace name for the Canary 59# [THIS IS READ ONLY] 60canary_local_git_fixed_namespace = "MQTT5_24_7_Canary" 61# The S3 bucket name to monitor for the application 62# [THIS IS READ ONLY] 63canary_s3_bucket_name = command_parser_arguments.s3_bucket_name 64if (canary_s3_bucket_name == ""): 65 canary_s3_bucket_name = "canary-wrapper-folder" 66# The file in the S3 bucket to monitor (The application filepath and file. Example: "canary/canary_application.exe") 67# [THIS IS READ ONLY] 68canary_s3_bucket_application_path = command_parser_arguments.s3_bucket_application 69if (canary_s3_bucket_application_path == ""): 70 print ("ERROR - required s3_bucket_application is empty!") 71 exit (1) # cannot run without a s3_bucket_application to monitor 72# The location of the file in the S3 zip, if the S3 file being monitored is a zip 73# (THIS IS READ ONLY) 74canary_s3_bucket_application_path_zip = command_parser_arguments.s3_bucket_application_in_zip 75if (canary_s3_bucket_application_path_zip == ""): 76 canary_s3_bucket_application_path_zip = None 77# The name of the email lambda. If an empty string is set, it defaults to 'iot-send-email-lambda' 78if (command_parser_arguments.lambda_name == ""): 79 command_parser_arguments.lambda_name = "iot-send-email-lambda" 80# The region the canary is running in 81# (THIS IS READ ONLY) 82canary_region_stub = "us-east-1" 83 84# How long (in seconds) to wait before gathering metrics and pushing them to Cloudwatch 85canary_metrics_wait_time = 600 # 10 minutes 86# How long (in seconds) to run the Application thread loop. Should be shorter or equal to the Canary Metrics time 87canary_application_loop_wait_time = 300 # 5 minutes 88 89# For testing - set both to 30 seconds 90# canary_metrics_wait_time = 30 91# canary_application_loop_wait_time = 30 92 93# ================================================================================ 94 95# Make the snapshot class 96data_snapshot = DataSnapshot( 97 git_hash=canary_local_git_hash_stub, 98 git_repo_name=canary_local_git_repo_stub, 99 git_hash_as_namespace=False, 100 datetime_string=None, 101 git_fixed_namespace_text=canary_local_git_fixed_namespace, 102 output_log_filepath="output.txt", 103 output_to_console=True, 104 cloudwatch_region=canary_region_stub, 105 cloudwatch_make_dashboard=True, 106 cloudwatch_teardown_alarms_on_complete=True, 107 cloudwatch_teardown_dashboard_on_complete=False, 108 s3_bucket_name=canary_s3_bucket_name, 109 s3_bucket_upload_on_complete=True, 110 lambda_name=command_parser_arguments.lambda_name, 111 metric_frequency=canary_metrics_wait_time) 112 113# Make sure nothing failed 114if (data_snapshot.abort_due_to_internal_error == True): 115 print ("INFO - Stopping application due to error caused by credentials") 116 print ("Please fix your credentials and then restart this application again") 117 exit(0) 118 119# Register metrics 120data_snapshot.register_metric( 121 new_metric_name="total_cpu_usage", 122 new_metric_function=get_metric_total_cpu_usage, 123 new_metric_unit="Percent", 124 new_metric_alarm_threshold=70, 125 new_metric_reports_to_skip=1, 126 new_metric_alarm_severity=5, 127 is_percent=True) 128data_snapshot.register_metric( 129 new_metric_name="total_memory_usage_value", 130 new_metric_function=get_metric_total_memory_usage_value, 131 new_metric_unit="Bytes") 132data_snapshot.register_metric( 133 new_metric_name="total_memory_usage_percent", 134 new_metric_function=get_metric_total_memory_usage_percent, 135 new_metric_unit="Percent", 136 new_metric_alarm_threshold=70, 137 new_metric_reports_to_skip=0, 138 new_metric_alarm_severity=5, 139 is_percent=True) 140 141data_snapshot.register_dashboard_widget("Process CPU Usage - Percentage", ["total_cpu_usage"], 60) 142data_snapshot.register_dashboard_widget("Process Memory Usage - Percentage", ["total_memory_usage_percent"], 60) 143 144# Print diagnosis information 145data_snapshot.output_diagnosis_information("24/7 Canary cannot show dependencies!") 146 147# Make the S3 class 148s3_monitor = S3Monitor( 149 s3_bucket_name=canary_s3_bucket_name, 150 s3_file_name=canary_s3_bucket_application_path, 151 s3_file_name_in_zip=canary_s3_bucket_application_path_zip, 152 canary_local_application_path=canary_local_application_path, 153 data_snapshot=data_snapshot) 154 155if (s3_monitor.had_internal_error == True): 156 print ("INFO - Stopping application due to error caused by credentials") 157 print ("Please fix your credentials and then restart this application again") 158 exit(0) 159 160# Make the snapshot (metrics) monitor 161snapshot_monitor = SnapshotMonitor( 162 wrapper_data_snapshot=data_snapshot, 163 wrapper_metrics_wait_time=canary_metrics_wait_time) 164 165# Make sure nothing failed 166if (snapshot_monitor.had_internal_error == True): 167 print ("INFO - Stopping application due to error caused by credentials") 168 print ("Please fix your credentials and then restart this application again") 169 exit(0) 170 171# Make the application monitor 172application_monitor = ApplicationMonitor( 173 wrapper_application_path=canary_local_application_path, 174 wrapper_application_arguments=canary_local_application_arguments, 175 wrapper_application_restart_on_finish=True, 176 data_snapshot=data_snapshot) 177 178# Make sure nothing failed 179if (application_monitor.error_has_occurred == True): 180 print ("INFO - Stopping application due to error caused by credentials") 181 print ("Please fix your credentials and then restart this application again") 182 exit(0) 183 184# For tracking if we stopped due to a metric alarm 185stopped_due_to_metric_alarm = False 186 187def execution_loop(): 188 while True: 189 s3_monitor.monitor_loop_function(time_passed=canary_application_loop_wait_time) 190 191 # Is there an error? 192 if (s3_monitor.had_internal_error == True): 193 print ("[Debug] S3 monitor had an internal error!") 194 break 195 196 # Is there a new file? 197 if (s3_monitor.s3_file_needs_replacing == True): 198 # Stop the application 199 print ("[Debug] Stopping application monitor...") 200 application_monitor.stop_monitoring() 201 print ("[Debug] Getting S3 file...") 202 s3_monitor.replace_current_file_for_new_file() 203 # Start the application 204 print ("[Debug] Starting application monitor...") 205 application_monitor.start_monitoring() 206 # Allow the snapshot monitor to cut a ticket 207 snapshot_monitor.can_cut_ticket = True 208 209 snapshot_monitor.monitor_loop_function( 210 time_passed=canary_application_loop_wait_time, psutil_process=application_monitor.application_process_psutil) 211 application_monitor.monitor_loop_function( 212 time_passed=canary_application_loop_wait_time) 213 214 # Did a metric go into alarm? 215 if (snapshot_monitor.has_cut_ticket == True): 216 # Do not allow it to cut anymore tickets until it gets a new build 217 snapshot_monitor.can_cut_ticket = False 218 219 # If an error has occurred or otherwise this thread needs to stop, then break the loop 220 if (application_monitor.error_has_occurred == True or snapshot_monitor.had_internal_error == True): 221 if (application_monitor.error_has_occurred == True): 222 print ("[Debug] Application monitor error occurred!") 223 else: 224 print ("[Debug] Snapshot monitor internal error ocurred!") 225 break 226 227 time.sleep(canary_application_loop_wait_time) 228 229 230def application_thread(): 231 # Start the application going 232 snapshot_monitor.start_monitoring() 233 application_monitor.start_monitoring() 234 # Allow the snapshot monitor to cut tickets 235 snapshot_monitor.can_cut_ticket = True 236 237 start_email_body = "MQTT5 24/7 Canary Wrapper has started. This will run and continue to test new MQTT5 application builds as" 238 start_email_body += " they pass CodeBuild and are uploaded to S3." 239 snapshot_monitor.send_email(email_body=start_email_body, email_subject_text_append="Started") 240 241 # Start the execution loop 242 execution_loop() 243 244 # Make sure everything is stopped 245 snapshot_monitor.stop_monitoring() 246 application_monitor.stop_monitoring() 247 248 # Track whether this counts as an error (and therefore we should cleanup accordingly) or not 249 wrapper_error_occurred = False 250 251 send_finished_email = True 252 finished_email_body = "MQTT5 24/7 Canary Wrapper has stopped." 253 finished_email_body += "\n\n" 254 255 try: 256 # Find out why we stopped 257 # S3 Monitor 258 if (s3_monitor.had_internal_error == True): 259 if (s3_monitor.error_due_to_credentials == False): 260 print ("ERROR - S3 monitor stopped due to internal error!") 261 cut_ticket_using_cloudwatch( 262 git_repo_name=canary_local_git_repo_stub, 263 git_hash=canary_local_git_hash_stub, 264 git_hash_as_namespace=False, 265 git_fixed_namespace_text=canary_local_git_fixed_namespace, 266 cloudwatch_region=canary_region_stub, 267 ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + s3_monitor.internal_error_reason, 268 ticket_reason="S3 monitor stopped due to internal error", 269 ticket_allow_duplicates=True, 270 ticket_category="AWS", 271 ticket_type="SDKs and Tools", 272 ticket_item="IoT SDK for CPP", 273 ticket_group="AWS IoT Device SDK", 274 ticket_severity=4) 275 finished_email_body += "Failure due to S3 monitor stopping due to an internal error." 276 finished_email_body += " Reason given for error: " + s3_monitor.internal_error_reason 277 wrapper_error_occurred = True 278 # Snapshot Monitor 279 elif (snapshot_monitor.had_internal_error == True): 280 if (snapshot_monitor.has_cut_ticket == True): 281 # We do not need to cut a ticket here - it's cut by the snapshot monitor! 282 print ("ERROR - Snapshot monitor stopped due to metric in alarm!") 283 finished_email_body += "Failure due to required metrics being in alarm! A new ticket should have been cut!" 284 finished_email_body += "\nMetrics in Alarm: " + str(snapshot_monitor.cloudwatch_current_alarms_triggered) 285 finished_email_body += "\nNOTE - this shouldn't occur in the 24/7 Canary! If it does, then the wrapper needs adjusting." 286 wrapper_error_occurred = True 287 else: 288 print ("ERROR - Snapshot monitor stopped due to internal error!") 289 cut_ticket_using_cloudwatch( 290 git_repo_name=canary_local_git_repo_stub, 291 git_hash=canary_local_git_hash_stub, 292 git_hash_as_namespace=False, 293 git_fixed_namespace_text=canary_local_git_fixed_namespace, 294 cloudwatch_region=canary_region_stub, 295 ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + snapshot_monitor.internal_error_reason, 296 ticket_reason="Snapshot monitor stopped due to internal error", 297 ticket_allow_duplicates=True, 298 ticket_category="AWS", 299 ticket_type="SDKs and Tools", 300 ticket_item="IoT SDK for CPP", 301 ticket_group="AWS IoT Device SDK", 302 ticket_severity=4) 303 wrapper_error_occurred = True 304 finished_email_body += "Failure due to Snapshot monitor stopping due to an internal error." 305 finished_email_body += " Reason given for error: " + snapshot_monitor.internal_error_reason 306 # Application Monitor 307 elif (application_monitor.error_has_occurred == True): 308 if (application_monitor.error_due_to_credentials == True): 309 print ("INFO - Stopping application due to error caused by credentials") 310 print ("Please fix your credentials and then restart this application again") 311 wrapper_error_occurred = True 312 send_finished_email = False 313 else: 314 # Is the error something in the canary failed? 315 if (application_monitor.error_code != 0): 316 cut_ticket_using_cloudwatch( 317 git_repo_name=canary_local_git_repo_stub, 318 git_hash=canary_local_git_hash_stub, 319 git_hash_as_namespace=False, 320 git_fixed_namespace_text=canary_local_git_fixed_namespace, 321 cloudwatch_region=canary_region_stub, 322 ticket_description="The 24/7 Canary exited with a non-zero exit code! This likely means something in the canary failed.", 323 ticket_reason="The 24/7 Canary exited with a non-zero exit code", 324 ticket_allow_duplicates=True, 325 ticket_category="AWS", 326 ticket_type="SDKs and Tools", 327 ticket_item="IoT SDK for CPP", 328 ticket_group="AWS IoT Device SDK", 329 ticket_severity=3) 330 wrapper_error_occurred = True 331 finished_email_body += "Failure due to MQTT5 application exiting with a non-zero exit code!" 332 finished_email_body += " This means something in the Canary application itself failed" 333 else: 334 cut_ticket_using_cloudwatch( 335 git_repo_name=canary_local_git_repo_stub, 336 git_hash=canary_local_git_hash_stub, 337 git_hash_as_namespace=False, 338 git_fixed_namespace_text=canary_local_git_fixed_namespace, 339 cloudwatch_region=canary_region_stub, 340 ticket_description="The 24/7 Canary exited with a zero exit code but did not restart!", 341 ticket_reason="The 24/7 Canary exited with a zero exit code but did not restart", 342 ticket_allow_duplicates=True, 343 ticket_category="AWS", 344 ticket_type="SDKs and Tools", 345 ticket_item="IoT SDK for CPP", 346 ticket_group="AWS IoT Device SDK", 347 ticket_severity=3) 348 wrapper_error_occurred = True 349 finished_email_body += "Failure due to MQTT5 application stopping and not automatically restarting!" 350 finished_email_body += " This shouldn't occur and means something is wrong with the Canary wrapper!" 351 # Other 352 else: 353 print ("ERROR - 24/7 Canary stopped due to unknown reason!") 354 cut_ticket_using_cloudwatch( 355 git_repo_name=canary_local_git_repo_stub, 356 git_hash=canary_local_git_hash_stub, 357 git_hash_as_namespace=False, 358 git_fixed_namespace_text=canary_local_git_fixed_namespace, 359 cloudwatch_region=canary_region_stub, 360 ticket_description="The 24/7 Canary stopped for an unknown reason!", 361 ticket_reason="The 24/7 Canary stopped for unknown reason", 362 ticket_allow_duplicates=True, 363 ticket_category="AWS", 364 ticket_type="SDKs and Tools", 365 ticket_item="IoT SDK for CPP", 366 ticket_group="AWS IoT Device SDK", 367 ticket_severity=3) 368 wrapper_error_occurred = True 369 finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!" 370 except Exception as e: 371 print ("ERROR: Could not (possibly) cut ticket due to exception!") 372 print ("Exception: " + str(e), flush=True) 373 374 # Clean everything up and stop 375 snapshot_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) 376 application_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) 377 print ("24/7 Canary finished!") 378 379 finished_email_body += "\n\nYou can find the log file for this run at the following S3 location: " 380 finished_email_body += "https://s3.console.aws.amazon.com/s3/object/" 381 finished_email_body += command_parser_arguments.s3_bucket_name 382 finished_email_body += "?region=" + canary_region_stub 383 finished_email_body += "&prefix=" + canary_local_git_repo_stub + "/" 384 if (wrapper_error_occurred == True): 385 finished_email_body += "Failed_Logs/" 386 finished_email_body += canary_local_git_hash_stub + ".log" 387 # Send the finish email 388 if (send_finished_email == True): 389 if (wrapper_error_occurred == True): 390 snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Had an error") 391 else: 392 snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Finished") 393 394 exit (-1) 395 396 397# Start the application! 398application_thread() 399