1// Copyright 2020 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5package main 6 7import ( 8 "context" 9 "encoding/json" 10 "errors" 11 "flag" 12 "fmt" 13 "net/http" 14 "strconv" 15 "time" 16 17 "cloud.google.com/go/storage" 18 "google.golang.org/api/option" 19 20 "go.skia.org/infra/go/auth" 21 "go.skia.org/infra/go/gcs" 22 "go.skia.org/infra/go/gcs/gcsclient" 23 "go.skia.org/infra/go/httputils" 24 "go.skia.org/infra/go/skerr" 25 "go.skia.org/infra/go/sklog" 26 "go.skia.org/infra/promk/go/pushgateway" 27 "go.skia.org/infra/task_driver/go/lib/auth_steps" 28 "go.skia.org/infra/task_driver/go/lib/checkout" 29 "go.skia.org/infra/task_driver/go/td" 30) 31 32const ( 33 g3CanaryBucketName = "g3-compile-tasks" 34 35 InfraFailureErrorMsg = "Your run failed due to unknown infrastructure failures. Ask the Infra Gardener to investigate (or file a bug at go/autoroll-bug)." 36 MissingApprovalErrorMsg = "To run the G3 tryjob, changes must be either owned and authored by Googlers or approved (Code-Review+1) by Googlers." 37 MergeConflictErrorMsg = "G3 tryjob failed because the change is causing a merge conflict when applying it to the Skia hash in G3." 38 39 PatchingInformation = "Tip: If needed, could try patching in the CL into a local G3 client with \"g4 patch\" and then hacking on it." 40 41 // Metric constants for pushgateway. 42 jobName = "g3-canary" 43 metricName = "g3_canary_infra_failure" 44 metricValue_NoInfraFailure = "0" 45 metricValue_InfraFailure = "1" 46) 47 48type CanaryStatusType string 49 50const ( 51 ExceptionStatus CanaryStatusType = "exception" 52 MissingApprovalStatus CanaryStatusType = "missing_approval" 53 MergeConflictStatus CanaryStatusType = "merge_conflict" 54 FailureStatus CanaryStatusType = "failure" 55 SuccessStatus CanaryStatusType = "success" 56) 57 58type G3CanaryTask struct { 59 Issue int `json:"issue"` 60 Patchset int `json:"patchset"` 61 Status CanaryStatusType `json:"status"` 62 Result string `json:"result"` 63 Error string `json:"error"` 64 CL int `json:"cl"` 65} 66 67func main() { 68 var ( 69 projectId = flag.String("project_id", "", "ID of the Google Cloud project.") 70 taskId = flag.String("task_id", "", "ID of this task.") 71 taskName = flag.String("task_name", "", "Name of the task.") 72 output = flag.String("o", "", "If provided, dump a JSON blob of step data to the given file. Prints to stdout if '-' is given.") 73 local = flag.Bool("local", true, "True if running locally (as opposed to on the bots)") 74 75 checkoutFlags = checkout.SetupFlags(nil) 76 ) 77 ctx := td.StartRun(projectId, taskId, taskName, output, local) 78 defer td.EndRun(ctx) 79 80 rs, err := checkout.GetRepoState(checkoutFlags) 81 if err != nil { 82 td.Fatal(ctx, skerr.Wrap(err)) 83 } 84 if rs.Issue == "" || rs.Patchset == "" { 85 td.Fatalf(ctx, "This task driver should be run only as a try bot") 86 } 87 88 // Create token source with scope for GCS access. 89 ts, err := auth_steps.Init(ctx, *local, auth.ScopeUserinfoEmail, auth.ScopeFullControl) 90 if err != nil { 91 td.Fatal(ctx, skerr.Wrap(err)) 92 } 93 client := httputils.DefaultClientConfig().WithTokenSource(ts).Client() 94 store, err := storage.NewClient(ctx, option.WithHTTPClient(client)) 95 if err != nil { 96 td.Fatalf(ctx, "Failed to create storage service client: %s", err) 97 } 98 gcsClient := gcsclient.New(store, g3CanaryBucketName) 99 100 taskFileName := fmt.Sprintf("%s-%s.json", rs.Issue, rs.Patchset) 101 taskStoragePath := fmt.Sprintf("gs://%s/%s", g3CanaryBucketName, taskFileName) 102 103 err = td.Do(ctx, td.Props("Trigger new task if not already running"), func(ctx context.Context) error { 104 if _, err := gcsClient.GetFileContents(ctx, taskFileName); err != nil { 105 if err == storage.ErrObjectNotExist { 106 // The task is not already running. Create a new file to trigger a new run. 107 if err := triggerCanaryRoll(ctx, rs.Issue, rs.Patchset, taskFileName, taskStoragePath, gcsClient); err != nil { 108 td.Fatal(ctx, fmt.Errorf("Could not trigger canary roll for %s/%s: %s", rs.Issue, rs.Patchset, err)) 109 } 110 } else { 111 return fmt.Errorf("Could not read %s: %s", taskStoragePath, err) 112 } 113 } else { 114 fmt.Printf("G3 canary task for %s/%s already exists\n", rs.Issue, rs.Patchset) 115 } 116 return nil 117 }) 118 if err != nil { 119 td.Fatal(ctx, skerr.Wrap(err)) 120 } 121 122 defer func() { 123 // Cleanup the storage file after the task finishes. 124 if err := gcsClient.DeleteFile(ctx, taskFileName); err != nil { 125 sklog.Errorf("Could not delete %s: %s", taskStoragePath, err) 126 } 127 }() 128 129 // Add documentation link for canary rolls. 130 td.StepText(ctx, "Canary roll doc", "https://goto.google.com/autoroller-canary-bots") 131 132 // Wait for the canary roll to finish. 133 if err := waitForCanaryRoll(ctx, taskFileName, taskStoragePath, client, gcsClient); err != nil { 134 td.Fatal(ctx, skerr.Wrap(err)) 135 } 136} 137 138func triggerCanaryRoll(ctx context.Context, issue, patchset, taskFileName, taskStoragePath string, gcsClient gcs.GCSClient) error { 139 ctx = td.StartStep(ctx, td.Props("Trigger canary roll")) 140 defer td.EndStep(ctx) 141 142 i, err := strconv.Atoi(issue) 143 if err != nil { 144 return fmt.Errorf("Could not convert %s to int: %s", issue, err) 145 } 146 p, err := strconv.Atoi(patchset) 147 if err != nil { 148 return fmt.Errorf("Could not convert %s to int: %s", patchset, err) 149 } 150 newTask := G3CanaryTask{ 151 Issue: i, 152 Patchset: p, 153 } 154 taskJson, err := json.Marshal(newTask) 155 if err != nil { 156 return fmt.Errorf("Could not encode task to JSON: %s", err) 157 } 158 if err := gcsClient.SetFileContents(ctx, taskFileName, gcs.FILE_WRITE_OPTS_TEXT, taskJson); err != nil { 159 return fmt.Errorf("Could not write task to %s: %s", taskStoragePath, err) 160 } 161 fmt.Printf("G3 canary task for %s/%s has been successfully added to %s\n", issue, patchset, taskStoragePath) 162 return nil 163} 164 165func waitForCanaryRoll(parentCtx context.Context, taskFileName, taskStoragePath string, httpClient *http.Client, gcsClient gcs.GCSClient) error { 166 ctx := td.StartStep(parentCtx, td.Props("Wait for canary roll")) 167 defer td.EndStep(ctx) 168 169 // For updating g3_canary_infra_failure metric after run completes. 170 pg := pushgateway.New(httpClient, jobName, pushgateway.DefaultPushgatewayURL) 171 172 // For writing to the step's log stream. 173 stdout := td.NewLogStream(ctx, "stdout", td.SeverityInfo) 174 // Lets add the roll link only once to step data. 175 addedRollLinkStepData := false 176 for { 177 // Read task status from storage. 178 contents, err := gcsClient.GetFileContents(ctx, taskFileName) 179 if err != nil { 180 return td.FailStep(ctx, fmt.Errorf("Could not read contents of %s: %s", taskStoragePath, err)) 181 } 182 var task G3CanaryTask 183 if err := json.Unmarshal(contents, &task); err != nil { 184 return td.FailStep(ctx, fmt.Errorf("Could not unmarshal %s: %s", taskStoragePath, err)) 185 } 186 187 var rollStatus string 188 if task.CL == 0 { 189 rollStatus = "Waiting for Canary roll to start" 190 } else { 191 clLink := fmt.Sprintf("http://cl/%d", task.CL) 192 if !addedRollLinkStepData { 193 // Add the roll link to both the current step and it's parent. 194 td.StepText(ctx, "Canary roll CL", clLink) 195 td.StepText(parentCtx, "Canary roll CL", clLink) 196 addedRollLinkStepData = true 197 } 198 rollStatus = fmt.Sprintf("Canary roll [ %s ] has status %s", clLink, task.Result) 199 } 200 if _, err := stdout.Write([]byte(rollStatus)); err != nil { 201 return td.FailStep(ctx, fmt.Errorf("Could not write to stdout: %s", err)) 202 } 203 204 switch task.Status { 205 case "": 206 // Still waiting for the task. 207 time.Sleep(30 * time.Second) 208 continue 209 case ExceptionStatus: 210 if task.Error != "" { 211 sklog.Errorf("Run failed with: %s", task.Error) 212 } 213 _ = pg.Push(ctx, metricName, metricValue_InfraFailure) 214 // Use a general purpose error message. 215 return td.FailStep(ctx, errors.New(InfraFailureErrorMsg)) 216 case MissingApprovalStatus: 217 _ = pg.Push(ctx, metricName, metricValue_NoInfraFailure) 218 return td.FailStep(ctx, errors.New(MissingApprovalErrorMsg)) 219 case MergeConflictStatus: 220 _ = pg.Push(ctx, metricName, metricValue_NoInfraFailure) 221 return td.FailStep(ctx, errors.New(MergeConflictErrorMsg)) 222 case FailureStatus: 223 _ = pg.Push(ctx, metricName, metricValue_NoInfraFailure) 224 return td.FailStep(ctx, fmt.Errorf("Run failed G3 TAP.\n%s", PatchingInformation)) 225 case SuccessStatus: 226 // Run passed G3 TAP. 227 _ = pg.Push(ctx, metricName, metricValue_NoInfraFailure) 228 return nil 229 } 230 } 231} 232