xref: /aosp_15_r20/external/skia/infra/bots/task_drivers/g3_canary/g3_canary.go (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1// Copyright 2020 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5package main
6
7import (
8	"context"
9	"encoding/json"
10	"errors"
11	"flag"
12	"fmt"
13	"net/http"
14	"strconv"
15	"time"
16
17	"cloud.google.com/go/storage"
18	"google.golang.org/api/option"
19
20	"go.skia.org/infra/go/auth"
21	"go.skia.org/infra/go/gcs"
22	"go.skia.org/infra/go/gcs/gcsclient"
23	"go.skia.org/infra/go/httputils"
24	"go.skia.org/infra/go/skerr"
25	"go.skia.org/infra/go/sklog"
26	"go.skia.org/infra/promk/go/pushgateway"
27	"go.skia.org/infra/task_driver/go/lib/auth_steps"
28	"go.skia.org/infra/task_driver/go/lib/checkout"
29	"go.skia.org/infra/task_driver/go/td"
30)
31
32const (
33	g3CanaryBucketName = "g3-compile-tasks"
34
35	InfraFailureErrorMsg    = "Your run failed due to unknown infrastructure failures. Ask the Infra Gardener to investigate (or file a bug at go/autoroll-bug)."
36	MissingApprovalErrorMsg = "To run the G3 tryjob, changes must be either owned and authored by Googlers or approved (Code-Review+1) by Googlers."
37	MergeConflictErrorMsg   = "G3 tryjob failed because the change is causing a merge conflict when applying it to the Skia hash in G3."
38
39	PatchingInformation = "Tip: If needed, could try patching in the CL into a local G3 client with \"g4 patch\" and then hacking on it."
40
41	// Metric constants for pushgateway.
42	jobName                    = "g3-canary"
43	metricName                 = "g3_canary_infra_failure"
44	metricValue_NoInfraFailure = "0"
45	metricValue_InfraFailure   = "1"
46)
47
48type CanaryStatusType string
49
50const (
51	ExceptionStatus       CanaryStatusType = "exception"
52	MissingApprovalStatus CanaryStatusType = "missing_approval"
53	MergeConflictStatus   CanaryStatusType = "merge_conflict"
54	FailureStatus         CanaryStatusType = "failure"
55	SuccessStatus         CanaryStatusType = "success"
56)
57
58type G3CanaryTask struct {
59	Issue    int              `json:"issue"`
60	Patchset int              `json:"patchset"`
61	Status   CanaryStatusType `json:"status"`
62	Result   string           `json:"result"`
63	Error    string           `json:"error"`
64	CL       int              `json:"cl"`
65}
66
67func main() {
68	var (
69		projectId = flag.String("project_id", "", "ID of the Google Cloud project.")
70		taskId    = flag.String("task_id", "", "ID of this task.")
71		taskName  = flag.String("task_name", "", "Name of the task.")
72		output    = flag.String("o", "", "If provided, dump a JSON blob of step data to the given file. Prints to stdout if '-' is given.")
73		local     = flag.Bool("local", true, "True if running locally (as opposed to on the bots)")
74
75		checkoutFlags = checkout.SetupFlags(nil)
76	)
77	ctx := td.StartRun(projectId, taskId, taskName, output, local)
78	defer td.EndRun(ctx)
79
80	rs, err := checkout.GetRepoState(checkoutFlags)
81	if err != nil {
82		td.Fatal(ctx, skerr.Wrap(err))
83	}
84	if rs.Issue == "" || rs.Patchset == "" {
85		td.Fatalf(ctx, "This task driver should be run only as a try bot")
86	}
87
88	// Create token source with scope for GCS access.
89	ts, err := auth_steps.Init(ctx, *local, auth.ScopeUserinfoEmail, auth.ScopeFullControl)
90	if err != nil {
91		td.Fatal(ctx, skerr.Wrap(err))
92	}
93	client := httputils.DefaultClientConfig().WithTokenSource(ts).Client()
94	store, err := storage.NewClient(ctx, option.WithHTTPClient(client))
95	if err != nil {
96		td.Fatalf(ctx, "Failed to create storage service client: %s", err)
97	}
98	gcsClient := gcsclient.New(store, g3CanaryBucketName)
99
100	taskFileName := fmt.Sprintf("%s-%s.json", rs.Issue, rs.Patchset)
101	taskStoragePath := fmt.Sprintf("gs://%s/%s", g3CanaryBucketName, taskFileName)
102
103	err = td.Do(ctx, td.Props("Trigger new task if not already running"), func(ctx context.Context) error {
104		if _, err := gcsClient.GetFileContents(ctx, taskFileName); err != nil {
105			if err == storage.ErrObjectNotExist {
106				// The task is not already running. Create a new file to trigger a new run.
107				if err := triggerCanaryRoll(ctx, rs.Issue, rs.Patchset, taskFileName, taskStoragePath, gcsClient); err != nil {
108					td.Fatal(ctx, fmt.Errorf("Could not trigger canary roll for %s/%s: %s", rs.Issue, rs.Patchset, err))
109				}
110			} else {
111				return fmt.Errorf("Could not read %s: %s", taskStoragePath, err)
112			}
113		} else {
114			fmt.Printf("G3 canary task for %s/%s already exists\n", rs.Issue, rs.Patchset)
115		}
116		return nil
117	})
118	if err != nil {
119		td.Fatal(ctx, skerr.Wrap(err))
120	}
121
122	defer func() {
123		// Cleanup the storage file after the task finishes.
124		if err := gcsClient.DeleteFile(ctx, taskFileName); err != nil {
125			sklog.Errorf("Could not delete %s: %s", taskStoragePath, err)
126		}
127	}()
128
129	// Add documentation link for canary rolls.
130	td.StepText(ctx, "Canary roll doc", "https://goto.google.com/autoroller-canary-bots")
131
132	// Wait for the canary roll to finish.
133	if err := waitForCanaryRoll(ctx, taskFileName, taskStoragePath, client, gcsClient); err != nil {
134		td.Fatal(ctx, skerr.Wrap(err))
135	}
136}
137
138func triggerCanaryRoll(ctx context.Context, issue, patchset, taskFileName, taskStoragePath string, gcsClient gcs.GCSClient) error {
139	ctx = td.StartStep(ctx, td.Props("Trigger canary roll"))
140	defer td.EndStep(ctx)
141
142	i, err := strconv.Atoi(issue)
143	if err != nil {
144		return fmt.Errorf("Could not convert %s to int: %s", issue, err)
145	}
146	p, err := strconv.Atoi(patchset)
147	if err != nil {
148		return fmt.Errorf("Could not convert %s to int: %s", patchset, err)
149	}
150	newTask := G3CanaryTask{
151		Issue:    i,
152		Patchset: p,
153	}
154	taskJson, err := json.Marshal(newTask)
155	if err != nil {
156		return fmt.Errorf("Could not encode task to JSON: %s", err)
157	}
158	if err := gcsClient.SetFileContents(ctx, taskFileName, gcs.FILE_WRITE_OPTS_TEXT, taskJson); err != nil {
159		return fmt.Errorf("Could not write task to %s: %s", taskStoragePath, err)
160	}
161	fmt.Printf("G3 canary task for %s/%s has been successfully added to %s\n", issue, patchset, taskStoragePath)
162	return nil
163}
164
165func waitForCanaryRoll(parentCtx context.Context, taskFileName, taskStoragePath string, httpClient *http.Client, gcsClient gcs.GCSClient) error {
166	ctx := td.StartStep(parentCtx, td.Props("Wait for canary roll"))
167	defer td.EndStep(ctx)
168
169	// For updating g3_canary_infra_failure metric after run completes.
170	pg := pushgateway.New(httpClient, jobName, pushgateway.DefaultPushgatewayURL)
171
172	// For writing to the step's log stream.
173	stdout := td.NewLogStream(ctx, "stdout", td.SeverityInfo)
174	// Lets add the roll link only once to step data.
175	addedRollLinkStepData := false
176	for {
177		// Read task status from storage.
178		contents, err := gcsClient.GetFileContents(ctx, taskFileName)
179		if err != nil {
180			return td.FailStep(ctx, fmt.Errorf("Could not read contents of %s: %s", taskStoragePath, err))
181		}
182		var task G3CanaryTask
183		if err := json.Unmarshal(contents, &task); err != nil {
184			return td.FailStep(ctx, fmt.Errorf("Could not unmarshal %s: %s", taskStoragePath, err))
185		}
186
187		var rollStatus string
188		if task.CL == 0 {
189			rollStatus = "Waiting for Canary roll to start"
190		} else {
191			clLink := fmt.Sprintf("http://cl/%d", task.CL)
192			if !addedRollLinkStepData {
193				// Add the roll link to both the current step and it's parent.
194				td.StepText(ctx, "Canary roll CL", clLink)
195				td.StepText(parentCtx, "Canary roll CL", clLink)
196				addedRollLinkStepData = true
197			}
198			rollStatus = fmt.Sprintf("Canary roll [ %s ] has status %s", clLink, task.Result)
199		}
200		if _, err := stdout.Write([]byte(rollStatus)); err != nil {
201			return td.FailStep(ctx, fmt.Errorf("Could not write to stdout: %s", err))
202		}
203
204		switch task.Status {
205		case "":
206			// Still waiting for the task.
207			time.Sleep(30 * time.Second)
208			continue
209		case ExceptionStatus:
210			if task.Error != "" {
211				sklog.Errorf("Run failed with: %s", task.Error)
212			}
213			_ = pg.Push(ctx, metricName, metricValue_InfraFailure)
214			// Use a general purpose error message.
215			return td.FailStep(ctx, errors.New(InfraFailureErrorMsg))
216		case MissingApprovalStatus:
217			_ = pg.Push(ctx, metricName, metricValue_NoInfraFailure)
218			return td.FailStep(ctx, errors.New(MissingApprovalErrorMsg))
219		case MergeConflictStatus:
220			_ = pg.Push(ctx, metricName, metricValue_NoInfraFailure)
221			return td.FailStep(ctx, errors.New(MergeConflictErrorMsg))
222		case FailureStatus:
223			_ = pg.Push(ctx, metricName, metricValue_NoInfraFailure)
224			return td.FailStep(ctx, fmt.Errorf("Run failed G3 TAP.\n%s", PatchingInformation))
225		case SuccessStatus:
226			// Run passed G3 TAP.
227			_ = pg.Push(ctx, metricName, metricValue_NoInfraFailure)
228			return nil
229		}
230	}
231}
232