1// Copyright 2024 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package telemetry
6
7import (
8	"fmt"
9	"log"
10	"os"
11	"os/exec"
12	"path/filepath"
13	"sync"
14	"time"
15
16	"golang.org/x/sync/errgroup"
17	"golang.org/x/telemetry/counter"
18	"golang.org/x/telemetry/internal/crashmonitor"
19	"golang.org/x/telemetry/internal/telemetry"
20	"golang.org/x/telemetry/internal/upload"
21)
22
23// Config controls the behavior of [Start].
24type Config struct {
25	// ReportCrashes, if set, will enable crash reporting.
26	// ReportCrashes uses the [debug.SetCrashOutput] mechanism, which is a
27	// process-wide resource.
28	// Do not make other calls to that function within your application.
29	// ReportCrashes is a non-functional unless the program is built with go1.23+.
30	ReportCrashes bool
31
32	// Upload causes this program to periodically upload approved counters
33	// from the local telemetry database to telemetry.go.dev.
34	//
35	// This option has no effect unless the user has given consent
36	// to enable data collection, for example by running
37	// cmd/gotelemetry or affirming the gopls dialog.
38	//
39	// (This feature is expected to be used only by gopls.
40	// Longer term, the go command may become the sole program
41	// responsible for uploading.)
42	Upload bool
43
44	// TelemetryDir, if set, will specify an alternate telemetry
45	// directory to write data to. If not set, it uses the default
46	// directory.
47	// This field is intended to be used for isolating testing environments.
48	TelemetryDir string
49
50	// UploadStartTime, if set, overrides the time used as the upload start time,
51	// which is the time used by the upload logic to determine whether counter
52	// file data should be uploaded. Only counter files that have expired before
53	// the start time are considered for upload.
54	//
55	// This field can be used to simulate a future upload that collects recently
56	// modified counters.
57	UploadStartTime time.Time
58
59	// UploadURL, if set, overrides the URL used to receive uploaded reports. If
60	// unset, this URL defaults to https://telemetry.go.dev/upload.
61	UploadURL string
62}
63
64// Start initializes telemetry using the specified configuration.
65//
66// Start opens the local telemetry database so that counter increment
67// operations are durably recorded in the local file system.
68//
69// If [Config.Upload] is set, and the user has opted in to telemetry
70// uploading, this process may attempt to upload approved counters
71// to telemetry.go.dev.
72//
73// If [Config.ReportCrashes] is set, any fatal crash will be
74// recorded by incrementing a counter named for the stack of the
75// first running goroutine in the traceback.
76//
77// If either of these flags is set, Start re-executes the current
78// executable as a child process, in a special mode in which it
79// acts as a telemetry sidecar for the parent process (the application).
80// In that mode, the call to Start will never return, so Start must
81// be called immediately within main, even before such things as
82// inspecting the command line. The application should avoid expensive
83// steps or external side effects in init functions, as they will
84// be executed twice (parent and child).
85//
86// Start returns a StartResult, which may be awaited via [StartResult.Wait] to
87// wait for all work done by Start to complete.
88func Start(config Config) *StartResult {
89	switch v := os.Getenv(telemetryChildVar); v {
90	case "":
91		// The subprocess started by parent has GO_TELEMETRY_CHILD=1.
92		return parent(config)
93	case "1":
94		child(config) // child will exit the process when it's done.
95	case "2":
96		// Do nothing: this was executed directly or indirectly by a child.
97	default:
98		log.Fatalf("unexpected value for %q: %q", telemetryChildVar, v)
99	}
100
101	return &StartResult{}
102}
103
104// MaybeChild executes the telemetry child logic if the calling program is
105// the telemetry child process, and does nothing otherwise. It is meant to be
106// called as the first thing in a program that uses telemetry.Start but cannot
107// call telemetry.Start immediately when it starts.
108func MaybeChild(config Config) {
109	if v := os.Getenv(telemetryChildVar); v == "1" {
110		child(config) // child will exit the process when it's done.
111	}
112	// other values of the telemetryChildVar environment variable
113	// will be handled by telemetry.Start.
114}
115
116// A StartResult is a handle to the result of a call to [Start]. Call
117// [StartResult.Wait] to wait for the completion of all work done on behalf of
118// Start.
119type StartResult struct {
120	wg sync.WaitGroup
121}
122
123// Wait waits for the completion of all work initiated by [Start].
124func (res *StartResult) Wait() {
125	if res == nil {
126		return
127	}
128	res.wg.Wait()
129}
130
131var daemonize = func(cmd *exec.Cmd) {}
132
133// If telemetryChildVar is set to "1" in the environment, this is the telemetry
134// child.
135//
136// If telemetryChildVar is set to "2", this is a child of the child, and no
137// further forking should occur.
138const telemetryChildVar = "GO_TELEMETRY_CHILD"
139
140// If telemetryUploadVar is set to "1" in the environment, the upload token has been
141// acquired by the parent, and the child should attempt an upload.
142const telemetryUploadVar = "GO_TELEMETRY_CHILD_UPLOAD"
143
144func parent(config Config) *StartResult {
145	if config.TelemetryDir != "" {
146		telemetry.Default = telemetry.NewDir(config.TelemetryDir)
147	}
148	result := new(StartResult)
149
150	mode, _ := telemetry.Default.Mode()
151	if mode == "off" {
152		// Telemetry is turned off. Crash reporting doesn't work without telemetry
153		// at least set to "local". The upload process runs in both "on" and "local" modes.
154		// In local mode the upload process builds local reports but does not do the upload.
155		return result
156	}
157
158	counter.Open()
159
160	if _, err := os.Stat(telemetry.Default.LocalDir()); err != nil {
161		// There was a problem statting LocalDir, which is needed for both
162		// crash monitoring and counter uploading. Most likely, there was an
163		// error creating telemetry.LocalDir in the counter.Open call above.
164		// Don't start the child.
165		return result
166	}
167
168	childShouldUpload := config.Upload && acquireUploadToken()
169	reportCrashes := config.ReportCrashes && crashmonitor.Supported()
170
171	if reportCrashes || childShouldUpload {
172		startChild(reportCrashes, childShouldUpload, result)
173	}
174
175	return result
176}
177
178func startChild(reportCrashes, upload bool, result *StartResult) {
179	// This process is the application (parent).
180	// Fork+exec the telemetry child.
181	exe, err := os.Executable()
182	if err != nil {
183		// There was an error getting os.Executable. It's possible
184		// for this to happen on AIX if os.Args[0] is not an absolute
185		// path and we can't find os.Args[0] in PATH.
186		log.Printf("failed to start telemetry sidecar: os.Executable: %v", err)
187		return
188	}
189	cmd := exec.Command(exe, "** telemetry **") // this unused arg is just for ps(1)
190	daemonize(cmd)
191	cmd.Env = append(os.Environ(), telemetryChildVar+"=1")
192	if upload {
193		cmd.Env = append(cmd.Env, telemetryUploadVar+"=1")
194	}
195	cmd.Dir = telemetry.Default.LocalDir()
196
197	// The child process must write to a log file, not
198	// the stderr file it inherited from the parent, as
199	// the child may outlive the parent but should not prolong
200	// the life of any pipes created (by the grandparent)
201	// to gather the output of the parent.
202	//
203	// By default, we discard the child process's stderr,
204	// but in line with the uploader, log to a file in debug
205	// only if that directory was created by the user.
206	fd, err := os.Stat(telemetry.Default.DebugDir())
207	if err != nil {
208		if !os.IsNotExist(err) {
209			log.Printf("failed to stat debug directory: %v", err)
210			return
211		}
212	} else if fd.IsDir() {
213		// local/debug exists and is a directory. Set stderr to a log file path
214		// in local/debug.
215		childLogPath := filepath.Join(telemetry.Default.DebugDir(), "sidecar.log")
216		childLog, err := os.OpenFile(childLogPath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0600)
217		if err != nil {
218			log.Printf("opening sidecar log file for child: %v", err)
219			return
220		}
221		defer childLog.Close()
222		cmd.Stderr = childLog
223	}
224
225	var crashOutputFile *os.File
226	if reportCrashes {
227		pipe, err := cmd.StdinPipe()
228		if err != nil {
229			log.Printf("StdinPipe: %v", err)
230			return
231		}
232
233		crashOutputFile = pipe.(*os.File) // (this conversion is safe)
234	}
235
236	if err := cmd.Start(); err != nil {
237		// The child couldn't be started. Log the failure.
238		log.Printf("can't start telemetry child process: %v", err)
239		return
240	}
241	if reportCrashes {
242		crashmonitor.Parent(crashOutputFile)
243	}
244	result.wg.Add(1)
245	go func() {
246		cmd.Wait() // Release resources if cmd happens not to outlive this process.
247		result.wg.Done()
248	}()
249}
250
251func child(config Config) {
252	log.SetPrefix(fmt.Sprintf("telemetry-sidecar (pid %v): ", os.Getpid()))
253
254	if config.TelemetryDir != "" {
255		telemetry.Default = telemetry.NewDir(config.TelemetryDir)
256	}
257
258	// golang/go#67211: be sure to set telemetryChildVar before running the
259	// child, because the child itself invokes the go command to download the
260	// upload config. If the telemetryChildVar variable is still set to "1",
261	// that delegated go command may think that it is itself a telemetry
262	// child.
263	//
264	// On the other hand, if telemetryChildVar were simply unset, then the
265	// delegated go commands would fork themselves recursively. Short-circuit
266	// this recursion.
267	os.Setenv(telemetryChildVar, "2")
268	upload := os.Getenv(telemetryUploadVar) == "1"
269
270	reportCrashes := config.ReportCrashes && crashmonitor.Supported()
271	uploadStartTime := config.UploadStartTime
272	uploadURL := config.UploadURL
273
274	// Start crashmonitoring and uploading depending on what's requested
275	// and wait for the longer running child to complete before exiting:
276	// if we collected a crash before the upload finished, wait for the
277	// upload to finish before exiting
278	var g errgroup.Group
279
280	if reportCrashes {
281		g.Go(func() error {
282			crashmonitor.Child()
283			return nil
284		})
285	}
286	if upload {
287		g.Go(func() error {
288			uploaderChild(uploadStartTime, uploadURL)
289			return nil
290		})
291	}
292	g.Wait()
293
294	os.Exit(0)
295}
296
297func uploaderChild(asof time.Time, uploadURL string) {
298	if err := upload.Run(upload.RunConfig{
299		UploadURL: uploadURL,
300		LogWriter: os.Stderr,
301		StartTime: asof,
302	}); err != nil {
303		log.Printf("upload failed: %v", err)
304	}
305}
306
307// acquireUploadToken acquires a token permitting the caller to upload.
308// To limit the frequency of uploads, only one token is issue per
309// machine per time period.
310// The boolean indicates whether the token was acquired.
311func acquireUploadToken() bool {
312	if telemetry.Default.LocalDir() == "" {
313		// The telemetry dir wasn't initialized properly, probably because
314		// os.UserConfigDir did not complete successfully. In that case
315		// there are no counters to upload, so we should just do nothing.
316		return false
317	}
318	tokenfile := filepath.Join(telemetry.Default.LocalDir(), "upload.token")
319	const period = 24 * time.Hour
320
321	// A process acquires a token by successfully creating a
322	// well-known file. If the file already exists and has an
323	// mtime age less then than the period, the process does
324	// not acquire the token. If the file is older than the
325	// period, the process is allowed to remove the file and
326	// try to re-create it.
327	fi, err := os.Stat(tokenfile)
328	if err == nil {
329		if time.Since(fi.ModTime()) < period {
330			return false
331		}
332		// There's a possible race here where two processes check the
333		// token file and see that it's older than the period, then the
334		// first one removes it and creates another, and then a second one
335		// removes the newly created file and creates yet another
336		// file. Then both processes would act as though they had the token.
337		// This is very rare, but it's also okay because we're only grabbing
338		// the token to do rate limiting, not for correctness.
339		_ = os.Remove(tokenfile)
340	} else if !os.IsNotExist(err) {
341		log.Printf("error acquiring upload taken: statting token file: %v", err)
342		return false
343	}
344
345	f, err := os.OpenFile(tokenfile, os.O_CREATE|os.O_EXCL, 0666)
346	if err != nil {
347		if os.IsExist(err) {
348			return false
349		}
350		log.Printf("error acquiring upload token: creating token file: %v", err)
351		return false
352	}
353	_ = f.Close()
354	return true
355}
356