1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.tpu.v1;
18
19import "google/api/annotations.proto";
20import "google/api/client.proto";
21import "google/api/field_behavior.proto";
22import "google/api/resource.proto";
23import "google/longrunning/operations.proto";
24import "google/protobuf/timestamp.proto";
25
26option go_package = "cloud.google.com/go/tpu/apiv1/tpupb;tpupb";
27option java_multiple_files = true;
28option java_outer_classname = "CloudTpuProto";
29option java_package = "com.google.cloud.tpu.v1";
30
31// Manages TPU nodes and other resources
32//
33// TPU API v1
34service Tpu {
35  option (google.api.default_host) = "tpu.googleapis.com";
36  option (google.api.oauth_scopes) =
37      "https://www.googleapis.com/auth/cloud-platform";
38
39  // Lists nodes.
40  rpc ListNodes(ListNodesRequest) returns (ListNodesResponse) {
41    option (google.api.http) = {
42      get: "/v1/{parent=projects/*/locations/*}/nodes"
43    };
44    option (google.api.method_signature) = "parent";
45  }
46
47  // Gets the details of a node.
48  rpc GetNode(GetNodeRequest) returns (Node) {
49    option (google.api.http) = {
50      get: "/v1/{name=projects/*/locations/*/nodes/*}"
51    };
52    option (google.api.method_signature) = "name";
53  }
54
55  // Creates a node.
56  rpc CreateNode(CreateNodeRequest) returns (google.longrunning.Operation) {
57    option (google.api.http) = {
58      post: "/v1/{parent=projects/*/locations/*}/nodes"
59      body: "node"
60    };
61    option (google.api.method_signature) = "parent,node,node_id";
62    option (google.longrunning.operation_info) = {
63      response_type: "Node"
64      metadata_type: "OperationMetadata"
65    };
66  }
67
68  // Deletes a node.
69  rpc DeleteNode(DeleteNodeRequest) returns (google.longrunning.Operation) {
70    option (google.api.http) = {
71      delete: "/v1/{name=projects/*/locations/*/nodes/*}"
72    };
73    option (google.api.method_signature) = "name";
74    option (google.longrunning.operation_info) = {
75      response_type: "Node"
76      metadata_type: "OperationMetadata"
77    };
78  }
79
80  // Reimages a node's OS.
81  rpc ReimageNode(ReimageNodeRequest) returns (google.longrunning.Operation) {
82    option (google.api.http) = {
83      post: "/v1/{name=projects/*/locations/*/nodes/*}:reimage"
84      body: "*"
85    };
86    option (google.longrunning.operation_info) = {
87      response_type: "Node"
88      metadata_type: "OperationMetadata"
89    };
90  }
91
92  // Stops a node, this operation is only available with single TPU nodes.
93  rpc StopNode(StopNodeRequest) returns (google.longrunning.Operation) {
94    option (google.api.http) = {
95      post: "/v1/{name=projects/*/locations/*/nodes/*}:stop"
96      body: "*"
97    };
98    option (google.longrunning.operation_info) = {
99      response_type: "Node"
100      metadata_type: "OperationMetadata"
101    };
102  }
103
104  // Starts a node.
105  rpc StartNode(StartNodeRequest) returns (google.longrunning.Operation) {
106    option (google.api.http) = {
107      post: "/v1/{name=projects/*/locations/*/nodes/*}:start"
108      body: "*"
109    };
110    option (google.longrunning.operation_info) = {
111      response_type: "Node"
112      metadata_type: "OperationMetadata"
113    };
114  }
115
116  // List TensorFlow versions supported by this API.
117  rpc ListTensorFlowVersions(ListTensorFlowVersionsRequest)
118      returns (ListTensorFlowVersionsResponse) {
119    option (google.api.http) = {
120      get: "/v1/{parent=projects/*/locations/*}/tensorflowVersions"
121    };
122    option (google.api.method_signature) = "parent";
123  }
124
125  // Gets TensorFlow Version.
126  rpc GetTensorFlowVersion(GetTensorFlowVersionRequest)
127      returns (TensorFlowVersion) {
128    option (google.api.http) = {
129      get: "/v1/{name=projects/*/locations/*/tensorflowVersions/*}"
130    };
131    option (google.api.method_signature) = "name";
132  }
133
134  // Lists accelerator types supported by this API.
135  rpc ListAcceleratorTypes(ListAcceleratorTypesRequest)
136      returns (ListAcceleratorTypesResponse) {
137    option (google.api.http) = {
138      get: "/v1/{parent=projects/*/locations/*}/acceleratorTypes"
139    };
140    option (google.api.method_signature) = "parent";
141  }
142
143  // Gets AcceleratorType.
144  rpc GetAcceleratorType(GetAcceleratorTypeRequest) returns (AcceleratorType) {
145    option (google.api.http) = {
146      get: "/v1/{name=projects/*/locations/*/acceleratorTypes/*}"
147    };
148    option (google.api.method_signature) = "name";
149  }
150}
151
152// Sets the scheduling options for this node.
153message SchedulingConfig {
154  // Defines whether the node is preemptible.
155  bool preemptible = 1;
156
157  // Whether the node is created under a reservation.
158  bool reserved = 2;
159}
160
161// A network endpoint over which a TPU worker can be reached.
162message NetworkEndpoint {
163  // The IP address of this network endpoint.
164  string ip_address = 1;
165
166  // The port of this network endpoint.
167  int32 port = 2;
168}
169
170// A TPU instance.
171message Node {
172  option (google.api.resource) = {
173    type: "tpu.googleapis.com/Node"
174    pattern: "projects/{project}/locations/{location}/nodes/{node}"
175  };
176
177  // Represents the different states of a TPU node during its lifecycle.
178  enum State {
179    // TPU node state is not known/set.
180    STATE_UNSPECIFIED = 0;
181
182    // TPU node is being created.
183    CREATING = 1;
184
185    // TPU node has been created.
186    READY = 2;
187
188    // TPU node is restarting.
189    RESTARTING = 3;
190
191    // TPU node is undergoing reimaging.
192    REIMAGING = 4;
193
194    // TPU node is being deleted.
195    DELETING = 5;
196
197    // TPU node is being repaired and may be unusable. Details can be
198    // found in the `help_description` field.
199    REPAIRING = 6;
200
201    // TPU node is stopped.
202    STOPPED = 8;
203
204    // TPU node is currently stopping.
205    STOPPING = 9;
206
207    // TPU node is currently starting.
208    STARTING = 10;
209
210    // TPU node has been preempted. Only applies to Preemptible TPU Nodes.
211    PREEMPTED = 11;
212
213    // TPU node has been terminated due to maintenance or has reached the end of
214    // its life cycle (for preemptible nodes).
215    TERMINATED = 12;
216
217    // TPU node is currently hiding.
218    HIDING = 13;
219
220    // TPU node has been hidden.
221    HIDDEN = 14;
222
223    // TPU node is currently unhiding.
224    UNHIDING = 15;
225  }
226
227  // Health defines the status of a TPU node as reported by
228  // Health Monitor.
229  enum Health {
230    // Health status is unknown: not initialized or failed to retrieve.
231    HEALTH_UNSPECIFIED = 0;
232
233    // The resource is healthy.
234    HEALTHY = 1;
235
236    // The resource is unhealthy.
237    DEPRECATED_UNHEALTHY = 2;
238
239    // The resource is unresponsive.
240    TIMEOUT = 3;
241
242    // The in-guest ML stack is unhealthy.
243    UNHEALTHY_TENSORFLOW = 4;
244
245    // The node is under maintenance/priority boost caused rescheduling and
246    // will resume running once rescheduled.
247    UNHEALTHY_MAINTENANCE = 5;
248  }
249
250  // TPU API Version.
251  enum ApiVersion {
252    // API version is unknown.
253    API_VERSION_UNSPECIFIED = 0;
254
255    // TPU API V1Alpha1 version.
256    V1_ALPHA1 = 1;
257
258    // TPU API V1 version.
259    V1 = 2;
260
261    // TPU API V2Alpha1 version.
262    V2_ALPHA1 = 3;
263  }
264
265  // Output only. Immutable. The name of the TPU
266  string name = 1 [
267    (google.api.field_behavior) = IMMUTABLE,
268    (google.api.field_behavior) = OUTPUT_ONLY
269  ];
270
271  // The user-supplied description of the TPU. Maximum of 512 characters.
272  string description = 3;
273
274  // Required. The type of hardware accelerators associated with this node.
275  string accelerator_type = 5 [(google.api.field_behavior) = REQUIRED];
276
277  // Output only. DEPRECATED! Use network_endpoints instead.
278  // The network address for the TPU Node as visible to Compute Engine
279  // instances.
280  string ip_address = 8 [deprecated = true];
281
282  // Output only. DEPRECATED! Use network_endpoints instead.
283  // The network port for the TPU Node as visible to Compute Engine instances.
284  string port = 14 [deprecated = true];
285
286  // Output only. The current state for the TPU Node.
287  State state = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
288
289  // Output only. If this field is populated, it contains a description of why
290  // the TPU Node is unhealthy.
291  string health_description = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
292
293  // Required. The version of Tensorflow running in the Node.
294  string tensorflow_version = 11 [(google.api.field_behavior) = REQUIRED];
295
296  // The name of a network they wish to peer the TPU node to. It must be a
297  // preexisting Compute Engine network inside of the project on which this API
298  // has been activated. If none is provided, "default" will be used.
299  string network = 12;
300
301  // The CIDR block that the TPU node will use when selecting an IP address.
302  // This CIDR block must be a /29 block; the Compute Engine networks API
303  // forbids a smaller block, and using a larger block would be wasteful (a
304  // node can only consume one IP address). Errors will occur if the CIDR block
305  // has already been used for a currently existing TPU node, the CIDR block
306  // conflicts with any subnetworks in the user's provided network, or the
307  // provided network is peered with another network that is using that CIDR
308  // block.
309  string cidr_block = 13;
310
311  // Output only. The service account used to run the tensor flow services
312  // within the node. To share resources, including Google Cloud Storage data,
313  // with the Tensorflow job running in the Node, this account must have
314  // permissions to that data.
315  string service_account = 15 [(google.api.field_behavior) = OUTPUT_ONLY];
316
317  // Output only. The time when the node was created.
318  google.protobuf.Timestamp create_time = 16
319      [(google.api.field_behavior) = OUTPUT_ONLY];
320
321  // The scheduling options for this node.
322  SchedulingConfig scheduling_config = 17;
323
324  // Output only. The network endpoints where TPU workers can be accessed and
325  // sent work. It is recommended that Tensorflow clients of the node reach out
326  // to the 0th entry in this map first.
327  repeated NetworkEndpoint network_endpoints = 21
328      [(google.api.field_behavior) = OUTPUT_ONLY];
329
330  // The health status of the TPU node.
331  Health health = 22;
332
333  // Resource labels to represent user-provided metadata.
334  map<string, string> labels = 24;
335
336  // Whether the VPC peering for the node is set up through Service Networking
337  // API. The VPC Peering should be set up before provisioning the node.
338  // If this field is set, cidr_block field should not be specified. If the
339  // network, that you want to peer the TPU Node to, is Shared VPC networks,
340  // the node must be created with this this field enabled.
341  bool use_service_networking = 27;
342
343  // Output only. The API version that created this Node.
344  ApiVersion api_version = 38 [(google.api.field_behavior) = OUTPUT_ONLY];
345
346  // Output only. The Symptoms that have occurred to the TPU Node.
347  repeated Symptom symptoms = 39 [(google.api.field_behavior) = OUTPUT_ONLY];
348}
349
350// Request for [ListNodes][google.cloud.tpu.v1.Tpu.ListNodes].
351message ListNodesRequest {
352  // Required. The parent resource name.
353  string parent = 1 [
354    (google.api.field_behavior) = REQUIRED,
355    (google.api.resource_reference) = { child_type: "tpu.googleapis.com/Node" }
356  ];
357
358  // The maximum number of items to return.
359  int32 page_size = 2;
360
361  // The next_page_token value returned from a previous List request, if any.
362  string page_token = 3;
363}
364
365// Response for [ListNodes][google.cloud.tpu.v1.Tpu.ListNodes].
366message ListNodesResponse {
367  // The listed nodes.
368  repeated Node nodes = 1;
369
370  // The next page token or empty if none.
371  string next_page_token = 2;
372
373  // Locations that could not be reached.
374  repeated string unreachable = 3;
375}
376
377// Request for [GetNode][google.cloud.tpu.v1.Tpu.GetNode].
378message GetNodeRequest {
379  // Required. The resource name.
380  string name = 1 [
381    (google.api.field_behavior) = REQUIRED,
382    (google.api.resource_reference) = { type: "tpu.googleapis.com/Node" }
383  ];
384}
385
386// Request for [CreateNode][google.cloud.tpu.v1.Tpu.CreateNode].
387message CreateNodeRequest {
388  // Required. The parent resource name.
389  string parent = 1 [
390    (google.api.field_behavior) = REQUIRED,
391    (google.api.resource_reference) = { child_type: "tpu.googleapis.com/Node" }
392  ];
393
394  // The unqualified resource name.
395  string node_id = 2;
396
397  // Required. The node.
398  Node node = 3 [(google.api.field_behavior) = REQUIRED];
399}
400
401// Request for [DeleteNode][google.cloud.tpu.v1.Tpu.DeleteNode].
402message DeleteNodeRequest {
403  // Required. The resource name.
404  string name = 1 [
405    (google.api.field_behavior) = REQUIRED,
406    (google.api.resource_reference) = { type: "tpu.googleapis.com/Node" }
407  ];
408}
409
410// Request for [ReimageNode][google.cloud.tpu.v1.Tpu.ReimageNode].
411message ReimageNodeRequest {
412  // The resource name.
413  string name = 1;
414
415  // The version for reimage to create.
416  string tensorflow_version = 2;
417}
418
419// Request for [StopNode][google.cloud.tpu.v1.Tpu.StopNode].
420message StopNodeRequest {
421  // The resource name.
422  string name = 1;
423}
424
425// Request for [StartNode][google.cloud.tpu.v1.Tpu.StartNode].
426message StartNodeRequest {
427  // The resource name.
428  string name = 1;
429}
430
431// A tensorflow version that a Node can be configured with.
432message TensorFlowVersion {
433  option (google.api.resource) = {
434    type: "tpu.googleapis.com/TensorFlowVersion"
435    pattern: "projects/{project}/locations/{location}/tensorFlowVersions/{tensor_flow_version}"
436  };
437
438  // The resource name.
439  string name = 1;
440
441  // the tensorflow version.
442  string version = 2;
443}
444
445// Request for
446// [GetTensorFlowVersion][google.cloud.tpu.v1.Tpu.GetTensorFlowVersion].
447message GetTensorFlowVersionRequest {
448  // Required. The resource name.
449  string name = 1 [
450    (google.api.field_behavior) = REQUIRED,
451    (google.api.resource_reference) = {
452      type: "tpu.googleapis.com/TensorFlowVersion"
453    }
454  ];
455}
456
457// Request for
458// [ListTensorFlowVersions][google.cloud.tpu.v1.Tpu.ListTensorFlowVersions].
459message ListTensorFlowVersionsRequest {
460  // Required. The parent resource name.
461  string parent = 1 [
462    (google.api.field_behavior) = REQUIRED,
463    (google.api.resource_reference) = {
464      type: "tpu.googleapis.com/TensorFlowVersion"
465    }
466  ];
467
468  // The maximum number of items to return.
469  int32 page_size = 2;
470
471  // The next_page_token value returned from a previous List request, if any.
472  string page_token = 3;
473
474  // List filter.
475  string filter = 5;
476
477  // Sort results.
478  string order_by = 6;
479}
480
481// Response for
482// [ListTensorFlowVersions][google.cloud.tpu.v1.Tpu.ListTensorFlowVersions].
483message ListTensorFlowVersionsResponse {
484  // The listed nodes.
485  repeated TensorFlowVersion tensorflow_versions = 1;
486
487  // The next page token or empty if none.
488  string next_page_token = 2;
489
490  // Locations that could not be reached.
491  repeated string unreachable = 3;
492}
493
494// A accelerator type that a Node can be configured with.
495message AcceleratorType {
496  option (google.api.resource) = {
497    type: "tpu.googleapis.com/AcceleratorType"
498    pattern: "projects/{project}/locations/{location}/acceleratorTypes/{accelerator_type}"
499  };
500
501  // The resource name.
502  string name = 1;
503
504  // the accelerator type.
505  string type = 2;
506}
507
508// Request for [GetAcceleratorType][google.cloud.tpu.v1.Tpu.GetAcceleratorType].
509message GetAcceleratorTypeRequest {
510  // Required. The resource name.
511  string name = 1 [
512    (google.api.field_behavior) = REQUIRED,
513    (google.api.resource_reference) = {
514      type: "tpu.googleapis.com/AcceleratorType"
515    }
516  ];
517}
518
519// Request for
520// [ListAcceleratorTypes][google.cloud.tpu.v1.Tpu.ListAcceleratorTypes].
521message ListAcceleratorTypesRequest {
522  // Required. The parent resource name.
523  string parent = 1 [
524    (google.api.field_behavior) = REQUIRED,
525    (google.api.resource_reference) = {
526      type: "tpu.googleapis.com/AcceleratorType"
527    }
528  ];
529
530  // The maximum number of items to return.
531  int32 page_size = 2;
532
533  // The next_page_token value returned from a previous List request, if any.
534  string page_token = 3;
535
536  // List filter.
537  string filter = 5;
538
539  // Sort results.
540  string order_by = 6;
541}
542
543// Response for
544// [ListAcceleratorTypes][google.cloud.tpu.v1.Tpu.ListAcceleratorTypes].
545message ListAcceleratorTypesResponse {
546  // The listed nodes.
547  repeated AcceleratorType accelerator_types = 1;
548
549  // The next page token or empty if none.
550  string next_page_token = 2;
551
552  // Locations that could not be reached.
553  repeated string unreachable = 3;
554}
555
556// Metadata describing an [Operation][google.longrunning.Operation]
557message OperationMetadata {
558  // The time the operation was created.
559  google.protobuf.Timestamp create_time = 1;
560
561  // The time the operation finished running.
562  google.protobuf.Timestamp end_time = 2;
563
564  // Target of the operation - for example
565  // projects/project-1/connectivityTests/test-1
566  string target = 3;
567
568  // Name of the verb executed by the operation.
569  string verb = 4;
570
571  // Human-readable status of the operation, if any.
572  string status_detail = 5;
573
574  // Specifies if cancellation was requested for the operation.
575  bool cancel_requested = 6;
576
577  // API version.
578  string api_version = 7;
579}
580
581// A Symptom instance.
582message Symptom {
583  // SymptomType represents the different types of Symptoms that a TPU can be
584  // at.
585  enum SymptomType {
586    // Unspecified symptom.
587    SYMPTOM_TYPE_UNSPECIFIED = 0;
588
589    // TPU VM memory is low.
590    LOW_MEMORY = 1;
591
592    // TPU runtime is out of memory.
593    OUT_OF_MEMORY = 2;
594
595    // TPU runtime execution has timed out.
596    EXECUTE_TIMED_OUT = 3;
597
598    // TPU runtime fails to construct a mesh that recognizes each TPU device's
599    // neighbors.
600    MESH_BUILD_FAIL = 4;
601
602    // TPU HBM is out of memory.
603    HBM_OUT_OF_MEMORY = 5;
604
605    // Abusive behaviors have been identified on the current project.
606    PROJECT_ABUSE = 6;
607  }
608
609  // Timestamp when the Symptom is created.
610  google.protobuf.Timestamp create_time = 1;
611
612  // Type of the Symptom.
613  SymptomType symptom_type = 2;
614
615  // Detailed information of the current Symptom.
616  string details = 3;
617
618  // A string used to uniquely distinguish a worker within a TPU node.
619  string worker_id = 4;
620}
621