1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.tpu.v1; 18 19import "google/api/annotations.proto"; 20import "google/api/client.proto"; 21import "google/api/field_behavior.proto"; 22import "google/api/resource.proto"; 23import "google/longrunning/operations.proto"; 24import "google/protobuf/timestamp.proto"; 25 26option go_package = "cloud.google.com/go/tpu/apiv1/tpupb;tpupb"; 27option java_multiple_files = true; 28option java_outer_classname = "CloudTpuProto"; 29option java_package = "com.google.cloud.tpu.v1"; 30 31// Manages TPU nodes and other resources 32// 33// TPU API v1 34service Tpu { 35 option (google.api.default_host) = "tpu.googleapis.com"; 36 option (google.api.oauth_scopes) = 37 "https://www.googleapis.com/auth/cloud-platform"; 38 39 // Lists nodes. 40 rpc ListNodes(ListNodesRequest) returns (ListNodesResponse) { 41 option (google.api.http) = { 42 get: "/v1/{parent=projects/*/locations/*}/nodes" 43 }; 44 option (google.api.method_signature) = "parent"; 45 } 46 47 // Gets the details of a node. 48 rpc GetNode(GetNodeRequest) returns (Node) { 49 option (google.api.http) = { 50 get: "/v1/{name=projects/*/locations/*/nodes/*}" 51 }; 52 option (google.api.method_signature) = "name"; 53 } 54 55 // Creates a node. 56 rpc CreateNode(CreateNodeRequest) returns (google.longrunning.Operation) { 57 option (google.api.http) = { 58 post: "/v1/{parent=projects/*/locations/*}/nodes" 59 body: "node" 60 }; 61 option (google.api.method_signature) = "parent,node,node_id"; 62 option (google.longrunning.operation_info) = { 63 response_type: "Node" 64 metadata_type: "OperationMetadata" 65 }; 66 } 67 68 // Deletes a node. 69 rpc DeleteNode(DeleteNodeRequest) returns (google.longrunning.Operation) { 70 option (google.api.http) = { 71 delete: "/v1/{name=projects/*/locations/*/nodes/*}" 72 }; 73 option (google.api.method_signature) = "name"; 74 option (google.longrunning.operation_info) = { 75 response_type: "Node" 76 metadata_type: "OperationMetadata" 77 }; 78 } 79 80 // Reimages a node's OS. 81 rpc ReimageNode(ReimageNodeRequest) returns (google.longrunning.Operation) { 82 option (google.api.http) = { 83 post: "/v1/{name=projects/*/locations/*/nodes/*}:reimage" 84 body: "*" 85 }; 86 option (google.longrunning.operation_info) = { 87 response_type: "Node" 88 metadata_type: "OperationMetadata" 89 }; 90 } 91 92 // Stops a node, this operation is only available with single TPU nodes. 93 rpc StopNode(StopNodeRequest) returns (google.longrunning.Operation) { 94 option (google.api.http) = { 95 post: "/v1/{name=projects/*/locations/*/nodes/*}:stop" 96 body: "*" 97 }; 98 option (google.longrunning.operation_info) = { 99 response_type: "Node" 100 metadata_type: "OperationMetadata" 101 }; 102 } 103 104 // Starts a node. 105 rpc StartNode(StartNodeRequest) returns (google.longrunning.Operation) { 106 option (google.api.http) = { 107 post: "/v1/{name=projects/*/locations/*/nodes/*}:start" 108 body: "*" 109 }; 110 option (google.longrunning.operation_info) = { 111 response_type: "Node" 112 metadata_type: "OperationMetadata" 113 }; 114 } 115 116 // List TensorFlow versions supported by this API. 117 rpc ListTensorFlowVersions(ListTensorFlowVersionsRequest) 118 returns (ListTensorFlowVersionsResponse) { 119 option (google.api.http) = { 120 get: "/v1/{parent=projects/*/locations/*}/tensorflowVersions" 121 }; 122 option (google.api.method_signature) = "parent"; 123 } 124 125 // Gets TensorFlow Version. 126 rpc GetTensorFlowVersion(GetTensorFlowVersionRequest) 127 returns (TensorFlowVersion) { 128 option (google.api.http) = { 129 get: "/v1/{name=projects/*/locations/*/tensorflowVersions/*}" 130 }; 131 option (google.api.method_signature) = "name"; 132 } 133 134 // Lists accelerator types supported by this API. 135 rpc ListAcceleratorTypes(ListAcceleratorTypesRequest) 136 returns (ListAcceleratorTypesResponse) { 137 option (google.api.http) = { 138 get: "/v1/{parent=projects/*/locations/*}/acceleratorTypes" 139 }; 140 option (google.api.method_signature) = "parent"; 141 } 142 143 // Gets AcceleratorType. 144 rpc GetAcceleratorType(GetAcceleratorTypeRequest) returns (AcceleratorType) { 145 option (google.api.http) = { 146 get: "/v1/{name=projects/*/locations/*/acceleratorTypes/*}" 147 }; 148 option (google.api.method_signature) = "name"; 149 } 150} 151 152// Sets the scheduling options for this node. 153message SchedulingConfig { 154 // Defines whether the node is preemptible. 155 bool preemptible = 1; 156 157 // Whether the node is created under a reservation. 158 bool reserved = 2; 159} 160 161// A network endpoint over which a TPU worker can be reached. 162message NetworkEndpoint { 163 // The IP address of this network endpoint. 164 string ip_address = 1; 165 166 // The port of this network endpoint. 167 int32 port = 2; 168} 169 170// A TPU instance. 171message Node { 172 option (google.api.resource) = { 173 type: "tpu.googleapis.com/Node" 174 pattern: "projects/{project}/locations/{location}/nodes/{node}" 175 }; 176 177 // Represents the different states of a TPU node during its lifecycle. 178 enum State { 179 // TPU node state is not known/set. 180 STATE_UNSPECIFIED = 0; 181 182 // TPU node is being created. 183 CREATING = 1; 184 185 // TPU node has been created. 186 READY = 2; 187 188 // TPU node is restarting. 189 RESTARTING = 3; 190 191 // TPU node is undergoing reimaging. 192 REIMAGING = 4; 193 194 // TPU node is being deleted. 195 DELETING = 5; 196 197 // TPU node is being repaired and may be unusable. Details can be 198 // found in the `help_description` field. 199 REPAIRING = 6; 200 201 // TPU node is stopped. 202 STOPPED = 8; 203 204 // TPU node is currently stopping. 205 STOPPING = 9; 206 207 // TPU node is currently starting. 208 STARTING = 10; 209 210 // TPU node has been preempted. Only applies to Preemptible TPU Nodes. 211 PREEMPTED = 11; 212 213 // TPU node has been terminated due to maintenance or has reached the end of 214 // its life cycle (for preemptible nodes). 215 TERMINATED = 12; 216 217 // TPU node is currently hiding. 218 HIDING = 13; 219 220 // TPU node has been hidden. 221 HIDDEN = 14; 222 223 // TPU node is currently unhiding. 224 UNHIDING = 15; 225 } 226 227 // Health defines the status of a TPU node as reported by 228 // Health Monitor. 229 enum Health { 230 // Health status is unknown: not initialized or failed to retrieve. 231 HEALTH_UNSPECIFIED = 0; 232 233 // The resource is healthy. 234 HEALTHY = 1; 235 236 // The resource is unhealthy. 237 DEPRECATED_UNHEALTHY = 2; 238 239 // The resource is unresponsive. 240 TIMEOUT = 3; 241 242 // The in-guest ML stack is unhealthy. 243 UNHEALTHY_TENSORFLOW = 4; 244 245 // The node is under maintenance/priority boost caused rescheduling and 246 // will resume running once rescheduled. 247 UNHEALTHY_MAINTENANCE = 5; 248 } 249 250 // TPU API Version. 251 enum ApiVersion { 252 // API version is unknown. 253 API_VERSION_UNSPECIFIED = 0; 254 255 // TPU API V1Alpha1 version. 256 V1_ALPHA1 = 1; 257 258 // TPU API V1 version. 259 V1 = 2; 260 261 // TPU API V2Alpha1 version. 262 V2_ALPHA1 = 3; 263 } 264 265 // Output only. Immutable. The name of the TPU 266 string name = 1 [ 267 (google.api.field_behavior) = IMMUTABLE, 268 (google.api.field_behavior) = OUTPUT_ONLY 269 ]; 270 271 // The user-supplied description of the TPU. Maximum of 512 characters. 272 string description = 3; 273 274 // Required. The type of hardware accelerators associated with this node. 275 string accelerator_type = 5 [(google.api.field_behavior) = REQUIRED]; 276 277 // Output only. DEPRECATED! Use network_endpoints instead. 278 // The network address for the TPU Node as visible to Compute Engine 279 // instances. 280 string ip_address = 8 [deprecated = true]; 281 282 // Output only. DEPRECATED! Use network_endpoints instead. 283 // The network port for the TPU Node as visible to Compute Engine instances. 284 string port = 14 [deprecated = true]; 285 286 // Output only. The current state for the TPU Node. 287 State state = 9 [(google.api.field_behavior) = OUTPUT_ONLY]; 288 289 // Output only. If this field is populated, it contains a description of why 290 // the TPU Node is unhealthy. 291 string health_description = 10 [(google.api.field_behavior) = OUTPUT_ONLY]; 292 293 // Required. The version of Tensorflow running in the Node. 294 string tensorflow_version = 11 [(google.api.field_behavior) = REQUIRED]; 295 296 // The name of a network they wish to peer the TPU node to. It must be a 297 // preexisting Compute Engine network inside of the project on which this API 298 // has been activated. If none is provided, "default" will be used. 299 string network = 12; 300 301 // The CIDR block that the TPU node will use when selecting an IP address. 302 // This CIDR block must be a /29 block; the Compute Engine networks API 303 // forbids a smaller block, and using a larger block would be wasteful (a 304 // node can only consume one IP address). Errors will occur if the CIDR block 305 // has already been used for a currently existing TPU node, the CIDR block 306 // conflicts with any subnetworks in the user's provided network, or the 307 // provided network is peered with another network that is using that CIDR 308 // block. 309 string cidr_block = 13; 310 311 // Output only. The service account used to run the tensor flow services 312 // within the node. To share resources, including Google Cloud Storage data, 313 // with the Tensorflow job running in the Node, this account must have 314 // permissions to that data. 315 string service_account = 15 [(google.api.field_behavior) = OUTPUT_ONLY]; 316 317 // Output only. The time when the node was created. 318 google.protobuf.Timestamp create_time = 16 319 [(google.api.field_behavior) = OUTPUT_ONLY]; 320 321 // The scheduling options for this node. 322 SchedulingConfig scheduling_config = 17; 323 324 // Output only. The network endpoints where TPU workers can be accessed and 325 // sent work. It is recommended that Tensorflow clients of the node reach out 326 // to the 0th entry in this map first. 327 repeated NetworkEndpoint network_endpoints = 21 328 [(google.api.field_behavior) = OUTPUT_ONLY]; 329 330 // The health status of the TPU node. 331 Health health = 22; 332 333 // Resource labels to represent user-provided metadata. 334 map<string, string> labels = 24; 335 336 // Whether the VPC peering for the node is set up through Service Networking 337 // API. The VPC Peering should be set up before provisioning the node. 338 // If this field is set, cidr_block field should not be specified. If the 339 // network, that you want to peer the TPU Node to, is Shared VPC networks, 340 // the node must be created with this this field enabled. 341 bool use_service_networking = 27; 342 343 // Output only. The API version that created this Node. 344 ApiVersion api_version = 38 [(google.api.field_behavior) = OUTPUT_ONLY]; 345 346 // Output only. The Symptoms that have occurred to the TPU Node. 347 repeated Symptom symptoms = 39 [(google.api.field_behavior) = OUTPUT_ONLY]; 348} 349 350// Request for [ListNodes][google.cloud.tpu.v1.Tpu.ListNodes]. 351message ListNodesRequest { 352 // Required. The parent resource name. 353 string parent = 1 [ 354 (google.api.field_behavior) = REQUIRED, 355 (google.api.resource_reference) = { child_type: "tpu.googleapis.com/Node" } 356 ]; 357 358 // The maximum number of items to return. 359 int32 page_size = 2; 360 361 // The next_page_token value returned from a previous List request, if any. 362 string page_token = 3; 363} 364 365// Response for [ListNodes][google.cloud.tpu.v1.Tpu.ListNodes]. 366message ListNodesResponse { 367 // The listed nodes. 368 repeated Node nodes = 1; 369 370 // The next page token or empty if none. 371 string next_page_token = 2; 372 373 // Locations that could not be reached. 374 repeated string unreachable = 3; 375} 376 377// Request for [GetNode][google.cloud.tpu.v1.Tpu.GetNode]. 378message GetNodeRequest { 379 // Required. The resource name. 380 string name = 1 [ 381 (google.api.field_behavior) = REQUIRED, 382 (google.api.resource_reference) = { type: "tpu.googleapis.com/Node" } 383 ]; 384} 385 386// Request for [CreateNode][google.cloud.tpu.v1.Tpu.CreateNode]. 387message CreateNodeRequest { 388 // Required. The parent resource name. 389 string parent = 1 [ 390 (google.api.field_behavior) = REQUIRED, 391 (google.api.resource_reference) = { child_type: "tpu.googleapis.com/Node" } 392 ]; 393 394 // The unqualified resource name. 395 string node_id = 2; 396 397 // Required. The node. 398 Node node = 3 [(google.api.field_behavior) = REQUIRED]; 399} 400 401// Request for [DeleteNode][google.cloud.tpu.v1.Tpu.DeleteNode]. 402message DeleteNodeRequest { 403 // Required. The resource name. 404 string name = 1 [ 405 (google.api.field_behavior) = REQUIRED, 406 (google.api.resource_reference) = { type: "tpu.googleapis.com/Node" } 407 ]; 408} 409 410// Request for [ReimageNode][google.cloud.tpu.v1.Tpu.ReimageNode]. 411message ReimageNodeRequest { 412 // The resource name. 413 string name = 1; 414 415 // The version for reimage to create. 416 string tensorflow_version = 2; 417} 418 419// Request for [StopNode][google.cloud.tpu.v1.Tpu.StopNode]. 420message StopNodeRequest { 421 // The resource name. 422 string name = 1; 423} 424 425// Request for [StartNode][google.cloud.tpu.v1.Tpu.StartNode]. 426message StartNodeRequest { 427 // The resource name. 428 string name = 1; 429} 430 431// A tensorflow version that a Node can be configured with. 432message TensorFlowVersion { 433 option (google.api.resource) = { 434 type: "tpu.googleapis.com/TensorFlowVersion" 435 pattern: "projects/{project}/locations/{location}/tensorFlowVersions/{tensor_flow_version}" 436 }; 437 438 // The resource name. 439 string name = 1; 440 441 // the tensorflow version. 442 string version = 2; 443} 444 445// Request for 446// [GetTensorFlowVersion][google.cloud.tpu.v1.Tpu.GetTensorFlowVersion]. 447message GetTensorFlowVersionRequest { 448 // Required. The resource name. 449 string name = 1 [ 450 (google.api.field_behavior) = REQUIRED, 451 (google.api.resource_reference) = { 452 type: "tpu.googleapis.com/TensorFlowVersion" 453 } 454 ]; 455} 456 457// Request for 458// [ListTensorFlowVersions][google.cloud.tpu.v1.Tpu.ListTensorFlowVersions]. 459message ListTensorFlowVersionsRequest { 460 // Required. The parent resource name. 461 string parent = 1 [ 462 (google.api.field_behavior) = REQUIRED, 463 (google.api.resource_reference) = { 464 type: "tpu.googleapis.com/TensorFlowVersion" 465 } 466 ]; 467 468 // The maximum number of items to return. 469 int32 page_size = 2; 470 471 // The next_page_token value returned from a previous List request, if any. 472 string page_token = 3; 473 474 // List filter. 475 string filter = 5; 476 477 // Sort results. 478 string order_by = 6; 479} 480 481// Response for 482// [ListTensorFlowVersions][google.cloud.tpu.v1.Tpu.ListTensorFlowVersions]. 483message ListTensorFlowVersionsResponse { 484 // The listed nodes. 485 repeated TensorFlowVersion tensorflow_versions = 1; 486 487 // The next page token or empty if none. 488 string next_page_token = 2; 489 490 // Locations that could not be reached. 491 repeated string unreachable = 3; 492} 493 494// A accelerator type that a Node can be configured with. 495message AcceleratorType { 496 option (google.api.resource) = { 497 type: "tpu.googleapis.com/AcceleratorType" 498 pattern: "projects/{project}/locations/{location}/acceleratorTypes/{accelerator_type}" 499 }; 500 501 // The resource name. 502 string name = 1; 503 504 // the accelerator type. 505 string type = 2; 506} 507 508// Request for [GetAcceleratorType][google.cloud.tpu.v1.Tpu.GetAcceleratorType]. 509message GetAcceleratorTypeRequest { 510 // Required. The resource name. 511 string name = 1 [ 512 (google.api.field_behavior) = REQUIRED, 513 (google.api.resource_reference) = { 514 type: "tpu.googleapis.com/AcceleratorType" 515 } 516 ]; 517} 518 519// Request for 520// [ListAcceleratorTypes][google.cloud.tpu.v1.Tpu.ListAcceleratorTypes]. 521message ListAcceleratorTypesRequest { 522 // Required. The parent resource name. 523 string parent = 1 [ 524 (google.api.field_behavior) = REQUIRED, 525 (google.api.resource_reference) = { 526 type: "tpu.googleapis.com/AcceleratorType" 527 } 528 ]; 529 530 // The maximum number of items to return. 531 int32 page_size = 2; 532 533 // The next_page_token value returned from a previous List request, if any. 534 string page_token = 3; 535 536 // List filter. 537 string filter = 5; 538 539 // Sort results. 540 string order_by = 6; 541} 542 543// Response for 544// [ListAcceleratorTypes][google.cloud.tpu.v1.Tpu.ListAcceleratorTypes]. 545message ListAcceleratorTypesResponse { 546 // The listed nodes. 547 repeated AcceleratorType accelerator_types = 1; 548 549 // The next page token or empty if none. 550 string next_page_token = 2; 551 552 // Locations that could not be reached. 553 repeated string unreachable = 3; 554} 555 556// Metadata describing an [Operation][google.longrunning.Operation] 557message OperationMetadata { 558 // The time the operation was created. 559 google.protobuf.Timestamp create_time = 1; 560 561 // The time the operation finished running. 562 google.protobuf.Timestamp end_time = 2; 563 564 // Target of the operation - for example 565 // projects/project-1/connectivityTests/test-1 566 string target = 3; 567 568 // Name of the verb executed by the operation. 569 string verb = 4; 570 571 // Human-readable status of the operation, if any. 572 string status_detail = 5; 573 574 // Specifies if cancellation was requested for the operation. 575 bool cancel_requested = 6; 576 577 // API version. 578 string api_version = 7; 579} 580 581// A Symptom instance. 582message Symptom { 583 // SymptomType represents the different types of Symptoms that a TPU can be 584 // at. 585 enum SymptomType { 586 // Unspecified symptom. 587 SYMPTOM_TYPE_UNSPECIFIED = 0; 588 589 // TPU VM memory is low. 590 LOW_MEMORY = 1; 591 592 // TPU runtime is out of memory. 593 OUT_OF_MEMORY = 2; 594 595 // TPU runtime execution has timed out. 596 EXECUTE_TIMED_OUT = 3; 597 598 // TPU runtime fails to construct a mesh that recognizes each TPU device's 599 // neighbors. 600 MESH_BUILD_FAIL = 4; 601 602 // TPU HBM is out of memory. 603 HBM_OUT_OF_MEMORY = 5; 604 605 // Abusive behaviors have been identified on the current project. 606 PROJECT_ABUSE = 6; 607 } 608 609 // Timestamp when the Symptom is created. 610 google.protobuf.Timestamp create_time = 1; 611 612 // Type of the Symptom. 613 SymptomType symptom_type = 2; 614 615 // Detailed information of the current Symptom. 616 string details = 3; 617 618 // A string used to uniquely distinguish a worker within a TPU node. 619 string worker_id = 4; 620} 621