1// Copyright 2022 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.dataproc.v1; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/protobuf/duration.proto"; 22import "google/protobuf/timestamp.proto"; 23 24option go_package = "cloud.google.com/go/dataproc/v2/apiv1/dataprocpb;dataprocpb"; 25option java_multiple_files = true; 26option java_outer_classname = "SharedProto"; 27option java_package = "com.google.cloud.dataproc.v1"; 28option (google.api.resource_definition) = { 29 type: "container.googleapis.com/Cluster" 30 pattern: "projects/{project}/locations/{location}/clusters/{cluster}" 31}; 32option (google.api.resource_definition) = { 33 type: "metastore.googleapis.com/Service" 34 pattern: "projects/{project}/locations/{location}/services/{service}" 35}; 36 37// Runtime configuration for a workload. 38message RuntimeConfig { 39 // Optional. Version of the batch runtime. 40 string version = 1 [(google.api.field_behavior) = OPTIONAL]; 41 42 // Optional. Optional custom container image for the job runtime environment. 43 // If not specified, a default container image will be used. 44 string container_image = 2 [(google.api.field_behavior) = OPTIONAL]; 45 46 // Optional. A mapping of property names to values, which are used to 47 // configure workload execution. 48 map<string, string> properties = 3 [(google.api.field_behavior) = OPTIONAL]; 49} 50 51// Environment configuration for a workload. 52message EnvironmentConfig { 53 // Optional. Execution configuration for a workload. 54 ExecutionConfig execution_config = 1 [(google.api.field_behavior) = OPTIONAL]; 55 56 // Optional. Peripherals configuration that workload has access to. 57 PeripheralsConfig peripherals_config = 2 58 [(google.api.field_behavior) = OPTIONAL]; 59} 60 61// Execution configuration for a workload. 62message ExecutionConfig { 63 // Optional. Service account that used to execute workload. 64 string service_account = 2 [(google.api.field_behavior) = OPTIONAL]; 65 66 // Network configuration for workload execution. 67 oneof network { 68 // Optional. Network URI to connect workload to. 69 string network_uri = 4 [(google.api.field_behavior) = OPTIONAL]; 70 71 // Optional. Subnetwork URI to connect workload to. 72 string subnetwork_uri = 5 [(google.api.field_behavior) = OPTIONAL]; 73 } 74 75 // Optional. Tags used for network traffic control. 76 repeated string network_tags = 6 [(google.api.field_behavior) = OPTIONAL]; 77 78 // Optional. The Cloud KMS key to use for encryption. 79 string kms_key = 7 [(google.api.field_behavior) = OPTIONAL]; 80 81 // Optional. The duration after which the workload will be terminated. 82 // When the workload passes this ttl, it will be unconditionally killed 83 // without waiting for ongoing work to finish. 84 // Minimum value is 10 minutes; maximum value is 14 days (see JSON 85 // representation of 86 // [Duration](https://developers.google.com/protocol-buffers/docs/proto3#json)). 87 // If both ttl and idle_ttl are specified, the conditions are treated as 88 // and OR: the workload will be terminated when it has been idle for idle_ttl 89 // or when the ttl has passed, whichever comes first. 90 // If ttl is not specified for a session, it defaults to 24h. 91 google.protobuf.Duration ttl = 9 [(google.api.field_behavior) = OPTIONAL]; 92 93 // Optional. A Cloud Storage bucket used to stage workload dependencies, 94 // config files, and store workload output and other ephemeral data, such as 95 // Spark history files. If you do not specify a staging bucket, Cloud Dataproc 96 // will determine a Cloud Storage location according to the region where your 97 // workload is running, and then create and manage project-level, per-location 98 // staging and temporary buckets. 99 // **This field requires a Cloud Storage bucket name, not a `gs://...` URI to 100 // a Cloud Storage bucket.** 101 string staging_bucket = 10 [(google.api.field_behavior) = OPTIONAL]; 102} 103 104// Spark History Server configuration for the workload. 105message SparkHistoryServerConfig { 106 // Optional. Resource name of an existing Dataproc Cluster to act as a Spark 107 // History Server for the workload. 108 // 109 // Example: 110 // 111 // * `projects/[project_id]/regions/[region]/clusters/[cluster_name]` 112 string dataproc_cluster = 1 [(google.api.field_behavior) = OPTIONAL]; 113} 114 115// Auxiliary services configuration for a workload. 116message PeripheralsConfig { 117 // Optional. Resource name of an existing Dataproc Metastore service. 118 // 119 // Example: 120 // 121 // * `projects/[project_id]/locations/[region]/services/[service_id]` 122 string metastore_service = 1 [ 123 (google.api.field_behavior) = OPTIONAL, 124 (google.api.resource_reference) = { 125 type: "metastore.googleapis.com/Service" 126 } 127 ]; 128 129 // Optional. The Spark History Server configuration for the workload. 130 SparkHistoryServerConfig spark_history_server_config = 2 131 [(google.api.field_behavior) = OPTIONAL]; 132} 133 134// Runtime information about workload execution. 135message RuntimeInfo { 136 // Output only. Map of remote access endpoints (such as web interfaces and 137 // APIs) to their URIs. 138 map<string, string> endpoints = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 139 140 // Output only. A URI pointing to the location of the stdout and stderr of the 141 // workload. 142 string output_uri = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 143 144 // Output only. A URI pointing to the location of the diagnostics tarball. 145 string diagnostic_output_uri = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 146 147 // Output only. Approximate workload resource usage calculated after workload 148 // finishes (see [Dataproc Serverless pricing] 149 // (https://cloud.google.com/dataproc-serverless/pricing)). 150 UsageMetrics approximate_usage = 6 151 [(google.api.field_behavior) = OUTPUT_ONLY]; 152 153 // Output only. Snapshot of current workload resource usage. 154 UsageSnapshot current_usage = 7 [(google.api.field_behavior) = OUTPUT_ONLY]; 155} 156 157// Usage metrics represent approximate total resources consumed by a workload. 158message UsageMetrics { 159 // Optional. DCU (Dataproc Compute Units) usage in (`milliDCU` x `seconds`) 160 // (see [Dataproc Serverless pricing] 161 // (https://cloud.google.com/dataproc-serverless/pricing)). 162 int64 milli_dcu_seconds = 1 [(google.api.field_behavior) = OPTIONAL]; 163 164 // Optional. Shuffle storage usage in (`GB` x `seconds`) (see 165 // [Dataproc Serverless pricing] 166 // (https://cloud.google.com/dataproc-serverless/pricing)). 167 int64 shuffle_storage_gb_seconds = 2 [(google.api.field_behavior) = OPTIONAL]; 168} 169 170// The usage snaphot represents the resources consumed by a workload at a 171// specified time. 172message UsageSnapshot { 173 // Optional. Milli (one-thousandth) Dataproc Compute Units (DCUs) (see 174 // [Dataproc Serverless pricing] 175 // (https://cloud.google.com/dataproc-serverless/pricing)). 176 int64 milli_dcu = 1 [(google.api.field_behavior) = OPTIONAL]; 177 178 // Optional. Shuffle Storage in gigabytes (GB). (see [Dataproc Serverless 179 // pricing] (https://cloud.google.com/dataproc-serverless/pricing)) 180 int64 shuffle_storage_gb = 2 [(google.api.field_behavior) = OPTIONAL]; 181 182 // Optional. The timestamp of the usage snapshot. 183 google.protobuf.Timestamp snapshot_time = 3 184 [(google.api.field_behavior) = OPTIONAL]; 185} 186 187// The cluster's GKE config. 188message GkeClusterConfig { 189 // Optional. A target GKE cluster to deploy to. It must be in the same project 190 // and region as the Dataproc cluster (the GKE cluster can be zonal or 191 // regional). Format: 192 // 'projects/{project}/locations/{location}/clusters/{cluster_id}' 193 string gke_cluster_target = 2 [ 194 (google.api.field_behavior) = OPTIONAL, 195 (google.api.resource_reference) = { 196 type: "container.googleapis.com/Cluster" 197 } 198 ]; 199 200 // Optional. GKE node pools where workloads will be scheduled. At least one 201 // node pool must be assigned the `DEFAULT` 202 // [GkeNodePoolTarget.Role][google.cloud.dataproc.v1.GkeNodePoolTarget.Role]. 203 // If a `GkeNodePoolTarget` is not specified, Dataproc constructs a `DEFAULT` 204 // `GkeNodePoolTarget`. Each role can be given to only one 205 // `GkeNodePoolTarget`. All node pools must have the same location settings. 206 repeated GkeNodePoolTarget node_pool_target = 3 207 [(google.api.field_behavior) = OPTIONAL]; 208} 209 210// The configuration for running the Dataproc cluster on Kubernetes. 211message KubernetesClusterConfig { 212 // Optional. A namespace within the Kubernetes cluster to deploy into. If this 213 // namespace does not exist, it is created. If it exists, Dataproc verifies 214 // that another Dataproc VirtualCluster is not installed into it. If not 215 // specified, the name of the Dataproc Cluster is used. 216 string kubernetes_namespace = 1 [(google.api.field_behavior) = OPTIONAL]; 217 218 oneof config { 219 // Required. The configuration for running the Dataproc cluster on GKE. 220 GkeClusterConfig gke_cluster_config = 2 221 [(google.api.field_behavior) = REQUIRED]; 222 } 223 224 // Optional. The software configuration for this Dataproc cluster running on 225 // Kubernetes. 226 KubernetesSoftwareConfig kubernetes_software_config = 3 227 [(google.api.field_behavior) = OPTIONAL]; 228} 229 230// The software configuration for this Dataproc cluster running on Kubernetes. 231message KubernetesSoftwareConfig { 232 // The components that should be installed in this Dataproc cluster. The key 233 // must be a string from the KubernetesComponent enumeration. The value is 234 // the version of the software to be installed. 235 // At least one entry must be specified. 236 map<string, string> component_version = 1; 237 238 // The properties to set on daemon config files. 239 // 240 // Property keys are specified in `prefix:property` format, for example 241 // `spark:spark.kubernetes.container.image`. The following are supported 242 // prefixes and their mappings: 243 // 244 // * spark: `spark-defaults.conf` 245 // 246 // For more information, see [Cluster 247 // properties](https://cloud.google.com/dataproc/docs/concepts/cluster-properties). 248 map<string, string> properties = 2; 249} 250 251// GKE node pools that Dataproc workloads run on. 252message GkeNodePoolTarget { 253 // `Role` specifies the tasks that will run on the node pool. Roles can be 254 // specific to workloads. Exactly one 255 // [GkeNodePoolTarget][google.cloud.dataproc.v1.GkeNodePoolTarget] within the 256 // virtual cluster must have the `DEFAULT` role, which is used to run all 257 // workloads that are not associated with a node pool. 258 enum Role { 259 // Role is unspecified. 260 ROLE_UNSPECIFIED = 0; 261 262 // At least one node pool must have the `DEFAULT` role. 263 // Work assigned to a role that is not associated with a node pool 264 // is assigned to the node pool with the `DEFAULT` role. For example, 265 // work assigned to the `CONTROLLER` role will be assigned to the node pool 266 // with the `DEFAULT` role if no node pool has the `CONTROLLER` role. 267 DEFAULT = 1; 268 269 // Run work associated with the Dataproc control plane (for example, 270 // controllers and webhooks). Very low resource requirements. 271 CONTROLLER = 2; 272 273 // Run work associated with a Spark driver of a job. 274 SPARK_DRIVER = 3; 275 276 // Run work associated with a Spark executor of a job. 277 SPARK_EXECUTOR = 4; 278 } 279 280 // Required. The target GKE node pool. 281 // Format: 282 // 'projects/{project}/locations/{location}/clusters/{cluster}/nodePools/{node_pool}' 283 string node_pool = 1 [(google.api.field_behavior) = REQUIRED]; 284 285 // Required. The roles associated with the GKE node pool. 286 repeated Role roles = 2 [(google.api.field_behavior) = REQUIRED]; 287 288 // Input only. The configuration for the GKE node pool. 289 // 290 // If specified, Dataproc attempts to create a node pool with the 291 // specified shape. If one with the same name already exists, it is 292 // verified against all specified fields. If a field differs, the 293 // virtual cluster creation will fail. 294 // 295 // If omitted, any node pool with the specified name is used. If a 296 // node pool with the specified name does not exist, Dataproc create a 297 // node pool with default values. 298 // 299 // This is an input only field. It will not be returned by the API. 300 GkeNodePoolConfig node_pool_config = 3 301 [(google.api.field_behavior) = INPUT_ONLY]; 302} 303 304// The configuration of a GKE node pool used by a [Dataproc-on-GKE 305// cluster](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke#create-a-dataproc-on-gke-cluster). 306message GkeNodePoolConfig { 307 // Parameters that describe cluster nodes. 308 message GkeNodeConfig { 309 // Optional. The name of a Compute Engine [machine 310 // type](https://cloud.google.com/compute/docs/machine-types). 311 string machine_type = 1 [(google.api.field_behavior) = OPTIONAL]; 312 313 // Optional. The number of local SSD disks to attach to the node, which is 314 // limited by the maximum number of disks allowable per zone (see [Adding 315 // Local SSDs](https://cloud.google.com/compute/docs/disks/local-ssd)). 316 int32 local_ssd_count = 7 [(google.api.field_behavior) = OPTIONAL]; 317 318 // Optional. Whether the nodes are created as legacy [preemptible VM 319 // instances] (https://cloud.google.com/compute/docs/instances/preemptible). 320 // Also see 321 // [Spot][google.cloud.dataproc.v1.GkeNodePoolConfig.GkeNodeConfig.spot] 322 // VMs, preemptible VM instances without a maximum lifetime. Legacy and Spot 323 // preemptible nodes cannot be used in a node pool with the `CONTROLLER` 324 // [role] 325 // (/dataproc/docs/reference/rest/v1/projects.regions.clusters#role) 326 // or in the DEFAULT node pool if the CONTROLLER role is not assigned (the 327 // DEFAULT node pool will assume the CONTROLLER role). 328 bool preemptible = 10 [(google.api.field_behavior) = OPTIONAL]; 329 330 // Optional. A list of [hardware 331 // accelerators](https://cloud.google.com/compute/docs/gpus) to attach to 332 // each node. 333 repeated GkeNodePoolAcceleratorConfig accelerators = 11 334 [(google.api.field_behavior) = OPTIONAL]; 335 336 // Optional. [Minimum CPU 337 // platform](https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform) 338 // to be used by this instance. The instance may be scheduled on the 339 // specified or a newer CPU platform. Specify the friendly names of CPU 340 // platforms, such as "Intel Haswell"` or Intel Sandy Bridge". 341 string min_cpu_platform = 13 [(google.api.field_behavior) = OPTIONAL]; 342 343 // Optional. The [Customer Managed Encryption Key (CMEK)] 344 // (https://cloud.google.com/kubernetes-engine/docs/how-to/using-cmek) 345 // used to encrypt the boot disk attached to each node in the node pool. 346 // Specify the key using the following format: 347 // <code>projects/<var>KEY_PROJECT_ID</var>/locations/<var>LOCATION</var>/keyRings/<var>RING_NAME</var>/cryptoKeys/<var>KEY_NAME</var></code>. 348 string boot_disk_kms_key = 23 [(google.api.field_behavior) = OPTIONAL]; 349 350 // Optional. Whether the nodes are created as [Spot VM instances] 351 // (https://cloud.google.com/compute/docs/instances/spot). 352 // Spot VMs are the latest update to legacy 353 // [preemptible 354 // VMs][google.cloud.dataproc.v1.GkeNodePoolConfig.GkeNodeConfig.preemptible]. 355 // Spot VMs do not have a maximum lifetime. Legacy and Spot preemptible 356 // nodes cannot be used in a node pool with the `CONTROLLER` 357 // [role](/dataproc/docs/reference/rest/v1/projects.regions.clusters#role) 358 // or in the DEFAULT node pool if the CONTROLLER role is not assigned (the 359 // DEFAULT node pool will assume the CONTROLLER role). 360 bool spot = 32 [(google.api.field_behavior) = OPTIONAL]; 361 } 362 363 // A GkeNodeConfigAcceleratorConfig represents a Hardware Accelerator request 364 // for a node pool. 365 message GkeNodePoolAcceleratorConfig { 366 // The number of accelerator cards exposed to an instance. 367 int64 accelerator_count = 1; 368 369 // The accelerator type resource namename (see GPUs on Compute Engine). 370 string accelerator_type = 2; 371 372 // Size of partitions to create on the GPU. Valid values are described in 373 // the NVIDIA [mig user 374 // guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#partitioning). 375 string gpu_partition_size = 3; 376 } 377 378 // GkeNodePoolAutoscaling contains information the cluster autoscaler needs to 379 // adjust the size of the node pool to the current cluster usage. 380 message GkeNodePoolAutoscalingConfig { 381 // The minimum number of nodes in the node pool. Must be >= 0 and <= 382 // max_node_count. 383 int32 min_node_count = 2; 384 385 // The maximum number of nodes in the node pool. Must be >= min_node_count, 386 // and must be > 0. 387 // **Note:** Quota must be sufficient to scale up the cluster. 388 int32 max_node_count = 3; 389 } 390 391 // Optional. The node pool configuration. 392 GkeNodeConfig config = 2 [(google.api.field_behavior) = OPTIONAL]; 393 394 // Optional. The list of Compute Engine 395 // [zones](https://cloud.google.com/compute/docs/zones#available) where 396 // node pool nodes associated with a Dataproc on GKE virtual cluster 397 // will be located. 398 // 399 // **Note:** All node pools associated with a virtual cluster 400 // must be located in the same region as the virtual cluster, and they must 401 // be located in the same zone within that region. 402 // 403 // If a location is not specified during node pool creation, Dataproc on GKE 404 // will choose the zone. 405 repeated string locations = 13 [(google.api.field_behavior) = OPTIONAL]; 406 407 // Optional. The autoscaler configuration for this node pool. The autoscaler 408 // is enabled only when a valid configuration is present. 409 GkeNodePoolAutoscalingConfig autoscaling = 4 410 [(google.api.field_behavior) = OPTIONAL]; 411} 412 413// Cluster components that can be activated. 414enum Component { 415 // Unspecified component. Specifying this will cause Cluster creation to fail. 416 COMPONENT_UNSPECIFIED = 0; 417 418 // The Anaconda python distribution. The Anaconda component is not supported 419 // in the Dataproc 420 // <a 421 // href="/dataproc/docs/concepts/versioning/dataproc-release-2.0">2.0 422 // image</a>. The 2.0 image is pre-installed with Miniconda. 423 ANACONDA = 5; 424 425 // Docker 426 DOCKER = 13; 427 428 // The Druid query engine. (alpha) 429 DRUID = 9; 430 431 // Flink 432 FLINK = 14; 433 434 // HBase. (beta) 435 HBASE = 11; 436 437 // The Hive Web HCatalog (the REST service for accessing HCatalog). 438 HIVE_WEBHCAT = 3; 439 440 // Hudi. 441 HUDI = 18; 442 443 // The Jupyter Notebook. 444 JUPYTER = 1; 445 446 // The Presto query engine. 447 PRESTO = 6; 448 449 // The Trino query engine. 450 TRINO = 17; 451 452 // The Ranger service. 453 RANGER = 12; 454 455 // The Solr service. 456 SOLR = 10; 457 458 // The Zeppelin notebook. 459 ZEPPELIN = 4; 460 461 // The Zookeeper service. 462 ZOOKEEPER = 8; 463} 464 465// Actions in response to failure of a resource associated with a cluster. 466enum FailureAction { 467 // When FailureAction is unspecified, failure action defaults to NO_ACTION. 468 FAILURE_ACTION_UNSPECIFIED = 0; 469 470 // Take no action on failure to create a cluster resource. NO_ACTION is the 471 // default. 472 NO_ACTION = 1; 473 474 // Delete the failed cluster resource. 475 DELETE = 2; 476} 477