1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.aiplatform.v1; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/cloud/aiplatform/v1/encryption_spec.proto"; 22import "google/cloud/aiplatform/v1/io.proto"; 23import "google/cloud/aiplatform/v1/model.proto"; 24import "google/cloud/aiplatform/v1/pipeline_state.proto"; 25import "google/protobuf/struct.proto"; 26import "google/protobuf/timestamp.proto"; 27import "google/rpc/status.proto"; 28 29option csharp_namespace = "Google.Cloud.AIPlatform.V1"; 30option go_package = "cloud.google.com/go/aiplatform/apiv1/aiplatformpb;aiplatformpb"; 31option java_multiple_files = true; 32option java_outer_classname = "TrainingPipelineProto"; 33option java_package = "com.google.cloud.aiplatform.v1"; 34option php_namespace = "Google\\Cloud\\AIPlatform\\V1"; 35option ruby_package = "Google::Cloud::AIPlatform::V1"; 36 37// The TrainingPipeline orchestrates tasks associated with training a Model. It 38// always executes the training task, and optionally may also 39// export data from Vertex AI's Dataset which becomes the training input, 40// [upload][google.cloud.aiplatform.v1.ModelService.UploadModel] the Model to 41// Vertex AI, and evaluate the Model. 42message TrainingPipeline { 43 option (google.api.resource) = { 44 type: "aiplatform.googleapis.com/TrainingPipeline" 45 pattern: "projects/{project}/locations/{location}/trainingPipelines/{training_pipeline}" 46 }; 47 48 // Output only. Resource name of the TrainingPipeline. 49 string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 50 51 // Required. The user-defined name of this TrainingPipeline. 52 string display_name = 2 [(google.api.field_behavior) = REQUIRED]; 53 54 // Specifies Vertex AI owned input data that may be used for training the 55 // Model. The TrainingPipeline's 56 // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition] 57 // should make clear whether this config is used and if there are any special 58 // requirements on how it should be filled. If nothing about this config is 59 // mentioned in the 60 // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition], 61 // then it should be assumed that the TrainingPipeline does not depend on this 62 // configuration. 63 InputDataConfig input_data_config = 3; 64 65 // Required. A Google Cloud Storage path to the YAML file that defines the 66 // training task which is responsible for producing the model artifact, and 67 // may also include additional auxiliary work. The definition files that can 68 // be used here are found in 69 // gs://google-cloud-aiplatform/schema/trainingjob/definition/. 70 // Note: The URI given on output will be immutable and probably different, 71 // including the URI scheme, than the one given on input. The output URI will 72 // point to a location where the user only has a read access. 73 string training_task_definition = 4 [(google.api.field_behavior) = REQUIRED]; 74 75 // Required. The training task's parameter(s), as specified in the 76 // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition]'s 77 // `inputs`. 78 google.protobuf.Value training_task_inputs = 5 79 [(google.api.field_behavior) = REQUIRED]; 80 81 // Output only. The metadata information as specified in the 82 // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition]'s 83 // `metadata`. This metadata is an auxiliary runtime and final information 84 // about the training task. While the pipeline is running this information is 85 // populated only at a best effort basis. Only present if the 86 // pipeline's 87 // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition] 88 // contains `metadata` object. 89 google.protobuf.Value training_task_metadata = 6 90 [(google.api.field_behavior) = OUTPUT_ONLY]; 91 92 // Describes the Model that may be uploaded (via 93 // [ModelService.UploadModel][google.cloud.aiplatform.v1.ModelService.UploadModel]) 94 // by this TrainingPipeline. The TrainingPipeline's 95 // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition] 96 // should make clear whether this Model description should be populated, and 97 // if there are any special requirements regarding how it should be filled. If 98 // nothing is mentioned in the 99 // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition], 100 // then it should be assumed that this field should not be filled and the 101 // training task either uploads the Model without a need of this information, 102 // or that training task does not support uploading a Model as part of the 103 // pipeline. When the Pipeline's state becomes `PIPELINE_STATE_SUCCEEDED` and 104 // the trained Model had been uploaded into Vertex AI, then the 105 // model_to_upload's resource [name][google.cloud.aiplatform.v1.Model.name] is 106 // populated. The Model is always uploaded into the Project and Location in 107 // which this pipeline is. 108 Model model_to_upload = 7; 109 110 // Optional. The ID to use for the uploaded Model, which will become the final 111 // component of the model resource name. 112 // 113 // This value may be up to 63 characters, and valid characters are 114 // `[a-z0-9_-]`. The first character cannot be a number or hyphen. 115 string model_id = 22 [(google.api.field_behavior) = OPTIONAL]; 116 117 // Optional. When specify this field, the `model_to_upload` will not be 118 // uploaded as a new model, instead, it will become a new version of this 119 // `parent_model`. 120 string parent_model = 21 [(google.api.field_behavior) = OPTIONAL]; 121 122 // Output only. The detailed state of the pipeline. 123 PipelineState state = 9 [(google.api.field_behavior) = OUTPUT_ONLY]; 124 125 // Output only. Only populated when the pipeline's state is 126 // `PIPELINE_STATE_FAILED` or `PIPELINE_STATE_CANCELLED`. 127 google.rpc.Status error = 10 [(google.api.field_behavior) = OUTPUT_ONLY]; 128 129 // Output only. Time when the TrainingPipeline was created. 130 google.protobuf.Timestamp create_time = 11 131 [(google.api.field_behavior) = OUTPUT_ONLY]; 132 133 // Output only. Time when the TrainingPipeline for the first time entered the 134 // `PIPELINE_STATE_RUNNING` state. 135 google.protobuf.Timestamp start_time = 12 136 [(google.api.field_behavior) = OUTPUT_ONLY]; 137 138 // Output only. Time when the TrainingPipeline entered any of the following 139 // states: `PIPELINE_STATE_SUCCEEDED`, `PIPELINE_STATE_FAILED`, 140 // `PIPELINE_STATE_CANCELLED`. 141 google.protobuf.Timestamp end_time = 13 142 [(google.api.field_behavior) = OUTPUT_ONLY]; 143 144 // Output only. Time when the TrainingPipeline was most recently updated. 145 google.protobuf.Timestamp update_time = 14 146 [(google.api.field_behavior) = OUTPUT_ONLY]; 147 148 // The labels with user-defined metadata to organize TrainingPipelines. 149 // 150 // Label keys and values can be no longer than 64 characters 151 // (Unicode codepoints), can only contain lowercase letters, numeric 152 // characters, underscores and dashes. International characters are allowed. 153 // 154 // See https://goo.gl/xmQnxf for more information and examples of labels. 155 map<string, string> labels = 15; 156 157 // Customer-managed encryption key spec for a TrainingPipeline. If set, this 158 // TrainingPipeline will be secured by this key. 159 // 160 // Note: Model trained by this TrainingPipeline is also secured by this key if 161 // [model_to_upload][google.cloud.aiplatform.v1.TrainingPipeline.encryption_spec] 162 // is not set separately. 163 EncryptionSpec encryption_spec = 18; 164} 165 166// Specifies Vertex AI owned input data to be used for training, and 167// possibly evaluating, the Model. 168message InputDataConfig { 169 // The instructions how the input data should be split between the 170 // training, validation and test sets. 171 // If no split type is provided, the 172 // [fraction_split][google.cloud.aiplatform.v1.InputDataConfig.fraction_split] 173 // is used by default. 174 oneof split { 175 // Split based on fractions defining the size of each set. 176 FractionSplit fraction_split = 2; 177 178 // Split based on the provided filters for each set. 179 FilterSplit filter_split = 3; 180 181 // Supported only for tabular Datasets. 182 // 183 // Split based on a predefined key. 184 PredefinedSplit predefined_split = 4; 185 186 // Supported only for tabular Datasets. 187 // 188 // Split based on the timestamp of the input data pieces. 189 TimestampSplit timestamp_split = 5; 190 191 // Supported only for tabular Datasets. 192 // 193 // Split based on the distribution of the specified column. 194 StratifiedSplit stratified_split = 12; 195 } 196 197 // Only applicable to Custom and Hyperparameter Tuning TrainingPipelines. 198 // 199 // The destination of the training data to be written to. 200 // 201 // Supported destination file formats: 202 // * For non-tabular data: "jsonl". 203 // * For tabular data: "csv" and "bigquery". 204 // 205 // The following Vertex AI environment variables are passed to containers 206 // or python modules of the training task when this field is set: 207 // 208 // * AIP_DATA_FORMAT : Exported data format. 209 // * AIP_TRAINING_DATA_URI : Sharded exported training data uris. 210 // * AIP_VALIDATION_DATA_URI : Sharded exported validation data uris. 211 // * AIP_TEST_DATA_URI : Sharded exported test data uris. 212 oneof destination { 213 // The Cloud Storage location where the training data is to be 214 // written to. In the given directory a new directory is created with 215 // name: 216 // `dataset-<dataset-id>-<annotation-type>-<timestamp-of-training-call>` 217 // where timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format. 218 // All training input data is written into that directory. 219 // 220 // The Vertex AI environment variables representing Cloud Storage 221 // data URIs are represented in the Cloud Storage wildcard 222 // format to support sharded data. e.g.: "gs://.../training-*.jsonl" 223 // 224 // * AIP_DATA_FORMAT = "jsonl" for non-tabular data, "csv" for tabular data 225 // * AIP_TRAINING_DATA_URI = 226 // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/training-*.${AIP_DATA_FORMAT}" 227 // 228 // * AIP_VALIDATION_DATA_URI = 229 // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/validation-*.${AIP_DATA_FORMAT}" 230 // 231 // * AIP_TEST_DATA_URI = 232 // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/test-*.${AIP_DATA_FORMAT}" 233 GcsDestination gcs_destination = 8; 234 235 // Only applicable to custom training with tabular Dataset with BigQuery 236 // source. 237 // 238 // The BigQuery project location where the training data is to be written 239 // to. In the given project a new dataset is created with name 240 // `dataset_<dataset-id>_<annotation-type>_<timestamp-of-training-call>` 241 // where timestamp is in YYYY_MM_DDThh_mm_ss_sssZ format. All training 242 // input data is written into that dataset. In the dataset three 243 // tables are created, `training`, `validation` and `test`. 244 // 245 // * AIP_DATA_FORMAT = "bigquery". 246 // * AIP_TRAINING_DATA_URI = 247 // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.training" 248 // 249 // * AIP_VALIDATION_DATA_URI = 250 // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.validation" 251 // 252 // * AIP_TEST_DATA_URI = 253 // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.test" 254 BigQueryDestination bigquery_destination = 10; 255 } 256 257 // Required. The ID of the Dataset in the same Project and Location which data 258 // will be used to train the Model. The Dataset must use schema compatible 259 // with Model being trained, and what is compatible should be described in the 260 // used TrainingPipeline's [training_task_definition] 261 // [google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition]. 262 // For tabular Datasets, all their data is exported to training, to pick 263 // and choose from. 264 string dataset_id = 1 [(google.api.field_behavior) = REQUIRED]; 265 266 // Applicable only to Datasets that have DataItems and Annotations. 267 // 268 // A filter on Annotations of the Dataset. Only Annotations that both 269 // match this filter and belong to DataItems not ignored by the split method 270 // are used in respectively training, validation or test role, depending on 271 // the role of the DataItem they are on (for the auto-assigned that role is 272 // decided by Vertex AI). A filter with same syntax as the one used in 273 // [ListAnnotations][google.cloud.aiplatform.v1.DatasetService.ListAnnotations] 274 // may be used, but note here it filters across all Annotations of the 275 // Dataset, and not just within a single DataItem. 276 string annotations_filter = 6; 277 278 // Applicable only to custom training with Datasets that have DataItems and 279 // Annotations. 280 // 281 // Cloud Storage URI that points to a YAML file describing the annotation 282 // schema. The schema is defined as an OpenAPI 3.0.2 [Schema 283 // Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject). 284 // The schema files that can be used here are found in 285 // gs://google-cloud-aiplatform/schema/dataset/annotation/ , note that the 286 // chosen schema must be consistent with 287 // [metadata][google.cloud.aiplatform.v1.Dataset.metadata_schema_uri] of the 288 // Dataset specified by 289 // [dataset_id][google.cloud.aiplatform.v1.InputDataConfig.dataset_id]. 290 // 291 // Only Annotations that both match this schema and belong to DataItems not 292 // ignored by the split method are used in respectively training, validation 293 // or test role, depending on the role of the DataItem they are on. 294 // 295 // When used in conjunction with 296 // [annotations_filter][google.cloud.aiplatform.v1.InputDataConfig.annotations_filter], 297 // the Annotations used for training are filtered by both 298 // [annotations_filter][google.cloud.aiplatform.v1.InputDataConfig.annotations_filter] 299 // and 300 // [annotation_schema_uri][google.cloud.aiplatform.v1.InputDataConfig.annotation_schema_uri]. 301 string annotation_schema_uri = 9; 302 303 // Only applicable to Datasets that have SavedQueries. 304 // 305 // The ID of a SavedQuery (annotation set) under the Dataset specified by 306 // [dataset_id][google.cloud.aiplatform.v1.InputDataConfig.dataset_id] used 307 // for filtering Annotations for training. 308 // 309 // Only Annotations that are associated with this SavedQuery are used in 310 // respectively training. When used in conjunction with 311 // [annotations_filter][google.cloud.aiplatform.v1.InputDataConfig.annotations_filter], 312 // the Annotations used for training are filtered by both 313 // [saved_query_id][google.cloud.aiplatform.v1.InputDataConfig.saved_query_id] 314 // and 315 // [annotations_filter][google.cloud.aiplatform.v1.InputDataConfig.annotations_filter]. 316 // 317 // Only one of 318 // [saved_query_id][google.cloud.aiplatform.v1.InputDataConfig.saved_query_id] 319 // and 320 // [annotation_schema_uri][google.cloud.aiplatform.v1.InputDataConfig.annotation_schema_uri] 321 // should be specified as both of them represent the same thing: problem type. 322 string saved_query_id = 7; 323 324 // Whether to persist the ML use assignment to data item system labels. 325 bool persist_ml_use_assignment = 11; 326} 327 328// Assigns the input data to training, validation, and test sets as per the 329// given fractions. Any of `training_fraction`, `validation_fraction` and 330// `test_fraction` may optionally be provided, they must sum to up to 1. If the 331// provided ones sum to less than 1, the remainder is assigned to sets as 332// decided by Vertex AI. If none of the fractions are set, by default roughly 333// 80% of data is used for training, 10% for validation, and 10% for test. 334message FractionSplit { 335 // The fraction of the input data that is to be used to train the Model. 336 double training_fraction = 1; 337 338 // The fraction of the input data that is to be used to validate the Model. 339 double validation_fraction = 2; 340 341 // The fraction of the input data that is to be used to evaluate the Model. 342 double test_fraction = 3; 343} 344 345// Assigns input data to training, validation, and test sets based on the given 346// filters, data pieces not matched by any filter are ignored. Currently only 347// supported for Datasets containing DataItems. 348// If any of the filters in this message are to match nothing, then they can be 349// set as '-' (the minus sign). 350// 351// Supported only for unstructured Datasets. 352// 353message FilterSplit { 354 // Required. A filter on DataItems of the Dataset. DataItems that match 355 // this filter are used to train the Model. A filter with same syntax 356 // as the one used in 357 // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems] 358 // may be used. If a single DataItem is matched by more than one of the 359 // FilterSplit filters, then it is assigned to the first set that applies to 360 // it in the training, validation, test order. 361 string training_filter = 1 [(google.api.field_behavior) = REQUIRED]; 362 363 // Required. A filter on DataItems of the Dataset. DataItems that match 364 // this filter are used to validate the Model. A filter with same syntax 365 // as the one used in 366 // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems] 367 // may be used. If a single DataItem is matched by more than one of the 368 // FilterSplit filters, then it is assigned to the first set that applies to 369 // it in the training, validation, test order. 370 string validation_filter = 2 [(google.api.field_behavior) = REQUIRED]; 371 372 // Required. A filter on DataItems of the Dataset. DataItems that match 373 // this filter are used to test the Model. A filter with same syntax 374 // as the one used in 375 // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems] 376 // may be used. If a single DataItem is matched by more than one of the 377 // FilterSplit filters, then it is assigned to the first set that applies to 378 // it in the training, validation, test order. 379 string test_filter = 3 [(google.api.field_behavior) = REQUIRED]; 380} 381 382// Assigns input data to training, validation, and test sets based on the 383// value of a provided key. 384// 385// Supported only for tabular Datasets. 386message PredefinedSplit { 387 // Required. The key is a name of one of the Dataset's data columns. 388 // The value of the key (either the label's value or value in the column) 389 // must be one of {`training`, `validation`, `test`}, and it defines to which 390 // set the given piece of data is assigned. If for a piece of data the key 391 // is not present or has an invalid value, that piece is ignored by the 392 // pipeline. 393 string key = 1 [(google.api.field_behavior) = REQUIRED]; 394} 395 396// Assigns input data to training, validation, and test sets based on a 397// provided timestamps. The youngest data pieces are assigned to training set, 398// next to validation set, and the oldest to the test set. 399// 400// Supported only for tabular Datasets. 401message TimestampSplit { 402 // The fraction of the input data that is to be used to train the Model. 403 double training_fraction = 1; 404 405 // The fraction of the input data that is to be used to validate the Model. 406 double validation_fraction = 2; 407 408 // The fraction of the input data that is to be used to evaluate the Model. 409 double test_fraction = 3; 410 411 // Required. The key is a name of one of the Dataset's data columns. 412 // The values of the key (the values in the column) must be in RFC 3339 413 // `date-time` format, where `time-offset` = `"Z"` 414 // (e.g. 1985-04-12T23:20:50.52Z). If for a piece of data the key is not 415 // present or has an invalid value, that piece is ignored by the pipeline. 416 string key = 4 [(google.api.field_behavior) = REQUIRED]; 417} 418 419// Assigns input data to the training, validation, and test sets so that the 420// distribution of values found in the categorical column (as specified by the 421// `key` field) is mirrored within each split. The fraction values determine 422// the relative sizes of the splits. 423// 424// For example, if the specified column has three values, with 50% of the rows 425// having value "A", 25% value "B", and 25% value "C", and the split fractions 426// are specified as 80/10/10, then the training set will constitute 80% of the 427// training data, with about 50% of the training set rows having the value "A" 428// for the specified column, about 25% having the value "B", and about 25% 429// having the value "C". 430// 431// Only the top 500 occurring values are used; any values not in the top 432// 500 values are randomly assigned to a split. If less than three rows contain 433// a specific value, those rows are randomly assigned. 434// 435// Supported only for tabular Datasets. 436message StratifiedSplit { 437 // The fraction of the input data that is to be used to train the Model. 438 double training_fraction = 1; 439 440 // The fraction of the input data that is to be used to validate the Model. 441 double validation_fraction = 2; 442 443 // The fraction of the input data that is to be used to evaluate the Model. 444 double test_fraction = 3; 445 446 // Required. The key is a name of one of the Dataset's data columns. 447 // The key provided must be for a categorical column. 448 string key = 4 [(google.api.field_behavior) = REQUIRED]; 449} 450