1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.aiplatform.v1;
18
19import "google/api/field_behavior.proto";
20import "google/api/resource.proto";
21import "google/cloud/aiplatform/v1/encryption_spec.proto";
22import "google/cloud/aiplatform/v1/io.proto";
23import "google/cloud/aiplatform/v1/model.proto";
24import "google/cloud/aiplatform/v1/pipeline_state.proto";
25import "google/protobuf/struct.proto";
26import "google/protobuf/timestamp.proto";
27import "google/rpc/status.proto";
28
29option csharp_namespace = "Google.Cloud.AIPlatform.V1";
30option go_package = "cloud.google.com/go/aiplatform/apiv1/aiplatformpb;aiplatformpb";
31option java_multiple_files = true;
32option java_outer_classname = "TrainingPipelineProto";
33option java_package = "com.google.cloud.aiplatform.v1";
34option php_namespace = "Google\\Cloud\\AIPlatform\\V1";
35option ruby_package = "Google::Cloud::AIPlatform::V1";
36
37// The TrainingPipeline orchestrates tasks associated with training a Model. It
38// always executes the training task, and optionally may also
39// export data from Vertex AI's Dataset which becomes the training input,
40// [upload][google.cloud.aiplatform.v1.ModelService.UploadModel] the Model to
41// Vertex AI, and evaluate the Model.
42message TrainingPipeline {
43  option (google.api.resource) = {
44    type: "aiplatform.googleapis.com/TrainingPipeline"
45    pattern: "projects/{project}/locations/{location}/trainingPipelines/{training_pipeline}"
46  };
47
48  // Output only. Resource name of the TrainingPipeline.
49  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
50
51  // Required. The user-defined name of this TrainingPipeline.
52  string display_name = 2 [(google.api.field_behavior) = REQUIRED];
53
54  // Specifies Vertex AI owned input data that may be used for training the
55  // Model. The TrainingPipeline's
56  // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition]
57  // should make clear whether this config is used and if there are any special
58  // requirements on how it should be filled. If nothing about this config is
59  // mentioned in the
60  // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition],
61  // then it should be assumed that the TrainingPipeline does not depend on this
62  // configuration.
63  InputDataConfig input_data_config = 3;
64
65  // Required. A Google Cloud Storage path to the YAML file that defines the
66  // training task which is responsible for producing the model artifact, and
67  // may also include additional auxiliary work. The definition files that can
68  // be used here are found in
69  // gs://google-cloud-aiplatform/schema/trainingjob/definition/.
70  // Note: The URI given on output will be immutable and probably different,
71  // including the URI scheme, than the one given on input. The output URI will
72  // point to a location where the user only has a read access.
73  string training_task_definition = 4 [(google.api.field_behavior) = REQUIRED];
74
75  // Required. The training task's parameter(s), as specified in the
76  // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition]'s
77  // `inputs`.
78  google.protobuf.Value training_task_inputs = 5
79      [(google.api.field_behavior) = REQUIRED];
80
81  // Output only. The metadata information as specified in the
82  // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition]'s
83  // `metadata`. This metadata is an auxiliary runtime and final information
84  // about the training task. While the pipeline is running this information is
85  // populated only at a best effort basis. Only present if the
86  // pipeline's
87  // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition]
88  // contains `metadata` object.
89  google.protobuf.Value training_task_metadata = 6
90      [(google.api.field_behavior) = OUTPUT_ONLY];
91
92  // Describes the Model that may be uploaded (via
93  // [ModelService.UploadModel][google.cloud.aiplatform.v1.ModelService.UploadModel])
94  // by this TrainingPipeline. The TrainingPipeline's
95  // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition]
96  // should make clear whether this Model description should be populated, and
97  // if there are any special requirements regarding how it should be filled. If
98  // nothing is mentioned in the
99  // [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition],
100  // then it should be assumed that this field should not be filled and the
101  // training task either uploads the Model without a need of this information,
102  // or that training task does not support uploading a Model as part of the
103  // pipeline. When the Pipeline's state becomes `PIPELINE_STATE_SUCCEEDED` and
104  // the trained Model had been uploaded into Vertex AI, then the
105  // model_to_upload's resource [name][google.cloud.aiplatform.v1.Model.name] is
106  // populated. The Model is always uploaded into the Project and Location in
107  // which this pipeline is.
108  Model model_to_upload = 7;
109
110  // Optional. The ID to use for the uploaded Model, which will become the final
111  // component of the model resource name.
112  //
113  // This value may be up to 63 characters, and valid characters are
114  // `[a-z0-9_-]`. The first character cannot be a number or hyphen.
115  string model_id = 22 [(google.api.field_behavior) = OPTIONAL];
116
117  // Optional. When specify this field, the `model_to_upload` will not be
118  // uploaded as a new model, instead, it will become a new version of this
119  // `parent_model`.
120  string parent_model = 21 [(google.api.field_behavior) = OPTIONAL];
121
122  // Output only. The detailed state of the pipeline.
123  PipelineState state = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
124
125  // Output only. Only populated when the pipeline's state is
126  // `PIPELINE_STATE_FAILED` or `PIPELINE_STATE_CANCELLED`.
127  google.rpc.Status error = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
128
129  // Output only. Time when the TrainingPipeline was created.
130  google.protobuf.Timestamp create_time = 11
131      [(google.api.field_behavior) = OUTPUT_ONLY];
132
133  // Output only. Time when the TrainingPipeline for the first time entered the
134  // `PIPELINE_STATE_RUNNING` state.
135  google.protobuf.Timestamp start_time = 12
136      [(google.api.field_behavior) = OUTPUT_ONLY];
137
138  // Output only. Time when the TrainingPipeline entered any of the following
139  // states: `PIPELINE_STATE_SUCCEEDED`, `PIPELINE_STATE_FAILED`,
140  // `PIPELINE_STATE_CANCELLED`.
141  google.protobuf.Timestamp end_time = 13
142      [(google.api.field_behavior) = OUTPUT_ONLY];
143
144  // Output only. Time when the TrainingPipeline was most recently updated.
145  google.protobuf.Timestamp update_time = 14
146      [(google.api.field_behavior) = OUTPUT_ONLY];
147
148  // The labels with user-defined metadata to organize TrainingPipelines.
149  //
150  // Label keys and values can be no longer than 64 characters
151  // (Unicode codepoints), can only contain lowercase letters, numeric
152  // characters, underscores and dashes. International characters are allowed.
153  //
154  // See https://goo.gl/xmQnxf for more information and examples of labels.
155  map<string, string> labels = 15;
156
157  // Customer-managed encryption key spec for a TrainingPipeline. If set, this
158  // TrainingPipeline will be secured by this key.
159  //
160  // Note: Model trained by this TrainingPipeline is also secured by this key if
161  // [model_to_upload][google.cloud.aiplatform.v1.TrainingPipeline.encryption_spec]
162  // is not set separately.
163  EncryptionSpec encryption_spec = 18;
164}
165
166// Specifies Vertex AI owned input data to be used for training, and
167// possibly evaluating, the Model.
168message InputDataConfig {
169  // The instructions how the input data should be split between the
170  // training, validation and test sets.
171  // If no split type is provided, the
172  // [fraction_split][google.cloud.aiplatform.v1.InputDataConfig.fraction_split]
173  // is used by default.
174  oneof split {
175    // Split based on fractions defining the size of each set.
176    FractionSplit fraction_split = 2;
177
178    // Split based on the provided filters for each set.
179    FilterSplit filter_split = 3;
180
181    // Supported only for tabular Datasets.
182    //
183    // Split based on a predefined key.
184    PredefinedSplit predefined_split = 4;
185
186    // Supported only for tabular Datasets.
187    //
188    // Split based on the timestamp of the input data pieces.
189    TimestampSplit timestamp_split = 5;
190
191    // Supported only for tabular Datasets.
192    //
193    // Split based on the distribution of the specified column.
194    StratifiedSplit stratified_split = 12;
195  }
196
197  // Only applicable to Custom and Hyperparameter Tuning TrainingPipelines.
198  //
199  // The destination of the training data to be written to.
200  //
201  // Supported destination file formats:
202  //   * For non-tabular data: "jsonl".
203  //   * For tabular data: "csv" and "bigquery".
204  //
205  // The following Vertex AI environment variables are passed to containers
206  // or python modules of the training task when this field is set:
207  //
208  // * AIP_DATA_FORMAT : Exported data format.
209  // * AIP_TRAINING_DATA_URI : Sharded exported training data uris.
210  // * AIP_VALIDATION_DATA_URI : Sharded exported validation data uris.
211  // * AIP_TEST_DATA_URI : Sharded exported test data uris.
212  oneof destination {
213    // The Cloud Storage location where the training data is to be
214    // written to. In the given directory a new directory is created with
215    // name:
216    // `dataset-<dataset-id>-<annotation-type>-<timestamp-of-training-call>`
217    // where timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format.
218    // All training input data is written into that directory.
219    //
220    // The Vertex AI environment variables representing Cloud Storage
221    // data URIs are represented in the Cloud Storage wildcard
222    // format to support sharded data. e.g.: "gs://.../training-*.jsonl"
223    //
224    // * AIP_DATA_FORMAT = "jsonl" for non-tabular data, "csv" for tabular data
225    // * AIP_TRAINING_DATA_URI =
226    // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/training-*.${AIP_DATA_FORMAT}"
227    //
228    // * AIP_VALIDATION_DATA_URI =
229    // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/validation-*.${AIP_DATA_FORMAT}"
230    //
231    // * AIP_TEST_DATA_URI =
232    // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/test-*.${AIP_DATA_FORMAT}"
233    GcsDestination gcs_destination = 8;
234
235    // Only applicable to custom training with tabular Dataset with BigQuery
236    // source.
237    //
238    // The BigQuery project location where the training data is to be written
239    // to. In the given project a new dataset is created with name
240    // `dataset_<dataset-id>_<annotation-type>_<timestamp-of-training-call>`
241    // where timestamp is in YYYY_MM_DDThh_mm_ss_sssZ format. All training
242    // input data is written into that dataset. In the dataset three
243    // tables are created, `training`, `validation` and `test`.
244    //
245    // * AIP_DATA_FORMAT = "bigquery".
246    // * AIP_TRAINING_DATA_URI  =
247    // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.training"
248    //
249    // * AIP_VALIDATION_DATA_URI =
250    // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.validation"
251    //
252    // * AIP_TEST_DATA_URI =
253    // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.test"
254    BigQueryDestination bigquery_destination = 10;
255  }
256
257  // Required. The ID of the Dataset in the same Project and Location which data
258  // will be used to train the Model. The Dataset must use schema compatible
259  // with Model being trained, and what is compatible should be described in the
260  // used TrainingPipeline's [training_task_definition]
261  // [google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition].
262  // For tabular Datasets, all their data is exported to training, to pick
263  // and choose from.
264  string dataset_id = 1 [(google.api.field_behavior) = REQUIRED];
265
266  // Applicable only to Datasets that have DataItems and Annotations.
267  //
268  // A filter on Annotations of the Dataset. Only Annotations that both
269  // match this filter and belong to DataItems not ignored by the split method
270  // are used in respectively training, validation or test role, depending on
271  // the role of the DataItem they are on (for the auto-assigned that role is
272  // decided by Vertex AI). A filter with same syntax as the one used in
273  // [ListAnnotations][google.cloud.aiplatform.v1.DatasetService.ListAnnotations]
274  // may be used, but note here it filters across all Annotations of the
275  // Dataset, and not just within a single DataItem.
276  string annotations_filter = 6;
277
278  // Applicable only to custom training with Datasets that have DataItems and
279  // Annotations.
280  //
281  // Cloud Storage URI that points to a YAML file describing the annotation
282  // schema. The schema is defined as an OpenAPI 3.0.2 [Schema
283  // Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject).
284  // The schema files that can be used here are found in
285  // gs://google-cloud-aiplatform/schema/dataset/annotation/ , note that the
286  // chosen schema must be consistent with
287  // [metadata][google.cloud.aiplatform.v1.Dataset.metadata_schema_uri] of the
288  // Dataset specified by
289  // [dataset_id][google.cloud.aiplatform.v1.InputDataConfig.dataset_id].
290  //
291  // Only Annotations that both match this schema and belong to DataItems not
292  // ignored by the split method are used in respectively training, validation
293  // or test role, depending on the role of the DataItem they are on.
294  //
295  // When used in conjunction with
296  // [annotations_filter][google.cloud.aiplatform.v1.InputDataConfig.annotations_filter],
297  // the Annotations used for training are filtered by both
298  // [annotations_filter][google.cloud.aiplatform.v1.InputDataConfig.annotations_filter]
299  // and
300  // [annotation_schema_uri][google.cloud.aiplatform.v1.InputDataConfig.annotation_schema_uri].
301  string annotation_schema_uri = 9;
302
303  // Only applicable to Datasets that have SavedQueries.
304  //
305  // The ID of a SavedQuery (annotation set) under the Dataset specified by
306  // [dataset_id][google.cloud.aiplatform.v1.InputDataConfig.dataset_id] used
307  // for filtering Annotations for training.
308  //
309  // Only Annotations that are associated with this SavedQuery are used in
310  // respectively training. When used in conjunction with
311  // [annotations_filter][google.cloud.aiplatform.v1.InputDataConfig.annotations_filter],
312  // the Annotations used for training are filtered by both
313  // [saved_query_id][google.cloud.aiplatform.v1.InputDataConfig.saved_query_id]
314  // and
315  // [annotations_filter][google.cloud.aiplatform.v1.InputDataConfig.annotations_filter].
316  //
317  // Only one of
318  // [saved_query_id][google.cloud.aiplatform.v1.InputDataConfig.saved_query_id]
319  // and
320  // [annotation_schema_uri][google.cloud.aiplatform.v1.InputDataConfig.annotation_schema_uri]
321  // should be specified as both of them represent the same thing: problem type.
322  string saved_query_id = 7;
323
324  // Whether to persist the ML use assignment to data item system labels.
325  bool persist_ml_use_assignment = 11;
326}
327
328// Assigns the input data to training, validation, and test sets as per the
329// given fractions. Any of `training_fraction`, `validation_fraction` and
330// `test_fraction` may optionally be provided, they must sum to up to 1. If the
331// provided ones sum to less than 1, the remainder is assigned to sets as
332// decided by Vertex AI. If none of the fractions are set, by default roughly
333// 80% of data is used for training, 10% for validation, and 10% for test.
334message FractionSplit {
335  // The fraction of the input data that is to be used to train the Model.
336  double training_fraction = 1;
337
338  // The fraction of the input data that is to be used to validate the Model.
339  double validation_fraction = 2;
340
341  // The fraction of the input data that is to be used to evaluate the Model.
342  double test_fraction = 3;
343}
344
345// Assigns input data to training, validation, and test sets based on the given
346// filters, data pieces not matched by any filter are ignored. Currently only
347// supported for Datasets containing DataItems.
348// If any of the filters in this message are to match nothing, then they can be
349// set as '-' (the minus sign).
350//
351// Supported only for unstructured Datasets.
352//
353message FilterSplit {
354  // Required. A filter on DataItems of the Dataset. DataItems that match
355  // this filter are used to train the Model. A filter with same syntax
356  // as the one used in
357  // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems]
358  // may be used. If a single DataItem is matched by more than one of the
359  // FilterSplit filters, then it is assigned to the first set that applies to
360  // it in the training, validation, test order.
361  string training_filter = 1 [(google.api.field_behavior) = REQUIRED];
362
363  // Required. A filter on DataItems of the Dataset. DataItems that match
364  // this filter are used to validate the Model. A filter with same syntax
365  // as the one used in
366  // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems]
367  // may be used. If a single DataItem is matched by more than one of the
368  // FilterSplit filters, then it is assigned to the first set that applies to
369  // it in the training, validation, test order.
370  string validation_filter = 2 [(google.api.field_behavior) = REQUIRED];
371
372  // Required. A filter on DataItems of the Dataset. DataItems that match
373  // this filter are used to test the Model. A filter with same syntax
374  // as the one used in
375  // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems]
376  // may be used. If a single DataItem is matched by more than one of the
377  // FilterSplit filters, then it is assigned to the first set that applies to
378  // it in the training, validation, test order.
379  string test_filter = 3 [(google.api.field_behavior) = REQUIRED];
380}
381
382// Assigns input data to training, validation, and test sets based on the
383// value of a provided key.
384//
385// Supported only for tabular Datasets.
386message PredefinedSplit {
387  // Required. The key is a name of one of the Dataset's data columns.
388  // The value of the key (either the label's value or value in the column)
389  // must be one of {`training`, `validation`, `test`}, and it defines to which
390  // set the given piece of data is assigned. If for a piece of data the key
391  // is not present or has an invalid value, that piece is ignored by the
392  // pipeline.
393  string key = 1 [(google.api.field_behavior) = REQUIRED];
394}
395
396// Assigns input data to training, validation, and test sets based on a
397// provided timestamps. The youngest data pieces are assigned to training set,
398// next to validation set, and the oldest to the test set.
399//
400// Supported only for tabular Datasets.
401message TimestampSplit {
402  // The fraction of the input data that is to be used to train the Model.
403  double training_fraction = 1;
404
405  // The fraction of the input data that is to be used to validate the Model.
406  double validation_fraction = 2;
407
408  // The fraction of the input data that is to be used to evaluate the Model.
409  double test_fraction = 3;
410
411  // Required. The key is a name of one of the Dataset's data columns.
412  // The values of the key (the values in the column) must be in RFC 3339
413  // `date-time` format, where `time-offset` = `"Z"`
414  // (e.g. 1985-04-12T23:20:50.52Z). If for a piece of data the key is not
415  // present or has an invalid value, that piece is ignored by the pipeline.
416  string key = 4 [(google.api.field_behavior) = REQUIRED];
417}
418
419// Assigns input data to the training, validation, and test sets so that the
420// distribution of values found in the categorical column (as specified by the
421// `key` field) is mirrored within each split. The fraction values determine
422// the relative sizes of the splits.
423//
424// For example, if the specified column has three values, with 50% of the rows
425// having value "A", 25% value "B", and 25% value "C", and the split fractions
426// are specified as 80/10/10, then the training set will constitute 80% of the
427// training data, with about 50% of the training set rows having the value "A"
428// for the specified column, about 25% having the value "B", and about 25%
429// having the value "C".
430//
431// Only the top 500 occurring values are used; any values not in the top
432// 500 values are randomly assigned to a split. If less than three rows contain
433// a specific value, those rows are randomly assigned.
434//
435// Supported only for tabular Datasets.
436message StratifiedSplit {
437  // The fraction of the input data that is to be used to train the Model.
438  double training_fraction = 1;
439
440  // The fraction of the input data that is to be used to validate the Model.
441  double validation_fraction = 2;
442
443  // The fraction of the input data that is to be used to evaluate the Model.
444  double test_fraction = 3;
445
446  // Required. The key is a name of one of the Dataset's data columns.
447  // The key provided must be for a categorical column.
448  string key = 4 [(google.api.field_behavior) = REQUIRED];
449}
450