1// Copyright 2020 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.automl.v1beta1;
18
19import "google/cloud/automl/v1beta1/classification.proto";
20import "google/cloud/automl/v1beta1/column_spec.proto";
21import "google/cloud/automl/v1beta1/data_items.proto";
22import "google/cloud/automl/v1beta1/data_stats.proto";
23import "google/cloud/automl/v1beta1/ranges.proto";
24import "google/cloud/automl/v1beta1/regression.proto";
25import "google/cloud/automl/v1beta1/temporal.proto";
26import "google/protobuf/struct.proto";
27import "google/protobuf/timestamp.proto";
28
29option go_package = "cloud.google.com/go/automl/apiv1beta1/automlpb;automlpb";
30option java_multiple_files = true;
31option java_package = "com.google.cloud.automl.v1beta1";
32option php_namespace = "Google\\Cloud\\AutoMl\\V1beta1";
33option ruby_package = "Google::Cloud::AutoML::V1beta1";
34
35// Metadata for a dataset used for AutoML Tables.
36message TablesDatasetMetadata {
37  // Output only. The table_spec_id of the primary table of this dataset.
38  string primary_table_spec_id = 1;
39
40  // column_spec_id of the primary table's column that should be used as the
41  // training & prediction target.
42  // This column must be non-nullable and have one of following data types
43  // (otherwise model creation will error):
44  //
45  // * CATEGORY
46  //
47  // * FLOAT64
48  //
49  // If the type is CATEGORY , only up to
50  // 100 unique values may exist in that column across all rows.
51  //
52  // NOTE: Updates of this field will instantly affect any other users
53  // concurrently working with the dataset.
54  string target_column_spec_id = 2;
55
56  // column_spec_id of the primary table's column that should be used as the
57  // weight column, i.e. the higher the value the more important the row will be
58  // during model training.
59  // Required type: FLOAT64.
60  // Allowed values: 0 to 10000, inclusive on both ends; 0 means the row is
61  //                 ignored for training.
62  // If not set all rows are assumed to have equal weight of 1.
63  // NOTE: Updates of this field will instantly affect any other users
64  // concurrently working with the dataset.
65  string weight_column_spec_id = 3;
66
67  // column_spec_id of the primary table column which specifies a possible ML
68  // use of the row, i.e. the column will be used to split the rows into TRAIN,
69  // VALIDATE and TEST sets.
70  // Required type: STRING.
71  // This column, if set, must either have all of `TRAIN`, `VALIDATE`, `TEST`
72  // among its values, or only have `TEST`, `UNASSIGNED` values. In the latter
73  // case the rows with `UNASSIGNED` value will be assigned by AutoML. Note
74  // that if a given ml use distribution makes it impossible to create a "good"
75  // model, that call will error describing the issue.
76  // If both this column_spec_id and primary table's time_column_spec_id are not
77  // set, then all rows are treated as `UNASSIGNED`.
78  // NOTE: Updates of this field will instantly affect any other users
79  // concurrently working with the dataset.
80  string ml_use_column_spec_id = 4;
81
82  // Output only. Correlations between
83  //
84  // [TablesDatasetMetadata.target_column_spec_id][google.cloud.automl.v1beta1.TablesDatasetMetadata.target_column_spec_id],
85  // and other columns of the
86  //
87  // [TablesDatasetMetadataprimary_table][google.cloud.automl.v1beta1.TablesDatasetMetadata.primary_table_spec_id].
88  // Only set if the target column is set. Mapping from other column spec id to
89  // its CorrelationStats with the target column.
90  // This field may be stale, see the stats_update_time field for
91  // for the timestamp at which these stats were last updated.
92  map<string, CorrelationStats> target_column_correlations = 6;
93
94  // Output only. The most recent timestamp when target_column_correlations
95  // field and all descendant ColumnSpec.data_stats and
96  // ColumnSpec.top_correlated_columns fields were last (re-)generated. Any
97  // changes that happened to the dataset afterwards are not reflected in these
98  // fields values. The regeneration happens in the background on a best effort
99  // basis.
100  google.protobuf.Timestamp stats_update_time = 7;
101}
102
103// Model metadata specific to AutoML Tables.
104message TablesModelMetadata {
105  // Additional optimization objective configuration. Required for
106  // `MAXIMIZE_PRECISION_AT_RECALL` and `MAXIMIZE_RECALL_AT_PRECISION`,
107  // otherwise unused.
108  oneof additional_optimization_objective_config {
109    // Required when optimization_objective is "MAXIMIZE_PRECISION_AT_RECALL".
110    // Must be between 0 and 1, inclusive.
111    float optimization_objective_recall_value = 17;
112
113    // Required when optimization_objective is "MAXIMIZE_RECALL_AT_PRECISION".
114    // Must be between 0 and 1, inclusive.
115    float optimization_objective_precision_value = 18;
116  }
117
118  // Column spec of the dataset's primary table's column the model is
119  // predicting. Snapshotted when model creation started.
120  // Only 3 fields are used:
121  // name - May be set on CreateModel, if it's not then the ColumnSpec
122  //        corresponding to the current target_column_spec_id of the dataset
123  //        the model is trained from is used.
124  //        If neither is set, CreateModel will error.
125  // display_name - Output only.
126  // data_type - Output only.
127  ColumnSpec target_column_spec = 2;
128
129  // Column specs of the dataset's primary table's columns, on which
130  // the model is trained and which are used as the input for predictions.
131  // The
132  //
133  // [target_column][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec]
134  // as well as, according to dataset's state upon model creation,
135  //
136  // [weight_column][google.cloud.automl.v1beta1.TablesDatasetMetadata.weight_column_spec_id],
137  // and
138  //
139  // [ml_use_column][google.cloud.automl.v1beta1.TablesDatasetMetadata.ml_use_column_spec_id]
140  // must never be included here.
141  //
142  // Only 3 fields are used:
143  //
144  // * name - May be set on CreateModel, if set only the columns specified are
145  //   used, otherwise all primary table's columns (except the ones listed
146  //   above) are used for the training and prediction input.
147  //
148  // * display_name - Output only.
149  //
150  // * data_type - Output only.
151  repeated ColumnSpec input_feature_column_specs = 3;
152
153  // Objective function the model is optimizing towards. The training process
154  // creates a model that maximizes/minimizes the value of the objective
155  // function over the validation set.
156  //
157  // The supported optimization objectives depend on the prediction type.
158  // If the field is not set, a default objective function is used.
159  //
160  // CLASSIFICATION_BINARY:
161  //   "MAXIMIZE_AU_ROC" (default) - Maximize the area under the receiver
162  //                                 operating characteristic (ROC) curve.
163  //   "MINIMIZE_LOG_LOSS" - Minimize log loss.
164  //   "MAXIMIZE_AU_PRC" - Maximize the area under the precision-recall curve.
165  //   "MAXIMIZE_PRECISION_AT_RECALL" - Maximize precision for a specified
166  //                                   recall value.
167  //   "MAXIMIZE_RECALL_AT_PRECISION" - Maximize recall for a specified
168  //                                    precision value.
169  //
170  // CLASSIFICATION_MULTI_CLASS :
171  //   "MINIMIZE_LOG_LOSS" (default) - Minimize log loss.
172  //
173  //
174  // REGRESSION:
175  //   "MINIMIZE_RMSE" (default) - Minimize root-mean-squared error (RMSE).
176  //   "MINIMIZE_MAE" - Minimize mean-absolute error (MAE).
177  //   "MINIMIZE_RMSLE" - Minimize root-mean-squared log error (RMSLE).
178  string optimization_objective = 4;
179
180  // Output only. Auxiliary information for each of the
181  // input_feature_column_specs with respect to this particular model.
182  repeated TablesModelColumnInfo tables_model_column_info = 5;
183
184  // Required. The train budget of creating this model, expressed in milli node
185  // hours i.e. 1,000 value in this field means 1 node hour.
186  //
187  // The training cost of the model will not exceed this budget. The final cost
188  // will be attempted to be close to the budget, though may end up being (even)
189  // noticeably smaller - at the backend's discretion. This especially may
190  // happen when further model training ceases to provide any improvements.
191  //
192  // If the budget is set to a value known to be insufficient to train a
193  // model for the given dataset, the training won't be attempted and
194  // will error.
195  //
196  // The train budget must be between 1,000 and 72,000 milli node hours,
197  // inclusive.
198  int64 train_budget_milli_node_hours = 6;
199
200  // Output only. The actual training cost of the model, expressed in milli
201  // node hours, i.e. 1,000 value in this field means 1 node hour. Guaranteed
202  // to not exceed the train budget.
203  int64 train_cost_milli_node_hours = 7;
204
205  // Use the entire training budget. This disables the early stopping feature.
206  // By default, the early stopping feature is enabled, which means that AutoML
207  // Tables might stop training before the entire training budget has been used.
208  bool disable_early_stopping = 12;
209}
210
211// Contains annotation details specific to Tables.
212message TablesAnnotation {
213  // Output only. A confidence estimate between 0.0 and 1.0, inclusive. A higher
214  // value means greater confidence in the returned value.
215  // For
216  //
217  // [target_column_spec][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec]
218  // of FLOAT64 data type the score is not populated.
219  float score = 1;
220
221  // Output only. Only populated when
222  //
223  // [target_column_spec][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec]
224  // has FLOAT64 data type. An interval in which the exactly correct target
225  // value has 95% chance to be in.
226  DoubleRange prediction_interval = 4;
227
228  // The predicted value of the row's
229  //
230  // [target_column][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec].
231  // The value depends on the column's DataType:
232  //
233  // * CATEGORY - the predicted (with the above confidence `score`) CATEGORY
234  //   value.
235  //
236  // * FLOAT64 - the predicted (with above `prediction_interval`) FLOAT64 value.
237  google.protobuf.Value value = 2;
238
239  // Output only. Auxiliary information for each of the model's
240  //
241  // [input_feature_column_specs][google.cloud.automl.v1beta1.TablesModelMetadata.input_feature_column_specs]
242  // with respect to this particular prediction.
243  // If no other fields than
244  //
245  // [column_spec_name][google.cloud.automl.v1beta1.TablesModelColumnInfo.column_spec_name]
246  // and
247  //
248  // [column_display_name][google.cloud.automl.v1beta1.TablesModelColumnInfo.column_display_name]
249  // would be populated, then this whole field is not.
250  repeated TablesModelColumnInfo tables_model_column_info = 3;
251
252  // Output only. Stores the prediction score for the baseline example, which
253  // is defined as the example with all values set to their baseline values.
254  // This is used as part of the Sampled Shapley explanation of the model's
255  // prediction. This field is populated only when feature importance is
256  // requested. For regression models, this holds the baseline prediction for
257  // the baseline example. For classification models, this holds the baseline
258  // prediction for the baseline example for the argmax class.
259  float baseline_score = 5;
260}
261
262// An information specific to given column and Tables Model, in context
263// of the Model and the predictions created by it.
264message TablesModelColumnInfo {
265  // Output only. The name of the ColumnSpec describing the column. Not
266  // populated when this proto is outputted to BigQuery.
267  string column_spec_name = 1;
268
269  // Output only. The display name of the column (same as the display_name of
270  // its ColumnSpec).
271  string column_display_name = 2;
272
273  // Output only. When given as part of a Model (always populated):
274  // Measurement of how much model predictions correctness on the TEST data
275  // depend on values in this column. A value between 0 and 1, higher means
276  // higher influence. These values are normalized - for all input feature
277  // columns of a given model they add to 1.
278  //
279  // When given back by Predict (populated iff
280  // [feature_importance
281  // param][google.cloud.automl.v1beta1.PredictRequest.params] is set) or Batch
282  // Predict (populated iff
283  // [feature_importance][google.cloud.automl.v1beta1.PredictRequest.params]
284  // param is set):
285  // Measurement of how impactful for the prediction returned for the given row
286  // the value in this column was. Specifically, the feature importance
287  // specifies the marginal contribution that the feature made to the prediction
288  // score compared to the baseline score. These values are computed using the
289  // Sampled Shapley method.
290  float feature_importance = 3;
291}
292