1// Copyright 2020 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.automl.v1beta1; 18 19import "google/cloud/automl/v1beta1/classification.proto"; 20import "google/cloud/automl/v1beta1/column_spec.proto"; 21import "google/cloud/automl/v1beta1/data_items.proto"; 22import "google/cloud/automl/v1beta1/data_stats.proto"; 23import "google/cloud/automl/v1beta1/ranges.proto"; 24import "google/cloud/automl/v1beta1/regression.proto"; 25import "google/cloud/automl/v1beta1/temporal.proto"; 26import "google/protobuf/struct.proto"; 27import "google/protobuf/timestamp.proto"; 28 29option go_package = "cloud.google.com/go/automl/apiv1beta1/automlpb;automlpb"; 30option java_multiple_files = true; 31option java_package = "com.google.cloud.automl.v1beta1"; 32option php_namespace = "Google\\Cloud\\AutoMl\\V1beta1"; 33option ruby_package = "Google::Cloud::AutoML::V1beta1"; 34 35// Metadata for a dataset used for AutoML Tables. 36message TablesDatasetMetadata { 37 // Output only. The table_spec_id of the primary table of this dataset. 38 string primary_table_spec_id = 1; 39 40 // column_spec_id of the primary table's column that should be used as the 41 // training & prediction target. 42 // This column must be non-nullable and have one of following data types 43 // (otherwise model creation will error): 44 // 45 // * CATEGORY 46 // 47 // * FLOAT64 48 // 49 // If the type is CATEGORY , only up to 50 // 100 unique values may exist in that column across all rows. 51 // 52 // NOTE: Updates of this field will instantly affect any other users 53 // concurrently working with the dataset. 54 string target_column_spec_id = 2; 55 56 // column_spec_id of the primary table's column that should be used as the 57 // weight column, i.e. the higher the value the more important the row will be 58 // during model training. 59 // Required type: FLOAT64. 60 // Allowed values: 0 to 10000, inclusive on both ends; 0 means the row is 61 // ignored for training. 62 // If not set all rows are assumed to have equal weight of 1. 63 // NOTE: Updates of this field will instantly affect any other users 64 // concurrently working with the dataset. 65 string weight_column_spec_id = 3; 66 67 // column_spec_id of the primary table column which specifies a possible ML 68 // use of the row, i.e. the column will be used to split the rows into TRAIN, 69 // VALIDATE and TEST sets. 70 // Required type: STRING. 71 // This column, if set, must either have all of `TRAIN`, `VALIDATE`, `TEST` 72 // among its values, or only have `TEST`, `UNASSIGNED` values. In the latter 73 // case the rows with `UNASSIGNED` value will be assigned by AutoML. Note 74 // that if a given ml use distribution makes it impossible to create a "good" 75 // model, that call will error describing the issue. 76 // If both this column_spec_id and primary table's time_column_spec_id are not 77 // set, then all rows are treated as `UNASSIGNED`. 78 // NOTE: Updates of this field will instantly affect any other users 79 // concurrently working with the dataset. 80 string ml_use_column_spec_id = 4; 81 82 // Output only. Correlations between 83 // 84 // [TablesDatasetMetadata.target_column_spec_id][google.cloud.automl.v1beta1.TablesDatasetMetadata.target_column_spec_id], 85 // and other columns of the 86 // 87 // [TablesDatasetMetadataprimary_table][google.cloud.automl.v1beta1.TablesDatasetMetadata.primary_table_spec_id]. 88 // Only set if the target column is set. Mapping from other column spec id to 89 // its CorrelationStats with the target column. 90 // This field may be stale, see the stats_update_time field for 91 // for the timestamp at which these stats were last updated. 92 map<string, CorrelationStats> target_column_correlations = 6; 93 94 // Output only. The most recent timestamp when target_column_correlations 95 // field and all descendant ColumnSpec.data_stats and 96 // ColumnSpec.top_correlated_columns fields were last (re-)generated. Any 97 // changes that happened to the dataset afterwards are not reflected in these 98 // fields values. The regeneration happens in the background on a best effort 99 // basis. 100 google.protobuf.Timestamp stats_update_time = 7; 101} 102 103// Model metadata specific to AutoML Tables. 104message TablesModelMetadata { 105 // Additional optimization objective configuration. Required for 106 // `MAXIMIZE_PRECISION_AT_RECALL` and `MAXIMIZE_RECALL_AT_PRECISION`, 107 // otherwise unused. 108 oneof additional_optimization_objective_config { 109 // Required when optimization_objective is "MAXIMIZE_PRECISION_AT_RECALL". 110 // Must be between 0 and 1, inclusive. 111 float optimization_objective_recall_value = 17; 112 113 // Required when optimization_objective is "MAXIMIZE_RECALL_AT_PRECISION". 114 // Must be between 0 and 1, inclusive. 115 float optimization_objective_precision_value = 18; 116 } 117 118 // Column spec of the dataset's primary table's column the model is 119 // predicting. Snapshotted when model creation started. 120 // Only 3 fields are used: 121 // name - May be set on CreateModel, if it's not then the ColumnSpec 122 // corresponding to the current target_column_spec_id of the dataset 123 // the model is trained from is used. 124 // If neither is set, CreateModel will error. 125 // display_name - Output only. 126 // data_type - Output only. 127 ColumnSpec target_column_spec = 2; 128 129 // Column specs of the dataset's primary table's columns, on which 130 // the model is trained and which are used as the input for predictions. 131 // The 132 // 133 // [target_column][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec] 134 // as well as, according to dataset's state upon model creation, 135 // 136 // [weight_column][google.cloud.automl.v1beta1.TablesDatasetMetadata.weight_column_spec_id], 137 // and 138 // 139 // [ml_use_column][google.cloud.automl.v1beta1.TablesDatasetMetadata.ml_use_column_spec_id] 140 // must never be included here. 141 // 142 // Only 3 fields are used: 143 // 144 // * name - May be set on CreateModel, if set only the columns specified are 145 // used, otherwise all primary table's columns (except the ones listed 146 // above) are used for the training and prediction input. 147 // 148 // * display_name - Output only. 149 // 150 // * data_type - Output only. 151 repeated ColumnSpec input_feature_column_specs = 3; 152 153 // Objective function the model is optimizing towards. The training process 154 // creates a model that maximizes/minimizes the value of the objective 155 // function over the validation set. 156 // 157 // The supported optimization objectives depend on the prediction type. 158 // If the field is not set, a default objective function is used. 159 // 160 // CLASSIFICATION_BINARY: 161 // "MAXIMIZE_AU_ROC" (default) - Maximize the area under the receiver 162 // operating characteristic (ROC) curve. 163 // "MINIMIZE_LOG_LOSS" - Minimize log loss. 164 // "MAXIMIZE_AU_PRC" - Maximize the area under the precision-recall curve. 165 // "MAXIMIZE_PRECISION_AT_RECALL" - Maximize precision for a specified 166 // recall value. 167 // "MAXIMIZE_RECALL_AT_PRECISION" - Maximize recall for a specified 168 // precision value. 169 // 170 // CLASSIFICATION_MULTI_CLASS : 171 // "MINIMIZE_LOG_LOSS" (default) - Minimize log loss. 172 // 173 // 174 // REGRESSION: 175 // "MINIMIZE_RMSE" (default) - Minimize root-mean-squared error (RMSE). 176 // "MINIMIZE_MAE" - Minimize mean-absolute error (MAE). 177 // "MINIMIZE_RMSLE" - Minimize root-mean-squared log error (RMSLE). 178 string optimization_objective = 4; 179 180 // Output only. Auxiliary information for each of the 181 // input_feature_column_specs with respect to this particular model. 182 repeated TablesModelColumnInfo tables_model_column_info = 5; 183 184 // Required. The train budget of creating this model, expressed in milli node 185 // hours i.e. 1,000 value in this field means 1 node hour. 186 // 187 // The training cost of the model will not exceed this budget. The final cost 188 // will be attempted to be close to the budget, though may end up being (even) 189 // noticeably smaller - at the backend's discretion. This especially may 190 // happen when further model training ceases to provide any improvements. 191 // 192 // If the budget is set to a value known to be insufficient to train a 193 // model for the given dataset, the training won't be attempted and 194 // will error. 195 // 196 // The train budget must be between 1,000 and 72,000 milli node hours, 197 // inclusive. 198 int64 train_budget_milli_node_hours = 6; 199 200 // Output only. The actual training cost of the model, expressed in milli 201 // node hours, i.e. 1,000 value in this field means 1 node hour. Guaranteed 202 // to not exceed the train budget. 203 int64 train_cost_milli_node_hours = 7; 204 205 // Use the entire training budget. This disables the early stopping feature. 206 // By default, the early stopping feature is enabled, which means that AutoML 207 // Tables might stop training before the entire training budget has been used. 208 bool disable_early_stopping = 12; 209} 210 211// Contains annotation details specific to Tables. 212message TablesAnnotation { 213 // Output only. A confidence estimate between 0.0 and 1.0, inclusive. A higher 214 // value means greater confidence in the returned value. 215 // For 216 // 217 // [target_column_spec][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec] 218 // of FLOAT64 data type the score is not populated. 219 float score = 1; 220 221 // Output only. Only populated when 222 // 223 // [target_column_spec][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec] 224 // has FLOAT64 data type. An interval in which the exactly correct target 225 // value has 95% chance to be in. 226 DoubleRange prediction_interval = 4; 227 228 // The predicted value of the row's 229 // 230 // [target_column][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec]. 231 // The value depends on the column's DataType: 232 // 233 // * CATEGORY - the predicted (with the above confidence `score`) CATEGORY 234 // value. 235 // 236 // * FLOAT64 - the predicted (with above `prediction_interval`) FLOAT64 value. 237 google.protobuf.Value value = 2; 238 239 // Output only. Auxiliary information for each of the model's 240 // 241 // [input_feature_column_specs][google.cloud.automl.v1beta1.TablesModelMetadata.input_feature_column_specs] 242 // with respect to this particular prediction. 243 // If no other fields than 244 // 245 // [column_spec_name][google.cloud.automl.v1beta1.TablesModelColumnInfo.column_spec_name] 246 // and 247 // 248 // [column_display_name][google.cloud.automl.v1beta1.TablesModelColumnInfo.column_display_name] 249 // would be populated, then this whole field is not. 250 repeated TablesModelColumnInfo tables_model_column_info = 3; 251 252 // Output only. Stores the prediction score for the baseline example, which 253 // is defined as the example with all values set to their baseline values. 254 // This is used as part of the Sampled Shapley explanation of the model's 255 // prediction. This field is populated only when feature importance is 256 // requested. For regression models, this holds the baseline prediction for 257 // the baseline example. For classification models, this holds the baseline 258 // prediction for the baseline example for the argmax class. 259 float baseline_score = 5; 260} 261 262// An information specific to given column and Tables Model, in context 263// of the Model and the predictions created by it. 264message TablesModelColumnInfo { 265 // Output only. The name of the ColumnSpec describing the column. Not 266 // populated when this proto is outputted to BigQuery. 267 string column_spec_name = 1; 268 269 // Output only. The display name of the column (same as the display_name of 270 // its ColumnSpec). 271 string column_display_name = 2; 272 273 // Output only. When given as part of a Model (always populated): 274 // Measurement of how much model predictions correctness on the TEST data 275 // depend on values in this column. A value between 0 and 1, higher means 276 // higher influence. These values are normalized - for all input feature 277 // columns of a given model they add to 1. 278 // 279 // When given back by Predict (populated iff 280 // [feature_importance 281 // param][google.cloud.automl.v1beta1.PredictRequest.params] is set) or Batch 282 // Predict (populated iff 283 // [feature_importance][google.cloud.automl.v1beta1.PredictRequest.params] 284 // param is set): 285 // Measurement of how impactful for the prediction returned for the given row 286 // the value in this column was. Specifically, the feature importance 287 // specifies the marginal contribution that the feature made to the prediction 288 // score compared to the baseline score. These values are computed using the 289 // Sampled Shapley method. 290 float feature_importance = 3; 291} 292