1# Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""This API defines FeatureColumn abstraction. 16 17FeatureColumns provide a high level abstraction for ingesting and representing 18features. FeatureColumns are also the primary way of encoding features for 19canned `tf.estimator.Estimator`s. 20 21When using FeatureColumns with `Estimators`, the type of feature column you 22should choose depends on (1) the feature type and (2) the model type. 23 241. Feature type: 25 26 * Continuous features can be represented by `numeric_column`. 27 * Categorical features can be represented by any `categorical_column_with_*` 28 column: 29 - `categorical_column_with_vocabulary_list` 30 - `categorical_column_with_vocabulary_file` 31 - `categorical_column_with_hash_bucket` 32 - `categorical_column_with_identity` 33 - `weighted_categorical_column` 34 352. Model type: 36 37 * Deep neural network models (`DNNClassifier`, `DNNRegressor`). 38 39 Continuous features can be directly fed into deep neural network models. 40 41 age_column = numeric_column("age") 42 43 To feed sparse features into DNN models, wrap the column with 44 `embedding_column` or `indicator_column`. `indicator_column` is recommended 45 for features with only a few possible values. For features with many 46 possible values, to reduce the size of your model, `embedding_column` is 47 recommended. 48 49 embedded_dept_column = embedding_column( 50 categorical_column_with_vocabulary_list( 51 "department", ["math", "philosophy", ...]), dimension=10) 52 53 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`). 54 55 Sparse features can be fed directly into linear models. They behave like an 56 indicator column but with an efficient implementation. 57 58 dept_column = categorical_column_with_vocabulary_list("department", 59 ["math", "philosophy", "english"]) 60 61 It is recommended that continuous features be bucketized before being 62 fed into linear models. 63 64 bucketized_age_column = bucketized_column( 65 source_column=age_column, 66 boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) 67 68 Sparse features can be crossed (also known as conjuncted or combined) in 69 order to form non-linearities, and then fed into linear models. 70 71 cross_dept_age_column = crossed_column( 72 columns=["department", bucketized_age_column], 73 hash_bucket_size=1000) 74 75Example of building canned `Estimator`s using FeatureColumns: 76 77 ```python 78 # Define features and transformations 79 deep_feature_columns = [age_column, embedded_dept_column] 80 wide_feature_columns = [dept_column, bucketized_age_column, 81 cross_dept_age_column] 82 83 # Build deep model 84 estimator = DNNClassifier( 85 feature_columns=deep_feature_columns, 86 hidden_units=[500, 250, 50]) 87 estimator.train(...) 88 89 # Or build a wide model 90 estimator = LinearClassifier( 91 feature_columns=wide_feature_columns) 92 estimator.train(...) 93 94 # Or build a wide and deep model! 95 estimator = DNNLinearCombinedClassifier( 96 linear_feature_columns=wide_feature_columns, 97 dnn_feature_columns=deep_feature_columns, 98 dnn_hidden_units=[500, 250, 50]) 99 estimator.train(...) 100 ``` 101 102 103FeatureColumns can also be transformed into a generic input layer for 104custom models using `input_layer`. 105 106Example of building model using FeatureColumns, this can be used in a 107`model_fn` which is given to the {tf.estimator.Estimator}: 108 109 ```python 110 # Building model via layers 111 112 deep_feature_columns = [age_column, embedded_dept_column] 113 columns_to_tensor = parse_feature_columns_from_examples( 114 serialized=my_data, 115 feature_columns=deep_feature_columns) 116 first_layer = input_layer( 117 features=columns_to_tensor, 118 feature_columns=deep_feature_columns) 119 second_layer = fully_connected(first_layer, ...) 120 ``` 121 122NOTE: Functions prefixed with "_" indicate experimental or private parts of 123the API subject to change, and should not be relied upon! 124 125NOTE: The new feature columns are being developed in feature_column_v2.py and 126are a somewhat duplicate of the code here. Please make sure to update logic 127in both places. 128""" 129 130import abc 131import collections 132import math 133 134import numpy as np 135import six 136 137from tensorflow.python.eager import context 138from tensorflow.python.feature_column import utils as fc_utils 139from tensorflow.python.framework import dtypes 140from tensorflow.python.framework import ops 141from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib 142from tensorflow.python.framework import tensor_shape 143from tensorflow.python.layers import base 144from tensorflow.python.ops import array_ops 145from tensorflow.python.ops import check_ops 146from tensorflow.python.ops import control_flow_ops 147from tensorflow.python.ops import embedding_ops 148from tensorflow.python.ops import init_ops 149from tensorflow.python.ops import lookup_ops 150from tensorflow.python.ops import math_ops 151from tensorflow.python.ops import nn_ops 152from tensorflow.python.ops import parsing_ops 153from tensorflow.python.ops import resource_variable_ops 154from tensorflow.python.ops import sparse_ops 155from tensorflow.python.ops import string_ops 156from tensorflow.python.ops import template 157from tensorflow.python.ops import variable_scope 158from tensorflow.python.ops import variables 159from tensorflow.python.platform import gfile 160from tensorflow.python.platform import tf_logging as logging 161from tensorflow.python.training import checkpoint_utils 162from tensorflow.python.util import nest 163from tensorflow.python.util.compat import collections_abc 164from tensorflow.python.util.tf_export import tf_export 165 166 167def _internal_input_layer(features, 168 feature_columns, 169 weight_collections=None, 170 trainable=True, 171 cols_to_vars=None, 172 scope=None, 173 cols_to_output_tensors=None, 174 from_template=False): 175 """See input_layer. `scope` is a name or variable scope to use.""" 176 177 feature_columns = _normalize_feature_columns(feature_columns) 178 for column in feature_columns: 179 if not isinstance(column, _DenseColumn): 180 raise ValueError( 181 'Items of feature_columns must be a _DenseColumn. ' 182 'You can wrap a categorical column with an ' 183 'embedding_column or indicator_column. Given: {}'.format(column)) 184 weight_collections = list(weight_collections or []) 185 if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections: 186 weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES) 187 if ops.GraphKeys.MODEL_VARIABLES not in weight_collections: 188 weight_collections.append(ops.GraphKeys.MODEL_VARIABLES) 189 190 def _get_logits(): # pylint: disable=missing-docstring 191 builder = _LazyBuilder(features) 192 output_tensors = [] 193 ordered_columns = [] 194 for column in sorted(feature_columns, key=lambda x: x.name): 195 ordered_columns.append(column) 196 with variable_scope.variable_scope( 197 None, default_name=column._var_scope_name): # pylint: disable=protected-access 198 tensor = column._get_dense_tensor( # pylint: disable=protected-access 199 builder, 200 weight_collections=weight_collections, 201 trainable=trainable) 202 num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access 203 batch_size = array_ops.shape(tensor)[0] 204 output_tensor = array_ops.reshape( 205 tensor, shape=(batch_size, num_elements)) 206 output_tensors.append(output_tensor) 207 if cols_to_vars is not None: 208 # Retrieve any variables created (some _DenseColumn's don't create 209 # variables, in which case an empty list is returned). 210 cols_to_vars[column] = ops.get_collection( 211 ops.GraphKeys.GLOBAL_VARIABLES, 212 scope=variable_scope.get_variable_scope().name) 213 if cols_to_output_tensors is not None: 214 cols_to_output_tensors[column] = output_tensor 215 _verify_static_batch_size_equality(output_tensors, ordered_columns) 216 return array_ops.concat(output_tensors, 1) 217 218 # If we're constructing from the `make_template`, that by default adds a 219 # variable scope with the name of the layer. In that case, we dont want to 220 # add another `variable_scope` as that would break checkpoints. 221 if from_template: 222 return _get_logits() 223 else: 224 with variable_scope.variable_scope( 225 scope, default_name='input_layer', values=features.values()): 226 return _get_logits() 227 228 229@tf_export(v1=['feature_column.input_layer']) 230def input_layer(features, 231 feature_columns, 232 weight_collections=None, 233 trainable=True, 234 cols_to_vars=None, 235 cols_to_output_tensors=None): 236 """Returns a dense `Tensor` as input layer based on given `feature_columns`. 237 238 Generally a single example in training data is described with FeatureColumns. 239 At the first layer of the model, this column oriented data should be converted 240 to a single `Tensor`. 241 242 Example: 243 244 ```python 245 price = numeric_column('price') 246 keywords_embedded = embedding_column( 247 categorical_column_with_hash_bucket("keywords", 10K), dimensions=16) 248 columns = [price, keywords_embedded, ...] 249 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 250 dense_tensor = input_layer(features, columns) 251 for units in [128, 64, 32]: 252 dense_tensor = tf.compat.v1.layers.dense(dense_tensor, units, tf.nn.relu) 253 prediction = tf.compat.v1.layers.dense(dense_tensor, 1) 254 ``` 255 256 Args: 257 features: A mapping from key to tensors. `_FeatureColumn`s look up via these 258 keys. For example `numeric_column('price')` will look at 'price' key in 259 this dict. Values can be a `SparseTensor` or a `Tensor` depends on 260 corresponding `_FeatureColumn`. 261 feature_columns: An iterable containing the FeatureColumns to use as inputs 262 to your model. All items should be instances of classes derived from 263 `_DenseColumn` such as `numeric_column`, `embedding_column`, 264 `bucketized_column`, `indicator_column`. If you have categorical features, 265 you can wrap them with an `embedding_column` or `indicator_column`. 266 weight_collections: A list of collection names to which the Variable will be 267 added. Note that variables will also be added to collections 268 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`. 269 trainable: If `True` also add the variable to the graph collection 270 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 271 cols_to_vars: If not `None`, must be a dictionary that will be filled with a 272 mapping from `_FeatureColumn` to list of `Variable`s. For example, after 273 the call, we might have cols_to_vars = 274 {_EmbeddingColumn( 275 categorical_column=_HashedCategoricalColumn( 276 key='sparse_feature', hash_bucket_size=5, dtype=tf.string), 277 dimension=10): [<tf.Variable 'some_variable:0' shape=(5, 10), 278 <tf.Variable 'some_variable:1' shape=(5, 10)]} 279 If a column creates no variables, its value will be an empty list. 280 cols_to_output_tensors: If not `None`, must be a dictionary that will be 281 filled with a mapping from '_FeatureColumn' to the associated 282 output `Tensor`s. 283 284 Returns: 285 A `Tensor` which represents input layer of a model. Its shape 286 is (batch_size, first_layer_dimension) and its dtype is `float32`. 287 first_layer_dimension is determined based on given `feature_columns`. 288 289 Raises: 290 ValueError: if an item in `feature_columns` is not a `_DenseColumn`. 291 """ 292 return _internal_input_layer( 293 features, 294 feature_columns, 295 weight_collections=weight_collections, 296 trainable=trainable, 297 cols_to_vars=cols_to_vars, 298 cols_to_output_tensors=cols_to_output_tensors) 299 300 301# TODO(akshayka): InputLayer should be a subclass of Layer, and it 302# should implement the logic in input_layer using Layer's build-and-call 303# paradigm; input_layer should create an instance of InputLayer and 304# return the result of invoking its apply method, just as functional layers do. 305class InputLayer(object): 306 """An object-oriented version of `input_layer` that reuses variables.""" 307 308 def __init__(self, 309 feature_columns, 310 weight_collections=None, 311 trainable=True, 312 cols_to_vars=None, 313 name='feature_column_input_layer', 314 create_scope_now=True): 315 """See `input_layer`.""" 316 317 self._feature_columns = feature_columns 318 self._weight_collections = weight_collections 319 self._trainable = trainable 320 self._cols_to_vars = cols_to_vars 321 self._name = name 322 self._input_layer_template = template.make_template( 323 self._name, _internal_input_layer, create_scope_now_=create_scope_now) 324 self._scope = self._input_layer_template.variable_scope 325 326 def __call__(self, features): 327 return self._input_layer_template( 328 features=features, 329 feature_columns=self._feature_columns, 330 weight_collections=self._weight_collections, 331 trainable=self._trainable, 332 cols_to_vars=None, 333 from_template=True) 334 335 @property 336 def name(self): 337 return self._name 338 339 @property 340 def non_trainable_variables(self): 341 return self._input_layer_template.non_trainable_variables 342 343 @property 344 def non_trainable_weights(self): 345 return self._input_layer_template.non_trainable_weights 346 347 @property 348 def trainable_variables(self): 349 return self._input_layer_template.trainable_variables 350 351 @property 352 def trainable_weights(self): 353 return self._input_layer_template.trainable_weights 354 355 @property 356 def variables(self): 357 return self._input_layer_template.variables 358 359 @property 360 def weights(self): 361 return self._input_layer_template.weights 362 363 364@tf_export(v1=['feature_column.linear_model']) 365def linear_model(features, 366 feature_columns, 367 units=1, 368 sparse_combiner='sum', 369 weight_collections=None, 370 trainable=True, 371 cols_to_vars=None): 372 """Returns a linear prediction `Tensor` based on given `feature_columns`. 373 374 This function generates a weighted sum based on output dimension `units`. 375 Weighted sum refers to logits in classification problems. It refers to the 376 prediction itself for linear regression problems. 377 378 Note on supported columns: `linear_model` treats categorical columns as 379 `indicator_column`s. To be specific, assume the input as `SparseTensor` looks 380 like: 381 382 ```python 383 shape = [2, 2] 384 { 385 [0, 0]: "a" 386 [1, 0]: "b" 387 [1, 1]: "c" 388 } 389 ``` 390 `linear_model` assigns weights for the presence of "a", "b", "c' implicitly, 391 just like `indicator_column`, while `input_layer` explicitly requires wrapping 392 each of categorical columns with an `embedding_column` or an 393 `indicator_column`. 394 395 Example of usage: 396 397 ```python 398 price = numeric_column('price') 399 price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.]) 400 keywords = categorical_column_with_hash_bucket("keywords", 10K) 401 keywords_price = crossed_column('keywords', price_buckets, ...) 402 columns = [price_buckets, keywords, keywords_price ...] 403 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 404 prediction = linear_model(features, columns) 405 ``` 406 407 The `sparse_combiner` argument works as follows 408 For example, for two features represented as the categorical columns: 409 410 ```python 411 # Feature 1 412 413 shape = [2, 2] 414 { 415 [0, 0]: "a" 416 [0, 1]: "b" 417 [1, 0]: "c" 418 } 419 420 # Feature 2 421 422 shape = [2, 3] 423 { 424 [0, 0]: "d" 425 [1, 0]: "e" 426 [1, 1]: "f" 427 [1, 2]: "f" 428 } 429 ``` 430 431 with `sparse_combiner` as "mean", the linear model outputs consequently 432 are: 433 434 ``` 435 y_0 = 1.0 / 2.0 * ( w_a + w_b ) + w_d + b 436 y_1 = w_c + 1.0 / 3.0 * ( w_e + 2.0 * w_f ) + b 437 ``` 438 439 where `y_i` is the output, `b` is the bias, and `w_x` is the weight 440 assigned to the presence of `x` in the input features. 441 442 Args: 443 features: A mapping from key to tensors. `_FeatureColumn`s look up via these 444 keys. For example `numeric_column('price')` will look at 'price' key in 445 this dict. Values are `Tensor` or `SparseTensor` depending on 446 corresponding `_FeatureColumn`. 447 feature_columns: An iterable containing the FeatureColumns to use as inputs 448 to your model. All items should be instances of classes derived from 449 `_FeatureColumn`s. 450 units: An integer, dimensionality of the output space. Default value is 1. 451 sparse_combiner: A string specifying how to reduce if a categorical column 452 is multivalent. Except `numeric_column`, almost all columns passed to 453 `linear_model` are considered as categorical columns. It combines each 454 categorical column independently. Currently "mean", "sqrtn" and "sum" are 455 supported, with "sum" the default for linear model. "sqrtn" often achieves 456 good accuracy, in particular with bag-of-words columns. 457 * "sum": do not normalize features in the column 458 * "mean": do l1 normalization on features in the column 459 * "sqrtn": do l2 normalization on features in the column 460 weight_collections: A list of collection names to which the Variable will be 461 added. Note that, variables will also be added to collections 462 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`. 463 trainable: If `True` also add the variable to the graph collection 464 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 465 cols_to_vars: If not `None`, must be a dictionary that will be filled with a 466 mapping from `_FeatureColumn` to associated list of `Variable`s. For 467 example, after the call, we might have cols_to_vars = { 468 _NumericColumn( 469 key='numeric_feature1', shape=(1,): 470 [<tf.Variable 'linear_model/price2/weights:0' shape=(1, 1)>], 471 'bias': [<tf.Variable 'linear_model/bias_weights:0' shape=(1,)>], 472 _NumericColumn( 473 key='numeric_feature2', shape=(2,)): 474 [<tf.Variable 'linear_model/price1/weights:0' shape=(2, 1)>]} 475 If a column creates no variables, its value will be an empty list. Note 476 that cols_to_vars will also contain a string key 'bias' that maps to a 477 list of Variables. 478 479 Returns: 480 A `Tensor` which represents predictions/logits of a linear model. Its shape 481 is (batch_size, units) and its dtype is `float32`. 482 483 Raises: 484 ValueError: if an item in `feature_columns` is neither a `_DenseColumn` 485 nor `_CategoricalColumn`. 486 """ 487 with variable_scope.variable_scope(None, 'linear_model') as vs: 488 model_name = _strip_leading_slashes(vs.name) 489 linear_model_layer = _LinearModel( 490 feature_columns=feature_columns, 491 units=units, 492 sparse_combiner=sparse_combiner, 493 weight_collections=weight_collections, 494 trainable=trainable, 495 name=model_name) 496 retval = linear_model_layer(features) # pylint: disable=not-callable 497 if cols_to_vars is not None: 498 cols_to_vars.update(linear_model_layer.cols_to_vars()) 499 return retval 500 501 502def _add_to_collections(var, weight_collections): 503 """Adds a var to the list of weight_collections provided. 504 505 Handles the case for partitioned and non-partitioned variables. 506 507 Args: 508 var: A variable or Partitioned Variable. 509 weight_collections: List of collections to add variable to. 510 """ 511 for weight_collection in weight_collections: 512 # The layer self.add_variable call already adds it to GLOBAL_VARIABLES. 513 if weight_collection == ops.GraphKeys.GLOBAL_VARIABLES: 514 continue 515 # TODO(rohanj): Explore adding a _get_variable_list method on `Variable` 516 # so that we don't have to do this check. 517 if isinstance(var, variables.PartitionedVariable): 518 for constituent_var in list(var): 519 ops.add_to_collection(weight_collection, constituent_var) 520 else: 521 ops.add_to_collection(weight_collection, var) 522 523 524class _FCLinearWrapper(base.Layer): 525 """Wraps a _FeatureColumn in a layer for use in a linear model. 526 527 See `linear_model` above. 528 """ 529 530 def __init__(self, 531 feature_column, 532 units=1, 533 sparse_combiner='sum', 534 weight_collections=None, 535 trainable=True, 536 name=None, 537 **kwargs): 538 super(_FCLinearWrapper, self).__init__( 539 trainable=trainable, name=name, **kwargs) 540 self._feature_column = feature_column 541 self._units = units 542 self._sparse_combiner = sparse_combiner 543 self._weight_collections = weight_collections 544 545 def build(self, _): 546 if isinstance(self._feature_column, _CategoricalColumn): 547 weight = self.add_variable( 548 name='weights', 549 shape=(self._feature_column._num_buckets, self._units), # pylint: disable=protected-access 550 initializer=init_ops.zeros_initializer(), 551 trainable=self.trainable) 552 else: 553 num_elements = self._feature_column._variable_shape.num_elements() # pylint: disable=protected-access 554 weight = self.add_variable( 555 name='weights', 556 shape=[num_elements, self._units], 557 initializer=init_ops.zeros_initializer(), 558 trainable=self.trainable) 559 _add_to_collections(weight, self._weight_collections) 560 self._weight_var = weight 561 self.built = True 562 563 def call(self, builder): 564 weighted_sum = _create_weighted_sum( 565 column=self._feature_column, 566 builder=builder, 567 units=self._units, 568 sparse_combiner=self._sparse_combiner, 569 weight_collections=self._weight_collections, 570 trainable=self.trainable, 571 weight_var=self._weight_var) 572 return weighted_sum 573 574 575class _BiasLayer(base.Layer): 576 """A layer for the bias term. 577 """ 578 579 def __init__(self, 580 units=1, 581 trainable=True, 582 weight_collections=None, 583 name=None, 584 **kwargs): 585 super(_BiasLayer, self).__init__(trainable=trainable, name=name, **kwargs) 586 self._units = units 587 self._weight_collections = weight_collections 588 589 def build(self, _): 590 self._bias_variable = self.add_variable( 591 'bias_weights', 592 shape=[self._units], 593 initializer=init_ops.zeros_initializer(), 594 trainable=self.trainable) 595 _add_to_collections(self._bias_variable, self._weight_collections) 596 self.built = True 597 598 def call(self, _): 599 return self._bias_variable 600 601 602def _get_expanded_variable_list(variable): 603 if (isinstance(variable, variables.Variable) or 604 resource_variable_ops.is_resource_variable(variable)): 605 return [variable] # Single variable case. 606 else: # Must be a PartitionedVariable, so convert into a list. 607 return list(variable) 608 609 610def _strip_leading_slashes(name): 611 return name.rsplit('/', 1)[-1] 612 613 614class _LinearModel(base.Layer): 615 """Creates a linear model using feature columns. 616 617 See `linear_model` for details. 618 """ 619 620 def __init__(self, 621 feature_columns, 622 units=1, 623 sparse_combiner='sum', 624 weight_collections=None, 625 trainable=True, 626 name=None, 627 **kwargs): 628 super(_LinearModel, self).__init__(name=name, **kwargs) 629 # We force the keras_style to be True here, as a workaround to not being 630 # able to inherit keras.layers.Layer as base class. Setting this will let 631 # us skip all the legacy behavior for base.Layer. 632 # Also note that we use Layer as base class, instead of Model, since there 633 # isn't any Model specific behavior gets used, eg compile/fit. 634 self._keras_style = True 635 self._feature_columns = _normalize_feature_columns( 636 feature_columns) 637 self._weight_collections = list(weight_collections or []) 638 if ops.GraphKeys.GLOBAL_VARIABLES not in self._weight_collections: 639 self._weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES) 640 if ops.GraphKeys.MODEL_VARIABLES not in self._weight_collections: 641 self._weight_collections.append(ops.GraphKeys.MODEL_VARIABLES) 642 643 column_layers = {} 644 for column in sorted(self._feature_columns, key=lambda x: x.name): 645 with variable_scope.variable_scope( 646 None, default_name=column._var_scope_name) as vs: # pylint: disable=protected-access 647 # Having the fully expressed variable scope name ends up doubly 648 # expressing the outer scope (scope with which this method was called) 649 # in the name of the variable that would get created. 650 column_name = _strip_leading_slashes(vs.name) 651 column_layer = _FCLinearWrapper(column, units, sparse_combiner, 652 self._weight_collections, trainable, 653 column_name, **kwargs) 654 column_layers[column_name] = column_layer 655 self._column_layers = self._add_layers(column_layers) 656 self._bias_layer = _BiasLayer( 657 units=units, 658 trainable=trainable, 659 weight_collections=self._weight_collections, 660 name='bias_layer', 661 **kwargs) 662 self._cols_to_vars = {} 663 664 def cols_to_vars(self): 665 """Returns a dict mapping _FeatureColumns to variables. 666 667 See `linear_model` for more information. 668 This is not populated till `call` is called i.e. layer is built. 669 """ 670 return self._cols_to_vars 671 672 def call(self, features): 673 with variable_scope.variable_scope(self.name): 674 for column in self._feature_columns: 675 if not isinstance(column, (_DenseColumn, _CategoricalColumn)): 676 raise ValueError( 677 'Items of feature_columns must be either a ' 678 '_DenseColumn or _CategoricalColumn. Given: {}'.format(column)) 679 weighted_sums = [] 680 ordered_columns = [] 681 builder = _LazyBuilder(features) 682 for layer in sorted(self._column_layers.values(), key=lambda x: x.name): 683 column = layer._feature_column # pylint: disable=protected-access 684 ordered_columns.append(column) 685 weighted_sum = layer(builder) 686 weighted_sums.append(weighted_sum) 687 self._cols_to_vars[column] = ops.get_collection( 688 ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name) 689 690 _verify_static_batch_size_equality(weighted_sums, ordered_columns) 691 predictions_no_bias = math_ops.add_n( 692 weighted_sums, name='weighted_sum_no_bias') 693 predictions = nn_ops.bias_add( 694 predictions_no_bias, 695 self._bias_layer( # pylint: disable=not-callable 696 builder, 697 scope=variable_scope.get_variable_scope()), # pylint: disable=not-callable 698 name='weighted_sum') 699 bias = self._bias_layer.variables[0] 700 self._cols_to_vars['bias'] = _get_expanded_variable_list(bias) 701 return predictions 702 703 def _add_layers(self, layers): 704 # "Magic" required for keras.Model classes to track all the variables in 705 # a list of layers.Layer objects. 706 # TODO(ashankar): Figure out API so user code doesn't have to do this. 707 for name, layer in layers.items(): 708 setattr(self, 'layer-%s' % name, layer) 709 return layers 710 711 712def _transform_features(features, feature_columns): 713 """Returns transformed features based on features columns passed in. 714 715 Please note that most probably you would not need to use this function. Please 716 check `input_layer` and `linear_model` to see whether they will 717 satisfy your use case or not. 718 719 Example: 720 721 ```python 722 # Define features and transformations 723 crosses_a_x_b = crossed_column( 724 columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000) 725 price_buckets = bucketized_column( 726 source_column=numeric_column("price"), boundaries=[...]) 727 728 columns = [crosses_a_x_b, price_buckets] 729 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 730 transformed = transform_features(features=features, feature_columns=columns) 731 732 assertCountEqual(columns, transformed.keys()) 733 ``` 734 735 Args: 736 features: A mapping from key to tensors. `_FeatureColumn`s look up via these 737 keys. For example `numeric_column('price')` will look at 'price' key in 738 this dict. Values can be a `SparseTensor` or a `Tensor` depends on 739 corresponding `_FeatureColumn`. 740 feature_columns: An iterable containing all the `_FeatureColumn`s. 741 742 Returns: 743 A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values. 744 """ 745 feature_columns = _normalize_feature_columns(feature_columns) 746 outputs = {} 747 with ops.name_scope( 748 None, default_name='transform_features', values=features.values()): 749 builder = _LazyBuilder(features) 750 for column in sorted(feature_columns, key=lambda x: x.name): 751 with ops.name_scope(None, default_name=column.name): 752 outputs[column] = builder.get(column) 753 return outputs 754 755 756@tf_export(v1=['feature_column.make_parse_example_spec']) 757def make_parse_example_spec(feature_columns): 758 """Creates parsing spec dictionary from input feature_columns. 759 760 The returned dictionary can be used as arg 'features' in 761 `tf.io.parse_example`. 762 763 Typical usage example: 764 765 ```python 766 # Define features and transformations 767 feature_a = categorical_column_with_vocabulary_file(...) 768 feature_b = numeric_column(...) 769 feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...) 770 feature_a_x_feature_c = crossed_column( 771 columns=["feature_a", feature_c_bucketized], ...) 772 773 feature_columns = set( 774 [feature_b, feature_c_bucketized, feature_a_x_feature_c]) 775 features = tf.io.parse_example( 776 serialized=serialized_examples, 777 features=make_parse_example_spec(feature_columns)) 778 ``` 779 780 For the above example, make_parse_example_spec would return the dict: 781 782 ```python 783 { 784 "feature_a": parsing_ops.VarLenFeature(tf.string), 785 "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32), 786 "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32) 787 } 788 ``` 789 790 Args: 791 feature_columns: An iterable containing all feature columns. All items 792 should be instances of classes derived from `_FeatureColumn`. 793 794 Returns: 795 A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature` 796 value. 797 798 Raises: 799 ValueError: If any of the given `feature_columns` is not a `_FeatureColumn` 800 instance. 801 """ 802 result = {} 803 for column in feature_columns: 804 if not isinstance(column, _FeatureColumn): 805 raise ValueError( 806 'All feature_columns must be _FeatureColumn instances. ' 807 'Given: {}'.format(column)) 808 config = column._parse_example_spec # pylint: disable=protected-access 809 for key, value in six.iteritems(config): 810 if key in result and value != result[key]: 811 raise ValueError( 812 'feature_columns contain different parse_spec for key ' 813 '{}. Given {} and {}'.format(key, value, result[key])) 814 result.update(config) 815 return result 816 817 818def _embedding_column(categorical_column, 819 dimension, 820 combiner='mean', 821 initializer=None, 822 ckpt_to_load_from=None, 823 tensor_name_in_ckpt=None, 824 max_norm=None, 825 trainable=True, 826 use_safe_embedding_lookup=True): 827 """`_DenseColumn` that converts from sparse, categorical input. 828 829 Use this when your inputs are sparse, but you want to convert them to a dense 830 representation (e.g., to feed to a DNN). 831 832 Inputs must be a `_CategoricalColumn` created by any of the 833 `categorical_column_*` function. Here is an example of using 834 `embedding_column` with `DNNClassifier`: 835 836 ```python 837 video_id = categorical_column_with_identity( 838 key='video_id', num_buckets=1000000, default_value=0) 839 columns = [embedding_column(video_id, 9),...] 840 841 estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...) 842 843 label_column = ... 844 def input_fn(): 845 features = tf.io.parse_example( 846 ..., features=make_parse_example_spec(columns + [label_column])) 847 labels = features.pop(label_column.name) 848 return features, labels 849 850 estimator.train(input_fn=input_fn, steps=100) 851 ``` 852 853 Here is an example using `embedding_column` with model_fn: 854 855 ```python 856 def model_fn(features, ...): 857 video_id = categorical_column_with_identity( 858 key='video_id', num_buckets=1000000, default_value=0) 859 columns = [embedding_column(video_id, 9),...] 860 dense_tensor = input_layer(features, columns) 861 # Form DNN layers, calculate loss, and return EstimatorSpec. 862 ... 863 ``` 864 865 Args: 866 categorical_column: A `_CategoricalColumn` created by a 867 `categorical_column_with_*` function. This column produces the sparse IDs 868 that are inputs to the embedding lookup. 869 dimension: An integer specifying dimension of the embedding, must be > 0. 870 combiner: A string specifying how to reduce if there are multiple entries 871 in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with 872 'mean' the default. 'sqrtn' often achieves good accuracy, in particular 873 with bag-of-words columns. Each of this can be thought as example level 874 normalizations on the column. For more information, see 875 `tf.embedding_lookup_sparse`. 876 initializer: A variable initializer function to be used in embedding 877 variable initialization. If not specified, defaults to 878 `tf.compat.v1.truncated_normal_initializer` with mean `0.0` and 879 standard deviation `1/sqrt(dimension)`. 880 ckpt_to_load_from: String representing checkpoint name/pattern from which to 881 restore column weights. Required if `tensor_name_in_ckpt` is not `None`. 882 tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from 883 which to restore the column weights. Required if `ckpt_to_load_from` is 884 not `None`. 885 max_norm: If not `None`, embedding values are l2-normalized to this value. 886 trainable: Whether or not the embedding is trainable. Default is True. 887 use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse 888 instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures 889 there are no empty rows and all weights and ids are positive at the 890 expense of extra compute cost. This only applies to rank 2 (NxM) shaped 891 input tensors. Defaults to true, consider turning off if the above checks 892 are not needed. Note that having empty rows will not trigger any error 893 though the output result might be 0 or omitted. 894 895 Returns: 896 `_DenseColumn` that converts from sparse input. 897 898 Raises: 899 ValueError: if `dimension` not > 0. 900 ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt` 901 is specified. 902 ValueError: if `initializer` is specified and is not callable. 903 RuntimeError: If eager execution is enabled. 904 """ 905 if (dimension is None) or (dimension < 1): 906 raise ValueError('Invalid dimension {}.'.format(dimension)) 907 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None): 908 raise ValueError('Must specify both `ckpt_to_load_from` and ' 909 '`tensor_name_in_ckpt` or none of them.') 910 911 if (initializer is not None) and (not callable(initializer)): 912 raise ValueError('initializer must be callable if specified. ' 913 'Embedding of column_name: {}'.format( 914 categorical_column.name)) 915 if initializer is None: 916 initializer = init_ops.truncated_normal_initializer( 917 mean=0.0, stddev=1 / math.sqrt(dimension)) 918 919 embedding_shape = categorical_column._num_buckets, dimension # pylint: disable=protected-access 920 921 def _creator(weight_collections, scope): 922 embedding_column_layer = _EmbeddingColumnLayer( 923 embedding_shape=embedding_shape, 924 initializer=initializer, 925 weight_collections=weight_collections, 926 trainable=trainable, 927 name='embedding_column_layer') 928 return embedding_column_layer(None, scope=scope) # pylint: disable=not-callable 929 930 return _EmbeddingColumn( 931 categorical_column=categorical_column, 932 dimension=dimension, 933 combiner=combiner, 934 layer_creator=_creator, 935 ckpt_to_load_from=ckpt_to_load_from, 936 tensor_name_in_ckpt=tensor_name_in_ckpt, 937 max_norm=max_norm, 938 trainable=trainable, 939 use_safe_embedding_lookup=use_safe_embedding_lookup) 940 941 942def _numeric_column(key, 943 shape=(1,), 944 default_value=None, 945 dtype=dtypes.float32, 946 normalizer_fn=None): 947 """Represents real valued or numerical features. 948 949 Example: 950 951 ```python 952 price = numeric_column('price') 953 columns = [price, ...] 954 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 955 dense_tensor = input_layer(features, columns) 956 957 # or 958 bucketized_price = bucketized_column(price, boundaries=[...]) 959 columns = [bucketized_price, ...] 960 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 961 linear_prediction = linear_model(features, columns) 962 ``` 963 964 Args: 965 key: A unique string identifying the input feature. It is used as the 966 column name and the dictionary key for feature parsing configs, feature 967 `Tensor` objects, and feature columns. 968 shape: An iterable of integers specifies the shape of the `Tensor`. An 969 integer can be given which means a single dimension `Tensor` with given 970 width. The `Tensor` representing the column will have the shape of 971 [batch_size] + `shape`. 972 default_value: A single value compatible with `dtype` or an iterable of 973 values compatible with `dtype` which the column takes on during 974 `tf.Example` parsing if data is missing. A default value of `None` will 975 cause `tf.io.parse_example` to fail if an example does not contain this 976 column. If a single value is provided, the same value will be applied as 977 the default value for every item. If an iterable of values is provided, 978 the shape of the `default_value` should be equal to the given `shape`. 979 dtype: defines the type of values. Default value is `tf.float32`. Must be a 980 non-quantized, real integer or floating point type. 981 normalizer_fn: If not `None`, a function that can be used to normalize the 982 value of the tensor after `default_value` is applied for parsing. 983 Normalizer function takes the input `Tensor` as its argument, and returns 984 the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that 985 even though the most common use case of this function is normalization, it 986 can be used for any kind of Tensorflow transformations. 987 988 Returns: 989 A `_NumericColumn`. 990 991 Raises: 992 TypeError: if any dimension in shape is not an int 993 ValueError: if any dimension in shape is not a positive integer 994 TypeError: if `default_value` is an iterable but not compatible with `shape` 995 TypeError: if `default_value` is not compatible with `dtype`. 996 ValueError: if `dtype` is not convertible to `tf.float32`. 997 """ 998 shape = _check_shape(shape, key) 999 if not (dtype.is_integer or dtype.is_floating): 1000 raise ValueError('dtype must be convertible to float. ' 1001 'dtype: {}, key: {}'.format(dtype, key)) 1002 default_value = fc_utils.check_default_value( 1003 shape, default_value, dtype, key) 1004 1005 if normalizer_fn is not None and not callable(normalizer_fn): 1006 raise TypeError( 1007 'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn)) 1008 1009 fc_utils.assert_key_is_string(key) 1010 return _NumericColumn( 1011 key, 1012 shape=shape, 1013 default_value=default_value, 1014 dtype=dtype, 1015 normalizer_fn=normalizer_fn) 1016 1017 1018def _bucketized_column(source_column, boundaries): 1019 """Represents discretized dense input. 1020 1021 Buckets include the left boundary, and exclude the right boundary. Namely, 1022 `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`, 1023 `[1., 2.)`, and `[2., +inf)`. 1024 1025 For example, if the inputs are 1026 1027 ```python 1028 boundaries = [0, 10, 100] 1029 input tensor = [[-5, 10000] 1030 [150, 10] 1031 [5, 100]] 1032 ``` 1033 1034 then the output will be 1035 1036 ```python 1037 output = [[0, 3] 1038 [3, 2] 1039 [1, 3]] 1040 ``` 1041 1042 Example: 1043 1044 ```python 1045 price = numeric_column('price') 1046 bucketized_price = bucketized_column(price, boundaries=[...]) 1047 columns = [bucketized_price, ...] 1048 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1049 linear_prediction = linear_model(features, columns) 1050 1051 # or 1052 columns = [bucketized_price, ...] 1053 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1054 dense_tensor = input_layer(features, columns) 1055 ``` 1056 1057 A `bucketized_column` can also be crossed with another categorical column 1058 using `crossed_column`: 1059 1060 ```python 1061 price = numeric_column('price') 1062 # bucketized_column converts numerical feature to a categorical one. 1063 bucketized_price = bucketized_column(price, boundaries=[...]) 1064 # 'keywords' is a string feature. 1065 price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K) 1066 columns = [price_x_keywords, ...] 1067 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1068 linear_prediction = linear_model(features, columns) 1069 ``` 1070 1071 Args: 1072 source_column: A one-dimensional dense column which is generated with 1073 `numeric_column`. 1074 boundaries: A sorted list or tuple of floats specifying the boundaries. 1075 1076 Returns: 1077 A `_BucketizedColumn`. 1078 1079 Raises: 1080 ValueError: If `source_column` is not a numeric column, or if it is not 1081 one-dimensional. 1082 ValueError: If `boundaries` is not a sorted list or tuple. 1083 """ 1084 if not isinstance(source_column, _NumericColumn): 1085 raise ValueError( 1086 'source_column must be a column generated with numeric_column(). ' 1087 'Given: {}'.format(source_column)) 1088 if len(source_column.shape) > 1: 1089 raise ValueError( 1090 'source_column must be one-dimensional column. ' 1091 'Given: {}'.format(source_column)) 1092 if (not boundaries or 1093 not (isinstance(boundaries, list) or isinstance(boundaries, tuple))): 1094 raise ValueError('boundaries must be a sorted list.') 1095 for i in range(len(boundaries) - 1): 1096 if boundaries[i] >= boundaries[i + 1]: 1097 raise ValueError('boundaries must be a sorted list.') 1098 return _BucketizedColumn(source_column, tuple(boundaries)) 1099 1100 1101def _categorical_column_with_hash_bucket(key, 1102 hash_bucket_size, 1103 dtype=dtypes.string): 1104 """Represents sparse feature where ids are set by hashing. 1105 1106 Use this when your sparse features are in string or integer format, and you 1107 want to distribute your inputs into a finite number of buckets by hashing. 1108 output_id = Hash(input_feature_string) % bucket_size for string type input. 1109 For int type input, the value is converted to its string representation first 1110 and then hashed by the same formula. 1111 1112 For input dictionary `features`, `features[key]` is either `Tensor` or 1113 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1114 and `''` for string, which will be dropped by this feature column. 1115 1116 Example: 1117 1118 ```python 1119 keywords = categorical_column_with_hash_bucket("keywords", 10K) 1120 columns = [keywords, ...] 1121 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1122 linear_prediction = linear_model(features, columns) 1123 1124 # or 1125 keywords_embedded = embedding_column(keywords, 16) 1126 columns = [keywords_embedded, ...] 1127 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1128 dense_tensor = input_layer(features, columns) 1129 ``` 1130 1131 Args: 1132 key: A unique string identifying the input feature. It is used as the 1133 column name and the dictionary key for feature parsing configs, feature 1134 `Tensor` objects, and feature columns. 1135 hash_bucket_size: An int > 1. The number of buckets. 1136 dtype: The type of features. Only string and integer types are supported. 1137 1138 Returns: 1139 A `_HashedCategoricalColumn`. 1140 1141 Raises: 1142 ValueError: `hash_bucket_size` is not greater than 1. 1143 ValueError: `dtype` is neither string nor integer. 1144 """ 1145 if hash_bucket_size is None: 1146 raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key)) 1147 1148 if hash_bucket_size < 1: 1149 raise ValueError('hash_bucket_size must be at least 1. ' 1150 'hash_bucket_size: {}, key: {}'.format( 1151 hash_bucket_size, key)) 1152 1153 fc_utils.assert_key_is_string(key) 1154 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 1155 1156 return _HashedCategoricalColumn(key, hash_bucket_size, dtype) 1157 1158 1159def _categorical_column_with_vocabulary_file(key, 1160 vocabulary_file, 1161 vocabulary_size=None, 1162 num_oov_buckets=0, 1163 default_value=None, 1164 dtype=dtypes.string): 1165 """A `_CategoricalColumn` with a vocabulary file. 1166 1167 Use this when your inputs are in string or integer format, and you have a 1168 vocabulary file that maps each value to an integer ID. By default, 1169 out-of-vocabulary values are ignored. Use either (but not both) of 1170 `num_oov_buckets` and `default_value` to specify how to include 1171 out-of-vocabulary values. 1172 1173 For input dictionary `features`, `features[key]` is either `Tensor` or 1174 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1175 and `''` for string, which will be dropped by this feature column. 1176 1177 Example with `num_oov_buckets`: 1178 File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state 1179 abbreviation. All inputs with values in that file are assigned an ID 0-49, 1180 corresponding to its line number. All other values are hashed and assigned an 1181 ID 50-54. 1182 1183 ```python 1184 states = categorical_column_with_vocabulary_file( 1185 key='states', vocabulary_file='/us/states.txt', vocabulary_size=50, 1186 num_oov_buckets=5) 1187 columns = [states, ...] 1188 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1189 linear_prediction = linear_model(features, columns) 1190 ``` 1191 1192 Example with `default_value`: 1193 File '/us/states.txt' contains 51 lines - the first line is 'XX', and the 1194 other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX' 1195 in input, and other values missing from the file, will be assigned ID 0. All 1196 others are assigned the corresponding line number 1-50. 1197 1198 ```python 1199 states = categorical_column_with_vocabulary_file( 1200 key='states', vocabulary_file='/us/states.txt', vocabulary_size=51, 1201 default_value=0) 1202 columns = [states, ...] 1203 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1204 linear_prediction, _, _ = linear_model(features, columns) 1205 ``` 1206 1207 And to make an embedding with either: 1208 1209 ```python 1210 columns = [embedding_column(states, 3),...] 1211 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1212 dense_tensor = input_layer(features, columns) 1213 ``` 1214 1215 Args: 1216 key: A unique string identifying the input feature. It is used as the 1217 column name and the dictionary key for feature parsing configs, feature 1218 `Tensor` objects, and feature columns. 1219 vocabulary_file: The vocabulary file name. 1220 vocabulary_size: Number of the elements in the vocabulary. This must be no 1221 greater than length of `vocabulary_file`, if less than length, later 1222 values are ignored. If None, it is set to the length of `vocabulary_file`. 1223 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 1224 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 1225 `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of 1226 the input value. A positive `num_oov_buckets` can not be specified with 1227 `default_value`. 1228 default_value: The integer ID value to return for out-of-vocabulary feature 1229 values, defaults to `-1`. This can not be specified with a positive 1230 `num_oov_buckets`. 1231 dtype: The type of features. Only string and integer types are supported. 1232 1233 Returns: 1234 A `_CategoricalColumn` with a vocabulary file. 1235 1236 Raises: 1237 ValueError: `vocabulary_file` is missing or cannot be opened. 1238 ValueError: `vocabulary_size` is missing or < 1. 1239 ValueError: `num_oov_buckets` is a negative integer. 1240 ValueError: `num_oov_buckets` and `default_value` are both specified. 1241 ValueError: `dtype` is neither string nor integer. 1242 """ 1243 if not vocabulary_file: 1244 raise ValueError('Missing vocabulary_file in {}.'.format(key)) 1245 1246 if vocabulary_size is None: 1247 if not gfile.Exists(vocabulary_file): 1248 raise ValueError('vocabulary_file in {} does not exist.'.format(key)) 1249 1250 with gfile.GFile(vocabulary_file) as f: 1251 vocabulary_size = sum(1 for _ in f) 1252 logging.info( 1253 'vocabulary_size = %d in %s is inferred from the number of elements ' 1254 'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file) 1255 1256 # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`. 1257 if vocabulary_size < 1: 1258 raise ValueError('Invalid vocabulary_size in {}.'.format(key)) 1259 if num_oov_buckets: 1260 if default_value is not None: 1261 raise ValueError( 1262 'Can\'t specify both num_oov_buckets and default_value in {}.'.format( 1263 key)) 1264 if num_oov_buckets < 0: 1265 raise ValueError('Invalid num_oov_buckets {} in {}.'.format( 1266 num_oov_buckets, key)) 1267 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 1268 fc_utils.assert_key_is_string(key) 1269 return _VocabularyFileCategoricalColumn( 1270 key=key, 1271 vocabulary_file=vocabulary_file, 1272 vocabulary_size=vocabulary_size, 1273 num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets, 1274 default_value=-1 if default_value is None else default_value, 1275 dtype=dtype) 1276 1277 1278def _categorical_column_with_vocabulary_list(key, 1279 vocabulary_list, 1280 dtype=None, 1281 default_value=-1, 1282 num_oov_buckets=0): 1283 """A `_CategoricalColumn` with in-memory vocabulary. 1284 1285 Use this when your inputs are in string or integer format, and you have an 1286 in-memory vocabulary mapping each value to an integer ID. By default, 1287 out-of-vocabulary values are ignored. Use either (but not both) of 1288 `num_oov_buckets` and `default_value` to specify how to include 1289 out-of-vocabulary values. 1290 1291 For input dictionary `features`, `features[key]` is either `Tensor` or 1292 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1293 and `''` for string, which will be dropped by this feature column. 1294 1295 Example with `num_oov_buckets`: 1296 In the following example, each input in `vocabulary_list` is assigned an ID 1297 0-3 corresponding to its index (e.g., input 'B' produces output 2). All other 1298 inputs are hashed and assigned an ID 4-5. 1299 1300 ```python 1301 colors = categorical_column_with_vocabulary_list( 1302 key='colors', vocabulary_list=('R', 'G', 'B', 'Y'), 1303 num_oov_buckets=2) 1304 columns = [colors, ...] 1305 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1306 linear_prediction, _, _ = linear_model(features, columns) 1307 ``` 1308 1309 Example with `default_value`: 1310 In the following example, each input in `vocabulary_list` is assigned an ID 1311 0-4 corresponding to its index (e.g., input 'B' produces output 3). All other 1312 inputs are assigned `default_value` 0. 1313 1314 1315 ```python 1316 colors = categorical_column_with_vocabulary_list( 1317 key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0) 1318 columns = [colors, ...] 1319 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1320 linear_prediction, _, _ = linear_model(features, columns) 1321 ``` 1322 1323 And to make an embedding with either: 1324 1325 ```python 1326 columns = [embedding_column(colors, 3),...] 1327 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1328 dense_tensor = input_layer(features, columns) 1329 ``` 1330 1331 Args: 1332 key: A unique string identifying the input feature. It is used as the 1333 column name and the dictionary key for feature parsing configs, feature 1334 `Tensor` objects, and feature columns. 1335 vocabulary_list: An ordered iterable defining the vocabulary. Each feature 1336 is mapped to the index of its value (if present) in `vocabulary_list`. 1337 Must be castable to `dtype`. 1338 dtype: The type of features. Only string and integer types are supported. 1339 If `None`, it will be inferred from `vocabulary_list`. 1340 default_value: The integer ID value to return for out-of-vocabulary feature 1341 values, defaults to `-1`. This can not be specified with a positive 1342 `num_oov_buckets`. 1343 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 1344 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 1345 `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a 1346 hash of the input value. A positive `num_oov_buckets` can not be specified 1347 with `default_value`. 1348 1349 Returns: 1350 A `_CategoricalColumn` with in-memory vocabulary. 1351 1352 Raises: 1353 ValueError: if `vocabulary_list` is empty, or contains duplicate keys. 1354 ValueError: `num_oov_buckets` is a negative integer. 1355 ValueError: `num_oov_buckets` and `default_value` are both specified. 1356 ValueError: if `dtype` is not integer or string. 1357 """ 1358 if (vocabulary_list is None) or (len(vocabulary_list) < 1): 1359 raise ValueError( 1360 'vocabulary_list {} must be non-empty, column_name: {}'.format( 1361 vocabulary_list, key)) 1362 if len(set(vocabulary_list)) != len(vocabulary_list): 1363 raise ValueError( 1364 'Duplicate keys in vocabulary_list {}, column_name: {}'.format( 1365 vocabulary_list, key)) 1366 vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype) 1367 if num_oov_buckets: 1368 if default_value != -1: 1369 raise ValueError( 1370 'Can\'t specify both num_oov_buckets and default_value in {}.'.format( 1371 key)) 1372 if num_oov_buckets < 0: 1373 raise ValueError('Invalid num_oov_buckets {} in {}.'.format( 1374 num_oov_buckets, key)) 1375 fc_utils.assert_string_or_int( 1376 vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key)) 1377 if dtype is None: 1378 dtype = vocabulary_dtype 1379 elif dtype.is_integer != vocabulary_dtype.is_integer: 1380 raise ValueError( 1381 'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format( 1382 dtype, vocabulary_dtype, key)) 1383 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 1384 fc_utils.assert_key_is_string(key) 1385 1386 return _VocabularyListCategoricalColumn( 1387 key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype, 1388 default_value=default_value, num_oov_buckets=num_oov_buckets) 1389 1390 1391def _categorical_column_with_identity(key, num_buckets, default_value=None): 1392 """A `_CategoricalColumn` that returns identity values. 1393 1394 Use this when your inputs are integers in the range `[0, num_buckets)`, and 1395 you want to use the input value itself as the categorical ID. Values outside 1396 this range will result in `default_value` if specified, otherwise it will 1397 fail. 1398 1399 Typically, this is used for contiguous ranges of integer indexes, but 1400 it doesn't have to be. This might be inefficient, however, if many of IDs 1401 are unused. Consider `categorical_column_with_hash_bucket` in that case. 1402 1403 For input dictionary `features`, `features[key]` is either `Tensor` or 1404 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1405 and `''` for string, which will be dropped by this feature column. 1406 1407 In the following examples, each input in the range `[0, 1000000)` is assigned 1408 the same value. All other inputs are assigned `default_value` 0. Note that a 1409 literal 0 in inputs will result in the same default ID. 1410 1411 Linear model: 1412 1413 ```python 1414 video_id = categorical_column_with_identity( 1415 key='video_id', num_buckets=1000000, default_value=0) 1416 columns = [video_id, ...] 1417 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1418 linear_prediction, _, _ = linear_model(features, columns) 1419 ``` 1420 1421 Embedding for a DNN model: 1422 1423 ```python 1424 columns = [embedding_column(video_id, 9),...] 1425 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1426 dense_tensor = input_layer(features, columns) 1427 ``` 1428 1429 Args: 1430 key: A unique string identifying the input feature. It is used as the 1431 column name and the dictionary key for feature parsing configs, feature 1432 `Tensor` objects, and feature columns. 1433 num_buckets: Range of inputs and outputs is `[0, num_buckets)`. 1434 default_value: If set, values outside of range `[0, num_buckets)` will 1435 be replaced with this value. If not set, values >= num_buckets will 1436 cause a failure while values < 0 will be dropped. 1437 1438 Returns: 1439 A `_CategoricalColumn` that returns identity values. 1440 1441 Raises: 1442 ValueError: if `num_buckets` is less than one. 1443 ValueError: if `default_value` is not in range `[0, num_buckets)`. 1444 """ 1445 if num_buckets < 1: 1446 raise ValueError( 1447 'num_buckets {} < 1, column_name {}'.format(num_buckets, key)) 1448 if (default_value is not None) and ( 1449 (default_value < 0) or (default_value >= num_buckets)): 1450 raise ValueError( 1451 'default_value {} not in range [0, {}), column_name {}'.format( 1452 default_value, num_buckets, key)) 1453 fc_utils.assert_key_is_string(key) 1454 return _IdentityCategoricalColumn( 1455 key=key, num_buckets=num_buckets, default_value=default_value) 1456 1457 1458def _indicator_column(categorical_column): 1459 """Represents multi-hot representation of given categorical column. 1460 1461 - For DNN model, `indicator_column` can be used to wrap any 1462 `categorical_column_*` (e.g., to feed to DNN). Consider to Use 1463 `embedding_column` if the number of buckets/unique(values) are large. 1464 1465 - For Wide (aka linear) model, `indicator_column` is the internal 1466 representation for categorical column when passing categorical column 1467 directly (as any element in feature_columns) to `linear_model`. See 1468 `linear_model` for details. 1469 1470 ```python 1471 name = indicator_column(categorical_column_with_vocabulary_list( 1472 'name', ['bob', 'george', 'wanda']) 1473 columns = [name, ...] 1474 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1475 dense_tensor = input_layer(features, columns) 1476 1477 dense_tensor == [[1, 0, 0]] # If "name" bytes_list is ["bob"] 1478 dense_tensor == [[1, 0, 1]] # If "name" bytes_list is ["bob", "wanda"] 1479 dense_tensor == [[2, 0, 0]] # If "name" bytes_list is ["bob", "bob"] 1480 ``` 1481 1482 Args: 1483 categorical_column: A `_CategoricalColumn` which is created by 1484 `categorical_column_with_*` or `crossed_column` functions. 1485 1486 Returns: 1487 An `_IndicatorColumn`. 1488 """ 1489 return _IndicatorColumn(categorical_column) 1490 1491 1492def _weighted_categorical_column(categorical_column, 1493 weight_feature_key, 1494 dtype=dtypes.float32): 1495 """Applies weight values to a `_CategoricalColumn`. 1496 1497 Use this when each of your sparse inputs has both an ID and a value. For 1498 example, if you're representing text documents as a collection of word 1499 frequencies, you can provide 2 parallel sparse input features ('terms' and 1500 'frequencies' below). 1501 1502 Example: 1503 1504 Input `tf.Example` objects: 1505 1506 ```proto 1507 [ 1508 features { 1509 feature { 1510 key: "terms" 1511 value {bytes_list {value: "very" value: "model"}} 1512 } 1513 feature { 1514 key: "frequencies" 1515 value {float_list {value: 0.3 value: 0.1}} 1516 } 1517 }, 1518 features { 1519 feature { 1520 key: "terms" 1521 value {bytes_list {value: "when" value: "course" value: "human"}} 1522 } 1523 feature { 1524 key: "frequencies" 1525 value {float_list {value: 0.4 value: 0.1 value: 0.2}} 1526 } 1527 } 1528 ] 1529 ``` 1530 1531 ```python 1532 categorical_column = categorical_column_with_hash_bucket( 1533 column_name='terms', hash_bucket_size=1000) 1534 weighted_column = weighted_categorical_column( 1535 categorical_column=categorical_column, weight_feature_key='frequencies') 1536 columns = [weighted_column, ...] 1537 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1538 linear_prediction, _, _ = linear_model(features, columns) 1539 ``` 1540 1541 This assumes the input dictionary contains a `SparseTensor` for key 1542 'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have 1543 the same indices and dense shape. 1544 1545 Args: 1546 categorical_column: A `_CategoricalColumn` created by 1547 `categorical_column_with_*` functions. 1548 weight_feature_key: String key for weight values. 1549 dtype: Type of weights, such as `tf.float32`. Only float and integer weights 1550 are supported. 1551 1552 Returns: 1553 A `_CategoricalColumn` composed of two sparse features: one represents id, 1554 the other represents weight (value) of the id feature in that example. 1555 1556 Raises: 1557 ValueError: if `dtype` is not convertible to float. 1558 """ 1559 if (dtype is None) or not (dtype.is_integer or dtype.is_floating): 1560 raise ValueError('dtype {} is not convertible to float.'.format(dtype)) 1561 return _WeightedCategoricalColumn( 1562 categorical_column=categorical_column, 1563 weight_feature_key=weight_feature_key, 1564 dtype=dtype) 1565 1566 1567def _crossed_column(keys, hash_bucket_size, hash_key=None): 1568 """Returns a column for performing crosses of categorical features. 1569 1570 Crossed features will be hashed according to `hash_bucket_size`. Conceptually, 1571 the transformation can be thought of as: 1572 Hash(cartesian product of features) % `hash_bucket_size` 1573 1574 For example, if the input features are: 1575 1576 * SparseTensor referred by first key: 1577 1578 ```python 1579 shape = [2, 2] 1580 { 1581 [0, 0]: "a" 1582 [1, 0]: "b" 1583 [1, 1]: "c" 1584 } 1585 ``` 1586 1587 * SparseTensor referred by second key: 1588 1589 ```python 1590 shape = [2, 1] 1591 { 1592 [0, 0]: "d" 1593 [1, 0]: "e" 1594 } 1595 ``` 1596 1597 then crossed feature will look like: 1598 1599 ```python 1600 shape = [2, 2] 1601 { 1602 [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size 1603 [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size 1604 [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size 1605 } 1606 ``` 1607 1608 Here is an example to create a linear model with crosses of string features: 1609 1610 ```python 1611 keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K) 1612 columns = [keywords_x_doc_terms, ...] 1613 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1614 linear_prediction = linear_model(features, columns) 1615 ``` 1616 1617 You could also use vocabulary lookup before crossing: 1618 1619 ```python 1620 keywords = categorical_column_with_vocabulary_file( 1621 'keywords', '/path/to/vocabulary/file', vocabulary_size=1K) 1622 keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K) 1623 columns = [keywords_x_doc_terms, ...] 1624 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1625 linear_prediction = linear_model(features, columns) 1626 ``` 1627 1628 If an input feature is of numeric type, you can use 1629 `categorical_column_with_identity`, or `bucketized_column`, as in the example: 1630 1631 ```python 1632 # vertical_id is an integer categorical feature. 1633 vertical_id = categorical_column_with_identity('vertical_id', 10K) 1634 price = numeric_column('price') 1635 # bucketized_column converts numerical feature to a categorical one. 1636 bucketized_price = bucketized_column(price, boundaries=[...]) 1637 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K) 1638 columns = [vertical_id_x_price, ...] 1639 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 1640 linear_prediction = linear_model(features, columns) 1641 ``` 1642 1643 To use crossed column in DNN model, you need to add it in an embedding column 1644 as in this example: 1645 1646 ```python 1647 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K) 1648 vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10) 1649 dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...]) 1650 ``` 1651 1652 Args: 1653 keys: An iterable identifying the features to be crossed. Each element can 1654 be either: 1655 * string: Will use the corresponding feature which must be of string type. 1656 * `_CategoricalColumn`: Will use the transformed tensor produced by this 1657 column. Does not support hashed categorical column. 1658 hash_bucket_size: An int > 1. The number of buckets. 1659 hash_key: Specify the hash_key that will be used by the `FingerprintCat64` 1660 function to combine the crosses fingerprints on SparseCrossOp (optional). 1661 1662 Returns: 1663 A `_CrossedColumn`. 1664 1665 Raises: 1666 ValueError: If `len(keys) < 2`. 1667 ValueError: If any of the keys is neither a string nor `_CategoricalColumn`. 1668 ValueError: If any of the keys is `_HashedCategoricalColumn`. 1669 ValueError: If `hash_bucket_size < 1`. 1670 """ 1671 if not hash_bucket_size or hash_bucket_size < 1: 1672 raise ValueError('hash_bucket_size must be > 1. ' 1673 'hash_bucket_size: {}'.format(hash_bucket_size)) 1674 if not keys or len(keys) < 2: 1675 raise ValueError( 1676 'keys must be a list with length > 1. Given: {}'.format(keys)) 1677 for key in keys: 1678 if (not isinstance(key, six.string_types) and 1679 not isinstance(key, _CategoricalColumn)): 1680 raise ValueError( 1681 'Unsupported key type. All keys must be either string, or ' 1682 'categorical column except _HashedCategoricalColumn. ' 1683 'Given: {}'.format(key)) 1684 if isinstance(key, _HashedCategoricalColumn): 1685 raise ValueError( 1686 'categorical_column_with_hash_bucket is not supported for crossing. ' 1687 'Hashing before crossing will increase probability of collision. ' 1688 'Instead, use the feature name as a string. Given: {}'.format(key)) 1689 return _CrossedColumn( 1690 keys=tuple(keys), hash_bucket_size=hash_bucket_size, 1691 hash_key=hash_key) 1692 1693 1694# TODO(rohanj): Clearly define semantics of this layer. 1695class _EmbeddingColumnLayer(base.Layer): 1696 """A layer that stores all the state required for a embedding column.""" 1697 1698 def __init__(self, 1699 embedding_shape, 1700 initializer, 1701 weight_collections=None, 1702 trainable=True, 1703 name=None, 1704 **kwargs): 1705 """Constructor. 1706 1707 Args: 1708 embedding_shape: Shape of the embedding variable used for lookup. 1709 initializer: A variable initializer function to be used in embedding 1710 variable initialization. 1711 weight_collections: A list of collection names to which the Variable will 1712 be added. Note that, variables will also be added to collections 1713 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`. 1714 trainable: If `True` also add the variable to the graph collection 1715 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 1716 name: Name of the layer 1717 **kwargs: keyword named properties. 1718 """ 1719 super(_EmbeddingColumnLayer, self).__init__( 1720 trainable=trainable, name=name, **kwargs) 1721 self._embedding_shape = embedding_shape 1722 self._initializer = initializer 1723 self._weight_collections = weight_collections 1724 1725 def set_weight_collections(self, weight_collections): 1726 """Sets the weight collections for the layer. 1727 1728 Args: 1729 weight_collections: A list of collection names to which the Variable will 1730 be added. 1731 """ 1732 self._weight_collections = weight_collections 1733 1734 def build(self, _): 1735 self._embedding_weight_var = self.add_variable( 1736 name='embedding_weights', 1737 shape=self._embedding_shape, 1738 dtype=dtypes.float32, 1739 initializer=self._initializer, 1740 trainable=self.trainable) 1741 if self._weight_collections and not context.executing_eagerly(): 1742 _add_to_collections(self._embedding_weight_var, self._weight_collections) 1743 self.built = True 1744 1745 def call(self, _): 1746 return self._embedding_weight_var 1747 1748 1749@six.add_metaclass(abc.ABCMeta) 1750class _FeatureColumn(object): 1751 """Represents a feature column abstraction. 1752 1753 WARNING: Do not subclass this layer unless you know what you are doing: 1754 the API is subject to future changes. 1755 1756 To distinguish the concept of a feature family and a specific binary feature 1757 within a family, we refer to a feature family like "country" as a feature 1758 column. Following is an example feature in a `tf.Example` format: 1759 {key: "country", value: [ "US" ]} 1760 In this example the value of feature is "US" and "country" refers to the 1761 column of the feature. 1762 1763 This class is an abstract class. User should not create instances of this. 1764 """ 1765 1766 @abc.abstractproperty 1767 def name(self): 1768 """Returns string. Used for naming and for name_scope.""" 1769 pass 1770 1771 def __lt__(self, other): 1772 """Allows feature columns to be sorted in Python 3 as they are in Python 2. 1773 1774 Feature columns need to occasionally be sortable, for example when used as 1775 keys in a features dictionary passed to a layer. 1776 1777 In CPython, `__lt__` must be defined for all objects in the 1778 sequence being sorted. If any objects do not have an `__lt__` compatible 1779 with feature column objects (such as strings), then CPython will fall back 1780 to using the `__gt__` method below. 1781 https://docs.python.org/3/library/stdtypes.html#list.sort 1782 1783 Args: 1784 other: The other object to compare to. 1785 1786 Returns: 1787 True if the string representation of this object is lexicographically less 1788 than the string representation of `other`. For FeatureColumn objects, 1789 this looks like "<__main__.FeatureColumn object at 0xa>". 1790 """ 1791 return str(self) < str(other) 1792 1793 def __gt__(self, other): 1794 """Allows feature columns to be sorted in Python 3 as they are in Python 2. 1795 1796 Feature columns need to occasionally be sortable, for example when used as 1797 keys in a features dictionary passed to a layer. 1798 1799 `__gt__` is called when the "other" object being compared during the sort 1800 does not have `__lt__` defined. 1801 Example: 1802 ``` 1803 # __lt__ only class 1804 class A(): 1805 def __lt__(self, other): return str(self) < str(other) 1806 1807 a = A() 1808 a < "b" # True 1809 "0" < a # Error 1810 1811 # __lt__ and __gt__ class 1812 class B(): 1813 def __lt__(self, other): return str(self) < str(other) 1814 def __gt__(self, other): return str(self) > str(other) 1815 1816 b = B() 1817 b < "c" # True 1818 "0" < b # True 1819 ``` 1820 1821 1822 Args: 1823 other: The other object to compare to. 1824 1825 Returns: 1826 True if the string representation of this object is lexicographically 1827 greater than the string representation of `other`. For FeatureColumn 1828 objects, this looks like "<__main__.FeatureColumn object at 0xa>". 1829 """ 1830 return str(self) > str(other) 1831 1832 @property 1833 def _var_scope_name(self): 1834 """Returns string. Used for variable_scope. Defaults to self.name.""" 1835 return self.name 1836 1837 @abc.abstractmethod 1838 def _transform_feature(self, inputs): 1839 """Returns intermediate representation (usually a `Tensor`). 1840 1841 Uses `inputs` to create an intermediate representation (usually a `Tensor`) 1842 that other feature columns can use. 1843 1844 Example usage of `inputs`: 1845 Let's say a Feature column depends on raw feature ('raw') and another 1846 `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will 1847 be used as follows: 1848 1849 ```python 1850 raw_tensor = inputs.get('raw') 1851 fc_tensor = inputs.get(input_fc) 1852 ``` 1853 1854 Args: 1855 inputs: A `_LazyBuilder` object to access inputs. 1856 1857 Returns: 1858 Transformed feature `Tensor`. 1859 """ 1860 pass 1861 1862 @abc.abstractproperty 1863 def _parse_example_spec(self): 1864 """Returns a `tf.Example` parsing spec as dict. 1865 1866 It is used for get_parsing_spec for `tf.io.parse_example`. Returned spec is 1867 a dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other 1868 supported objects. Please check documentation of `tf.io.parse_example` for 1869 all supported spec objects. 1870 1871 Let's say a Feature column depends on raw feature ('raw') and another 1872 `_FeatureColumn` (input_fc). One possible implementation of 1873 _parse_example_spec is as follows: 1874 1875 ```python 1876 spec = {'raw': tf.io.FixedLenFeature(...)} 1877 spec.update(input_fc._parse_example_spec) 1878 return spec 1879 ``` 1880 """ 1881 pass 1882 1883 def _reset_config(self): 1884 """Resets the configuration in the column. 1885 1886 Some feature columns e.g. embedding or shared embedding columns might 1887 have some state that is needed to be reset sometimes. Use this method 1888 in that scenario. 1889 """ 1890 1891 1892class _DenseColumn(_FeatureColumn): 1893 """Represents a column which can be represented as `Tensor`. 1894 1895 WARNING: Do not subclass this layer unless you know what you are doing: 1896 the API is subject to future changes. 1897 1898 Some examples of this type are: numeric_column, embedding_column, 1899 indicator_column. 1900 """ 1901 1902 @abc.abstractproperty 1903 def _variable_shape(self): 1904 """`TensorShape` of `_get_dense_tensor`, without batch dimension.""" 1905 pass 1906 1907 @abc.abstractmethod 1908 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 1909 """Returns a `Tensor`. 1910 1911 The output of this function will be used by model-builder-functions. For 1912 example the pseudo code of `input_layer` will be like: 1913 1914 ```python 1915 def input_layer(features, feature_columns, ...): 1916 outputs = [fc._get_dense_tensor(...) for fc in feature_columns] 1917 return tf.concat(outputs) 1918 ``` 1919 1920 Args: 1921 inputs: A `_LazyBuilder` object to access inputs. 1922 weight_collections: List of graph collections to which Variables (if any 1923 will be created) are added. 1924 trainable: If `True` also add variables to the graph collection 1925 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 1926 1927 Returns: 1928 `Tensor` of shape [batch_size] + `_variable_shape`. 1929 """ 1930 pass 1931 1932 1933def _create_weighted_sum(column, 1934 builder, 1935 units, 1936 sparse_combiner, 1937 weight_collections, 1938 trainable, 1939 weight_var=None): 1940 """Creates a weighted sum for a dense/categorical column for linear_model.""" 1941 if isinstance(column, _CategoricalColumn): 1942 return _create_categorical_column_weighted_sum( 1943 column=column, 1944 builder=builder, 1945 units=units, 1946 sparse_combiner=sparse_combiner, 1947 weight_collections=weight_collections, 1948 trainable=trainable, 1949 weight_var=weight_var) 1950 else: 1951 return _create_dense_column_weighted_sum( 1952 column=column, 1953 builder=builder, 1954 units=units, 1955 weight_collections=weight_collections, 1956 trainable=trainable, 1957 weight_var=weight_var) 1958 1959 1960def _create_dense_column_weighted_sum(column, 1961 builder, 1962 units, 1963 weight_collections, 1964 trainable, 1965 weight_var=None): 1966 """Create a weighted sum of a dense column for linear_model.""" 1967 tensor = column._get_dense_tensor( # pylint: disable=protected-access 1968 builder, 1969 weight_collections=weight_collections, 1970 trainable=trainable) 1971 num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access 1972 batch_size = array_ops.shape(tensor)[0] 1973 tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements)) 1974 if weight_var is not None: 1975 weight = weight_var 1976 else: 1977 weight = variable_scope.get_variable( 1978 name='weights', 1979 shape=[num_elements, units], 1980 initializer=init_ops.zeros_initializer(), 1981 trainable=trainable, 1982 collections=weight_collections) 1983 return math_ops.matmul(tensor, weight, name='weighted_sum') 1984 1985 1986class _CategoricalColumn(_FeatureColumn): 1987 """Represents a categorical feature. 1988 1989 WARNING: Do not subclass this layer unless you know what you are doing: 1990 the API is subject to future changes. 1991 1992 A categorical feature typically handled with a `tf.sparse.SparseTensor` of 1993 IDs. 1994 """ 1995 1996 IdWeightPair = collections.namedtuple( # pylint: disable=invalid-name 1997 'IdWeightPair', ['id_tensor', 'weight_tensor']) 1998 1999 @abc.abstractproperty 2000 def _num_buckets(self): 2001 """Returns number of buckets in this sparse feature.""" 2002 pass 2003 2004 @abc.abstractmethod 2005 def _get_sparse_tensors(self, 2006 inputs, 2007 weight_collections=None, 2008 trainable=None): 2009 """Returns an IdWeightPair. 2010 2011 `IdWeightPair` is a pair of `SparseTensor`s which represents ids and 2012 weights. 2013 2014 `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets` 2015 `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a 2016 `SparseTensor` of `float` or `None` to indicate all weights should be 2017 taken to be 1. If specified, `weight_tensor` must have exactly the same 2018 shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing 2019 output of a `VarLenFeature` which is a ragged matrix. 2020 2021 Args: 2022 inputs: A `LazyBuilder` as a cache to get input tensors required to 2023 create `IdWeightPair`. 2024 weight_collections: List of graph collections to which variables (if any 2025 will be created) are added. 2026 trainable: If `True` also add variables to the graph collection 2027 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.compat.v1.get_variable`). 2028 """ 2029 pass 2030 2031 2032def _create_categorical_column_weighted_sum(column, 2033 builder, 2034 units, 2035 sparse_combiner, 2036 weight_collections, 2037 trainable, 2038 weight_var=None): 2039 # pylint: disable=g-doc-return-or-yield,g-doc-args 2040 """Create a weighted sum of a categorical column for linear_model. 2041 2042 Note to maintainer: As implementation details, the weighted sum is 2043 implemented via embedding_lookup_sparse toward efficiency. Mathematically, 2044 they are the same. 2045 2046 To be specific, conceptually, categorical column can be treated as multi-hot 2047 vector. Say: 2048 2049 ```python 2050 x = [0 0 1] # categorical column input 2051 w = [a b c] # weights 2052 ``` 2053 The weighted sum is `c` in this case, which is same as `w[2]`. 2054 2055 Another example is 2056 2057 ```python 2058 x = [0 1 1] # categorical column input 2059 w = [a b c] # weights 2060 ``` 2061 The weighted sum is `b + c` in this case, which is same as `w[2] + w[3]`. 2062 2063 For both cases, we can implement weighted sum via embedding_lookup with 2064 sparse_combiner = "sum". 2065 """ 2066 2067 sparse_tensors = column._get_sparse_tensors( # pylint: disable=protected-access 2068 builder, 2069 weight_collections=weight_collections, 2070 trainable=trainable) 2071 id_tensor = sparse_ops.sparse_reshape(sparse_tensors.id_tensor, [ 2072 array_ops.shape(sparse_tensors.id_tensor)[0], -1 2073 ]) 2074 weight_tensor = sparse_tensors.weight_tensor 2075 if weight_tensor is not None: 2076 weight_tensor = sparse_ops.sparse_reshape( 2077 weight_tensor, [array_ops.shape(weight_tensor)[0], -1]) 2078 2079 if weight_var is not None: 2080 weight = weight_var 2081 else: 2082 weight = variable_scope.get_variable( 2083 name='weights', 2084 shape=(column._num_buckets, units), # pylint: disable=protected-access 2085 initializer=init_ops.zeros_initializer(), 2086 trainable=trainable, 2087 collections=weight_collections) 2088 return embedding_ops.safe_embedding_lookup_sparse( 2089 weight, 2090 id_tensor, 2091 sparse_weights=weight_tensor, 2092 combiner=sparse_combiner, 2093 name='weighted_sum') 2094 2095 2096class _SequenceDenseColumn(_FeatureColumn): 2097 """Represents dense sequence data.""" 2098 2099 TensorSequenceLengthPair = collections.namedtuple( # pylint: disable=invalid-name 2100 'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length']) 2101 2102 @abc.abstractmethod 2103 def _get_sequence_dense_tensor( 2104 self, inputs, weight_collections=None, trainable=None): 2105 """Returns a `TensorSequenceLengthPair`.""" 2106 pass 2107 2108 2109class _LazyBuilder(object): 2110 """Handles caching of transformations while building the model. 2111 2112 `_FeatureColumn` specifies how to digest an input column to the network. Some 2113 feature columns require data transformations. This class caches those 2114 transformations. 2115 2116 Some features may be used in more than one place. For example, one can use a 2117 bucketized feature by itself and a cross with it. In that case we 2118 should create only one bucketization op instead of creating ops for each 2119 feature column separately. To handle re-use of transformed columns, 2120 `_LazyBuilder` caches all previously transformed columns. 2121 2122 Example: 2123 We're trying to use the following `_FeatureColumn`s: 2124 2125 ```python 2126 bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...) 2127 keywords = fc.categorical_column_with_hash_buckets("keywords", ...) 2128 age_X_keywords = fc.crossed_column([bucketized_age, "keywords"]) 2129 ... = linear_model(features, 2130 [bucketized_age, keywords, age_X_keywords] 2131 ``` 2132 2133 If we transform each column independently, then we'll get duplication of 2134 bucketization (one for cross, one for bucketization itself). 2135 The `_LazyBuilder` eliminates this duplication. 2136 """ 2137 2138 def __init__(self, features): 2139 """Creates a `_LazyBuilder`. 2140 2141 Args: 2142 features: A mapping from feature column to objects that are `Tensor` or 2143 `SparseTensor`, or can be converted to same via 2144 `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key 2145 signifies a base feature (not-transformed). A `_FeatureColumn` key 2146 means that this `Tensor` is the output of an existing `_FeatureColumn` 2147 which can be reused. 2148 """ 2149 self._features = features.copy() 2150 self._feature_tensors = {} 2151 2152 def get(self, key): 2153 """Returns a `Tensor` for the given key. 2154 2155 A `str` key is used to access a base feature (not-transformed). When a 2156 `_FeatureColumn` is passed, the transformed feature is returned if it 2157 already exists, otherwise the given `_FeatureColumn` is asked to provide its 2158 transformed output, which is then cached. 2159 2160 Args: 2161 key: a `str` or a `_FeatureColumn`. 2162 2163 Returns: 2164 The transformed `Tensor` corresponding to the `key`. 2165 2166 Raises: 2167 ValueError: if key is not found or a transformed `Tensor` cannot be 2168 computed. 2169 """ 2170 if key in self._feature_tensors: 2171 # FeatureColumn is already transformed or converted. 2172 return self._feature_tensors[key] 2173 2174 if key in self._features: 2175 feature_tensor = self._get_raw_feature_as_tensor(key) 2176 self._feature_tensors[key] = feature_tensor 2177 return feature_tensor 2178 2179 if isinstance(key, six.string_types): 2180 raise ValueError('Feature {} is not in features dictionary.'.format(key)) 2181 2182 if not isinstance(key, _FeatureColumn): 2183 raise TypeError('"key" must be either a "str" or "_FeatureColumn". ' 2184 'Provided: {}'.format(key)) 2185 2186 column = key 2187 logging.debug('Transforming feature_column %s.', column) 2188 transformed = column._transform_feature(self) # pylint: disable=protected-access 2189 if transformed is None: 2190 raise ValueError('Column {} is not supported.'.format(column.name)) 2191 self._feature_tensors[column] = transformed 2192 return transformed 2193 2194 def _get_raw_feature_as_tensor(self, key): 2195 """Gets the raw_feature (keyed by `key`) as `tensor`. 2196 2197 The raw feature is converted to (sparse) tensor and maybe expand dim. 2198 2199 For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if 2200 the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will 2201 error out as it is not supported. 2202 2203 Args: 2204 key: A `str` key to access the raw feature. 2205 2206 Returns: 2207 A `Tensor` or `SparseTensor`. 2208 2209 Raises: 2210 ValueError: if the raw feature has rank 0. 2211 """ 2212 raw_feature = self._features[key] 2213 feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 2214 raw_feature) 2215 2216 def expand_dims(input_tensor): 2217 # Input_tensor must have rank 1. 2218 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 2219 return sparse_ops.sparse_reshape( 2220 input_tensor, [array_ops.shape(input_tensor)[0], 1]) 2221 else: 2222 return array_ops.expand_dims(input_tensor, -1) 2223 2224 rank = feature_tensor.get_shape().ndims 2225 if rank is not None: 2226 if rank == 0: 2227 raise ValueError( 2228 'Feature (key: {}) cannot have rank 0. Given: {}'.format( 2229 key, feature_tensor)) 2230 return feature_tensor if rank != 1 else expand_dims(feature_tensor) 2231 2232 # Handle dynamic rank. 2233 with ops.control_dependencies([ 2234 check_ops.assert_positive( 2235 array_ops.rank(feature_tensor), 2236 message='Feature (key: {}) cannot have rank 0. Given: {}'.format( 2237 key, feature_tensor))]): 2238 return control_flow_ops.cond( 2239 math_ops.equal(1, array_ops.rank(feature_tensor)), 2240 lambda: expand_dims(feature_tensor), 2241 lambda: feature_tensor) 2242 2243 2244# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py 2245def _shape_offsets(shape): 2246 """Returns moving offset for each dimension given shape.""" 2247 offsets = [] 2248 for dim in reversed(shape): 2249 if offsets: 2250 offsets.append(dim * offsets[-1]) 2251 else: 2252 offsets.append(dim) 2253 offsets.reverse() 2254 return offsets 2255 2256 2257# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py 2258def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None): 2259 """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells. 2260 2261 If `input_tensor` is already a `SparseTensor`, just return it. 2262 2263 Args: 2264 input_tensor: A string or integer `Tensor`. 2265 ignore_value: Entries in `dense_tensor` equal to this value will be 2266 absent from the resulting `SparseTensor`. If `None`, default value of 2267 `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`). 2268 2269 Returns: 2270 A `SparseTensor` with the same shape as `input_tensor`. 2271 2272 Raises: 2273 ValueError: when `input_tensor`'s rank is `None`. 2274 """ 2275 input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 2276 input_tensor) 2277 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 2278 return input_tensor 2279 with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)): 2280 if ignore_value is None: 2281 if input_tensor.dtype == dtypes.string: 2282 # Exception due to TF strings are converted to numpy objects by default. 2283 ignore_value = '' 2284 elif input_tensor.dtype.is_integer: 2285 ignore_value = -1 # -1 has a special meaning of missing feature 2286 else: 2287 # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is 2288 # constructing a new numpy object of the given type, which yields the 2289 # default value for that type. 2290 ignore_value = input_tensor.dtype.as_numpy_dtype() 2291 ignore_value = math_ops.cast( 2292 ignore_value, input_tensor.dtype, name='ignore_value') 2293 indices = array_ops.where( 2294 math_ops.not_equal(input_tensor, ignore_value), name='indices') 2295 return sparse_tensor_lib.SparseTensor( 2296 indices=indices, 2297 values=array_ops.gather_nd(input_tensor, indices, name='values'), 2298 dense_shape=array_ops.shape( 2299 input_tensor, out_type=dtypes.int64, name='dense_shape')) 2300 2301 2302def _normalize_feature_columns(feature_columns): 2303 """Normalizes the `feature_columns` input. 2304 2305 This method converts the `feature_columns` to list type as best as it can. In 2306 addition, verifies the type and other parts of feature_columns, required by 2307 downstream library. 2308 2309 Args: 2310 feature_columns: The raw feature columns, usually passed by users. 2311 2312 Returns: 2313 The normalized feature column list. 2314 2315 Raises: 2316 ValueError: for any invalid inputs, such as empty, duplicated names, etc. 2317 """ 2318 if isinstance(feature_columns, _FeatureColumn): 2319 feature_columns = [feature_columns] 2320 2321 if isinstance(feature_columns, collections_abc.Iterator): 2322 feature_columns = list(feature_columns) 2323 2324 if isinstance(feature_columns, dict): 2325 raise ValueError('Expected feature_columns to be iterable, found dict.') 2326 2327 for column in feature_columns: 2328 if not isinstance(column, _FeatureColumn): 2329 raise ValueError('Items of feature_columns must be a _FeatureColumn. ' 2330 'Given (type {}): {}.'.format(type(column), column)) 2331 if not feature_columns: 2332 raise ValueError('feature_columns must not be empty.') 2333 name_to_column = {} 2334 for column in feature_columns: 2335 if column.name in name_to_column: 2336 raise ValueError('Duplicate feature column name found for columns: {} ' 2337 'and {}. This usually means that these columns refer to ' 2338 'same base feature. Either one must be discarded or a ' 2339 'duplicated but renamed item must be inserted in ' 2340 'features dict.'.format(column, 2341 name_to_column[column.name])) 2342 name_to_column[column.name] = column 2343 2344 return feature_columns 2345 2346 2347class _NumericColumn(_DenseColumn, 2348 collections.namedtuple('_NumericColumn', [ 2349 'key', 'shape', 'default_value', 'dtype', 2350 'normalizer_fn' 2351 ])): 2352 """see `numeric_column`.""" 2353 2354 @property 2355 def name(self): 2356 return self.key 2357 2358 @property 2359 def _parse_example_spec(self): 2360 return { 2361 self.key: 2362 parsing_ops.FixedLenFeature(self.shape, self.dtype, 2363 self.default_value) 2364 } 2365 2366 def _transform_feature(self, inputs): 2367 input_tensor = inputs.get(self.key) 2368 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 2369 raise ValueError( 2370 'The corresponding Tensor of numerical column must be a Tensor. ' 2371 'SparseTensor is not supported. key: {}'.format(self.key)) 2372 if self.normalizer_fn is not None: 2373 input_tensor = self.normalizer_fn(input_tensor) 2374 return math_ops.cast(input_tensor, dtypes.float32) 2375 2376 @property 2377 def _variable_shape(self): 2378 return tensor_shape.TensorShape(self.shape) 2379 2380 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2381 """Returns dense `Tensor` representing numeric feature. 2382 2383 Args: 2384 inputs: A `_LazyBuilder` object to access inputs. 2385 weight_collections: Unused `weight_collections` since no variables are 2386 created in this function. 2387 trainable: Unused `trainable` bool since no variables are created in 2388 this function. 2389 2390 Returns: 2391 Dense `Tensor` created within `_transform_feature`. 2392 """ 2393 # Do nothing with weight_collections and trainable since no variables are 2394 # created in this function. 2395 del weight_collections 2396 del trainable 2397 # Feature has been already transformed. Return the intermediate 2398 # representation created by _transform_feature. 2399 return inputs.get(self) 2400 2401 2402class _BucketizedColumn(_DenseColumn, _CategoricalColumn, 2403 collections.namedtuple('_BucketizedColumn', [ 2404 'source_column', 'boundaries'])): 2405 """See `bucketized_column`.""" 2406 2407 @property 2408 def name(self): 2409 return '{}_bucketized'.format(self.source_column.name) 2410 2411 @property 2412 def _parse_example_spec(self): 2413 return self.source_column._parse_example_spec # pylint: disable=protected-access 2414 2415 def _transform_feature(self, inputs): 2416 source_tensor = inputs.get(self.source_column) 2417 return math_ops._bucketize( # pylint: disable=protected-access 2418 source_tensor, 2419 boundaries=self.boundaries) 2420 2421 @property 2422 def _variable_shape(self): 2423 return tensor_shape.TensorShape( 2424 tuple(self.source_column.shape) + (len(self.boundaries) + 1,)) 2425 2426 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2427 del weight_collections 2428 del trainable 2429 input_tensor = inputs.get(self) 2430 return array_ops.one_hot( 2431 indices=math_ops.cast(input_tensor, dtypes.int64), 2432 depth=len(self.boundaries) + 1, 2433 on_value=1., 2434 off_value=0.) 2435 2436 @property 2437 def _num_buckets(self): 2438 # By construction, source_column is always one-dimensional. 2439 return (len(self.boundaries) + 1) * self.source_column.shape[0] 2440 2441 def _get_sparse_tensors(self, inputs, weight_collections=None, 2442 trainable=None): 2443 """Converts dense inputs to SparseTensor so downstream code can use it.""" 2444 input_tensor = inputs.get(self) 2445 batch_size = array_ops.shape(input_tensor)[0] 2446 # By construction, source_column is always one-dimensional. 2447 source_dimension = self.source_column.shape[0] 2448 2449 i1 = array_ops.reshape( 2450 array_ops.tile( 2451 array_ops.expand_dims(math_ops.range(0, batch_size), 1), 2452 [1, source_dimension]), 2453 (-1,)) 2454 i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size]) 2455 # Flatten the bucket indices and unique them across dimensions 2456 # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets 2457 bucket_indices = ( 2458 array_ops.reshape(input_tensor, (-1,)) + 2459 (len(self.boundaries) + 1) * i2) 2460 2461 indices = math_ops.cast( 2462 array_ops.transpose(array_ops.stack((i1, i2))), dtypes.int64) 2463 dense_shape = math_ops.cast( 2464 array_ops.stack([batch_size, source_dimension]), dtypes.int64) 2465 sparse_tensor = sparse_tensor_lib.SparseTensor( 2466 indices=indices, 2467 values=bucket_indices, 2468 dense_shape=dense_shape) 2469 return _CategoricalColumn.IdWeightPair(sparse_tensor, None) 2470 2471 2472class _EmbeddingColumn( 2473 _DenseColumn, _SequenceDenseColumn, 2474 collections.namedtuple( 2475 '_EmbeddingColumn', 2476 ('categorical_column', 'dimension', 'combiner', 'layer_creator', 2477 'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable', 2478 'use_safe_embedding_lookup'))): 2479 """See `embedding_column`.""" 2480 2481 def __new__(cls, 2482 categorical_column, 2483 dimension, 2484 combiner, 2485 layer_creator, 2486 ckpt_to_load_from, 2487 tensor_name_in_ckpt, 2488 max_norm, 2489 trainable, 2490 use_safe_embedding_lookup=True): 2491 return super(_EmbeddingColumn, cls).__new__( 2492 cls, 2493 categorical_column=categorical_column, 2494 dimension=dimension, 2495 combiner=combiner, 2496 layer_creator=layer_creator, 2497 ckpt_to_load_from=ckpt_to_load_from, 2498 tensor_name_in_ckpt=tensor_name_in_ckpt, 2499 max_norm=max_norm, 2500 trainable=trainable, 2501 use_safe_embedding_lookup=use_safe_embedding_lookup) 2502 2503 @property 2504 def name(self): 2505 if not hasattr(self, '_name'): 2506 self._name = '{}_embedding'.format(self.categorical_column.name) 2507 return self._name 2508 2509 @property 2510 def _parse_example_spec(self): 2511 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 2512 2513 def _transform_feature(self, inputs): 2514 return inputs.get(self.categorical_column) 2515 2516 @property 2517 def _variable_shape(self): 2518 if not hasattr(self, '_shape'): 2519 self._shape = tensor_shape.TensorShape([self.dimension]) 2520 return self._shape 2521 2522 def _get_dense_tensor_internal(self, 2523 inputs, 2524 weight_collections=None, 2525 trainable=None): 2526 """Private method that follows the signature of _get_dense_tensor.""" 2527 # Get sparse IDs and weights. 2528 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access 2529 inputs, weight_collections=weight_collections, trainable=trainable) 2530 sparse_ids = sparse_tensors.id_tensor 2531 sparse_weights = sparse_tensors.weight_tensor 2532 2533 embedding_weights = self.layer_creator( 2534 weight_collections=weight_collections, 2535 scope=variable_scope.get_variable_scope()) 2536 2537 if self.ckpt_to_load_from is not None: 2538 to_restore = embedding_weights 2539 if isinstance(to_restore, variables.PartitionedVariable): 2540 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access 2541 checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, { 2542 self.tensor_name_in_ckpt: to_restore 2543 }) 2544 2545 sparse_id_rank = tensor_shape.dimension_value( 2546 sparse_ids.dense_shape.get_shape()[0]) 2547 embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse 2548 if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and 2549 sparse_id_rank <= 2): 2550 embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2 2551 # Return embedding lookup result. 2552 return embedding_lookup_sparse( 2553 embedding_weights, 2554 sparse_ids, 2555 sparse_weights, 2556 combiner=self.combiner, 2557 name='%s_weights' % self.name, 2558 max_norm=self.max_norm) 2559 2560 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2561 if isinstance(self.categorical_column, _SequenceCategoricalColumn): 2562 raise ValueError( 2563 'In embedding_column: {}. ' 2564 'categorical_column must not be of type _SequenceCategoricalColumn. ' 2565 'Suggested fix A: If you wish to use input_layer, use a ' 2566 'non-sequence categorical_column_with_*. ' 2567 'Suggested fix B: If you wish to create sequence input, use ' 2568 'sequence_input_layer instead of input_layer. ' 2569 'Given (type {}): {}'.format( 2570 self.name, type(self.categorical_column), 2571 self.categorical_column)) 2572 return self._get_dense_tensor_internal( 2573 inputs=inputs, 2574 weight_collections=weight_collections, 2575 trainable=trainable) 2576 2577 def _get_sequence_dense_tensor( 2578 self, inputs, weight_collections=None, trainable=None): 2579 if not isinstance(self.categorical_column, _SequenceCategoricalColumn): 2580 raise ValueError( 2581 'In embedding_column: {}. ' 2582 'categorical_column must be of type _SequenceCategoricalColumn ' 2583 'to use sequence_input_layer. ' 2584 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 2585 'Given (type {}): {}'.format( 2586 self.name, type(self.categorical_column), 2587 self.categorical_column)) 2588 dense_tensor = self._get_dense_tensor_internal( # pylint: disable=protected-access 2589 inputs=inputs, 2590 weight_collections=weight_collections, 2591 trainable=trainable) 2592 2593 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 2594 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 2595 sparse_tensors.id_tensor) 2596 return _SequenceDenseColumn.TensorSequenceLengthPair( 2597 dense_tensor=dense_tensor, sequence_length=sequence_length) 2598 2599 2600def _get_graph_for_variable(var): 2601 if isinstance(var, variables.PartitionedVariable): 2602 return list(var)[0].graph 2603 else: 2604 return var.graph 2605 2606 2607class _SharedEmbeddingColumn( 2608 _DenseColumn, _SequenceDenseColumn, 2609 collections.namedtuple( 2610 '_SharedEmbeddingColumn', 2611 ('categorical_column', 'dimension', 'combiner', 'initializer', 2612 'shared_embedding_collection_name', 'ckpt_to_load_from', 2613 'tensor_name_in_ckpt', 'max_norm', 'trainable', 2614 'use_safe_embedding_lookup'))): 2615 """See `embedding_column`.""" 2616 2617 @property 2618 def name(self): 2619 if not hasattr(self, '_name'): 2620 self._name = '{}_shared_embedding'.format(self.categorical_column.name) 2621 return self._name 2622 2623 @property 2624 def _var_scope_name(self): 2625 return self.shared_embedding_collection_name 2626 2627 @property 2628 def _parse_example_spec(self): 2629 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 2630 2631 def _transform_feature(self, inputs): 2632 return inputs.get(self.categorical_column) 2633 2634 @property 2635 def _variable_shape(self): 2636 if not hasattr(self, '_shape'): 2637 self._shape = tensor_shape.TensorShape([self.dimension]) 2638 return self._shape 2639 2640 def _get_dense_tensor_internal(self, 2641 inputs, 2642 weight_collections=None, 2643 trainable=None): 2644 """Private method that follows the signature of _get_dense_tensor.""" 2645 # This method is called from a variable_scope with name _var_scope_name, 2646 # which is shared among all shared embeddings. Open a name_scope here, so 2647 # that the ops for different columns have distinct names. 2648 with ops.name_scope(None, default_name=self.name): 2649 # Get sparse IDs and weights. 2650 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access 2651 inputs, weight_collections=weight_collections, trainable=trainable) 2652 sparse_ids = sparse_tensors.id_tensor 2653 sparse_weights = sparse_tensors.weight_tensor 2654 2655 embedding_shape = (self.categorical_column._num_buckets, self.dimension) # pylint: disable=protected-access 2656 shared_embedding_collection = ops.get_collection( 2657 self.shared_embedding_collection_name) 2658 if shared_embedding_collection: 2659 if len(shared_embedding_collection) > 1: 2660 raise ValueError( 2661 'Collection {} can only contain one variable. ' 2662 'Suggested fix A: Choose a unique name for this collection. ' 2663 'Suggested fix B: Do not add any variables to this collection. ' 2664 'The feature_column library already adds a variable under the ' 2665 'hood.'.format(shared_embedding_collection)) 2666 embedding_weights = shared_embedding_collection[0] 2667 if embedding_weights.get_shape() != embedding_shape: 2668 raise ValueError( 2669 'Shared embedding collection {} contains variable {} of ' 2670 'unexpected shape {}. Expected shape is {}. ' 2671 'Suggested fix A: Choose a unique name for this collection. ' 2672 'Suggested fix B: Do not add any variables to this collection. ' 2673 'The feature_column library already adds a variable under the ' 2674 'hood.'.format(self.shared_embedding_collection_name, 2675 embedding_weights.name, 2676 embedding_weights.get_shape(), embedding_shape)) 2677 else: 2678 embedding_weights = variable_scope.get_variable( 2679 name='embedding_weights', 2680 shape=embedding_shape, 2681 dtype=dtypes.float32, 2682 initializer=self.initializer, 2683 trainable=self.trainable and trainable, 2684 collections=weight_collections) 2685 ops.add_to_collection(self.shared_embedding_collection_name, 2686 embedding_weights) 2687 if self.ckpt_to_load_from is not None: 2688 to_restore = embedding_weights 2689 if isinstance(to_restore, variables.PartitionedVariable): 2690 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access 2691 checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, { 2692 self.tensor_name_in_ckpt: to_restore 2693 }) 2694 2695 sparse_id_rank = tensor_shape.dimension_value( 2696 sparse_ids.dense_shape.get_shape()[0]) 2697 embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse 2698 if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and 2699 sparse_id_rank <= 2): 2700 embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2 2701 # Return embedding lookup result. 2702 return embedding_lookup_sparse( 2703 embedding_weights, 2704 sparse_ids, 2705 sparse_weights, 2706 combiner=self.combiner, 2707 name='%s_weights' % self.name, 2708 max_norm=self.max_norm) 2709 2710 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2711 if isinstance(self.categorical_column, _SequenceCategoricalColumn): 2712 raise ValueError( 2713 'In embedding_column: {}. ' 2714 'categorical_column must not be of type _SequenceCategoricalColumn. ' 2715 'Suggested fix A: If you wish to use input_layer, use a ' 2716 'non-sequence categorical_column_with_*. ' 2717 'Suggested fix B: If you wish to create sequence input, use ' 2718 'sequence_input_layer instead of input_layer. ' 2719 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 2720 self.categorical_column)) 2721 return self._get_dense_tensor_internal( 2722 inputs=inputs, 2723 weight_collections=weight_collections, 2724 trainable=trainable) 2725 2726 def _get_sequence_dense_tensor(self, 2727 inputs, 2728 weight_collections=None, 2729 trainable=None): 2730 if not isinstance(self.categorical_column, _SequenceCategoricalColumn): 2731 raise ValueError( 2732 'In embedding_column: {}. ' 2733 'categorical_column must be of type _SequenceCategoricalColumn ' 2734 'to use sequence_input_layer. ' 2735 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 2736 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 2737 self.categorical_column)) 2738 dense_tensor = self._get_dense_tensor_internal( # pylint: disable=protected-access 2739 inputs=inputs, 2740 weight_collections=weight_collections, 2741 trainable=trainable) 2742 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 2743 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 2744 sparse_tensors.id_tensor) 2745 return _SequenceDenseColumn.TensorSequenceLengthPair( 2746 dense_tensor=dense_tensor, sequence_length=sequence_length) 2747 2748 2749def _check_shape(shape, key): 2750 """Returns shape if it's valid, raises error otherwise.""" 2751 assert shape is not None 2752 if not nest.is_nested(shape): 2753 shape = [shape] 2754 shape = tuple(shape) 2755 for dimension in shape: 2756 if not isinstance(dimension, six.integer_types): 2757 raise TypeError('shape dimensions must be integer. ' 2758 'shape: {}, key: {}'.format(shape, key)) 2759 if dimension < 1: 2760 raise ValueError('shape dimensions must be greater than 0. ' 2761 'shape: {}, key: {}'.format(shape, key)) 2762 return shape 2763 2764 2765class _HashedCategoricalColumn( 2766 _CategoricalColumn, 2767 collections.namedtuple('_HashedCategoricalColumn', 2768 ['key', 'hash_bucket_size', 'dtype'])): 2769 """see `categorical_column_with_hash_bucket`.""" 2770 2771 @property 2772 def name(self): 2773 return self.key 2774 2775 @property 2776 def _parse_example_spec(self): 2777 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 2778 2779 def _transform_feature(self, inputs): 2780 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 2781 if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 2782 raise ValueError('SparseColumn input must be a SparseTensor.') 2783 2784 fc_utils.assert_string_or_int( 2785 input_tensor.dtype, 2786 prefix='column_name: {} input_tensor'.format(self.key)) 2787 2788 if self.dtype.is_integer != input_tensor.dtype.is_integer: 2789 raise ValueError( 2790 'Column dtype and SparseTensors dtype must be compatible. ' 2791 'key: {}, column dtype: {}, tensor dtype: {}'.format( 2792 self.key, self.dtype, input_tensor.dtype)) 2793 2794 if self.dtype == dtypes.string: 2795 sparse_values = input_tensor.values 2796 else: 2797 sparse_values = string_ops.as_string(input_tensor.values) 2798 2799 sparse_id_values = string_ops.string_to_hash_bucket_fast( 2800 sparse_values, self.hash_bucket_size, name='lookup') 2801 return sparse_tensor_lib.SparseTensor( 2802 input_tensor.indices, sparse_id_values, input_tensor.dense_shape) 2803 2804 @property 2805 def _num_buckets(self): 2806 """Returns number of buckets in this sparse feature.""" 2807 return self.hash_bucket_size 2808 2809 def _get_sparse_tensors(self, inputs, weight_collections=None, 2810 trainable=None): 2811 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2812 2813 2814class _VocabularyFileCategoricalColumn( 2815 _CategoricalColumn, 2816 collections.namedtuple('_VocabularyFileCategoricalColumn', ( 2817 'key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'dtype', 2818 'default_value' 2819 ))): 2820 """See `categorical_column_with_vocabulary_file`.""" 2821 2822 @property 2823 def name(self): 2824 return self.key 2825 2826 @property 2827 def _parse_example_spec(self): 2828 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 2829 2830 def _transform_feature(self, inputs): 2831 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 2832 2833 if self.dtype.is_integer != input_tensor.dtype.is_integer: 2834 raise ValueError( 2835 'Column dtype and SparseTensors dtype must be compatible. ' 2836 'key: {}, column dtype: {}, tensor dtype: {}'.format( 2837 self.key, self.dtype, input_tensor.dtype)) 2838 2839 fc_utils.assert_string_or_int( 2840 input_tensor.dtype, 2841 prefix='column_name: {} input_tensor'.format(self.key)) 2842 2843 key_dtype = self.dtype 2844 if input_tensor.dtype.is_integer: 2845 # `index_table_from_file` requires 64-bit integer keys. 2846 key_dtype = dtypes.int64 2847 input_tensor = math_ops.cast(input_tensor, dtypes.int64) 2848 2849 return lookup_ops.index_table_from_file( 2850 vocabulary_file=self.vocabulary_file, 2851 num_oov_buckets=self.num_oov_buckets, 2852 vocab_size=self.vocabulary_size, 2853 default_value=self.default_value, 2854 key_dtype=key_dtype, 2855 name='{}_lookup'.format(self.key)).lookup(input_tensor) 2856 2857 @property 2858 def _num_buckets(self): 2859 """Returns number of buckets in this sparse feature.""" 2860 return self.vocabulary_size + self.num_oov_buckets 2861 2862 def _get_sparse_tensors( 2863 self, inputs, weight_collections=None, trainable=None): 2864 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2865 2866 2867class _VocabularyListCategoricalColumn( 2868 _CategoricalColumn, 2869 collections.namedtuple('_VocabularyListCategoricalColumn', ( 2870 'key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets' 2871 ))): 2872 """See `categorical_column_with_vocabulary_list`.""" 2873 2874 @property 2875 def name(self): 2876 return self.key 2877 2878 @property 2879 def _parse_example_spec(self): 2880 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 2881 2882 def _transform_feature(self, inputs): 2883 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 2884 2885 if self.dtype.is_integer != input_tensor.dtype.is_integer: 2886 raise ValueError( 2887 'Column dtype and SparseTensors dtype must be compatible. ' 2888 'key: {}, column dtype: {}, tensor dtype: {}'.format( 2889 self.key, self.dtype, input_tensor.dtype)) 2890 2891 fc_utils.assert_string_or_int( 2892 input_tensor.dtype, 2893 prefix='column_name: {} input_tensor'.format(self.key)) 2894 2895 key_dtype = self.dtype 2896 if input_tensor.dtype.is_integer: 2897 # `index_table_from_tensor` requires 64-bit integer keys. 2898 key_dtype = dtypes.int64 2899 input_tensor = math_ops.cast(input_tensor, dtypes.int64) 2900 2901 return lookup_ops.index_table_from_tensor( 2902 vocabulary_list=tuple(self.vocabulary_list), 2903 default_value=self.default_value, 2904 num_oov_buckets=self.num_oov_buckets, 2905 dtype=key_dtype, 2906 name='{}_lookup'.format(self.key)).lookup(input_tensor) 2907 2908 @property 2909 def _num_buckets(self): 2910 """Returns number of buckets in this sparse feature.""" 2911 return len(self.vocabulary_list) + self.num_oov_buckets 2912 2913 def _get_sparse_tensors( 2914 self, inputs, weight_collections=None, trainable=None): 2915 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2916 2917 2918class _IdentityCategoricalColumn( 2919 _CategoricalColumn, 2920 collections.namedtuple('_IdentityCategoricalColumn', ( 2921 'key', 'num_buckets', 'default_value' 2922 ))): 2923 2924 """See `categorical_column_with_identity`.""" 2925 2926 @property 2927 def name(self): 2928 return self.key 2929 2930 @property 2931 def _parse_example_spec(self): 2932 return {self.key: parsing_ops.VarLenFeature(dtypes.int64)} 2933 2934 def _transform_feature(self, inputs): 2935 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 2936 2937 if not input_tensor.dtype.is_integer: 2938 raise ValueError( 2939 'Invalid input, not integer. key: {} dtype: {}'.format( 2940 self.key, input_tensor.dtype)) 2941 values = input_tensor.values 2942 if input_tensor.values.dtype != dtypes.int64: 2943 values = math_ops.cast(values, dtypes.int64, name='values') 2944 if self.default_value is not None: 2945 num_buckets = math_ops.cast( 2946 self.num_buckets, dtypes.int64, name='num_buckets') 2947 zero = math_ops.cast(0, dtypes.int64, name='zero') 2948 # Assign default for out-of-range values. 2949 values = array_ops.where( 2950 math_ops.logical_or( 2951 values < zero, values >= num_buckets, name='out_of_range'), 2952 array_ops.fill( 2953 dims=array_ops.shape(values), 2954 value=math_ops.cast(self.default_value, dtypes.int64), 2955 name='default_values'), values) 2956 return sparse_tensor_lib.SparseTensor( 2957 indices=input_tensor.indices, 2958 values=values, 2959 dense_shape=input_tensor.dense_shape) 2960 2961 @property 2962 def _num_buckets(self): 2963 """Returns number of buckets in this sparse feature.""" 2964 return self.num_buckets 2965 2966 def _get_sparse_tensors( 2967 self, inputs, weight_collections=None, trainable=None): 2968 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2969 2970 2971class _WeightedCategoricalColumn( 2972 _CategoricalColumn, 2973 collections.namedtuple('_WeightedCategoricalColumn', ( 2974 'categorical_column', 'weight_feature_key', 'dtype' 2975 ))): 2976 """See `weighted_categorical_column`.""" 2977 2978 @property 2979 def name(self): 2980 return '{}_weighted_by_{}'.format( 2981 self.categorical_column.name, self.weight_feature_key) 2982 2983 @property 2984 def _parse_example_spec(self): 2985 config = self.categorical_column._parse_example_spec # pylint: disable=protected-access 2986 if self.weight_feature_key in config: 2987 raise ValueError('Parse config {} already exists for {}.'.format( 2988 config[self.weight_feature_key], self.weight_feature_key)) 2989 config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype) 2990 return config 2991 2992 @property 2993 def _num_buckets(self): 2994 return self.categorical_column._num_buckets # pylint: disable=protected-access 2995 2996 def _transform_feature(self, inputs): 2997 weight_tensor = inputs.get(self.weight_feature_key) 2998 if weight_tensor is None: 2999 raise ValueError('Missing weights {}.'.format(self.weight_feature_key)) 3000 weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 3001 weight_tensor) 3002 if self.dtype != weight_tensor.dtype.base_dtype: 3003 raise ValueError('Bad dtype, expected {}, but got {}.'.format( 3004 self.dtype, weight_tensor.dtype)) 3005 if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor): 3006 # The weight tensor can be a regular Tensor. In this case, sparsify it. 3007 weight_tensor = _to_sparse_input_and_drop_ignore_values( 3008 weight_tensor, ignore_value=0.0) 3009 if not weight_tensor.dtype.is_floating: 3010 weight_tensor = math_ops.cast(weight_tensor, dtypes.float32) 3011 return (inputs.get(self.categorical_column), weight_tensor) 3012 3013 def _get_sparse_tensors( 3014 self, inputs, weight_collections=None, trainable=None): 3015 del weight_collections 3016 del trainable 3017 tensors = inputs.get(self) 3018 return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1]) 3019 3020 3021class _CrossedColumn( 3022 _CategoricalColumn, 3023 collections.namedtuple('_CrossedColumn', 3024 ['keys', 'hash_bucket_size', 'hash_key'])): 3025 """See `crossed_column`.""" 3026 3027 @property 3028 def name(self): 3029 feature_names = [] 3030 for key in _collect_leaf_level_keys(self): 3031 if isinstance(key, _FeatureColumn): 3032 feature_names.append(key.name) 3033 else: # key must be a string 3034 feature_names.append(key) 3035 return '_X_'.join(sorted(feature_names)) 3036 3037 @property 3038 def _parse_example_spec(self): 3039 config = {} 3040 for key in self.keys: 3041 if isinstance(key, _FeatureColumn): 3042 config.update(key._parse_example_spec) # pylint: disable=protected-access 3043 else: # key must be a string 3044 config.update({key: parsing_ops.VarLenFeature(dtypes.string)}) 3045 return config 3046 3047 def _transform_feature(self, inputs): 3048 feature_tensors = [] 3049 for key in _collect_leaf_level_keys(self): 3050 if isinstance(key, six.string_types): 3051 feature_tensors.append(inputs.get(key)) 3052 elif isinstance(key, _CategoricalColumn): 3053 ids_and_weights = key._get_sparse_tensors(inputs) # pylint: disable=protected-access 3054 if ids_and_weights.weight_tensor is not None: 3055 raise ValueError( 3056 'crossed_column does not support weight_tensor, but the given ' 3057 'column populates weight_tensor. ' 3058 'Given column: {}'.format(key.name)) 3059 feature_tensors.append(ids_and_weights.id_tensor) 3060 else: 3061 raise ValueError('Unsupported column type. Given: {}'.format(key)) 3062 return sparse_ops.sparse_cross_hashed( 3063 inputs=feature_tensors, 3064 num_buckets=self.hash_bucket_size, 3065 hash_key=self.hash_key) 3066 3067 @property 3068 def _num_buckets(self): 3069 """Returns number of buckets in this sparse feature.""" 3070 return self.hash_bucket_size 3071 3072 def _get_sparse_tensors(self, inputs, weight_collections=None, 3073 trainable=None): 3074 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 3075 3076 3077def _collect_leaf_level_keys(cross): 3078 """Collects base keys by expanding all nested crosses. 3079 3080 Args: 3081 cross: A `_CrossedColumn`. 3082 3083 Returns: 3084 A list of strings or `_CategoricalColumn` instances. 3085 """ 3086 leaf_level_keys = [] 3087 for k in cross.keys: 3088 if isinstance(k, _CrossedColumn): 3089 leaf_level_keys.extend(_collect_leaf_level_keys(k)) 3090 else: 3091 leaf_level_keys.append(k) 3092 return leaf_level_keys 3093 3094 3095class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn, 3096 collections.namedtuple('_IndicatorColumn', 3097 ['categorical_column'])): 3098 """Represents a one-hot column for use in deep networks. 3099 3100 Args: 3101 categorical_column: A `_CategoricalColumn` which is created by 3102 `categorical_column_with_*` function. 3103 """ 3104 3105 @property 3106 def name(self): 3107 return '{}_indicator'.format(self.categorical_column.name) 3108 3109 def _transform_feature(self, inputs): 3110 """Returns dense `Tensor` representing feature. 3111 3112 Args: 3113 inputs: A `_LazyBuilder` object to access inputs. 3114 3115 Returns: 3116 Transformed feature `Tensor`. 3117 3118 Raises: 3119 ValueError: if input rank is not known at graph building time. 3120 """ 3121 id_weight_pair = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 3122 id_tensor = id_weight_pair.id_tensor 3123 weight_tensor = id_weight_pair.weight_tensor 3124 3125 # If the underlying column is weighted, return the input as a dense tensor. 3126 if weight_tensor is not None: 3127 weighted_column = sparse_ops.sparse_merge( 3128 sp_ids=id_tensor, 3129 sp_values=weight_tensor, 3130 vocab_size=int(self._variable_shape[-1])) 3131 # Remove (?, -1) index. 3132 weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0], 3133 weighted_column.dense_shape) 3134 # Use scatter_nd to merge duplicated indices if existed, 3135 # instead of sparse_tensor_to_dense. 3136 return array_ops.scatter_nd(weighted_column.indices, 3137 weighted_column.values, 3138 weighted_column.dense_shape) 3139 3140 dense_id_tensor = sparse_ops.sparse_tensor_to_dense( 3141 id_tensor, default_value=-1) 3142 3143 # One hot must be float for tf.concat reasons since all other inputs to 3144 # input_layer are float32. 3145 one_hot_id_tensor = array_ops.one_hot( 3146 dense_id_tensor, 3147 depth=self._variable_shape[-1], 3148 on_value=1.0, 3149 off_value=0.0) 3150 3151 # Reduce to get a multi-hot per example. 3152 return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2]) 3153 3154 @property 3155 def _parse_example_spec(self): 3156 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 3157 3158 @property 3159 def _variable_shape(self): 3160 """Returns a `TensorShape` representing the shape of the dense `Tensor`.""" 3161 return tensor_shape.TensorShape([1, self.categorical_column._num_buckets]) # pylint: disable=protected-access 3162 3163 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 3164 """Returns dense `Tensor` representing feature. 3165 3166 Args: 3167 inputs: A `_LazyBuilder` object to access inputs. 3168 weight_collections: Unused `weight_collections` since no variables are 3169 created in this function. 3170 trainable: Unused `trainable` bool since no variables are created in 3171 this function. 3172 3173 Returns: 3174 Dense `Tensor` created within `_transform_feature`. 3175 3176 Raises: 3177 ValueError: If `categorical_column` is a `_SequenceCategoricalColumn`. 3178 """ 3179 # Do nothing with weight_collections and trainable since no variables are 3180 # created in this function. 3181 del weight_collections 3182 del trainable 3183 if isinstance(self.categorical_column, _SequenceCategoricalColumn): 3184 raise ValueError( 3185 'In indicator_column: {}. ' 3186 'categorical_column must not be of type _SequenceCategoricalColumn. ' 3187 'Suggested fix A: If you wish to use input_layer, use a ' 3188 'non-sequence categorical_column_with_*. ' 3189 'Suggested fix B: If you wish to create sequence input, use ' 3190 'sequence_input_layer instead of input_layer. ' 3191 'Given (type {}): {}'.format( 3192 self.name, type(self.categorical_column), 3193 self.categorical_column)) 3194 # Feature has been already transformed. Return the intermediate 3195 # representation created by _transform_feature. 3196 return inputs.get(self) 3197 3198 def _get_sequence_dense_tensor( 3199 self, inputs, weight_collections=None, trainable=None): 3200 # Do nothing with weight_collections and trainable since no variables are 3201 # created in this function. 3202 del weight_collections 3203 del trainable 3204 if not isinstance(self.categorical_column, _SequenceCategoricalColumn): 3205 raise ValueError( 3206 'In indicator_column: {}. ' 3207 'categorical_column must be of type _SequenceCategoricalColumn ' 3208 'to use sequence_input_layer. ' 3209 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 3210 'Given (type {}): {}'.format( 3211 self.name, type(self.categorical_column), 3212 self.categorical_column)) 3213 # Feature has been already transformed. Return the intermediate 3214 # representation created by _transform_feature. 3215 dense_tensor = inputs.get(self) 3216 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 3217 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 3218 sparse_tensors.id_tensor) 3219 return _SequenceDenseColumn.TensorSequenceLengthPair( 3220 dense_tensor=dense_tensor, sequence_length=sequence_length) 3221 3222 3223def _verify_static_batch_size_equality(tensors, columns): 3224 """Validates that the first dim (batch size) of all tensors are equal or None. 3225 3226 Args: 3227 tensors: list of tensors to check. 3228 columns: list of feature columns matching tensors. Will be used for error 3229 messaging. 3230 3231 Raises: 3232 ValueError: if one of the tensors has a variant batch size 3233 """ 3234 # bath_size is a tf.compat.v1.Dimension object. 3235 expected_batch_size = None 3236 for i in range(0, len(tensors)): 3237 if tensors[i].shape.dims[0].value is not None: 3238 if expected_batch_size is None: 3239 bath_size_column_index = i 3240 expected_batch_size = tensors[i].shape.dims[0] 3241 elif not expected_batch_size.is_compatible_with(tensors[i].shape.dims[0]): 3242 raise ValueError( 3243 'Batch size (first dimension) of each feature must be same. ' 3244 'Batch size of columns ({}, {}): ({}, {})'.format( 3245 columns[bath_size_column_index].name, columns[i].name, 3246 expected_batch_size, tensors[i].shape.dims[0])) 3247 3248 3249class _SequenceCategoricalColumn( 3250 _CategoricalColumn, 3251 collections.namedtuple( 3252 '_SequenceCategoricalColumn', ['categorical_column'])): 3253 """Represents sequences of categorical data.""" 3254 3255 @property 3256 def name(self): 3257 return self.categorical_column.name 3258 3259 @property 3260 def _parse_example_spec(self): 3261 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 3262 3263 def _transform_feature(self, inputs): 3264 return self.categorical_column._transform_feature(inputs) # pylint: disable=protected-access 3265 3266 @property 3267 def _num_buckets(self): 3268 return self.categorical_column._num_buckets # pylint: disable=protected-access 3269 3270 def _get_sparse_tensors(self, inputs, weight_collections=None, 3271 trainable=None): 3272 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 3273 id_tensor = sparse_tensors.id_tensor 3274 weight_tensor = sparse_tensors.weight_tensor 3275 3276 # Expands third dimension, if necessary so that embeddings are not 3277 # combined during embedding lookup. If the tensor is already 3D, leave 3278 # as-is. 3279 shape = array_ops.shape(id_tensor) 3280 # Compute the third dimension explicitly instead of setting it to -1, as 3281 # that doesn't work for dynamically shaped tensors with 0-length at runtime. 3282 # This happens for empty sequences. 3283 target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])] 3284 id_tensor = sparse_ops.sparse_reshape(id_tensor, target_shape) 3285 if weight_tensor is not None: 3286 weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape) 3287 3288 return _CategoricalColumn.IdWeightPair(id_tensor, weight_tensor) 3289