xref: /aosp_15_r20/external/tensorflow/tensorflow/python/feature_column/feature_column.py (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""This API defines FeatureColumn abstraction.
16
17FeatureColumns provide a high level abstraction for ingesting and representing
18features. FeatureColumns are also the primary way of encoding features for
19canned `tf.estimator.Estimator`s.
20
21When using FeatureColumns with `Estimators`, the type of feature column you
22should choose depends on (1) the feature type and (2) the model type.
23
241. Feature type:
25
26  * Continuous features can be represented by `numeric_column`.
27  * Categorical features can be represented by any `categorical_column_with_*`
28  column:
29    - `categorical_column_with_vocabulary_list`
30    - `categorical_column_with_vocabulary_file`
31    - `categorical_column_with_hash_bucket`
32    - `categorical_column_with_identity`
33    - `weighted_categorical_column`
34
352. Model type:
36
37  * Deep neural network models (`DNNClassifier`, `DNNRegressor`).
38
39    Continuous features can be directly fed into deep neural network models.
40
41      age_column = numeric_column("age")
42
43    To feed sparse features into DNN models, wrap the column with
44    `embedding_column` or `indicator_column`. `indicator_column` is recommended
45    for features with only a few possible values. For features with many
46    possible values, to reduce the size of your model, `embedding_column` is
47    recommended.
48
49      embedded_dept_column = embedding_column(
50          categorical_column_with_vocabulary_list(
51              "department", ["math", "philosophy", ...]), dimension=10)
52
53  * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
54
55    Sparse features can be fed directly into linear models. They behave like an
56    indicator column but with an efficient implementation.
57
58      dept_column = categorical_column_with_vocabulary_list("department",
59          ["math", "philosophy", "english"])
60
61    It is recommended that continuous features be bucketized before being
62    fed into linear models.
63
64      bucketized_age_column = bucketized_column(
65          source_column=age_column,
66          boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
67
68    Sparse features can be crossed (also known as conjuncted or combined) in
69    order to form non-linearities, and then fed into linear models.
70
71      cross_dept_age_column = crossed_column(
72          columns=["department", bucketized_age_column],
73          hash_bucket_size=1000)
74
75Example of building canned `Estimator`s using FeatureColumns:
76
77  ```python
78  # Define features and transformations
79  deep_feature_columns = [age_column, embedded_dept_column]
80  wide_feature_columns = [dept_column, bucketized_age_column,
81      cross_dept_age_column]
82
83  # Build deep model
84  estimator = DNNClassifier(
85      feature_columns=deep_feature_columns,
86      hidden_units=[500, 250, 50])
87  estimator.train(...)
88
89  # Or build a wide model
90  estimator = LinearClassifier(
91      feature_columns=wide_feature_columns)
92  estimator.train(...)
93
94  # Or build a wide and deep model!
95  estimator = DNNLinearCombinedClassifier(
96      linear_feature_columns=wide_feature_columns,
97      dnn_feature_columns=deep_feature_columns,
98      dnn_hidden_units=[500, 250, 50])
99  estimator.train(...)
100  ```
101
102
103FeatureColumns can also be transformed into a generic input layer for
104custom models using `input_layer`.
105
106Example of building model using FeatureColumns, this can be used in a
107`model_fn` which is given to the {tf.estimator.Estimator}:
108
109  ```python
110  # Building model via layers
111
112  deep_feature_columns = [age_column, embedded_dept_column]
113  columns_to_tensor = parse_feature_columns_from_examples(
114      serialized=my_data,
115      feature_columns=deep_feature_columns)
116  first_layer = input_layer(
117      features=columns_to_tensor,
118      feature_columns=deep_feature_columns)
119  second_layer = fully_connected(first_layer, ...)
120  ```
121
122NOTE: Functions prefixed with "_" indicate experimental or private parts of
123the API subject to change, and should not be relied upon!
124
125NOTE: The new feature columns are being developed in feature_column_v2.py and
126are a somewhat duplicate of the code here. Please make sure to update logic
127in both places.
128"""
129
130import abc
131import collections
132import math
133
134import numpy as np
135import six
136
137from tensorflow.python.eager import context
138from tensorflow.python.feature_column import utils as fc_utils
139from tensorflow.python.framework import dtypes
140from tensorflow.python.framework import ops
141from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
142from tensorflow.python.framework import tensor_shape
143from tensorflow.python.layers import base
144from tensorflow.python.ops import array_ops
145from tensorflow.python.ops import check_ops
146from tensorflow.python.ops import control_flow_ops
147from tensorflow.python.ops import embedding_ops
148from tensorflow.python.ops import init_ops
149from tensorflow.python.ops import lookup_ops
150from tensorflow.python.ops import math_ops
151from tensorflow.python.ops import nn_ops
152from tensorflow.python.ops import parsing_ops
153from tensorflow.python.ops import resource_variable_ops
154from tensorflow.python.ops import sparse_ops
155from tensorflow.python.ops import string_ops
156from tensorflow.python.ops import template
157from tensorflow.python.ops import variable_scope
158from tensorflow.python.ops import variables
159from tensorflow.python.platform import gfile
160from tensorflow.python.platform import tf_logging as logging
161from tensorflow.python.training import checkpoint_utils
162from tensorflow.python.util import nest
163from tensorflow.python.util.compat import collections_abc
164from tensorflow.python.util.tf_export import tf_export
165
166
167def _internal_input_layer(features,
168                          feature_columns,
169                          weight_collections=None,
170                          trainable=True,
171                          cols_to_vars=None,
172                          scope=None,
173                          cols_to_output_tensors=None,
174                          from_template=False):
175  """See input_layer. `scope` is a name or variable scope to use."""
176
177  feature_columns = _normalize_feature_columns(feature_columns)
178  for column in feature_columns:
179    if not isinstance(column, _DenseColumn):
180      raise ValueError(
181          'Items of feature_columns must be a _DenseColumn. '
182          'You can wrap a categorical column with an '
183          'embedding_column or indicator_column. Given: {}'.format(column))
184  weight_collections = list(weight_collections or [])
185  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
186    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
187  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
188    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
189
190  def _get_logits():  # pylint: disable=missing-docstring
191    builder = _LazyBuilder(features)
192    output_tensors = []
193    ordered_columns = []
194    for column in sorted(feature_columns, key=lambda x: x.name):
195      ordered_columns.append(column)
196      with variable_scope.variable_scope(
197          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
198        tensor = column._get_dense_tensor(  # pylint: disable=protected-access
199            builder,
200            weight_collections=weight_collections,
201            trainable=trainable)
202        num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
203        batch_size = array_ops.shape(tensor)[0]
204        output_tensor = array_ops.reshape(
205            tensor, shape=(batch_size, num_elements))
206        output_tensors.append(output_tensor)
207        if cols_to_vars is not None:
208          # Retrieve any variables created (some _DenseColumn's don't create
209          # variables, in which case an empty list is returned).
210          cols_to_vars[column] = ops.get_collection(
211              ops.GraphKeys.GLOBAL_VARIABLES,
212              scope=variable_scope.get_variable_scope().name)
213        if cols_to_output_tensors is not None:
214          cols_to_output_tensors[column] = output_tensor
215    _verify_static_batch_size_equality(output_tensors, ordered_columns)
216    return array_ops.concat(output_tensors, 1)
217
218  # If we're constructing from the `make_template`, that by default adds a
219  # variable scope with the name of the layer. In that case, we dont want to
220  # add another `variable_scope` as that would break checkpoints.
221  if from_template:
222    return _get_logits()
223  else:
224    with variable_scope.variable_scope(
225        scope, default_name='input_layer', values=features.values()):
226      return _get_logits()
227
228
229@tf_export(v1=['feature_column.input_layer'])
230def input_layer(features,
231                feature_columns,
232                weight_collections=None,
233                trainable=True,
234                cols_to_vars=None,
235                cols_to_output_tensors=None):
236  """Returns a dense `Tensor` as input layer based on given `feature_columns`.
237
238  Generally a single example in training data is described with FeatureColumns.
239  At the first layer of the model, this column oriented data should be converted
240  to a single `Tensor`.
241
242  Example:
243
244  ```python
245  price = numeric_column('price')
246  keywords_embedded = embedding_column(
247      categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
248  columns = [price, keywords_embedded, ...]
249  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
250  dense_tensor = input_layer(features, columns)
251  for units in [128, 64, 32]:
252    dense_tensor = tf.compat.v1.layers.dense(dense_tensor, units, tf.nn.relu)
253  prediction = tf.compat.v1.layers.dense(dense_tensor, 1)
254  ```
255
256  Args:
257    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
258      keys. For example `numeric_column('price')` will look at 'price' key in
259      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
260      corresponding `_FeatureColumn`.
261    feature_columns: An iterable containing the FeatureColumns to use as inputs
262      to your model. All items should be instances of classes derived from
263      `_DenseColumn` such as `numeric_column`, `embedding_column`,
264      `bucketized_column`, `indicator_column`. If you have categorical features,
265      you can wrap them with an `embedding_column` or `indicator_column`.
266    weight_collections: A list of collection names to which the Variable will be
267      added. Note that variables will also be added to collections
268      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
269    trainable: If `True` also add the variable to the graph collection
270      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
271    cols_to_vars: If not `None`, must be a dictionary that will be filled with a
272      mapping from `_FeatureColumn` to list of `Variable`s.  For example, after
273      the call, we might have cols_to_vars =
274      {_EmbeddingColumn(
275        categorical_column=_HashedCategoricalColumn(
276          key='sparse_feature', hash_bucket_size=5, dtype=tf.string),
277        dimension=10): [<tf.Variable 'some_variable:0' shape=(5, 10),
278                        <tf.Variable 'some_variable:1' shape=(5, 10)]}
279      If a column creates no variables, its value will be an empty list.
280    cols_to_output_tensors: If not `None`, must be a dictionary that will be
281      filled with a mapping from '_FeatureColumn' to the associated
282      output `Tensor`s.
283
284  Returns:
285    A `Tensor` which represents input layer of a model. Its shape
286    is (batch_size, first_layer_dimension) and its dtype is `float32`.
287    first_layer_dimension is determined based on given `feature_columns`.
288
289  Raises:
290    ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
291  """
292  return _internal_input_layer(
293      features,
294      feature_columns,
295      weight_collections=weight_collections,
296      trainable=trainable,
297      cols_to_vars=cols_to_vars,
298      cols_to_output_tensors=cols_to_output_tensors)
299
300
301# TODO(akshayka): InputLayer should be a subclass of Layer, and it
302# should implement the logic in input_layer using Layer's build-and-call
303# paradigm; input_layer should create an instance of InputLayer and
304# return the result of invoking its apply method, just as functional layers do.
305class InputLayer(object):
306  """An object-oriented version of `input_layer` that reuses variables."""
307
308  def __init__(self,
309               feature_columns,
310               weight_collections=None,
311               trainable=True,
312               cols_to_vars=None,
313               name='feature_column_input_layer',
314               create_scope_now=True):
315    """See `input_layer`."""
316
317    self._feature_columns = feature_columns
318    self._weight_collections = weight_collections
319    self._trainable = trainable
320    self._cols_to_vars = cols_to_vars
321    self._name = name
322    self._input_layer_template = template.make_template(
323        self._name, _internal_input_layer, create_scope_now_=create_scope_now)
324    self._scope = self._input_layer_template.variable_scope
325
326  def __call__(self, features):
327    return self._input_layer_template(
328        features=features,
329        feature_columns=self._feature_columns,
330        weight_collections=self._weight_collections,
331        trainable=self._trainable,
332        cols_to_vars=None,
333        from_template=True)
334
335  @property
336  def name(self):
337    return self._name
338
339  @property
340  def non_trainable_variables(self):
341    return self._input_layer_template.non_trainable_variables
342
343  @property
344  def non_trainable_weights(self):
345    return self._input_layer_template.non_trainable_weights
346
347  @property
348  def trainable_variables(self):
349    return self._input_layer_template.trainable_variables
350
351  @property
352  def trainable_weights(self):
353    return self._input_layer_template.trainable_weights
354
355  @property
356  def variables(self):
357    return self._input_layer_template.variables
358
359  @property
360  def weights(self):
361    return self._input_layer_template.weights
362
363
364@tf_export(v1=['feature_column.linear_model'])
365def linear_model(features,
366                 feature_columns,
367                 units=1,
368                 sparse_combiner='sum',
369                 weight_collections=None,
370                 trainable=True,
371                 cols_to_vars=None):
372  """Returns a linear prediction `Tensor` based on given `feature_columns`.
373
374  This function generates a weighted sum based on output dimension `units`.
375  Weighted sum refers to logits in classification problems. It refers to the
376  prediction itself for linear regression problems.
377
378  Note on supported columns: `linear_model` treats categorical columns as
379  `indicator_column`s. To be specific, assume the input as `SparseTensor` looks
380  like:
381
382  ```python
383    shape = [2, 2]
384    {
385        [0, 0]: "a"
386        [1, 0]: "b"
387        [1, 1]: "c"
388    }
389  ```
390  `linear_model` assigns weights for the presence of "a", "b", "c' implicitly,
391  just like `indicator_column`, while `input_layer` explicitly requires wrapping
392  each of categorical columns with an `embedding_column` or an
393  `indicator_column`.
394
395  Example of usage:
396
397  ```python
398  price = numeric_column('price')
399  price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])
400  keywords = categorical_column_with_hash_bucket("keywords", 10K)
401  keywords_price = crossed_column('keywords', price_buckets, ...)
402  columns = [price_buckets, keywords, keywords_price ...]
403  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
404  prediction = linear_model(features, columns)
405  ```
406
407  The `sparse_combiner` argument works as follows
408  For example, for two features represented as the categorical columns:
409
410  ```python
411    # Feature 1
412
413    shape = [2, 2]
414    {
415        [0, 0]: "a"
416        [0, 1]: "b"
417        [1, 0]: "c"
418    }
419
420    # Feature 2
421
422    shape = [2, 3]
423    {
424        [0, 0]: "d"
425        [1, 0]: "e"
426        [1, 1]: "f"
427        [1, 2]: "f"
428    }
429  ```
430
431  with `sparse_combiner` as "mean", the linear model outputs consequently
432  are:
433
434  ```
435    y_0 = 1.0 / 2.0 * ( w_a + w_b ) + w_d + b
436    y_1 = w_c + 1.0 / 3.0 * ( w_e + 2.0 * w_f ) + b
437  ```
438
439  where `y_i` is the output, `b` is the bias, and `w_x` is the weight
440  assigned to the presence of `x` in the input features.
441
442  Args:
443    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
444      keys. For example `numeric_column('price')` will look at 'price' key in
445      this dict. Values are `Tensor` or `SparseTensor` depending on
446      corresponding `_FeatureColumn`.
447    feature_columns: An iterable containing the FeatureColumns to use as inputs
448      to your model. All items should be instances of classes derived from
449      `_FeatureColumn`s.
450    units: An integer, dimensionality of the output space. Default value is 1.
451    sparse_combiner: A string specifying how to reduce if a categorical column
452      is multivalent. Except `numeric_column`, almost all columns passed to
453      `linear_model` are considered as categorical columns.  It combines each
454      categorical column independently. Currently "mean", "sqrtn" and "sum" are
455      supported, with "sum" the default for linear model. "sqrtn" often achieves
456      good accuracy, in particular with bag-of-words columns.
457        * "sum": do not normalize features in the column
458        * "mean": do l1 normalization on features in the column
459        * "sqrtn": do l2 normalization on features in the column
460    weight_collections: A list of collection names to which the Variable will be
461      added. Note that, variables will also be added to collections
462      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
463    trainable: If `True` also add the variable to the graph collection
464      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
465    cols_to_vars: If not `None`, must be a dictionary that will be filled with a
466      mapping from `_FeatureColumn` to associated list of `Variable`s.  For
467      example, after the call, we might have cols_to_vars = {
468        _NumericColumn(
469          key='numeric_feature1', shape=(1,):
470        [<tf.Variable 'linear_model/price2/weights:0' shape=(1, 1)>],
471        'bias': [<tf.Variable 'linear_model/bias_weights:0' shape=(1,)>],
472        _NumericColumn(
473          key='numeric_feature2', shape=(2,)):
474        [<tf.Variable 'linear_model/price1/weights:0' shape=(2, 1)>]}
475      If a column creates no variables, its value will be an empty list. Note
476      that cols_to_vars will also contain a string key 'bias' that maps to a
477      list of Variables.
478
479  Returns:
480    A `Tensor` which represents predictions/logits of a linear model. Its shape
481    is (batch_size, units) and its dtype is `float32`.
482
483  Raises:
484    ValueError: if an item in `feature_columns` is neither a `_DenseColumn`
485      nor `_CategoricalColumn`.
486  """
487  with variable_scope.variable_scope(None, 'linear_model') as vs:
488    model_name = _strip_leading_slashes(vs.name)
489  linear_model_layer = _LinearModel(
490      feature_columns=feature_columns,
491      units=units,
492      sparse_combiner=sparse_combiner,
493      weight_collections=weight_collections,
494      trainable=trainable,
495      name=model_name)
496  retval = linear_model_layer(features)  # pylint: disable=not-callable
497  if cols_to_vars is not None:
498    cols_to_vars.update(linear_model_layer.cols_to_vars())
499  return retval
500
501
502def _add_to_collections(var, weight_collections):
503  """Adds a var to the list of weight_collections provided.
504
505  Handles the case for partitioned and non-partitioned variables.
506
507  Args:
508    var: A variable or Partitioned Variable.
509    weight_collections: List of collections to add variable to.
510  """
511  for weight_collection in weight_collections:
512    # The layer self.add_variable call already adds it to GLOBAL_VARIABLES.
513    if weight_collection == ops.GraphKeys.GLOBAL_VARIABLES:
514      continue
515    # TODO(rohanj): Explore adding a _get_variable_list method on `Variable`
516    # so that we don't have to do this check.
517    if isinstance(var, variables.PartitionedVariable):
518      for constituent_var in list(var):
519        ops.add_to_collection(weight_collection, constituent_var)
520    else:
521      ops.add_to_collection(weight_collection, var)
522
523
524class _FCLinearWrapper(base.Layer):
525  """Wraps a _FeatureColumn in a layer for use in a linear model.
526
527  See `linear_model` above.
528  """
529
530  def __init__(self,
531               feature_column,
532               units=1,
533               sparse_combiner='sum',
534               weight_collections=None,
535               trainable=True,
536               name=None,
537               **kwargs):
538    super(_FCLinearWrapper, self).__init__(
539        trainable=trainable, name=name, **kwargs)
540    self._feature_column = feature_column
541    self._units = units
542    self._sparse_combiner = sparse_combiner
543    self._weight_collections = weight_collections
544
545  def build(self, _):
546    if isinstance(self._feature_column, _CategoricalColumn):
547      weight = self.add_variable(
548          name='weights',
549          shape=(self._feature_column._num_buckets, self._units),  # pylint: disable=protected-access
550          initializer=init_ops.zeros_initializer(),
551          trainable=self.trainable)
552    else:
553      num_elements = self._feature_column._variable_shape.num_elements()  # pylint: disable=protected-access
554      weight = self.add_variable(
555          name='weights',
556          shape=[num_elements, self._units],
557          initializer=init_ops.zeros_initializer(),
558          trainable=self.trainable)
559    _add_to_collections(weight, self._weight_collections)
560    self._weight_var = weight
561    self.built = True
562
563  def call(self, builder):
564    weighted_sum = _create_weighted_sum(
565        column=self._feature_column,
566        builder=builder,
567        units=self._units,
568        sparse_combiner=self._sparse_combiner,
569        weight_collections=self._weight_collections,
570        trainable=self.trainable,
571        weight_var=self._weight_var)
572    return weighted_sum
573
574
575class _BiasLayer(base.Layer):
576  """A layer for the bias term.
577  """
578
579  def __init__(self,
580               units=1,
581               trainable=True,
582               weight_collections=None,
583               name=None,
584               **kwargs):
585    super(_BiasLayer, self).__init__(trainable=trainable, name=name, **kwargs)
586    self._units = units
587    self._weight_collections = weight_collections
588
589  def build(self, _):
590    self._bias_variable = self.add_variable(
591        'bias_weights',
592        shape=[self._units],
593        initializer=init_ops.zeros_initializer(),
594        trainable=self.trainable)
595    _add_to_collections(self._bias_variable, self._weight_collections)
596    self.built = True
597
598  def call(self, _):
599    return self._bias_variable
600
601
602def _get_expanded_variable_list(variable):
603  if (isinstance(variable, variables.Variable) or
604      resource_variable_ops.is_resource_variable(variable)):
605    return [variable]  # Single variable case.
606  else:  # Must be a PartitionedVariable, so convert into a list.
607    return list(variable)
608
609
610def _strip_leading_slashes(name):
611  return name.rsplit('/', 1)[-1]
612
613
614class _LinearModel(base.Layer):
615  """Creates a linear model using feature columns.
616
617  See `linear_model` for details.
618  """
619
620  def __init__(self,
621               feature_columns,
622               units=1,
623               sparse_combiner='sum',
624               weight_collections=None,
625               trainable=True,
626               name=None,
627               **kwargs):
628    super(_LinearModel, self).__init__(name=name, **kwargs)
629    # We force the keras_style to be True here, as a workaround to not being
630    # able to inherit keras.layers.Layer as base class. Setting this will let
631    # us skip all the legacy behavior for base.Layer.
632    # Also note that we use Layer as base class, instead of Model, since there
633    # isn't any Model specific behavior gets used, eg compile/fit.
634    self._keras_style = True
635    self._feature_columns = _normalize_feature_columns(
636        feature_columns)
637    self._weight_collections = list(weight_collections or [])
638    if ops.GraphKeys.GLOBAL_VARIABLES not in self._weight_collections:
639      self._weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
640    if ops.GraphKeys.MODEL_VARIABLES not in self._weight_collections:
641      self._weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
642
643    column_layers = {}
644    for column in sorted(self._feature_columns, key=lambda x: x.name):
645      with variable_scope.variable_scope(
646          None, default_name=column._var_scope_name) as vs:  # pylint: disable=protected-access
647        # Having the fully expressed variable scope name ends up doubly
648        # expressing the outer scope (scope with which this method was called)
649        # in the name of the variable that would get created.
650        column_name = _strip_leading_slashes(vs.name)
651      column_layer = _FCLinearWrapper(column, units, sparse_combiner,
652                                      self._weight_collections, trainable,
653                                      column_name, **kwargs)
654      column_layers[column_name] = column_layer
655    self._column_layers = self._add_layers(column_layers)
656    self._bias_layer = _BiasLayer(
657        units=units,
658        trainable=trainable,
659        weight_collections=self._weight_collections,
660        name='bias_layer',
661        **kwargs)
662    self._cols_to_vars = {}
663
664  def cols_to_vars(self):
665    """Returns a dict mapping _FeatureColumns to variables.
666
667    See `linear_model` for more information.
668    This is not populated till `call` is called i.e. layer is built.
669    """
670    return self._cols_to_vars
671
672  def call(self, features):
673    with variable_scope.variable_scope(self.name):
674      for column in self._feature_columns:
675        if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
676          raise ValueError(
677              'Items of feature_columns must be either a '
678              '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
679      weighted_sums = []
680      ordered_columns = []
681      builder = _LazyBuilder(features)
682      for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
683        column = layer._feature_column  # pylint: disable=protected-access
684        ordered_columns.append(column)
685        weighted_sum = layer(builder)
686        weighted_sums.append(weighted_sum)
687        self._cols_to_vars[column] = ops.get_collection(
688            ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name)
689
690      _verify_static_batch_size_equality(weighted_sums, ordered_columns)
691      predictions_no_bias = math_ops.add_n(
692          weighted_sums, name='weighted_sum_no_bias')
693      predictions = nn_ops.bias_add(
694          predictions_no_bias,
695          self._bias_layer(  # pylint: disable=not-callable
696              builder,
697              scope=variable_scope.get_variable_scope()),  # pylint: disable=not-callable
698          name='weighted_sum')
699      bias = self._bias_layer.variables[0]
700      self._cols_to_vars['bias'] = _get_expanded_variable_list(bias)
701    return predictions
702
703  def _add_layers(self, layers):
704    # "Magic" required for keras.Model classes to track all the variables in
705    # a list of layers.Layer objects.
706    # TODO(ashankar): Figure out API so user code doesn't have to do this.
707    for name, layer in layers.items():
708      setattr(self, 'layer-%s' % name, layer)
709    return layers
710
711
712def _transform_features(features, feature_columns):
713  """Returns transformed features based on features columns passed in.
714
715  Please note that most probably you would not need to use this function. Please
716  check `input_layer` and `linear_model` to see whether they will
717  satisfy your use case or not.
718
719  Example:
720
721  ```python
722  # Define features and transformations
723  crosses_a_x_b = crossed_column(
724      columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000)
725  price_buckets = bucketized_column(
726      source_column=numeric_column("price"), boundaries=[...])
727
728  columns = [crosses_a_x_b, price_buckets]
729  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
730  transformed = transform_features(features=features, feature_columns=columns)
731
732  assertCountEqual(columns, transformed.keys())
733  ```
734
735  Args:
736    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
737      keys. For example `numeric_column('price')` will look at 'price' key in
738      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
739      corresponding `_FeatureColumn`.
740    feature_columns: An iterable containing all the `_FeatureColumn`s.
741
742  Returns:
743    A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values.
744  """
745  feature_columns = _normalize_feature_columns(feature_columns)
746  outputs = {}
747  with ops.name_scope(
748      None, default_name='transform_features', values=features.values()):
749    builder = _LazyBuilder(features)
750    for column in sorted(feature_columns, key=lambda x: x.name):
751      with ops.name_scope(None, default_name=column.name):
752        outputs[column] = builder.get(column)
753  return outputs
754
755
756@tf_export(v1=['feature_column.make_parse_example_spec'])
757def make_parse_example_spec(feature_columns):
758  """Creates parsing spec dictionary from input feature_columns.
759
760  The returned dictionary can be used as arg 'features' in
761  `tf.io.parse_example`.
762
763  Typical usage example:
764
765  ```python
766  # Define features and transformations
767  feature_a = categorical_column_with_vocabulary_file(...)
768  feature_b = numeric_column(...)
769  feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...)
770  feature_a_x_feature_c = crossed_column(
771      columns=["feature_a", feature_c_bucketized], ...)
772
773  feature_columns = set(
774      [feature_b, feature_c_bucketized, feature_a_x_feature_c])
775  features = tf.io.parse_example(
776      serialized=serialized_examples,
777      features=make_parse_example_spec(feature_columns))
778  ```
779
780  For the above example, make_parse_example_spec would return the dict:
781
782  ```python
783  {
784      "feature_a": parsing_ops.VarLenFeature(tf.string),
785      "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
786      "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
787  }
788  ```
789
790  Args:
791    feature_columns: An iterable containing all feature columns. All items
792      should be instances of classes derived from `_FeatureColumn`.
793
794  Returns:
795    A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
796    value.
797
798  Raises:
799    ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`
800      instance.
801  """
802  result = {}
803  for column in feature_columns:
804    if not isinstance(column, _FeatureColumn):
805      raise ValueError(
806          'All feature_columns must be _FeatureColumn instances. '
807          'Given: {}'.format(column))
808    config = column._parse_example_spec  # pylint: disable=protected-access
809    for key, value in six.iteritems(config):
810      if key in result and value != result[key]:
811        raise ValueError(
812            'feature_columns contain different parse_spec for key '
813            '{}. Given {} and {}'.format(key, value, result[key]))
814    result.update(config)
815  return result
816
817
818def _embedding_column(categorical_column,
819                      dimension,
820                      combiner='mean',
821                      initializer=None,
822                      ckpt_to_load_from=None,
823                      tensor_name_in_ckpt=None,
824                      max_norm=None,
825                      trainable=True,
826                      use_safe_embedding_lookup=True):
827  """`_DenseColumn` that converts from sparse, categorical input.
828
829  Use this when your inputs are sparse, but you want to convert them to a dense
830  representation (e.g., to feed to a DNN).
831
832  Inputs must be a `_CategoricalColumn` created by any of the
833  `categorical_column_*` function. Here is an example of using
834  `embedding_column` with `DNNClassifier`:
835
836  ```python
837  video_id = categorical_column_with_identity(
838      key='video_id', num_buckets=1000000, default_value=0)
839  columns = [embedding_column(video_id, 9),...]
840
841  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
842
843  label_column = ...
844  def input_fn():
845    features = tf.io.parse_example(
846        ..., features=make_parse_example_spec(columns + [label_column]))
847    labels = features.pop(label_column.name)
848    return features, labels
849
850  estimator.train(input_fn=input_fn, steps=100)
851  ```
852
853  Here is an example using `embedding_column` with model_fn:
854
855  ```python
856  def model_fn(features, ...):
857    video_id = categorical_column_with_identity(
858        key='video_id', num_buckets=1000000, default_value=0)
859    columns = [embedding_column(video_id, 9),...]
860    dense_tensor = input_layer(features, columns)
861    # Form DNN layers, calculate loss, and return EstimatorSpec.
862    ...
863  ```
864
865  Args:
866    categorical_column: A `_CategoricalColumn` created by a
867      `categorical_column_with_*` function. This column produces the sparse IDs
868      that are inputs to the embedding lookup.
869    dimension: An integer specifying dimension of the embedding, must be > 0.
870    combiner: A string specifying how to reduce if there are multiple entries
871      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
872      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
873      with bag-of-words columns. Each of this can be thought as example level
874      normalizations on the column. For more information, see
875      `tf.embedding_lookup_sparse`.
876    initializer: A variable initializer function to be used in embedding
877      variable initialization. If not specified, defaults to
878      `tf.compat.v1.truncated_normal_initializer` with mean `0.0` and
879      standard deviation `1/sqrt(dimension)`.
880    ckpt_to_load_from: String representing checkpoint name/pattern from which to
881      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
882    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
883      which to restore the column weights. Required if `ckpt_to_load_from` is
884      not `None`.
885    max_norm: If not `None`, embedding values are l2-normalized to this value.
886    trainable: Whether or not the embedding is trainable. Default is True.
887    use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse
888      instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures
889      there are no empty rows and all weights and ids are positive at the
890      expense of extra compute cost. This only applies to rank 2 (NxM) shaped
891      input tensors. Defaults to true, consider turning off if the above checks
892      are not needed. Note that having empty rows will not trigger any error
893      though the output result might be 0 or omitted.
894
895  Returns:
896    `_DenseColumn` that converts from sparse input.
897
898  Raises:
899    ValueError: if `dimension` not > 0.
900    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
901      is specified.
902    ValueError: if `initializer` is specified and is not callable.
903    RuntimeError: If eager execution is enabled.
904  """
905  if (dimension is None) or (dimension < 1):
906    raise ValueError('Invalid dimension {}.'.format(dimension))
907  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
908    raise ValueError('Must specify both `ckpt_to_load_from` and '
909                     '`tensor_name_in_ckpt` or none of them.')
910
911  if (initializer is not None) and (not callable(initializer)):
912    raise ValueError('initializer must be callable if specified. '
913                     'Embedding of column_name: {}'.format(
914                         categorical_column.name))
915  if initializer is None:
916    initializer = init_ops.truncated_normal_initializer(
917        mean=0.0, stddev=1 / math.sqrt(dimension))
918
919  embedding_shape = categorical_column._num_buckets, dimension  # pylint: disable=protected-access
920
921  def _creator(weight_collections, scope):
922    embedding_column_layer = _EmbeddingColumnLayer(
923        embedding_shape=embedding_shape,
924        initializer=initializer,
925        weight_collections=weight_collections,
926        trainable=trainable,
927        name='embedding_column_layer')
928    return embedding_column_layer(None, scope=scope)  # pylint: disable=not-callable
929
930  return _EmbeddingColumn(
931      categorical_column=categorical_column,
932      dimension=dimension,
933      combiner=combiner,
934      layer_creator=_creator,
935      ckpt_to_load_from=ckpt_to_load_from,
936      tensor_name_in_ckpt=tensor_name_in_ckpt,
937      max_norm=max_norm,
938      trainable=trainable,
939      use_safe_embedding_lookup=use_safe_embedding_lookup)
940
941
942def _numeric_column(key,
943                    shape=(1,),
944                    default_value=None,
945                    dtype=dtypes.float32,
946                    normalizer_fn=None):
947  """Represents real valued or numerical features.
948
949  Example:
950
951  ```python
952  price = numeric_column('price')
953  columns = [price, ...]
954  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
955  dense_tensor = input_layer(features, columns)
956
957  # or
958  bucketized_price = bucketized_column(price, boundaries=[...])
959  columns = [bucketized_price, ...]
960  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
961  linear_prediction = linear_model(features, columns)
962  ```
963
964  Args:
965    key: A unique string identifying the input feature. It is used as the
966      column name and the dictionary key for feature parsing configs, feature
967      `Tensor` objects, and feature columns.
968    shape: An iterable of integers specifies the shape of the `Tensor`. An
969      integer can be given which means a single dimension `Tensor` with given
970      width. The `Tensor` representing the column will have the shape of
971      [batch_size] + `shape`.
972    default_value: A single value compatible with `dtype` or an iterable of
973      values compatible with `dtype` which the column takes on during
974      `tf.Example` parsing if data is missing. A default value of `None` will
975      cause `tf.io.parse_example` to fail if an example does not contain this
976      column. If a single value is provided, the same value will be applied as
977      the default value for every item. If an iterable of values is provided,
978      the shape of the `default_value` should be equal to the given `shape`.
979    dtype: defines the type of values. Default value is `tf.float32`. Must be a
980      non-quantized, real integer or floating point type.
981    normalizer_fn: If not `None`, a function that can be used to normalize the
982      value of the tensor after `default_value` is applied for parsing.
983      Normalizer function takes the input `Tensor` as its argument, and returns
984      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
985      even though the most common use case of this function is normalization, it
986      can be used for any kind of Tensorflow transformations.
987
988  Returns:
989    A `_NumericColumn`.
990
991  Raises:
992    TypeError: if any dimension in shape is not an int
993    ValueError: if any dimension in shape is not a positive integer
994    TypeError: if `default_value` is an iterable but not compatible with `shape`
995    TypeError: if `default_value` is not compatible with `dtype`.
996    ValueError: if `dtype` is not convertible to `tf.float32`.
997  """
998  shape = _check_shape(shape, key)
999  if not (dtype.is_integer or dtype.is_floating):
1000    raise ValueError('dtype must be convertible to float. '
1001                     'dtype: {}, key: {}'.format(dtype, key))
1002  default_value = fc_utils.check_default_value(
1003      shape, default_value, dtype, key)
1004
1005  if normalizer_fn is not None and not callable(normalizer_fn):
1006    raise TypeError(
1007        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
1008
1009  fc_utils.assert_key_is_string(key)
1010  return _NumericColumn(
1011      key,
1012      shape=shape,
1013      default_value=default_value,
1014      dtype=dtype,
1015      normalizer_fn=normalizer_fn)
1016
1017
1018def _bucketized_column(source_column, boundaries):
1019  """Represents discretized dense input.
1020
1021  Buckets include the left boundary, and exclude the right boundary. Namely,
1022  `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
1023  `[1., 2.)`, and `[2., +inf)`.
1024
1025  For example, if the inputs are
1026
1027  ```python
1028  boundaries = [0, 10, 100]
1029  input tensor = [[-5, 10000]
1030                  [150,   10]
1031                  [5,    100]]
1032  ```
1033
1034  then the output will be
1035
1036  ```python
1037  output = [[0, 3]
1038            [3, 2]
1039            [1, 3]]
1040  ```
1041
1042  Example:
1043
1044  ```python
1045  price = numeric_column('price')
1046  bucketized_price = bucketized_column(price, boundaries=[...])
1047  columns = [bucketized_price, ...]
1048  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1049  linear_prediction = linear_model(features, columns)
1050
1051  # or
1052  columns = [bucketized_price, ...]
1053  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1054  dense_tensor = input_layer(features, columns)
1055  ```
1056
1057  A `bucketized_column` can also be crossed with another categorical column
1058  using `crossed_column`:
1059
1060  ```python
1061  price = numeric_column('price')
1062  # bucketized_column converts numerical feature to a categorical one.
1063  bucketized_price = bucketized_column(price, boundaries=[...])
1064  # 'keywords' is a string feature.
1065  price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K)
1066  columns = [price_x_keywords, ...]
1067  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1068  linear_prediction = linear_model(features, columns)
1069  ```
1070
1071  Args:
1072    source_column: A one-dimensional dense column which is generated with
1073      `numeric_column`.
1074    boundaries: A sorted list or tuple of floats specifying the boundaries.
1075
1076  Returns:
1077    A `_BucketizedColumn`.
1078
1079  Raises:
1080    ValueError: If `source_column` is not a numeric column, or if it is not
1081      one-dimensional.
1082    ValueError: If `boundaries` is not a sorted list or tuple.
1083  """
1084  if not isinstance(source_column, _NumericColumn):
1085    raise ValueError(
1086        'source_column must be a column generated with numeric_column(). '
1087        'Given: {}'.format(source_column))
1088  if len(source_column.shape) > 1:
1089    raise ValueError(
1090        'source_column must be one-dimensional column. '
1091        'Given: {}'.format(source_column))
1092  if (not boundaries or
1093      not (isinstance(boundaries, list) or isinstance(boundaries, tuple))):
1094    raise ValueError('boundaries must be a sorted list.')
1095  for i in range(len(boundaries) - 1):
1096    if boundaries[i] >= boundaries[i + 1]:
1097      raise ValueError('boundaries must be a sorted list.')
1098  return _BucketizedColumn(source_column, tuple(boundaries))
1099
1100
1101def _categorical_column_with_hash_bucket(key,
1102                                         hash_bucket_size,
1103                                         dtype=dtypes.string):
1104  """Represents sparse feature where ids are set by hashing.
1105
1106  Use this when your sparse features are in string or integer format, and you
1107  want to distribute your inputs into a finite number of buckets by hashing.
1108  output_id = Hash(input_feature_string) % bucket_size for string type input.
1109  For int type input, the value is converted to its string representation first
1110  and then hashed by the same formula.
1111
1112  For input dictionary `features`, `features[key]` is either `Tensor` or
1113  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1114  and `''` for string, which will be dropped by this feature column.
1115
1116  Example:
1117
1118  ```python
1119  keywords = categorical_column_with_hash_bucket("keywords", 10K)
1120  columns = [keywords, ...]
1121  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1122  linear_prediction = linear_model(features, columns)
1123
1124  # or
1125  keywords_embedded = embedding_column(keywords, 16)
1126  columns = [keywords_embedded, ...]
1127  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1128  dense_tensor = input_layer(features, columns)
1129  ```
1130
1131  Args:
1132    key: A unique string identifying the input feature. It is used as the
1133      column name and the dictionary key for feature parsing configs, feature
1134      `Tensor` objects, and feature columns.
1135    hash_bucket_size: An int > 1. The number of buckets.
1136    dtype: The type of features. Only string and integer types are supported.
1137
1138  Returns:
1139    A `_HashedCategoricalColumn`.
1140
1141  Raises:
1142    ValueError: `hash_bucket_size` is not greater than 1.
1143    ValueError: `dtype` is neither string nor integer.
1144  """
1145  if hash_bucket_size is None:
1146    raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key))
1147
1148  if hash_bucket_size < 1:
1149    raise ValueError('hash_bucket_size must be at least 1. '
1150                     'hash_bucket_size: {}, key: {}'.format(
1151                         hash_bucket_size, key))
1152
1153  fc_utils.assert_key_is_string(key)
1154  fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1155
1156  return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
1157
1158
1159def _categorical_column_with_vocabulary_file(key,
1160                                             vocabulary_file,
1161                                             vocabulary_size=None,
1162                                             num_oov_buckets=0,
1163                                             default_value=None,
1164                                             dtype=dtypes.string):
1165  """A `_CategoricalColumn` with a vocabulary file.
1166
1167  Use this when your inputs are in string or integer format, and you have a
1168  vocabulary file that maps each value to an integer ID. By default,
1169  out-of-vocabulary values are ignored. Use either (but not both) of
1170  `num_oov_buckets` and `default_value` to specify how to include
1171  out-of-vocabulary values.
1172
1173  For input dictionary `features`, `features[key]` is either `Tensor` or
1174  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1175  and `''` for string, which will be dropped by this feature column.
1176
1177  Example with `num_oov_buckets`:
1178  File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state
1179  abbreviation. All inputs with values in that file are assigned an ID 0-49,
1180  corresponding to its line number. All other values are hashed and assigned an
1181  ID 50-54.
1182
1183  ```python
1184  states = categorical_column_with_vocabulary_file(
1185      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
1186      num_oov_buckets=5)
1187  columns = [states, ...]
1188  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1189  linear_prediction = linear_model(features, columns)
1190  ```
1191
1192  Example with `default_value`:
1193  File '/us/states.txt' contains 51 lines - the first line is 'XX', and the
1194  other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
1195  in input, and other values missing from the file, will be assigned ID 0. All
1196  others are assigned the corresponding line number 1-50.
1197
1198  ```python
1199  states = categorical_column_with_vocabulary_file(
1200      key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
1201      default_value=0)
1202  columns = [states, ...]
1203  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1204  linear_prediction, _, _ = linear_model(features, columns)
1205  ```
1206
1207  And to make an embedding with either:
1208
1209  ```python
1210  columns = [embedding_column(states, 3),...]
1211  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1212  dense_tensor = input_layer(features, columns)
1213  ```
1214
1215  Args:
1216    key: A unique string identifying the input feature. It is used as the
1217      column name and the dictionary key for feature parsing configs, feature
1218      `Tensor` objects, and feature columns.
1219    vocabulary_file: The vocabulary file name.
1220    vocabulary_size: Number of the elements in the vocabulary. This must be no
1221      greater than length of `vocabulary_file`, if less than length, later
1222      values are ignored. If None, it is set to the length of `vocabulary_file`.
1223    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
1224      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
1225      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
1226      the input value. A positive `num_oov_buckets` can not be specified with
1227      `default_value`.
1228    default_value: The integer ID value to return for out-of-vocabulary feature
1229      values, defaults to `-1`. This can not be specified with a positive
1230      `num_oov_buckets`.
1231    dtype: The type of features. Only string and integer types are supported.
1232
1233  Returns:
1234    A `_CategoricalColumn` with a vocabulary file.
1235
1236  Raises:
1237    ValueError: `vocabulary_file` is missing or cannot be opened.
1238    ValueError: `vocabulary_size` is missing or < 1.
1239    ValueError: `num_oov_buckets` is a negative integer.
1240    ValueError: `num_oov_buckets` and `default_value` are both specified.
1241    ValueError: `dtype` is neither string nor integer.
1242  """
1243  if not vocabulary_file:
1244    raise ValueError('Missing vocabulary_file in {}.'.format(key))
1245
1246  if vocabulary_size is None:
1247    if not gfile.Exists(vocabulary_file):
1248      raise ValueError('vocabulary_file in {} does not exist.'.format(key))
1249
1250    with gfile.GFile(vocabulary_file) as f:
1251      vocabulary_size = sum(1 for _ in f)
1252    logging.info(
1253        'vocabulary_size = %d in %s is inferred from the number of elements '
1254        'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)
1255
1256  # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.
1257  if vocabulary_size < 1:
1258    raise ValueError('Invalid vocabulary_size in {}.'.format(key))
1259  if num_oov_buckets:
1260    if default_value is not None:
1261      raise ValueError(
1262          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
1263              key))
1264    if num_oov_buckets < 0:
1265      raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
1266          num_oov_buckets, key))
1267  fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1268  fc_utils.assert_key_is_string(key)
1269  return _VocabularyFileCategoricalColumn(
1270      key=key,
1271      vocabulary_file=vocabulary_file,
1272      vocabulary_size=vocabulary_size,
1273      num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,
1274      default_value=-1 if default_value is None else default_value,
1275      dtype=dtype)
1276
1277
1278def _categorical_column_with_vocabulary_list(key,
1279                                             vocabulary_list,
1280                                             dtype=None,
1281                                             default_value=-1,
1282                                             num_oov_buckets=0):
1283  """A `_CategoricalColumn` with in-memory vocabulary.
1284
1285  Use this when your inputs are in string or integer format, and you have an
1286  in-memory vocabulary mapping each value to an integer ID. By default,
1287  out-of-vocabulary values are ignored. Use either (but not both) of
1288  `num_oov_buckets` and `default_value` to specify how to include
1289  out-of-vocabulary values.
1290
1291  For input dictionary `features`, `features[key]` is either `Tensor` or
1292  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1293  and `''` for string, which will be dropped by this feature column.
1294
1295  Example with `num_oov_buckets`:
1296  In the following example, each input in `vocabulary_list` is assigned an ID
1297  0-3 corresponding to its index (e.g., input 'B' produces output 2). All other
1298  inputs are hashed and assigned an ID 4-5.
1299
1300  ```python
1301  colors = categorical_column_with_vocabulary_list(
1302      key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
1303      num_oov_buckets=2)
1304  columns = [colors, ...]
1305  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1306  linear_prediction, _, _ = linear_model(features, columns)
1307  ```
1308
1309  Example with `default_value`:
1310  In the following example, each input in `vocabulary_list` is assigned an ID
1311  0-4 corresponding to its index (e.g., input 'B' produces output 3). All other
1312  inputs are assigned `default_value` 0.
1313
1314
1315  ```python
1316  colors = categorical_column_with_vocabulary_list(
1317      key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0)
1318  columns = [colors, ...]
1319  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1320  linear_prediction, _, _ = linear_model(features, columns)
1321  ```
1322
1323  And to make an embedding with either:
1324
1325  ```python
1326  columns = [embedding_column(colors, 3),...]
1327  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1328  dense_tensor = input_layer(features, columns)
1329  ```
1330
1331  Args:
1332    key: A unique string identifying the input feature. It is used as the
1333      column name and the dictionary key for feature parsing configs, feature
1334      `Tensor` objects, and feature columns.
1335    vocabulary_list: An ordered iterable defining the vocabulary. Each feature
1336      is mapped to the index of its value (if present) in `vocabulary_list`.
1337      Must be castable to `dtype`.
1338    dtype: The type of features. Only string and integer types are supported.
1339      If `None`, it will be inferred from `vocabulary_list`.
1340    default_value: The integer ID value to return for out-of-vocabulary feature
1341      values, defaults to `-1`. This can not be specified with a positive
1342      `num_oov_buckets`.
1343    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
1344      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
1345      `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
1346      hash of the input value. A positive `num_oov_buckets` can not be specified
1347      with `default_value`.
1348
1349  Returns:
1350    A `_CategoricalColumn` with in-memory vocabulary.
1351
1352  Raises:
1353    ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
1354    ValueError: `num_oov_buckets` is a negative integer.
1355    ValueError: `num_oov_buckets` and `default_value` are both specified.
1356    ValueError: if `dtype` is not integer or string.
1357  """
1358  if (vocabulary_list is None) or (len(vocabulary_list) < 1):
1359    raise ValueError(
1360        'vocabulary_list {} must be non-empty, column_name: {}'.format(
1361            vocabulary_list, key))
1362  if len(set(vocabulary_list)) != len(vocabulary_list):
1363    raise ValueError(
1364        'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
1365            vocabulary_list, key))
1366  vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)
1367  if num_oov_buckets:
1368    if default_value != -1:
1369      raise ValueError(
1370          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
1371              key))
1372    if num_oov_buckets < 0:
1373      raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
1374          num_oov_buckets, key))
1375  fc_utils.assert_string_or_int(
1376      vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
1377  if dtype is None:
1378    dtype = vocabulary_dtype
1379  elif dtype.is_integer != vocabulary_dtype.is_integer:
1380    raise ValueError(
1381        'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
1382            dtype, vocabulary_dtype, key))
1383  fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1384  fc_utils.assert_key_is_string(key)
1385
1386  return _VocabularyListCategoricalColumn(
1387      key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype,
1388      default_value=default_value, num_oov_buckets=num_oov_buckets)
1389
1390
1391def _categorical_column_with_identity(key, num_buckets, default_value=None):
1392  """A `_CategoricalColumn` that returns identity values.
1393
1394  Use this when your inputs are integers in the range `[0, num_buckets)`, and
1395  you want to use the input value itself as the categorical ID. Values outside
1396  this range will result in `default_value` if specified, otherwise it will
1397  fail.
1398
1399  Typically, this is used for contiguous ranges of integer indexes, but
1400  it doesn't have to be. This might be inefficient, however, if many of IDs
1401  are unused. Consider `categorical_column_with_hash_bucket` in that case.
1402
1403  For input dictionary `features`, `features[key]` is either `Tensor` or
1404  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1405  and `''` for string, which will be dropped by this feature column.
1406
1407  In the following examples, each input in the range `[0, 1000000)` is assigned
1408  the same value. All other inputs are assigned `default_value` 0. Note that a
1409  literal 0 in inputs will result in the same default ID.
1410
1411  Linear model:
1412
1413  ```python
1414  video_id = categorical_column_with_identity(
1415      key='video_id', num_buckets=1000000, default_value=0)
1416  columns = [video_id, ...]
1417  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1418  linear_prediction, _, _ = linear_model(features, columns)
1419  ```
1420
1421  Embedding for a DNN model:
1422
1423  ```python
1424  columns = [embedding_column(video_id, 9),...]
1425  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1426  dense_tensor = input_layer(features, columns)
1427  ```
1428
1429  Args:
1430    key: A unique string identifying the input feature. It is used as the
1431      column name and the dictionary key for feature parsing configs, feature
1432      `Tensor` objects, and feature columns.
1433    num_buckets: Range of inputs and outputs is `[0, num_buckets)`.
1434    default_value: If set, values outside of range `[0, num_buckets)` will
1435      be replaced with this value. If not set, values >= num_buckets will
1436      cause a failure while values < 0 will be dropped.
1437
1438  Returns:
1439    A `_CategoricalColumn` that returns identity values.
1440
1441  Raises:
1442    ValueError: if `num_buckets` is less than one.
1443    ValueError: if `default_value` is not in range `[0, num_buckets)`.
1444  """
1445  if num_buckets < 1:
1446    raise ValueError(
1447        'num_buckets {} < 1, column_name {}'.format(num_buckets, key))
1448  if (default_value is not None) and (
1449      (default_value < 0) or (default_value >= num_buckets)):
1450    raise ValueError(
1451        'default_value {} not in range [0, {}), column_name {}'.format(
1452            default_value, num_buckets, key))
1453  fc_utils.assert_key_is_string(key)
1454  return _IdentityCategoricalColumn(
1455      key=key, num_buckets=num_buckets, default_value=default_value)
1456
1457
1458def _indicator_column(categorical_column):
1459  """Represents multi-hot representation of given categorical column.
1460
1461  - For DNN model, `indicator_column` can be used to wrap any
1462    `categorical_column_*` (e.g., to feed to DNN). Consider to Use
1463    `embedding_column` if the number of buckets/unique(values) are large.
1464
1465  - For Wide (aka linear) model, `indicator_column` is the internal
1466    representation for categorical column when passing categorical column
1467    directly (as any element in feature_columns) to `linear_model`. See
1468    `linear_model` for details.
1469
1470  ```python
1471  name = indicator_column(categorical_column_with_vocabulary_list(
1472      'name', ['bob', 'george', 'wanda'])
1473  columns = [name, ...]
1474  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1475  dense_tensor = input_layer(features, columns)
1476
1477  dense_tensor == [[1, 0, 0]]  # If "name" bytes_list is ["bob"]
1478  dense_tensor == [[1, 0, 1]]  # If "name" bytes_list is ["bob", "wanda"]
1479  dense_tensor == [[2, 0, 0]]  # If "name" bytes_list is ["bob", "bob"]
1480  ```
1481
1482  Args:
1483    categorical_column: A `_CategoricalColumn` which is created by
1484      `categorical_column_with_*` or `crossed_column` functions.
1485
1486  Returns:
1487    An `_IndicatorColumn`.
1488  """
1489  return _IndicatorColumn(categorical_column)
1490
1491
1492def _weighted_categorical_column(categorical_column,
1493                                 weight_feature_key,
1494                                 dtype=dtypes.float32):
1495  """Applies weight values to a `_CategoricalColumn`.
1496
1497  Use this when each of your sparse inputs has both an ID and a value. For
1498  example, if you're representing text documents as a collection of word
1499  frequencies, you can provide 2 parallel sparse input features ('terms' and
1500  'frequencies' below).
1501
1502  Example:
1503
1504  Input `tf.Example` objects:
1505
1506  ```proto
1507  [
1508    features {
1509      feature {
1510        key: "terms"
1511        value {bytes_list {value: "very" value: "model"}}
1512      }
1513      feature {
1514        key: "frequencies"
1515        value {float_list {value: 0.3 value: 0.1}}
1516      }
1517    },
1518    features {
1519      feature {
1520        key: "terms"
1521        value {bytes_list {value: "when" value: "course" value: "human"}}
1522      }
1523      feature {
1524        key: "frequencies"
1525        value {float_list {value: 0.4 value: 0.1 value: 0.2}}
1526      }
1527    }
1528  ]
1529  ```
1530
1531  ```python
1532  categorical_column = categorical_column_with_hash_bucket(
1533      column_name='terms', hash_bucket_size=1000)
1534  weighted_column = weighted_categorical_column(
1535      categorical_column=categorical_column, weight_feature_key='frequencies')
1536  columns = [weighted_column, ...]
1537  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1538  linear_prediction, _, _ = linear_model(features, columns)
1539  ```
1540
1541  This assumes the input dictionary contains a `SparseTensor` for key
1542  'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have
1543  the same indices and dense shape.
1544
1545  Args:
1546    categorical_column: A `_CategoricalColumn` created by
1547      `categorical_column_with_*` functions.
1548    weight_feature_key: String key for weight values.
1549    dtype: Type of weights, such as `tf.float32`. Only float and integer weights
1550      are supported.
1551
1552  Returns:
1553    A `_CategoricalColumn` composed of two sparse features: one represents id,
1554    the other represents weight (value) of the id feature in that example.
1555
1556  Raises:
1557    ValueError: if `dtype` is not convertible to float.
1558  """
1559  if (dtype is None) or not (dtype.is_integer or dtype.is_floating):
1560    raise ValueError('dtype {} is not convertible to float.'.format(dtype))
1561  return _WeightedCategoricalColumn(
1562      categorical_column=categorical_column,
1563      weight_feature_key=weight_feature_key,
1564      dtype=dtype)
1565
1566
1567def _crossed_column(keys, hash_bucket_size, hash_key=None):
1568  """Returns a column for performing crosses of categorical features.
1569
1570  Crossed features will be hashed according to `hash_bucket_size`. Conceptually,
1571  the transformation can be thought of as:
1572    Hash(cartesian product of features) % `hash_bucket_size`
1573
1574  For example, if the input features are:
1575
1576  * SparseTensor referred by first key:
1577
1578    ```python
1579    shape = [2, 2]
1580    {
1581        [0, 0]: "a"
1582        [1, 0]: "b"
1583        [1, 1]: "c"
1584    }
1585    ```
1586
1587  * SparseTensor referred by second key:
1588
1589    ```python
1590    shape = [2, 1]
1591    {
1592        [0, 0]: "d"
1593        [1, 0]: "e"
1594    }
1595    ```
1596
1597  then crossed feature will look like:
1598
1599  ```python
1600   shape = [2, 2]
1601  {
1602      [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size
1603      [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size
1604      [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size
1605  }
1606  ```
1607
1608  Here is an example to create a linear model with crosses of string features:
1609
1610  ```python
1611  keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K)
1612  columns = [keywords_x_doc_terms, ...]
1613  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1614  linear_prediction = linear_model(features, columns)
1615  ```
1616
1617  You could also use vocabulary lookup before crossing:
1618
1619  ```python
1620  keywords = categorical_column_with_vocabulary_file(
1621      'keywords', '/path/to/vocabulary/file', vocabulary_size=1K)
1622  keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K)
1623  columns = [keywords_x_doc_terms, ...]
1624  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1625  linear_prediction = linear_model(features, columns)
1626  ```
1627
1628  If an input feature is of numeric type, you can use
1629  `categorical_column_with_identity`, or `bucketized_column`, as in the example:
1630
1631  ```python
1632  # vertical_id is an integer categorical feature.
1633  vertical_id = categorical_column_with_identity('vertical_id', 10K)
1634  price = numeric_column('price')
1635  # bucketized_column converts numerical feature to a categorical one.
1636  bucketized_price = bucketized_column(price, boundaries=[...])
1637  vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
1638  columns = [vertical_id_x_price, ...]
1639  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1640  linear_prediction = linear_model(features, columns)
1641  ```
1642
1643  To use crossed column in DNN model, you need to add it in an embedding column
1644  as in this example:
1645
1646  ```python
1647  vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
1648  vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10)
1649  dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...])
1650  ```
1651
1652  Args:
1653    keys: An iterable identifying the features to be crossed. Each element can
1654      be either:
1655      * string: Will use the corresponding feature which must be of string type.
1656      * `_CategoricalColumn`: Will use the transformed tensor produced by this
1657        column. Does not support hashed categorical column.
1658    hash_bucket_size: An int > 1. The number of buckets.
1659    hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
1660      function to combine the crosses fingerprints on SparseCrossOp (optional).
1661
1662  Returns:
1663    A `_CrossedColumn`.
1664
1665  Raises:
1666    ValueError: If `len(keys) < 2`.
1667    ValueError: If any of the keys is neither a string nor `_CategoricalColumn`.
1668    ValueError: If any of the keys is `_HashedCategoricalColumn`.
1669    ValueError: If `hash_bucket_size < 1`.
1670  """
1671  if not hash_bucket_size or hash_bucket_size < 1:
1672    raise ValueError('hash_bucket_size must be > 1. '
1673                     'hash_bucket_size: {}'.format(hash_bucket_size))
1674  if not keys or len(keys) < 2:
1675    raise ValueError(
1676        'keys must be a list with length > 1. Given: {}'.format(keys))
1677  for key in keys:
1678    if (not isinstance(key, six.string_types) and
1679        not isinstance(key, _CategoricalColumn)):
1680      raise ValueError(
1681          'Unsupported key type. All keys must be either string, or '
1682          'categorical column except _HashedCategoricalColumn. '
1683          'Given: {}'.format(key))
1684    if isinstance(key, _HashedCategoricalColumn):
1685      raise ValueError(
1686          'categorical_column_with_hash_bucket is not supported for crossing. '
1687          'Hashing before crossing will increase probability of collision. '
1688          'Instead, use the feature name as a string. Given: {}'.format(key))
1689  return _CrossedColumn(
1690      keys=tuple(keys), hash_bucket_size=hash_bucket_size,
1691      hash_key=hash_key)
1692
1693
1694# TODO(rohanj): Clearly define semantics of this layer.
1695class _EmbeddingColumnLayer(base.Layer):
1696  """A layer that stores all the state required for a embedding column."""
1697
1698  def __init__(self,
1699               embedding_shape,
1700               initializer,
1701               weight_collections=None,
1702               trainable=True,
1703               name=None,
1704               **kwargs):
1705    """Constructor.
1706
1707    Args:
1708      embedding_shape: Shape of the embedding variable used for lookup.
1709      initializer: A variable initializer function to be used in embedding
1710        variable initialization.
1711      weight_collections: A list of collection names to which the Variable will
1712        be added. Note that, variables will also be added to collections
1713        `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
1714      trainable: If `True` also add the variable to the graph collection
1715        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
1716      name: Name of the layer
1717      **kwargs: keyword named properties.
1718    """
1719    super(_EmbeddingColumnLayer, self).__init__(
1720        trainable=trainable, name=name, **kwargs)
1721    self._embedding_shape = embedding_shape
1722    self._initializer = initializer
1723    self._weight_collections = weight_collections
1724
1725  def set_weight_collections(self, weight_collections):
1726    """Sets the weight collections for the layer.
1727
1728    Args:
1729      weight_collections: A list of collection names to which the Variable will
1730        be added.
1731    """
1732    self._weight_collections = weight_collections
1733
1734  def build(self, _):
1735    self._embedding_weight_var = self.add_variable(
1736        name='embedding_weights',
1737        shape=self._embedding_shape,
1738        dtype=dtypes.float32,
1739        initializer=self._initializer,
1740        trainable=self.trainable)
1741    if self._weight_collections and not context.executing_eagerly():
1742      _add_to_collections(self._embedding_weight_var, self._weight_collections)
1743    self.built = True
1744
1745  def call(self, _):
1746    return self._embedding_weight_var
1747
1748
1749@six.add_metaclass(abc.ABCMeta)
1750class _FeatureColumn(object):
1751  """Represents a feature column abstraction.
1752
1753  WARNING: Do not subclass this layer unless you know what you are doing:
1754  the API is subject to future changes.
1755
1756  To distinguish the concept of a feature family and a specific binary feature
1757  within a family, we refer to a feature family like "country" as a feature
1758  column. Following is an example feature in a `tf.Example` format:
1759    {key: "country",  value: [ "US" ]}
1760  In this example the value of feature is "US" and "country" refers to the
1761  column of the feature.
1762
1763  This class is an abstract class. User should not create instances of this.
1764  """
1765
1766  @abc.abstractproperty
1767  def name(self):
1768    """Returns string. Used for naming and for name_scope."""
1769    pass
1770
1771  def __lt__(self, other):
1772    """Allows feature columns to be sorted in Python 3 as they are in Python 2.
1773
1774    Feature columns need to occasionally be sortable, for example when used as
1775    keys in a features dictionary passed to a layer.
1776
1777    In CPython, `__lt__` must be defined for all objects in the
1778    sequence being sorted. If any objects do not have an `__lt__` compatible
1779    with feature column objects (such as strings), then CPython will fall back
1780    to using the `__gt__` method below.
1781    https://docs.python.org/3/library/stdtypes.html#list.sort
1782
1783    Args:
1784      other: The other object to compare to.
1785
1786    Returns:
1787      True if the string representation of this object is lexicographically less
1788      than the string representation of `other`. For FeatureColumn objects,
1789      this looks like "<__main__.FeatureColumn object at 0xa>".
1790    """
1791    return str(self) < str(other)
1792
1793  def __gt__(self, other):
1794    """Allows feature columns to be sorted in Python 3 as they are in Python 2.
1795
1796    Feature columns need to occasionally be sortable, for example when used as
1797    keys in a features dictionary passed to a layer.
1798
1799    `__gt__` is called when the "other" object being compared during the sort
1800    does not have `__lt__` defined.
1801    Example:
1802    ```
1803    # __lt__ only class
1804    class A():
1805      def __lt__(self, other): return str(self) < str(other)
1806
1807    a = A()
1808    a < "b" # True
1809    "0" < a # Error
1810
1811    # __lt__ and __gt__ class
1812    class B():
1813      def __lt__(self, other): return str(self) < str(other)
1814      def __gt__(self, other): return str(self) > str(other)
1815
1816    b = B()
1817    b < "c" # True
1818    "0" < b # True
1819    ```
1820
1821
1822    Args:
1823      other: The other object to compare to.
1824
1825    Returns:
1826      True if the string representation of this object is lexicographically
1827      greater than the string representation of `other`. For FeatureColumn
1828      objects, this looks like "<__main__.FeatureColumn object at 0xa>".
1829    """
1830    return str(self) > str(other)
1831
1832  @property
1833  def _var_scope_name(self):
1834    """Returns string. Used for variable_scope. Defaults to self.name."""
1835    return self.name
1836
1837  @abc.abstractmethod
1838  def _transform_feature(self, inputs):
1839    """Returns intermediate representation (usually a `Tensor`).
1840
1841    Uses `inputs` to create an intermediate representation (usually a `Tensor`)
1842    that other feature columns can use.
1843
1844    Example usage of `inputs`:
1845    Let's say a Feature column depends on raw feature ('raw') and another
1846    `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will
1847    be used as follows:
1848
1849    ```python
1850    raw_tensor = inputs.get('raw')
1851    fc_tensor = inputs.get(input_fc)
1852    ```
1853
1854    Args:
1855      inputs: A `_LazyBuilder` object to access inputs.
1856
1857    Returns:
1858      Transformed feature `Tensor`.
1859    """
1860    pass
1861
1862  @abc.abstractproperty
1863  def _parse_example_spec(self):
1864    """Returns a `tf.Example` parsing spec as dict.
1865
1866    It is used for get_parsing_spec for `tf.io.parse_example`. Returned spec is
1867    a dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
1868    supported objects. Please check documentation of `tf.io.parse_example` for
1869    all supported spec objects.
1870
1871    Let's say a Feature column depends on raw feature ('raw') and another
1872    `_FeatureColumn` (input_fc). One possible implementation of
1873    _parse_example_spec is as follows:
1874
1875    ```python
1876    spec = {'raw': tf.io.FixedLenFeature(...)}
1877    spec.update(input_fc._parse_example_spec)
1878    return spec
1879    ```
1880    """
1881    pass
1882
1883  def _reset_config(self):
1884    """Resets the configuration in the column.
1885
1886    Some feature columns e.g. embedding or shared embedding columns might
1887    have some state that is needed to be reset sometimes. Use this method
1888    in that scenario.
1889    """
1890
1891
1892class _DenseColumn(_FeatureColumn):
1893  """Represents a column which can be represented as `Tensor`.
1894
1895  WARNING: Do not subclass this layer unless you know what you are doing:
1896  the API is subject to future changes.
1897
1898  Some examples of this type are: numeric_column, embedding_column,
1899  indicator_column.
1900  """
1901
1902  @abc.abstractproperty
1903  def _variable_shape(self):
1904    """`TensorShape` of `_get_dense_tensor`, without batch dimension."""
1905    pass
1906
1907  @abc.abstractmethod
1908  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
1909    """Returns a `Tensor`.
1910
1911    The output of this function will be used by model-builder-functions. For
1912    example the pseudo code of `input_layer` will be like:
1913
1914    ```python
1915    def input_layer(features, feature_columns, ...):
1916      outputs = [fc._get_dense_tensor(...) for fc in feature_columns]
1917      return tf.concat(outputs)
1918    ```
1919
1920    Args:
1921      inputs: A `_LazyBuilder` object to access inputs.
1922      weight_collections: List of graph collections to which Variables (if any
1923        will be created) are added.
1924      trainable: If `True` also add variables to the graph collection
1925        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
1926
1927    Returns:
1928      `Tensor` of shape [batch_size] + `_variable_shape`.
1929    """
1930    pass
1931
1932
1933def _create_weighted_sum(column,
1934                         builder,
1935                         units,
1936                         sparse_combiner,
1937                         weight_collections,
1938                         trainable,
1939                         weight_var=None):
1940  """Creates a weighted sum for a dense/categorical column for linear_model."""
1941  if isinstance(column, _CategoricalColumn):
1942    return _create_categorical_column_weighted_sum(
1943        column=column,
1944        builder=builder,
1945        units=units,
1946        sparse_combiner=sparse_combiner,
1947        weight_collections=weight_collections,
1948        trainable=trainable,
1949        weight_var=weight_var)
1950  else:
1951    return _create_dense_column_weighted_sum(
1952        column=column,
1953        builder=builder,
1954        units=units,
1955        weight_collections=weight_collections,
1956        trainable=trainable,
1957        weight_var=weight_var)
1958
1959
1960def _create_dense_column_weighted_sum(column,
1961                                      builder,
1962                                      units,
1963                                      weight_collections,
1964                                      trainable,
1965                                      weight_var=None):
1966  """Create a weighted sum of a dense column for linear_model."""
1967  tensor = column._get_dense_tensor(  # pylint: disable=protected-access
1968      builder,
1969      weight_collections=weight_collections,
1970      trainable=trainable)
1971  num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
1972  batch_size = array_ops.shape(tensor)[0]
1973  tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
1974  if weight_var is not None:
1975    weight = weight_var
1976  else:
1977    weight = variable_scope.get_variable(
1978        name='weights',
1979        shape=[num_elements, units],
1980        initializer=init_ops.zeros_initializer(),
1981        trainable=trainable,
1982        collections=weight_collections)
1983  return math_ops.matmul(tensor, weight, name='weighted_sum')
1984
1985
1986class _CategoricalColumn(_FeatureColumn):
1987  """Represents a categorical feature.
1988
1989  WARNING: Do not subclass this layer unless you know what you are doing:
1990  the API is subject to future changes.
1991
1992  A categorical feature typically handled with a `tf.sparse.SparseTensor` of
1993  IDs.
1994  """
1995
1996  IdWeightPair = collections.namedtuple(  # pylint: disable=invalid-name
1997      'IdWeightPair', ['id_tensor', 'weight_tensor'])
1998
1999  @abc.abstractproperty
2000  def _num_buckets(self):
2001    """Returns number of buckets in this sparse feature."""
2002    pass
2003
2004  @abc.abstractmethod
2005  def _get_sparse_tensors(self,
2006                          inputs,
2007                          weight_collections=None,
2008                          trainable=None):
2009    """Returns an IdWeightPair.
2010
2011    `IdWeightPair` is a pair of `SparseTensor`s which represents ids and
2012    weights.
2013
2014    `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets`
2015    `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a
2016    `SparseTensor` of `float` or `None` to indicate all weights should be
2017    taken to be 1. If specified, `weight_tensor` must have exactly the same
2018    shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing
2019    output of a `VarLenFeature` which is a ragged matrix.
2020
2021    Args:
2022      inputs: A `LazyBuilder` as a cache to get input tensors required to
2023        create `IdWeightPair`.
2024      weight_collections: List of graph collections to which variables (if any
2025        will be created) are added.
2026      trainable: If `True` also add variables to the graph collection
2027        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.compat.v1.get_variable`).
2028    """
2029    pass
2030
2031
2032def _create_categorical_column_weighted_sum(column,
2033                                            builder,
2034                                            units,
2035                                            sparse_combiner,
2036                                            weight_collections,
2037                                            trainable,
2038                                            weight_var=None):
2039  # pylint: disable=g-doc-return-or-yield,g-doc-args
2040  """Create a weighted sum of a categorical column for linear_model.
2041
2042  Note to maintainer: As implementation details, the weighted sum is
2043  implemented via embedding_lookup_sparse toward efficiency. Mathematically,
2044  they are the same.
2045
2046  To be specific, conceptually, categorical column can be treated as multi-hot
2047  vector. Say:
2048
2049  ```python
2050    x = [0 0 1]  # categorical column input
2051    w = [a b c]  # weights
2052  ```
2053  The weighted sum is `c` in this case, which is same as `w[2]`.
2054
2055  Another example is
2056
2057  ```python
2058    x = [0 1 1]  # categorical column input
2059    w = [a b c]  # weights
2060  ```
2061  The weighted sum is `b + c` in this case, which is same as `w[2] + w[3]`.
2062
2063  For both cases, we can implement weighted sum via embedding_lookup with
2064  sparse_combiner = "sum".
2065  """
2066
2067  sparse_tensors = column._get_sparse_tensors(  # pylint: disable=protected-access
2068      builder,
2069      weight_collections=weight_collections,
2070      trainable=trainable)
2071  id_tensor = sparse_ops.sparse_reshape(sparse_tensors.id_tensor, [
2072      array_ops.shape(sparse_tensors.id_tensor)[0], -1
2073  ])
2074  weight_tensor = sparse_tensors.weight_tensor
2075  if weight_tensor is not None:
2076    weight_tensor = sparse_ops.sparse_reshape(
2077        weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
2078
2079  if weight_var is not None:
2080    weight = weight_var
2081  else:
2082    weight = variable_scope.get_variable(
2083        name='weights',
2084        shape=(column._num_buckets, units),  # pylint: disable=protected-access
2085        initializer=init_ops.zeros_initializer(),
2086        trainable=trainable,
2087        collections=weight_collections)
2088  return embedding_ops.safe_embedding_lookup_sparse(
2089      weight,
2090      id_tensor,
2091      sparse_weights=weight_tensor,
2092      combiner=sparse_combiner,
2093      name='weighted_sum')
2094
2095
2096class _SequenceDenseColumn(_FeatureColumn):
2097  """Represents dense sequence data."""
2098
2099  TensorSequenceLengthPair = collections.namedtuple(  # pylint: disable=invalid-name
2100      'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length'])
2101
2102  @abc.abstractmethod
2103  def _get_sequence_dense_tensor(
2104      self, inputs, weight_collections=None, trainable=None):
2105    """Returns a `TensorSequenceLengthPair`."""
2106    pass
2107
2108
2109class _LazyBuilder(object):
2110  """Handles caching of transformations while building the model.
2111
2112  `_FeatureColumn` specifies how to digest an input column to the network. Some
2113  feature columns require data transformations. This class caches those
2114  transformations.
2115
2116  Some features may be used in more than one place. For example, one can use a
2117  bucketized feature by itself and a cross with it. In that case we
2118  should create only one bucketization op instead of creating ops for each
2119  feature column separately. To handle re-use of transformed columns,
2120  `_LazyBuilder` caches all previously transformed columns.
2121
2122  Example:
2123  We're trying to use the following `_FeatureColumn`s:
2124
2125  ```python
2126  bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...)
2127  keywords = fc.categorical_column_with_hash_buckets("keywords", ...)
2128  age_X_keywords = fc.crossed_column([bucketized_age, "keywords"])
2129  ... = linear_model(features,
2130                          [bucketized_age, keywords, age_X_keywords]
2131  ```
2132
2133  If we transform each column independently, then we'll get duplication of
2134  bucketization (one for cross, one for bucketization itself).
2135  The `_LazyBuilder` eliminates this duplication.
2136  """
2137
2138  def __init__(self, features):
2139    """Creates a `_LazyBuilder`.
2140
2141    Args:
2142      features: A mapping from feature column to objects that are `Tensor` or
2143        `SparseTensor`, or can be converted to same via
2144        `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key
2145        signifies a base feature (not-transformed). A `_FeatureColumn` key
2146        means that this `Tensor` is the output of an existing `_FeatureColumn`
2147        which can be reused.
2148    """
2149    self._features = features.copy()
2150    self._feature_tensors = {}
2151
2152  def get(self, key):
2153    """Returns a `Tensor` for the given key.
2154
2155    A `str` key is used to access a base feature (not-transformed). When a
2156    `_FeatureColumn` is passed, the transformed feature is returned if it
2157    already exists, otherwise the given `_FeatureColumn` is asked to provide its
2158    transformed output, which is then cached.
2159
2160    Args:
2161      key: a `str` or a `_FeatureColumn`.
2162
2163    Returns:
2164      The transformed `Tensor` corresponding to the `key`.
2165
2166    Raises:
2167      ValueError: if key is not found or a transformed `Tensor` cannot be
2168        computed.
2169    """
2170    if key in self._feature_tensors:
2171      # FeatureColumn is already transformed or converted.
2172      return self._feature_tensors[key]
2173
2174    if key in self._features:
2175      feature_tensor = self._get_raw_feature_as_tensor(key)
2176      self._feature_tensors[key] = feature_tensor
2177      return feature_tensor
2178
2179    if isinstance(key, six.string_types):
2180      raise ValueError('Feature {} is not in features dictionary.'.format(key))
2181
2182    if not isinstance(key, _FeatureColumn):
2183      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
2184                      'Provided: {}'.format(key))
2185
2186    column = key
2187    logging.debug('Transforming feature_column %s.', column)
2188    transformed = column._transform_feature(self)  # pylint: disable=protected-access
2189    if transformed is None:
2190      raise ValueError('Column {} is not supported.'.format(column.name))
2191    self._feature_tensors[column] = transformed
2192    return transformed
2193
2194  def _get_raw_feature_as_tensor(self, key):
2195    """Gets the raw_feature (keyed by `key`) as `tensor`.
2196
2197    The raw feature is converted to (sparse) tensor and maybe expand dim.
2198
2199    For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if
2200    the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will
2201    error out as it is not supported.
2202
2203    Args:
2204      key: A `str` key to access the raw feature.
2205
2206    Returns:
2207      A `Tensor` or `SparseTensor`.
2208
2209    Raises:
2210      ValueError: if the raw feature has rank 0.
2211    """
2212    raw_feature = self._features[key]
2213    feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
2214        raw_feature)
2215
2216    def expand_dims(input_tensor):
2217      # Input_tensor must have rank 1.
2218      if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2219        return sparse_ops.sparse_reshape(
2220            input_tensor, [array_ops.shape(input_tensor)[0], 1])
2221      else:
2222        return array_ops.expand_dims(input_tensor, -1)
2223
2224    rank = feature_tensor.get_shape().ndims
2225    if rank is not None:
2226      if rank == 0:
2227        raise ValueError(
2228            'Feature (key: {}) cannot have rank 0. Given: {}'.format(
2229                key, feature_tensor))
2230      return feature_tensor if rank != 1 else expand_dims(feature_tensor)
2231
2232    # Handle dynamic rank.
2233    with ops.control_dependencies([
2234        check_ops.assert_positive(
2235            array_ops.rank(feature_tensor),
2236            message='Feature (key: {}) cannot have rank 0. Given: {}'.format(
2237                key, feature_tensor))]):
2238      return control_flow_ops.cond(
2239          math_ops.equal(1, array_ops.rank(feature_tensor)),
2240          lambda: expand_dims(feature_tensor),
2241          lambda: feature_tensor)
2242
2243
2244# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
2245def _shape_offsets(shape):
2246  """Returns moving offset for each dimension given shape."""
2247  offsets = []
2248  for dim in reversed(shape):
2249    if offsets:
2250      offsets.append(dim * offsets[-1])
2251    else:
2252      offsets.append(dim)
2253  offsets.reverse()
2254  return offsets
2255
2256
2257# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
2258def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None):
2259  """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells.
2260
2261  If `input_tensor` is already a `SparseTensor`, just return it.
2262
2263  Args:
2264    input_tensor: A string or integer `Tensor`.
2265    ignore_value: Entries in `dense_tensor` equal to this value will be
2266      absent from the resulting `SparseTensor`. If `None`, default value of
2267      `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`).
2268
2269  Returns:
2270    A `SparseTensor` with the same shape as `input_tensor`.
2271
2272  Raises:
2273    ValueError: when `input_tensor`'s rank is `None`.
2274  """
2275  input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
2276      input_tensor)
2277  if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2278    return input_tensor
2279  with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)):
2280    if ignore_value is None:
2281      if input_tensor.dtype == dtypes.string:
2282        # Exception due to TF strings are converted to numpy objects by default.
2283        ignore_value = ''
2284      elif input_tensor.dtype.is_integer:
2285        ignore_value = -1  # -1 has a special meaning of missing feature
2286      else:
2287        # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is
2288        # constructing a new numpy object of the given type, which yields the
2289        # default value for that type.
2290        ignore_value = input_tensor.dtype.as_numpy_dtype()
2291    ignore_value = math_ops.cast(
2292        ignore_value, input_tensor.dtype, name='ignore_value')
2293    indices = array_ops.where(
2294        math_ops.not_equal(input_tensor, ignore_value), name='indices')
2295    return sparse_tensor_lib.SparseTensor(
2296        indices=indices,
2297        values=array_ops.gather_nd(input_tensor, indices, name='values'),
2298        dense_shape=array_ops.shape(
2299            input_tensor, out_type=dtypes.int64, name='dense_shape'))
2300
2301
2302def _normalize_feature_columns(feature_columns):
2303  """Normalizes the `feature_columns` input.
2304
2305  This method converts the `feature_columns` to list type as best as it can. In
2306  addition, verifies the type and other parts of feature_columns, required by
2307  downstream library.
2308
2309  Args:
2310    feature_columns: The raw feature columns, usually passed by users.
2311
2312  Returns:
2313    The normalized feature column list.
2314
2315  Raises:
2316    ValueError: for any invalid inputs, such as empty, duplicated names, etc.
2317  """
2318  if isinstance(feature_columns, _FeatureColumn):
2319    feature_columns = [feature_columns]
2320
2321  if isinstance(feature_columns, collections_abc.Iterator):
2322    feature_columns = list(feature_columns)
2323
2324  if isinstance(feature_columns, dict):
2325    raise ValueError('Expected feature_columns to be iterable, found dict.')
2326
2327  for column in feature_columns:
2328    if not isinstance(column, _FeatureColumn):
2329      raise ValueError('Items of feature_columns must be a _FeatureColumn. '
2330                       'Given (type {}): {}.'.format(type(column), column))
2331  if not feature_columns:
2332    raise ValueError('feature_columns must not be empty.')
2333  name_to_column = {}
2334  for column in feature_columns:
2335    if column.name in name_to_column:
2336      raise ValueError('Duplicate feature column name found for columns: {} '
2337                       'and {}. This usually means that these columns refer to '
2338                       'same base feature. Either one must be discarded or a '
2339                       'duplicated but renamed item must be inserted in '
2340                       'features dict.'.format(column,
2341                                               name_to_column[column.name]))
2342    name_to_column[column.name] = column
2343
2344  return feature_columns
2345
2346
2347class _NumericColumn(_DenseColumn,
2348                     collections.namedtuple('_NumericColumn', [
2349                         'key', 'shape', 'default_value', 'dtype',
2350                         'normalizer_fn'
2351                     ])):
2352  """see `numeric_column`."""
2353
2354  @property
2355  def name(self):
2356    return self.key
2357
2358  @property
2359  def _parse_example_spec(self):
2360    return {
2361        self.key:
2362            parsing_ops.FixedLenFeature(self.shape, self.dtype,
2363                                        self.default_value)
2364    }
2365
2366  def _transform_feature(self, inputs):
2367    input_tensor = inputs.get(self.key)
2368    if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2369      raise ValueError(
2370          'The corresponding Tensor of numerical column must be a Tensor. '
2371          'SparseTensor is not supported. key: {}'.format(self.key))
2372    if self.normalizer_fn is not None:
2373      input_tensor = self.normalizer_fn(input_tensor)
2374    return math_ops.cast(input_tensor, dtypes.float32)
2375
2376  @property
2377  def _variable_shape(self):
2378    return tensor_shape.TensorShape(self.shape)
2379
2380  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2381    """Returns dense `Tensor` representing numeric feature.
2382
2383    Args:
2384      inputs: A `_LazyBuilder` object to access inputs.
2385      weight_collections: Unused `weight_collections` since no variables are
2386        created in this function.
2387      trainable: Unused `trainable` bool since no variables are created in
2388        this function.
2389
2390    Returns:
2391      Dense `Tensor` created within `_transform_feature`.
2392    """
2393    # Do nothing with weight_collections and trainable since no variables are
2394    # created in this function.
2395    del weight_collections
2396    del trainable
2397    # Feature has been already transformed. Return the intermediate
2398    # representation created by _transform_feature.
2399    return inputs.get(self)
2400
2401
2402class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
2403                        collections.namedtuple('_BucketizedColumn', [
2404                            'source_column', 'boundaries'])):
2405  """See `bucketized_column`."""
2406
2407  @property
2408  def name(self):
2409    return '{}_bucketized'.format(self.source_column.name)
2410
2411  @property
2412  def _parse_example_spec(self):
2413    return self.source_column._parse_example_spec  # pylint: disable=protected-access
2414
2415  def _transform_feature(self, inputs):
2416    source_tensor = inputs.get(self.source_column)
2417    return math_ops._bucketize(  # pylint: disable=protected-access
2418        source_tensor,
2419        boundaries=self.boundaries)
2420
2421  @property
2422  def _variable_shape(self):
2423    return tensor_shape.TensorShape(
2424        tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
2425
2426  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2427    del weight_collections
2428    del trainable
2429    input_tensor = inputs.get(self)
2430    return array_ops.one_hot(
2431        indices=math_ops.cast(input_tensor, dtypes.int64),
2432        depth=len(self.boundaries) + 1,
2433        on_value=1.,
2434        off_value=0.)
2435
2436  @property
2437  def _num_buckets(self):
2438    # By construction, source_column is always one-dimensional.
2439    return (len(self.boundaries) + 1) * self.source_column.shape[0]
2440
2441  def _get_sparse_tensors(self, inputs, weight_collections=None,
2442                          trainable=None):
2443    """Converts dense inputs to SparseTensor so downstream code can use it."""
2444    input_tensor = inputs.get(self)
2445    batch_size = array_ops.shape(input_tensor)[0]
2446    # By construction, source_column is always one-dimensional.
2447    source_dimension = self.source_column.shape[0]
2448
2449    i1 = array_ops.reshape(
2450        array_ops.tile(
2451            array_ops.expand_dims(math_ops.range(0, batch_size), 1),
2452            [1, source_dimension]),
2453        (-1,))
2454    i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])
2455    # Flatten the bucket indices and unique them across dimensions
2456    # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
2457    bucket_indices = (
2458        array_ops.reshape(input_tensor, (-1,)) +
2459        (len(self.boundaries) + 1) * i2)
2460
2461    indices = math_ops.cast(
2462        array_ops.transpose(array_ops.stack((i1, i2))), dtypes.int64)
2463    dense_shape = math_ops.cast(
2464        array_ops.stack([batch_size, source_dimension]), dtypes.int64)
2465    sparse_tensor = sparse_tensor_lib.SparseTensor(
2466        indices=indices,
2467        values=bucket_indices,
2468        dense_shape=dense_shape)
2469    return _CategoricalColumn.IdWeightPair(sparse_tensor, None)
2470
2471
2472class _EmbeddingColumn(
2473    _DenseColumn, _SequenceDenseColumn,
2474    collections.namedtuple(
2475        '_EmbeddingColumn',
2476        ('categorical_column', 'dimension', 'combiner', 'layer_creator',
2477         'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable',
2478         'use_safe_embedding_lookup'))):
2479  """See `embedding_column`."""
2480
2481  def __new__(cls,
2482              categorical_column,
2483              dimension,
2484              combiner,
2485              layer_creator,
2486              ckpt_to_load_from,
2487              tensor_name_in_ckpt,
2488              max_norm,
2489              trainable,
2490              use_safe_embedding_lookup=True):
2491    return super(_EmbeddingColumn, cls).__new__(
2492        cls,
2493        categorical_column=categorical_column,
2494        dimension=dimension,
2495        combiner=combiner,
2496        layer_creator=layer_creator,
2497        ckpt_to_load_from=ckpt_to_load_from,
2498        tensor_name_in_ckpt=tensor_name_in_ckpt,
2499        max_norm=max_norm,
2500        trainable=trainable,
2501        use_safe_embedding_lookup=use_safe_embedding_lookup)
2502
2503  @property
2504  def name(self):
2505    if not hasattr(self, '_name'):
2506      self._name = '{}_embedding'.format(self.categorical_column.name)
2507    return self._name
2508
2509  @property
2510  def _parse_example_spec(self):
2511    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
2512
2513  def _transform_feature(self, inputs):
2514    return inputs.get(self.categorical_column)
2515
2516  @property
2517  def _variable_shape(self):
2518    if not hasattr(self, '_shape'):
2519      self._shape = tensor_shape.TensorShape([self.dimension])
2520    return self._shape
2521
2522  def _get_dense_tensor_internal(self,
2523                                 inputs,
2524                                 weight_collections=None,
2525                                 trainable=None):
2526    """Private method that follows the signature of _get_dense_tensor."""
2527    # Get sparse IDs and weights.
2528    sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
2529        inputs, weight_collections=weight_collections, trainable=trainable)
2530    sparse_ids = sparse_tensors.id_tensor
2531    sparse_weights = sparse_tensors.weight_tensor
2532
2533    embedding_weights = self.layer_creator(
2534        weight_collections=weight_collections,
2535        scope=variable_scope.get_variable_scope())
2536
2537    if self.ckpt_to_load_from is not None:
2538      to_restore = embedding_weights
2539      if isinstance(to_restore, variables.PartitionedVariable):
2540        to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
2541      checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
2542          self.tensor_name_in_ckpt: to_restore
2543      })
2544
2545    sparse_id_rank = tensor_shape.dimension_value(
2546        sparse_ids.dense_shape.get_shape()[0])
2547    embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
2548    if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
2549        sparse_id_rank <= 2):
2550      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
2551    # Return embedding lookup result.
2552    return embedding_lookup_sparse(
2553        embedding_weights,
2554        sparse_ids,
2555        sparse_weights,
2556        combiner=self.combiner,
2557        name='%s_weights' % self.name,
2558        max_norm=self.max_norm)
2559
2560  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2561    if isinstance(self.categorical_column, _SequenceCategoricalColumn):
2562      raise ValueError(
2563          'In embedding_column: {}. '
2564          'categorical_column must not be of type _SequenceCategoricalColumn. '
2565          'Suggested fix A: If you wish to use input_layer, use a '
2566          'non-sequence categorical_column_with_*. '
2567          'Suggested fix B: If you wish to create sequence input, use '
2568          'sequence_input_layer instead of input_layer. '
2569          'Given (type {}): {}'.format(
2570              self.name, type(self.categorical_column),
2571              self.categorical_column))
2572    return self._get_dense_tensor_internal(
2573        inputs=inputs,
2574        weight_collections=weight_collections,
2575        trainable=trainable)
2576
2577  def _get_sequence_dense_tensor(
2578      self, inputs, weight_collections=None, trainable=None):
2579    if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
2580      raise ValueError(
2581          'In embedding_column: {}. '
2582          'categorical_column must be of type _SequenceCategoricalColumn '
2583          'to use sequence_input_layer. '
2584          'Suggested fix: Use one of sequence_categorical_column_with_*. '
2585          'Given (type {}): {}'.format(
2586              self.name, type(self.categorical_column),
2587              self.categorical_column))
2588    dense_tensor = self._get_dense_tensor_internal(  # pylint: disable=protected-access
2589        inputs=inputs,
2590        weight_collections=weight_collections,
2591        trainable=trainable)
2592
2593    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
2594    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
2595        sparse_tensors.id_tensor)
2596    return _SequenceDenseColumn.TensorSequenceLengthPair(
2597        dense_tensor=dense_tensor, sequence_length=sequence_length)
2598
2599
2600def _get_graph_for_variable(var):
2601  if isinstance(var, variables.PartitionedVariable):
2602    return list(var)[0].graph
2603  else:
2604    return var.graph
2605
2606
2607class _SharedEmbeddingColumn(
2608    _DenseColumn, _SequenceDenseColumn,
2609    collections.namedtuple(
2610        '_SharedEmbeddingColumn',
2611        ('categorical_column', 'dimension', 'combiner', 'initializer',
2612         'shared_embedding_collection_name', 'ckpt_to_load_from',
2613         'tensor_name_in_ckpt', 'max_norm', 'trainable',
2614         'use_safe_embedding_lookup'))):
2615  """See `embedding_column`."""
2616
2617  @property
2618  def name(self):
2619    if not hasattr(self, '_name'):
2620      self._name = '{}_shared_embedding'.format(self.categorical_column.name)
2621    return self._name
2622
2623  @property
2624  def _var_scope_name(self):
2625    return self.shared_embedding_collection_name
2626
2627  @property
2628  def _parse_example_spec(self):
2629    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
2630
2631  def _transform_feature(self, inputs):
2632    return inputs.get(self.categorical_column)
2633
2634  @property
2635  def _variable_shape(self):
2636    if not hasattr(self, '_shape'):
2637      self._shape = tensor_shape.TensorShape([self.dimension])
2638    return self._shape
2639
2640  def _get_dense_tensor_internal(self,
2641                                 inputs,
2642                                 weight_collections=None,
2643                                 trainable=None):
2644    """Private method that follows the signature of _get_dense_tensor."""
2645    # This method is called from a variable_scope with name _var_scope_name,
2646    # which is shared among all shared embeddings. Open a name_scope here, so
2647    # that the ops for different columns have distinct names.
2648    with ops.name_scope(None, default_name=self.name):
2649      # Get sparse IDs and weights.
2650      sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
2651          inputs, weight_collections=weight_collections, trainable=trainable)
2652      sparse_ids = sparse_tensors.id_tensor
2653      sparse_weights = sparse_tensors.weight_tensor
2654
2655      embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
2656      shared_embedding_collection = ops.get_collection(
2657          self.shared_embedding_collection_name)
2658      if shared_embedding_collection:
2659        if len(shared_embedding_collection) > 1:
2660          raise ValueError(
2661              'Collection {} can only contain one variable. '
2662              'Suggested fix A: Choose a unique name for this collection. '
2663              'Suggested fix B: Do not add any variables to this collection. '
2664              'The feature_column library already adds a variable under the '
2665              'hood.'.format(shared_embedding_collection))
2666        embedding_weights = shared_embedding_collection[0]
2667        if embedding_weights.get_shape() != embedding_shape:
2668          raise ValueError(
2669              'Shared embedding collection {} contains variable {} of '
2670              'unexpected shape {}. Expected shape is {}. '
2671              'Suggested fix A: Choose a unique name for this collection. '
2672              'Suggested fix B: Do not add any variables to this collection. '
2673              'The feature_column library already adds a variable under the '
2674              'hood.'.format(self.shared_embedding_collection_name,
2675                             embedding_weights.name,
2676                             embedding_weights.get_shape(), embedding_shape))
2677      else:
2678        embedding_weights = variable_scope.get_variable(
2679            name='embedding_weights',
2680            shape=embedding_shape,
2681            dtype=dtypes.float32,
2682            initializer=self.initializer,
2683            trainable=self.trainable and trainable,
2684            collections=weight_collections)
2685        ops.add_to_collection(self.shared_embedding_collection_name,
2686                              embedding_weights)
2687      if self.ckpt_to_load_from is not None:
2688        to_restore = embedding_weights
2689        if isinstance(to_restore, variables.PartitionedVariable):
2690          to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
2691        checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
2692            self.tensor_name_in_ckpt: to_restore
2693        })
2694
2695      sparse_id_rank = tensor_shape.dimension_value(
2696          sparse_ids.dense_shape.get_shape()[0])
2697      embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
2698      if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
2699          sparse_id_rank <= 2):
2700        embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
2701      # Return embedding lookup result.
2702      return embedding_lookup_sparse(
2703          embedding_weights,
2704          sparse_ids,
2705          sparse_weights,
2706          combiner=self.combiner,
2707          name='%s_weights' % self.name,
2708          max_norm=self.max_norm)
2709
2710  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2711    if isinstance(self.categorical_column, _SequenceCategoricalColumn):
2712      raise ValueError(
2713          'In embedding_column: {}. '
2714          'categorical_column must not be of type _SequenceCategoricalColumn. '
2715          'Suggested fix A: If you wish to use input_layer, use a '
2716          'non-sequence categorical_column_with_*. '
2717          'Suggested fix B: If you wish to create sequence input, use '
2718          'sequence_input_layer instead of input_layer. '
2719          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
2720                                       self.categorical_column))
2721    return self._get_dense_tensor_internal(
2722        inputs=inputs,
2723        weight_collections=weight_collections,
2724        trainable=trainable)
2725
2726  def _get_sequence_dense_tensor(self,
2727                                 inputs,
2728                                 weight_collections=None,
2729                                 trainable=None):
2730    if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
2731      raise ValueError(
2732          'In embedding_column: {}. '
2733          'categorical_column must be of type _SequenceCategoricalColumn '
2734          'to use sequence_input_layer. '
2735          'Suggested fix: Use one of sequence_categorical_column_with_*. '
2736          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
2737                                       self.categorical_column))
2738    dense_tensor = self._get_dense_tensor_internal(  # pylint: disable=protected-access
2739        inputs=inputs,
2740        weight_collections=weight_collections,
2741        trainable=trainable)
2742    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
2743    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
2744        sparse_tensors.id_tensor)
2745    return _SequenceDenseColumn.TensorSequenceLengthPair(
2746        dense_tensor=dense_tensor, sequence_length=sequence_length)
2747
2748
2749def _check_shape(shape, key):
2750  """Returns shape if it's valid, raises error otherwise."""
2751  assert shape is not None
2752  if not nest.is_nested(shape):
2753    shape = [shape]
2754  shape = tuple(shape)
2755  for dimension in shape:
2756    if not isinstance(dimension, six.integer_types):
2757      raise TypeError('shape dimensions must be integer. '
2758                      'shape: {}, key: {}'.format(shape, key))
2759    if dimension < 1:
2760      raise ValueError('shape dimensions must be greater than 0. '
2761                       'shape: {}, key: {}'.format(shape, key))
2762  return shape
2763
2764
2765class _HashedCategoricalColumn(
2766    _CategoricalColumn,
2767    collections.namedtuple('_HashedCategoricalColumn',
2768                           ['key', 'hash_bucket_size', 'dtype'])):
2769  """see `categorical_column_with_hash_bucket`."""
2770
2771  @property
2772  def name(self):
2773    return self.key
2774
2775  @property
2776  def _parse_example_spec(self):
2777    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
2778
2779  def _transform_feature(self, inputs):
2780    input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
2781    if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2782      raise ValueError('SparseColumn input must be a SparseTensor.')
2783
2784    fc_utils.assert_string_or_int(
2785        input_tensor.dtype,
2786        prefix='column_name: {} input_tensor'.format(self.key))
2787
2788    if self.dtype.is_integer != input_tensor.dtype.is_integer:
2789      raise ValueError(
2790          'Column dtype and SparseTensors dtype must be compatible. '
2791          'key: {}, column dtype: {}, tensor dtype: {}'.format(
2792              self.key, self.dtype, input_tensor.dtype))
2793
2794    if self.dtype == dtypes.string:
2795      sparse_values = input_tensor.values
2796    else:
2797      sparse_values = string_ops.as_string(input_tensor.values)
2798
2799    sparse_id_values = string_ops.string_to_hash_bucket_fast(
2800        sparse_values, self.hash_bucket_size, name='lookup')
2801    return sparse_tensor_lib.SparseTensor(
2802        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
2803
2804  @property
2805  def _num_buckets(self):
2806    """Returns number of buckets in this sparse feature."""
2807    return self.hash_bucket_size
2808
2809  def _get_sparse_tensors(self, inputs, weight_collections=None,
2810                          trainable=None):
2811    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2812
2813
2814class _VocabularyFileCategoricalColumn(
2815    _CategoricalColumn,
2816    collections.namedtuple('_VocabularyFileCategoricalColumn', (
2817        'key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'dtype',
2818        'default_value'
2819    ))):
2820  """See `categorical_column_with_vocabulary_file`."""
2821
2822  @property
2823  def name(self):
2824    return self.key
2825
2826  @property
2827  def _parse_example_spec(self):
2828    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
2829
2830  def _transform_feature(self, inputs):
2831    input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
2832
2833    if self.dtype.is_integer != input_tensor.dtype.is_integer:
2834      raise ValueError(
2835          'Column dtype and SparseTensors dtype must be compatible. '
2836          'key: {}, column dtype: {}, tensor dtype: {}'.format(
2837              self.key, self.dtype, input_tensor.dtype))
2838
2839    fc_utils.assert_string_or_int(
2840        input_tensor.dtype,
2841        prefix='column_name: {} input_tensor'.format(self.key))
2842
2843    key_dtype = self.dtype
2844    if input_tensor.dtype.is_integer:
2845      # `index_table_from_file` requires 64-bit integer keys.
2846      key_dtype = dtypes.int64
2847      input_tensor = math_ops.cast(input_tensor, dtypes.int64)
2848
2849    return lookup_ops.index_table_from_file(
2850        vocabulary_file=self.vocabulary_file,
2851        num_oov_buckets=self.num_oov_buckets,
2852        vocab_size=self.vocabulary_size,
2853        default_value=self.default_value,
2854        key_dtype=key_dtype,
2855        name='{}_lookup'.format(self.key)).lookup(input_tensor)
2856
2857  @property
2858  def _num_buckets(self):
2859    """Returns number of buckets in this sparse feature."""
2860    return self.vocabulary_size + self.num_oov_buckets
2861
2862  def _get_sparse_tensors(
2863      self, inputs, weight_collections=None, trainable=None):
2864    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2865
2866
2867class _VocabularyListCategoricalColumn(
2868    _CategoricalColumn,
2869    collections.namedtuple('_VocabularyListCategoricalColumn', (
2870        'key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'
2871    ))):
2872  """See `categorical_column_with_vocabulary_list`."""
2873
2874  @property
2875  def name(self):
2876    return self.key
2877
2878  @property
2879  def _parse_example_spec(self):
2880    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
2881
2882  def _transform_feature(self, inputs):
2883    input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
2884
2885    if self.dtype.is_integer != input_tensor.dtype.is_integer:
2886      raise ValueError(
2887          'Column dtype and SparseTensors dtype must be compatible. '
2888          'key: {}, column dtype: {}, tensor dtype: {}'.format(
2889              self.key, self.dtype, input_tensor.dtype))
2890
2891    fc_utils.assert_string_or_int(
2892        input_tensor.dtype,
2893        prefix='column_name: {} input_tensor'.format(self.key))
2894
2895    key_dtype = self.dtype
2896    if input_tensor.dtype.is_integer:
2897      # `index_table_from_tensor` requires 64-bit integer keys.
2898      key_dtype = dtypes.int64
2899      input_tensor = math_ops.cast(input_tensor, dtypes.int64)
2900
2901    return lookup_ops.index_table_from_tensor(
2902        vocabulary_list=tuple(self.vocabulary_list),
2903        default_value=self.default_value,
2904        num_oov_buckets=self.num_oov_buckets,
2905        dtype=key_dtype,
2906        name='{}_lookup'.format(self.key)).lookup(input_tensor)
2907
2908  @property
2909  def _num_buckets(self):
2910    """Returns number of buckets in this sparse feature."""
2911    return len(self.vocabulary_list) + self.num_oov_buckets
2912
2913  def _get_sparse_tensors(
2914      self, inputs, weight_collections=None, trainable=None):
2915    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2916
2917
2918class _IdentityCategoricalColumn(
2919    _CategoricalColumn,
2920    collections.namedtuple('_IdentityCategoricalColumn', (
2921        'key', 'num_buckets', 'default_value'
2922    ))):
2923
2924  """See `categorical_column_with_identity`."""
2925
2926  @property
2927  def name(self):
2928    return self.key
2929
2930  @property
2931  def _parse_example_spec(self):
2932    return {self.key: parsing_ops.VarLenFeature(dtypes.int64)}
2933
2934  def _transform_feature(self, inputs):
2935    input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
2936
2937    if not input_tensor.dtype.is_integer:
2938      raise ValueError(
2939          'Invalid input, not integer. key: {} dtype: {}'.format(
2940              self.key, input_tensor.dtype))
2941    values = input_tensor.values
2942    if input_tensor.values.dtype != dtypes.int64:
2943      values = math_ops.cast(values, dtypes.int64, name='values')
2944    if self.default_value is not None:
2945      num_buckets = math_ops.cast(
2946          self.num_buckets, dtypes.int64, name='num_buckets')
2947      zero = math_ops.cast(0, dtypes.int64, name='zero')
2948      # Assign default for out-of-range values.
2949      values = array_ops.where(
2950          math_ops.logical_or(
2951              values < zero, values >= num_buckets, name='out_of_range'),
2952          array_ops.fill(
2953              dims=array_ops.shape(values),
2954              value=math_ops.cast(self.default_value, dtypes.int64),
2955              name='default_values'), values)
2956    return sparse_tensor_lib.SparseTensor(
2957        indices=input_tensor.indices,
2958        values=values,
2959        dense_shape=input_tensor.dense_shape)
2960
2961  @property
2962  def _num_buckets(self):
2963    """Returns number of buckets in this sparse feature."""
2964    return self.num_buckets
2965
2966  def _get_sparse_tensors(
2967      self, inputs, weight_collections=None, trainable=None):
2968    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2969
2970
2971class _WeightedCategoricalColumn(
2972    _CategoricalColumn,
2973    collections.namedtuple('_WeightedCategoricalColumn', (
2974        'categorical_column', 'weight_feature_key', 'dtype'
2975    ))):
2976  """See `weighted_categorical_column`."""
2977
2978  @property
2979  def name(self):
2980    return '{}_weighted_by_{}'.format(
2981        self.categorical_column.name, self.weight_feature_key)
2982
2983  @property
2984  def _parse_example_spec(self):
2985    config = self.categorical_column._parse_example_spec  # pylint: disable=protected-access
2986    if self.weight_feature_key in config:
2987      raise ValueError('Parse config {} already exists for {}.'.format(
2988          config[self.weight_feature_key], self.weight_feature_key))
2989    config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)
2990    return config
2991
2992  @property
2993  def _num_buckets(self):
2994    return self.categorical_column._num_buckets  # pylint: disable=protected-access
2995
2996  def _transform_feature(self, inputs):
2997    weight_tensor = inputs.get(self.weight_feature_key)
2998    if weight_tensor is None:
2999      raise ValueError('Missing weights {}.'.format(self.weight_feature_key))
3000    weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
3001        weight_tensor)
3002    if self.dtype != weight_tensor.dtype.base_dtype:
3003      raise ValueError('Bad dtype, expected {}, but got {}.'.format(
3004          self.dtype, weight_tensor.dtype))
3005    if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor):
3006      # The weight tensor can be a regular Tensor. In this case, sparsify it.
3007      weight_tensor = _to_sparse_input_and_drop_ignore_values(
3008          weight_tensor, ignore_value=0.0)
3009    if not weight_tensor.dtype.is_floating:
3010      weight_tensor = math_ops.cast(weight_tensor, dtypes.float32)
3011    return (inputs.get(self.categorical_column), weight_tensor)
3012
3013  def _get_sparse_tensors(
3014      self, inputs, weight_collections=None, trainable=None):
3015    del weight_collections
3016    del trainable
3017    tensors = inputs.get(self)
3018    return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
3019
3020
3021class _CrossedColumn(
3022    _CategoricalColumn,
3023    collections.namedtuple('_CrossedColumn',
3024                           ['keys', 'hash_bucket_size', 'hash_key'])):
3025  """See `crossed_column`."""
3026
3027  @property
3028  def name(self):
3029    feature_names = []
3030    for key in _collect_leaf_level_keys(self):
3031      if isinstance(key, _FeatureColumn):
3032        feature_names.append(key.name)
3033      else:  # key must be a string
3034        feature_names.append(key)
3035    return '_X_'.join(sorted(feature_names))
3036
3037  @property
3038  def _parse_example_spec(self):
3039    config = {}
3040    for key in self.keys:
3041      if isinstance(key, _FeatureColumn):
3042        config.update(key._parse_example_spec)  # pylint: disable=protected-access
3043      else:  # key must be a string
3044        config.update({key: parsing_ops.VarLenFeature(dtypes.string)})
3045    return config
3046
3047  def _transform_feature(self, inputs):
3048    feature_tensors = []
3049    for key in _collect_leaf_level_keys(self):
3050      if isinstance(key, six.string_types):
3051        feature_tensors.append(inputs.get(key))
3052      elif isinstance(key, _CategoricalColumn):
3053        ids_and_weights = key._get_sparse_tensors(inputs)  # pylint: disable=protected-access
3054        if ids_and_weights.weight_tensor is not None:
3055          raise ValueError(
3056              'crossed_column does not support weight_tensor, but the given '
3057              'column populates weight_tensor. '
3058              'Given column: {}'.format(key.name))
3059        feature_tensors.append(ids_and_weights.id_tensor)
3060      else:
3061        raise ValueError('Unsupported column type. Given: {}'.format(key))
3062    return sparse_ops.sparse_cross_hashed(
3063        inputs=feature_tensors,
3064        num_buckets=self.hash_bucket_size,
3065        hash_key=self.hash_key)
3066
3067  @property
3068  def _num_buckets(self):
3069    """Returns number of buckets in this sparse feature."""
3070    return self.hash_bucket_size
3071
3072  def _get_sparse_tensors(self, inputs, weight_collections=None,
3073                          trainable=None):
3074    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
3075
3076
3077def _collect_leaf_level_keys(cross):
3078  """Collects base keys by expanding all nested crosses.
3079
3080  Args:
3081    cross: A `_CrossedColumn`.
3082
3083  Returns:
3084    A list of strings or `_CategoricalColumn` instances.
3085  """
3086  leaf_level_keys = []
3087  for k in cross.keys:
3088    if isinstance(k, _CrossedColumn):
3089      leaf_level_keys.extend(_collect_leaf_level_keys(k))
3090    else:
3091      leaf_level_keys.append(k)
3092  return leaf_level_keys
3093
3094
3095class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn,
3096                       collections.namedtuple('_IndicatorColumn',
3097                                              ['categorical_column'])):
3098  """Represents a one-hot column for use in deep networks.
3099
3100  Args:
3101    categorical_column: A `_CategoricalColumn` which is created by
3102      `categorical_column_with_*` function.
3103  """
3104
3105  @property
3106  def name(self):
3107    return '{}_indicator'.format(self.categorical_column.name)
3108
3109  def _transform_feature(self, inputs):
3110    """Returns dense `Tensor` representing feature.
3111
3112    Args:
3113      inputs: A `_LazyBuilder` object to access inputs.
3114
3115    Returns:
3116      Transformed feature `Tensor`.
3117
3118    Raises:
3119      ValueError: if input rank is not known at graph building time.
3120    """
3121    id_weight_pair = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
3122    id_tensor = id_weight_pair.id_tensor
3123    weight_tensor = id_weight_pair.weight_tensor
3124
3125    # If the underlying column is weighted, return the input as a dense tensor.
3126    if weight_tensor is not None:
3127      weighted_column = sparse_ops.sparse_merge(
3128          sp_ids=id_tensor,
3129          sp_values=weight_tensor,
3130          vocab_size=int(self._variable_shape[-1]))
3131      # Remove (?, -1) index.
3132      weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],
3133                                                weighted_column.dense_shape)
3134      # Use scatter_nd to merge duplicated indices if existed,
3135      # instead of sparse_tensor_to_dense.
3136      return array_ops.scatter_nd(weighted_column.indices,
3137                                  weighted_column.values,
3138                                  weighted_column.dense_shape)
3139
3140    dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
3141        id_tensor, default_value=-1)
3142
3143    # One hot must be float for tf.concat reasons since all other inputs to
3144    # input_layer are float32.
3145    one_hot_id_tensor = array_ops.one_hot(
3146        dense_id_tensor,
3147        depth=self._variable_shape[-1],
3148        on_value=1.0,
3149        off_value=0.0)
3150
3151    # Reduce to get a multi-hot per example.
3152    return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2])
3153
3154  @property
3155  def _parse_example_spec(self):
3156    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
3157
3158  @property
3159  def _variable_shape(self):
3160    """Returns a `TensorShape` representing the shape of the dense `Tensor`."""
3161    return tensor_shape.TensorShape([1, self.categorical_column._num_buckets])  # pylint: disable=protected-access
3162
3163  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
3164    """Returns dense `Tensor` representing feature.
3165
3166    Args:
3167      inputs: A `_LazyBuilder` object to access inputs.
3168      weight_collections: Unused `weight_collections` since no variables are
3169        created in this function.
3170      trainable: Unused `trainable` bool since no variables are created in
3171        this function.
3172
3173    Returns:
3174      Dense `Tensor` created within `_transform_feature`.
3175
3176    Raises:
3177      ValueError: If `categorical_column` is a `_SequenceCategoricalColumn`.
3178    """
3179    # Do nothing with weight_collections and trainable since no variables are
3180    # created in this function.
3181    del weight_collections
3182    del trainable
3183    if isinstance(self.categorical_column, _SequenceCategoricalColumn):
3184      raise ValueError(
3185          'In indicator_column: {}. '
3186          'categorical_column must not be of type _SequenceCategoricalColumn. '
3187          'Suggested fix A: If you wish to use input_layer, use a '
3188          'non-sequence categorical_column_with_*. '
3189          'Suggested fix B: If you wish to create sequence input, use '
3190          'sequence_input_layer instead of input_layer. '
3191          'Given (type {}): {}'.format(
3192              self.name, type(self.categorical_column),
3193              self.categorical_column))
3194    # Feature has been already transformed. Return the intermediate
3195    # representation created by _transform_feature.
3196    return inputs.get(self)
3197
3198  def _get_sequence_dense_tensor(
3199      self, inputs, weight_collections=None, trainable=None):
3200    # Do nothing with weight_collections and trainable since no variables are
3201    # created in this function.
3202    del weight_collections
3203    del trainable
3204    if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
3205      raise ValueError(
3206          'In indicator_column: {}. '
3207          'categorical_column must be of type _SequenceCategoricalColumn '
3208          'to use sequence_input_layer. '
3209          'Suggested fix: Use one of sequence_categorical_column_with_*. '
3210          'Given (type {}): {}'.format(
3211              self.name, type(self.categorical_column),
3212              self.categorical_column))
3213    # Feature has been already transformed. Return the intermediate
3214    # representation created by _transform_feature.
3215    dense_tensor = inputs.get(self)
3216    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
3217    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
3218        sparse_tensors.id_tensor)
3219    return _SequenceDenseColumn.TensorSequenceLengthPair(
3220        dense_tensor=dense_tensor, sequence_length=sequence_length)
3221
3222
3223def _verify_static_batch_size_equality(tensors, columns):
3224  """Validates that the first dim (batch size) of all tensors are equal or None.
3225
3226  Args:
3227    tensors: list of tensors to check.
3228    columns: list of feature columns matching tensors. Will be used for error
3229      messaging.
3230
3231  Raises:
3232    ValueError: if one of the tensors has a variant batch size
3233  """
3234  # bath_size is a tf.compat.v1.Dimension object.
3235  expected_batch_size = None
3236  for i in range(0, len(tensors)):
3237    if tensors[i].shape.dims[0].value is not None:
3238      if expected_batch_size is None:
3239        bath_size_column_index = i
3240        expected_batch_size = tensors[i].shape.dims[0]
3241      elif not expected_batch_size.is_compatible_with(tensors[i].shape.dims[0]):
3242        raise ValueError(
3243            'Batch size (first dimension) of each feature must be same. '
3244            'Batch size of columns ({}, {}): ({}, {})'.format(
3245                columns[bath_size_column_index].name, columns[i].name,
3246                expected_batch_size, tensors[i].shape.dims[0]))
3247
3248
3249class _SequenceCategoricalColumn(
3250    _CategoricalColumn,
3251    collections.namedtuple(
3252        '_SequenceCategoricalColumn', ['categorical_column'])):
3253  """Represents sequences of categorical data."""
3254
3255  @property
3256  def name(self):
3257    return self.categorical_column.name
3258
3259  @property
3260  def _parse_example_spec(self):
3261    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
3262
3263  def _transform_feature(self, inputs):
3264    return self.categorical_column._transform_feature(inputs)  # pylint: disable=protected-access
3265
3266  @property
3267  def _num_buckets(self):
3268    return self.categorical_column._num_buckets  # pylint: disable=protected-access
3269
3270  def _get_sparse_tensors(self, inputs, weight_collections=None,
3271                          trainable=None):
3272    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
3273    id_tensor = sparse_tensors.id_tensor
3274    weight_tensor = sparse_tensors.weight_tensor
3275
3276    # Expands third dimension, if necessary so that embeddings are not
3277    # combined during embedding lookup. If the tensor is already 3D, leave
3278    # as-is.
3279    shape = array_ops.shape(id_tensor)
3280    # Compute the third dimension explicitly instead of setting it to -1, as
3281    # that doesn't work for dynamically shaped tensors with 0-length at runtime.
3282    # This happens for empty sequences.
3283    target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])]
3284    id_tensor = sparse_ops.sparse_reshape(id_tensor, target_shape)
3285    if weight_tensor is not None:
3286      weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape)
3287
3288    return _CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)
3289