xref: /aosp_15_r20/external/tensorflow/tensorflow/python/training/ftrl.py (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Ftrl-proximal for TensorFlow."""
16from tensorflow.python.framework import dtypes
17from tensorflow.python.framework import ops
18from tensorflow.python.ops import array_ops
19from tensorflow.python.ops import math_ops
20from tensorflow.python.training import optimizer
21from tensorflow.python.training import training_ops
22from tensorflow.python.util.tf_export import tf_export
23
24
25@tf_export(v1=["train.FtrlOptimizer"])
26class FtrlOptimizer(optimizer.Optimizer):
27  """Optimizer that implements the FTRL algorithm.
28
29  This version has support for both online L2 (McMahan et al., 2013) and
30  shrinkage-type L2, which is the addition of an L2 penalty
31  to the loss function.
32
33  References:
34    Ad-click prediction:
35      [McMahan et al., 2013](https://dl.acm.org/citation.cfm?id=2488200)
36      ([pdf](https://dl.acm.org/ft_gateway.cfm?id=2488200&ftid=1388399&dwn=1&CFID=32233078&CFTOKEN=d60fe57a294c056a-CB75C374-F915-E7A6-1573FBBC7BF7D526))
37  """
38
39  def __init__(self,
40               learning_rate,
41               learning_rate_power=-0.5,
42               initial_accumulator_value=0.1,
43               l1_regularization_strength=0.0,
44               l2_regularization_strength=0.0,
45               use_locking=False,
46               name="Ftrl",
47               accum_name=None,
48               linear_name=None,
49               l2_shrinkage_regularization_strength=0.0,
50               beta=None):
51    r"""Construct a new FTRL optimizer.
52
53    Args:
54      learning_rate: A float value or a constant float `Tensor`.
55      learning_rate_power: A float value, must be less or equal to zero.
56        Controls how the learning rate decreases during training. Use zero for
57        a fixed learning rate. See section 3.1 in (McMahan et al., 2013).
58      initial_accumulator_value: The starting value for accumulators.
59        Only zero or positive values are allowed.
60      l1_regularization_strength: A float value, must be greater than or
61        equal to zero.
62      l2_regularization_strength: A float value, must be greater than or
63        equal to zero.
64      use_locking: If `True` use locks for update operations.
65      name: Optional name prefix for the operations created when applying
66        gradients.  Defaults to "Ftrl".
67      accum_name: The suffix for the variable that keeps the gradient squared
68        accumulator.  If not present, defaults to name.
69      linear_name: The suffix for the variable that keeps the linear gradient
70        accumulator.  If not present, defaults to name + "_1".
71      l2_shrinkage_regularization_strength: A float value, must be greater than
72        or equal to zero. This differs from L2 above in that the L2 above is a
73        stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
74        The FTRL formulation can be written as:
75        w_{t+1} = argmin_w(\hat{g}_{1:t}w + L1*||w||_1 + L2*||w||_2^2), where
76        \hat{g} = g + (2*L2_shrinkage*w), and g is the gradient of the loss
77        function w.r.t. the weights w.
78        Specifically, in the absence of L1 regularization, it is equivalent to
79        the following update rule:
80        w_{t+1} = w_t - lr_t / (beta + 2*L2*lr_t) * g_t -
81                  2*L2_shrinkage*lr_t / (beta + 2*L2*lr_t) * w_t
82        where lr_t is the learning rate at t.
83        When input is sparse shrinkage will only happen on the active weights.
84      beta: A float value; corresponds to the beta parameter in the paper.
85
86    Raises:
87      ValueError: If one of the arguments is invalid.
88
89    References:
90      Ad-click prediction:
91        [McMahan et al., 2013](https://dl.acm.org/citation.cfm?id=2488200)
92        ([pdf](https://dl.acm.org/ft_gateway.cfm?id=2488200&ftid=1388399&dwn=1&CFID=32233078&CFTOKEN=d60fe57a294c056a-CB75C374-F915-E7A6-1573FBBC7BF7D526))
93    """
94    super(FtrlOptimizer, self).__init__(use_locking, name)
95
96    if initial_accumulator_value < 0.0:
97      raise ValueError(
98          "initial_accumulator_value %f needs to be positive or zero" %
99          initial_accumulator_value)
100    if learning_rate_power > 0.0:
101      raise ValueError("learning_rate_power %f needs to be negative or zero" %
102                       learning_rate_power)
103    if l1_regularization_strength < 0.0:
104      raise ValueError(
105          "l1_regularization_strength %f needs to be positive or zero" %
106          l1_regularization_strength)
107    if l2_regularization_strength < 0.0:
108      raise ValueError(
109          "l2_regularization_strength %f needs to be positive or zero" %
110          l2_regularization_strength)
111    if l2_shrinkage_regularization_strength < 0.0:
112      raise ValueError(
113          "l2_shrinkage_regularization_strength %f needs to be positive"
114          " or zero" % l2_shrinkage_regularization_strength)
115
116    self._learning_rate = learning_rate
117    self._learning_rate_power = learning_rate_power
118    self._initial_accumulator_value = initial_accumulator_value
119    self._l1_regularization_strength = l1_regularization_strength
120    self._l2_regularization_strength = l2_regularization_strength
121    self._beta = (0.0 if beta is None else beta)
122    self._l2_shrinkage_regularization_strength = (
123        l2_shrinkage_regularization_strength)
124    self._learning_rate_tensor = None
125    self._learning_rate_power_tensor = None
126    self._l1_regularization_strength_tensor = None
127    self._adjusted_l2_regularization_strength_tensor = None
128    self._l2_shrinkage_regularization_strength_tensor = None
129    self._accum_name = accum_name
130    self._linear_name = linear_name
131
132  def _create_slots(self, var_list):
133    # Create the "accum" and "linear" slots.
134    def _accum_initializer(shape, dtype=dtypes.float32, partition_info=None):
135      del partition_info
136      return array_ops.ones(
137          shape=shape, dtype=dtype) * self._initial_accumulator_value
138    for v in var_list:
139      self._get_or_make_slot_with_initializer(
140          v, _accum_initializer, v.shape, v.dtype, "accum",
141          self._accum_name or self._name)
142      self._zeros_slot(v, "linear", self._linear_name or self._name)
143
144  def _prepare(self):
145    self._learning_rate_tensor = ops.convert_to_tensor(
146        self._learning_rate, name="learning_rate")
147    self._l1_regularization_strength_tensor = ops.convert_to_tensor(
148        self._l1_regularization_strength, name="l1_regularization_strength")
149    # L2 regularization strength with beta added in so that the underlying
150    # TensorFlow ops do not need to include that parameter.
151    self._adjusted_l2_regularization_strength_tensor = ops.convert_to_tensor(
152        self._l2_regularization_strength + self._beta /
153        (2. * math_ops.maximum(self._learning_rate, 1e-36)),
154        name="adjusted_l2_regularization_strength")
155    assert self._adjusted_l2_regularization_strength_tensor is not None
156    self._beta_tensor = ops.convert_to_tensor(self._beta, name="beta")
157    self._l2_shrinkage_regularization_strength_tensor = ops.convert_to_tensor(
158        self._l2_shrinkage_regularization_strength,
159        name="l2_shrinkage_regularization_strength")
160    self._learning_rate_power_tensor = ops.convert_to_tensor(
161        self._learning_rate_power, name="learning_rate_power")
162
163  def _apply_dense(self, grad, var):
164    accum = self.get_slot(var, "accum")
165    linear = self.get_slot(var, "linear")
166    if self._l2_shrinkage_regularization_strength <= 0.0:
167      return training_ops.apply_ftrl(
168          var,
169          accum,
170          linear,
171          grad,
172          math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
173          math_ops.cast(self._l1_regularization_strength_tensor,
174                        var.dtype.base_dtype),
175          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
176                        var.dtype.base_dtype),
177          math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
178          use_locking=self._use_locking)
179    else:
180      return training_ops.apply_ftrl_v2(
181          var,
182          accum,
183          linear,
184          grad,
185          math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
186          math_ops.cast(self._l1_regularization_strength_tensor,
187                        var.dtype.base_dtype),
188          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
189                        var.dtype.base_dtype),
190          math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
191                        var.dtype.base_dtype),
192          math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
193          use_locking=self._use_locking)
194
195  def _resource_apply_dense(self, grad, var):
196    accum = self.get_slot(var, "accum")
197    linear = self.get_slot(var, "linear")
198    if self._l2_shrinkage_regularization_strength <= 0.0:
199      return training_ops.resource_apply_ftrl(
200          var.handle,
201          accum.handle,
202          linear.handle,
203          grad,
204          math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
205          math_ops.cast(self._l1_regularization_strength_tensor,
206                        var.dtype.base_dtype),
207          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
208                        var.dtype.base_dtype),
209          math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
210          use_locking=self._use_locking)
211    else:
212      return training_ops.resource_apply_ftrl_v2(
213          var.handle,
214          accum.handle,
215          linear.handle,
216          grad,
217          math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
218          math_ops.cast(self._l1_regularization_strength_tensor,
219                        var.dtype.base_dtype),
220          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
221                        var.dtype.base_dtype),
222          math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
223                        var.dtype.base_dtype),
224          math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
225          use_locking=self._use_locking)
226
227  def _apply_sparse(self, grad, var):
228    accum = self.get_slot(var, "accum")
229    linear = self.get_slot(var, "linear")
230    if self._l2_shrinkage_regularization_strength <= 0.0:
231      return training_ops.sparse_apply_ftrl(
232          var,
233          accum,
234          linear,
235          grad.values,
236          grad.indices,
237          math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
238          math_ops.cast(self._l1_regularization_strength_tensor,
239                        var.dtype.base_dtype),
240          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
241                        var.dtype.base_dtype),
242          math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
243          use_locking=self._use_locking)
244    else:
245      return training_ops.sparse_apply_ftrl_v2(
246          var,
247          accum,
248          linear,
249          grad.values,
250          grad.indices,
251          math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
252          math_ops.cast(self._l1_regularization_strength_tensor,
253                        var.dtype.base_dtype),
254          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
255                        var.dtype.base_dtype),
256          math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
257                        grad.dtype.base_dtype),
258          math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
259          use_locking=self._use_locking)
260
261  def _resource_apply_sparse(self, grad, var, indices):
262    accum = self.get_slot(var, "accum")
263    linear = self.get_slot(var, "linear")
264    if self._l2_shrinkage_regularization_strength <= 0.0:
265      return training_ops.resource_sparse_apply_ftrl(
266          var.handle,
267          accum.handle,
268          linear.handle,
269          grad,
270          indices,
271          math_ops.cast(self._learning_rate_tensor, grad.dtype),
272          math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
273          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
274                        grad.dtype),
275          math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
276          use_locking=self._use_locking)
277    else:
278      return training_ops.resource_sparse_apply_ftrl_v2(
279          var.handle,
280          accum.handle,
281          linear.handle,
282          grad,
283          indices,
284          math_ops.cast(self._learning_rate_tensor, grad.dtype),
285          math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
286          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
287                        grad.dtype),
288          math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
289                        grad.dtype),
290          math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
291          use_locking=self._use_locking)
292