python/ops/nn_grad.py

# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Gradients for operators defined in nn_ops.py."""

import functools
import itertools
import operator

from tensorflow.python.eager import backprop
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import gen_nn_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn_ops


@ops.RegisterGradient("Conv2DBackpropInput")
def _Conv2DBackpropInputGrad(op, grad):
  """The derivatives for deconvolution.

  Args:
    op: the Deconvolution op.
    grad: the tensor representing the gradient w.r.t. the output

  Returns:
    the gradients w.r.t. the input and the filter
  """
  # We call the gen_nn_ops backprop functions instead of nn_ops backprop
  # functions for performance reasons in Eager mode. See _Conv2DGrad.
  return [
      None,
      gen_nn_ops.conv2d_backprop_filter(
          grad,
          array_ops.shape(op.inputs[1]),
          op.inputs[2],
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          explicit_paddings=op.get_attr("explicit_paddings"),
          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
          data_format=op.get_attr("data_format").decode()),
      gen_nn_ops.conv2d(
          grad,
          op.inputs[1],
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          explicit_paddings=op.get_attr("explicit_paddings"),
          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
          data_format=op.get_attr("data_format").decode())
  ]


@ops.RegisterGradient("Conv2DBackpropFilter")
def _Conv2DBackpropFilterGrad(op, grad):
  # We call the gen_nn_ops backprop functions instead of nn_ops backprop
  # functions for performance reasons in Eager mode. See _Conv2DGrad.
  return [
      gen_nn_ops.conv2d_backprop_input(
          array_ops.shape(op.inputs[0]),
          grad,
          op.inputs[2],
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          explicit_paddings=op.get_attr("explicit_paddings"),
          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
          data_format=op.get_attr("data_format").decode()), None,
      gen_nn_ops.conv2d(
          op.inputs[0],
          grad,
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          explicit_paddings=op.get_attr("explicit_paddings"),
          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
          data_format=op.get_attr("data_format").decode())
  ]


@ops.RegisterGradient("DepthwiseConv2dNativeBackpropInput")
def _DepthwiseConv2dNativeBackpropInputGrad(op, grad):
  """The derivatives for deconvolution.

  Args:
    op: the Deconvolution op.
    grad: the tensor representing the gradient w.r.t. the output

  Returns:
    the gradients w.r.t. the input and the filter
  """
  return [
      None,
      gen_nn_ops.depthwise_conv2d_native_backprop_filter(
          grad,
          array_ops.shape(op.inputs[1]),
          op.inputs[2],
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          explicit_paddings=op.get_attr("explicit_paddings"),
          data_format=op.get_attr("data_format")),
      gen_nn_ops.depthwise_conv2d_native(
          grad,
          op.inputs[1],
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          explicit_paddings=op.get_attr("explicit_paddings"),
          data_format=op.get_attr("data_format"))
  ]


@ops.RegisterGradient("DepthwiseConv2dNativeBackpropFilter")
def _DepthwiseConv2dNativeBackpropFilterGrad(op, grad):
  return [
      gen_nn_ops.depthwise_conv2d_native_backprop_input(
          array_ops.shape(op.inputs[0]),
          grad,
          op.inputs[2],
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          explicit_paddings=op.get_attr("explicit_paddings"),
          data_format=op.get_attr("data_format")), None,
      gen_nn_ops.depthwise_conv2d_native(
          op.inputs[0],
          grad,
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          explicit_paddings=op.get_attr("explicit_paddings"),
          data_format=op.get_attr("data_format"))
  ]


@ops.RegisterGradient("Conv3D")
def _Conv3DGrad(op, grad):
  data_format = op.get_attr("data_format").decode()
  return [
      nn_ops.conv3d_backprop_input_v2(
          array_ops.shape(op.inputs[0]),
          op.inputs[1],
          grad,
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          data_format=data_format),
      nn_ops.conv3d_backprop_filter_v2(
          op.inputs[0],
          array_ops.shape(op.inputs[1]),
          grad,
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          data_format=data_format)
  ]


@ops.RegisterGradient("Conv3DBackpropInputV2")
def _Conv3DBackpropInputGrad(op, grad):
  data_format = op.get_attr("data_format").decode()
  return [
      None,
      nn_ops.conv3d_backprop_filter_v2(
          grad,
          array_ops.shape(op.inputs[1]),
          op.inputs[2],
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          data_format=data_format),
      nn_ops.conv3d(
          grad,
          op.inputs[1],
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          data_format=data_format)
  ]


@ops.RegisterGradient("Conv3DBackpropFilterV2")
def _Conv3DBackpropFilterGrad(op, grad):
  data_format = op.get_attr("data_format").decode()
  return [
      nn_ops.conv3d_backprop_input_v2(
          array_ops.shape(op.inputs[0]),
          grad,
          op.inputs[2],
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          data_format=data_format), None,
      nn_ops.conv3d(
          op.inputs[0],
          grad,
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          data_format=data_format)
  ]


@ops.RegisterGradient("AvgPool3D")
def _AvgPool3DGrad(op, grad):
  return gen_nn_ops.avg_pool3d_grad(
      array_ops.shape(op.inputs[0]),
      grad,
      ksize=op.get_attr("ksize"),
      strides=op.get_attr("strides"),
      padding=op.get_attr("padding"),
      data_format=op.get_attr("data_format").decode())


@ops.RegisterGradient("AvgPool3DGrad")
def _AvgPool3DGradGrad(op, grad):
  return (array_ops.stop_gradient(op.inputs[0]),
          gen_nn_ops.avg_pool3d(
              grad,
              op.get_attr("ksize"),
              op.get_attr("strides"),
              op.get_attr("padding"),
              data_format=op.get_attr("data_format").decode()))


@ops.RegisterGradient("MaxPool3D")
def _MaxPool3DGrad(op, grad):
  return gen_nn_ops.max_pool3d_grad(
      op.inputs[0],
      op.outputs[0],
      grad,
      ksize=op.get_attr("ksize"),
      strides=op.get_attr("strides"),
      padding=op.get_attr("padding"),
      data_format=op.get_attr("data_format").decode())


@ops.RegisterGradient("MaxPool3DGrad")
def _MaxPool3DGradGrad(op, grad):
  return (array_ops.zeros_like(op.inputs[0]),
          array_ops.zeros_like(op.inputs[1]),
          gen_nn_ops.max_pool3d_grad_grad(
              op.inputs[0],
              op.inputs[1],
              grad,
              op.get_attr("ksize"),
              op.get_attr("strides"),
              padding=op.get_attr("padding"),
              data_format=op.get_attr("data_format").decode()))


@ops.RegisterGradient("MaxPool3DGradGrad")
def _MaxPool3DGradGradGrad(op, grad):
  return (array_ops.zeros_like(op.inputs[0]),
          array_ops.zeros_like(op.inputs[1]),
          gen_nn_ops.max_pool3d_grad(
              op.inputs[0],
              op.inputs[1],
              grad,
              op.get_attr("ksize"),
              op.get_attr("strides"),
              padding=op.get_attr("padding"),
              data_format=op.get_attr("data_format").decode()))


@ops.RegisterGradient("Softmax")
def _SoftmaxGrad(op, grad_softmax):
  """The derivative of the softmax nonlinearity.

  We assume that probs is of shape [batch_size * dim]
  The formula for dsoftmax / dx = (diag(softmax) - softmax * softmax').
  This matrix is diagonal minus a rank one matrix, so it is easy to implement
  as follows:

    grad_x = grad_softmax * softmax - sum(grad_softmax * softmax) * softmax

  Args:
     op: the Softmax op.
     grad_softmax:  the tensor representing the gradient w.r.t. the softmax
       output.

  Returns:
     gradient w.r.t the input to the softmax

  """
  softmax = op.outputs[0]
  sum_channels = math_ops.reduce_sum(grad_softmax * softmax, -1, keepdims=True)
  return (grad_softmax - sum_channels) * softmax


@ops.RegisterGradient("LogSoftmax")
def _LogSoftmaxGrad(op, grad):
  """The gradient for log_softmax.

      log_softmax = input - log(sum(exp(input))
      dlog_softmax/dinput = diag - softmax(input)

  Args:
    op: The log softmax op.
    grad: The tensor representing the gradient w.r.t. the output.

  Returns:
    The gradients w.r.t. the input.
  """
  softmax = math_ops.exp(op.outputs[0])
  return grad - math_ops.reduce_sum(grad, -1, keepdims=True) * softmax


@ops.RegisterGradient("BiasAdd")
def _BiasAddGrad(op, received_grad):
  """Return the gradients for the 2 inputs of bias_op.

  The first input of unused_bias_op is the tensor t, and its gradient is
  just the gradient the unused_bias_op received.

  The second input of unused_bias_op is the bias vector which has one fewer
  dimension than "received_grad" (the batch dimension.)  Its gradient is the
  received gradient Summed on the batch dimension, which is the first dimension.

  Args:
    op: The BiasOp for which we need to generate gradients.
    received_grad: Tensor.  The gradients passed to the BiasOp.

  Returns:
    Two tensors, the first one for the "tensor" input of the BiasOp,
    the second one for the "bias" input of the BiasOp.
  """
  try:
    data_format = op.get_attr("data_format")
  except ValueError:
    data_format = None
  return (received_grad,
          gen_nn_ops.bias_add_grad(
              out_backprop=received_grad, data_format=data_format))


@ops.RegisterGradient("BiasAddGrad")
def _BiasAddGradGrad(op, received_grad):
  """Gradient for the BiasAddGrad op.

  Args:
    op: BiasAddGrad op for which we are calculating gradients.
    received_grad: The gradients passed to the BiasAddGrad op.

  Returns:
    A single gradient Tensor for the input to BiasAddGrad (which
    is the gradient of the bias term in BiasAdd)
  """

  try:
    data_format = op.get_attr("data_format")
  except ValueError:
    data_format = None

  shape = array_ops.shape(op.inputs[0])
  bias_shape = array_ops.shape(received_grad)

  if data_format == b"NCHW":
    expanded_shape = array_ops.concat([
        array_ops.ones_like(shape[:1]), bias_shape,
        array_ops.ones_like(shape[2:])
    ], 0)
    tile_mults = array_ops.concat([shape[:1], [1], shape[2:]], 0)
  else:
    expanded_shape = array_ops.concat(
        [array_ops.ones_like(shape[:-1]), bias_shape], 0)
    tile_mults = array_ops.concat([shape[:-1], [1]], 0)

  expanded_grad = array_ops.reshape(received_grad, expanded_shape)
  return array_ops.tile(expanded_grad, tile_mults)


@ops.RegisterGradient("BiasAddV1")
def _BiasAddGradV1(unused_bias_op, received_grad):
  """Return the gradients for the 2 inputs of bias_op.

  The first input of unused_bias_op is the tensor t, and its gradient is
  just the gradient the unused_bias_op received.

  The second input of unused_bias_op is the bias vector which has one fewer
  dimension than "received_grad" (the batch dimension.)  Its gradient is the
  received gradient Summed on the batch dimension, which is the first dimension.

  Args:
    unused_bias_op: The BiasOp for which we need to generate gradients.
    received_grad: Tensor.  The gradients passed to the BiasOp.

  Returns:
    Two tensors, the first one for the "tensor" input of the BiasOp,
    the second one for the "bias" input of the BiasOp.
  """
  reduction_dim_tensor = math_ops.range(array_ops.rank(received_grad) - 1)
  return (received_grad, math_ops.reduce_sum(received_grad,
                                             reduction_dim_tensor))


@ops.RegisterGradient("Relu")
def _ReluGrad(op, grad):
  return gen_nn_ops.relu_grad(grad, op.outputs[0])


@ops.RegisterGradient("EluGrad")
def _EluGradGrad(op, grad):
  elu_x = op.inputs[1]
  return (gen_nn_ops.elu_grad(grad, elu_x),
          array_ops.where(
              elu_x < 0, grad * op.inputs[0], array_ops.zeros_like(elu_x)))


@ops.RegisterGradient("SeluGrad")
def _SeluGradGrad(op, grad):
  selu_x = op.inputs[1]
  return (gen_nn_ops.selu_grad(grad, selu_x),
          array_ops.where(
              selu_x < 0., grad * op.inputs[0], array_ops.zeros_like(selu_x)))


@ops.RegisterGradient("Relu6")
def _Relu6Grad(op, grad):
  return gen_nn_ops.relu6_grad(grad, op.outputs[0])


@ops.RegisterGradient("Relu6Grad")
def _Relu6GradGrad(op, grad):
  x = op.inputs[1]
  return (gen_nn_ops.relu6_grad(grad, x), array_ops.zeros_like(x))


@ops.RegisterGradient("LeakyRelu")
def _LeakyReluGrad(op, grad):
  x = op.inputs[0]
  alpha = op.get_attr("alpha")
  return gen_nn_ops.leaky_relu_grad(grad, x, alpha=alpha)


@ops.RegisterGradient("LeakyReluGrad")
def _LeakyReluGradGrad(op, grad):
  x = op.inputs[1]
  alpha = op.get_attr("alpha")
  return (gen_nn_ops.leaky_relu_grad(grad, x,
                                     alpha=alpha), array_ops.zeros_like(x))


@ops.RegisterGradient("Elu")
def _EluGrad(op, grad):
  return gen_nn_ops.elu_grad(grad, op.outputs[0])


@ops.RegisterGradient("Selu")
def _SeluGrad(op, grad):
  return gen_nn_ops.selu_grad(grad, op.outputs[0])


@ops.RegisterGradient("Softplus")
def _SoftplusGrad(op, grad):
  return grad * math_ops.sigmoid(op.inputs[0])


@ops.RegisterGradient("SoftplusGrad")
def _SoftplusGradGrad(op, grad):
  # Let:
  #   y = tf.nn.softplus(x)
  #   dx = gen_nn_ops.softplus_grad(dy, x) = dy / (1 + exp(-x))
  # This op computes (ddy, d2x) from op.inputs == [dy, x] and grad == ddx.
  dy, x = op.inputs
  with ops.control_dependencies([grad]):
    ddy = gen_nn_ops.softplus_grad(grad, x)
    d2x = grad * dy / (math_ops.exp(-x) + 2.0 + math_ops.exp(x))
    return (ddy, d2x)


@ops.RegisterGradient("Softsign")
def _SoftsignGrad(op, grad):
  return gen_nn_ops.softsign_grad(grad, op.inputs[0])


@ops.RegisterGradient("ReluGrad")
def _ReluGradGrad(op, grad):
  x = op.inputs[1]
  return (gen_nn_ops.relu_grad(grad, x), array_ops.zeros_like(x))


def _BroadcastMul(vec, mat):
  """Multiply after broadcasting vec to match dimensions of mat.

  Args:
    vec: A 1-D tensor of dimension [D0]
    mat: A 2-D tensor of dimension [D0, D1]

  Returns:
    A tensor of dimension [D0, D1], the result of vec * mat
  """
  # Reshape vec to [D0, 1]
  vec = array_ops.expand_dims(vec, -1)
  return vec * mat


@ops.RegisterGradient("SoftmaxCrossEntropyWithLogits")
def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
  """Gradient function for SoftmaxCrossEntropyWithLogits."""
  # grad_loss is the backprop for cost, and we multiply it with the gradients
  # (which is output[1])
  # grad_grad is the backprop for softmax gradient.
  #
  # Second derivative is just softmax derivative w.r.t. logits.
  softmax_grad = op.outputs[1]
  grad = _BroadcastMul(grad_loss, softmax_grad)

  logits = op.inputs[0]
  if (grad_grad is not None and
      not getattr(grad_grad, "_is_zeros_tensor", False)):
    softmax = nn_ops.softmax(logits)

    grad += ((grad_grad - array_ops.squeeze(
        math_ops.matmul(
            array_ops.expand_dims(grad_grad, 1),
            array_ops.expand_dims(softmax, 2)),
        axis=1)) * softmax)

  return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))  # pylint: disable=invalid-unary-operand-type


@ops.RegisterGradient("SparseSoftmaxCrossEntropyWithLogits")
def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
  """Gradient function for SparseSoftmaxCrossEntropyWithLogits."""
  # grad_loss is the backprop for cost, and we multiply it with the gradients
  # (which is output[1])
  # grad_grad is the backprop for softmax gradient.
  # There is no gradient for the labels
  #
  # Second derivative is just softmax derivative w.r.t. logits.
  softmax_grad = op.outputs[1]
  grad = _BroadcastMul(grad_loss, softmax_grad)

  logits = op.inputs[0]
  if (grad_grad is not None and
      not getattr(grad_grad, "_is_zeros_tensor", False)):
    softmax = nn_ops.softmax(logits)

    grad += ((grad_grad - array_ops.squeeze(
        math_ops.matmul(
            array_ops.expand_dims(grad_grad, 1),
            array_ops.expand_dims(softmax, 2)),
        axis=1)) * softmax)

  return grad, None


@ops.RegisterGradient("Conv2D")
def _Conv2DGrad(op, grad):
  """Gradient function for Conv2D."""
  dilations = op.get_attr("dilations")
  strides = op.get_attr("strides")
  padding = op.get_attr("padding")
  explicit_paddings = op.get_attr("explicit_paddings")
  use_cudnn_on_gpu = op.get_attr("use_cudnn_on_gpu")
  data_format = op.get_attr("data_format")
  shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]])

  # We call the gen_nn_ops backprop functions instead of nn_ops backprop
  # functions for performance reasons in Eager mode. gen_nn_ops functions take a
  # `explicit_paddings` parameter, but nn_ops functions do not. So if we were
  # to use the nn_ops functions, we would have to convert `padding` and
  # `explicit_paddings` into a single `padding` parameter, increasing overhead
  # in Eager mode.
  return [
      gen_nn_ops.conv2d_backprop_input(
          shape_0,
          op.inputs[1],
          grad,
          dilations=dilations,
          strides=strides,
          padding=padding,
          explicit_paddings=explicit_paddings,
          use_cudnn_on_gpu=use_cudnn_on_gpu,
          data_format=data_format),
      gen_nn_ops.conv2d_backprop_filter(
          op.inputs[0],
          shape_1,
          grad,
          dilations=dilations,
          strides=strides,
          padding=padding,
          explicit_paddings=explicit_paddings,
          use_cudnn_on_gpu=use_cudnn_on_gpu,
          data_format=data_format)
  ]


@ops.RegisterGradient("DepthwiseConv2dNative")
def _DepthwiseConv2dNativeGrad(op, grad):
  return [
      gen_nn_ops.depthwise_conv2d_native_backprop_input(
          array_ops.shape(op.inputs[0]),
          op.inputs[1],
          grad,
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          explicit_paddings=op.get_attr("explicit_paddings"),
          data_format=op.get_attr("data_format")),
      gen_nn_ops.depthwise_conv2d_native_backprop_filter(
          op.inputs[0],
          array_ops.shape(op.inputs[1]),
          grad,
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          explicit_paddings=op.get_attr("explicit_paddings"),
          data_format=op.get_attr("data_format"))
  ]


@ops.RegisterGradient("Dilation2D")
def _Dilation2DGrad(op, grad):
  return [
      nn_ops.dilation2d_backprop_input(op.inputs[0], op.inputs[1], grad,
                                       op.get_attr("strides"),
                                       op.get_attr("rates"),
                                       op.get_attr("padding")),
      nn_ops.dilation2d_backprop_filter(op.inputs[0], op.inputs[1], grad,
                                        op.get_attr("strides"),
                                        op.get_attr("rates"),
                                        op.get_attr("padding"))
  ]


@ops.RegisterGradient("LRN")
def _LRNGrad(op, grad):
  depth_radius = op.get_attr("depth_radius")
  bias = op.get_attr("bias")
  alpha = op.get_attr("alpha")
  beta = op.get_attr("beta")
  return [
      gen_nn_ops.lrn_grad(grad, op.inputs[0], op.outputs[0], depth_radius, bias,
                          alpha, beta)
  ]


@ops.RegisterGradient("AvgPool")
def _AvgPoolGrad(op, grad):
  return gen_nn_ops.avg_pool_grad(
      array_ops.shape(op.inputs[0]),
      grad,
      op.get_attr("ksize"),
      op.get_attr("strides"),
      op.get_attr("padding"),
      data_format=op.get_attr("data_format"))


@ops.RegisterGradient("AvgPoolGrad")
def _AvgPoolGradGrad(op, grad):
  return (array_ops.stop_gradient(op.inputs[0]),
          gen_nn_ops.avg_pool(
              grad,
              op.get_attr("ksize"),
              op.get_attr("strides"),
              op.get_attr("padding"),
              data_format=op.get_attr("data_format")))


@ops.RegisterGradient("MaxPool")
def _MaxPoolGrad(op, grad):
  return gen_nn_ops.max_pool_grad(
      op.inputs[0],
      op.outputs[0],
      grad,
      op.get_attr("ksize"),
      op.get_attr("strides"),
      padding=op.get_attr("padding"),
      explicit_paddings=op.get_attr("explicit_paddings"),
      data_format=op.get_attr("data_format"))


@ops.RegisterGradient("MaxPoolV2")
def _MaxPoolGradV2(op, grad):
  ksize = op.inputs[1]
  strides = op.inputs[2]
  return gen_nn_ops.max_pool_grad_v2(
      op.inputs[0],
      op.outputs[0],
      grad,
      ksize,
      strides,
      padding=op.get_attr("padding"),
      data_format=op.get_attr("data_format")), None, None


@ops.RegisterGradient("MaxPoolWithArgmax")
def _MaxPoolGradWithArgmax(op, grad, unused_argmax_grad):
  del unused_argmax_grad
  return gen_nn_ops.max_pool_grad_with_argmax(
      op.inputs[0],
      grad,
      op.outputs[1],
      op.get_attr("ksize"),
      op.get_attr("strides"),
      padding=op.get_attr("padding"),
      include_batch_in_index=op.get_attr("include_batch_in_index"))


@ops.RegisterGradient("MaxPoolGrad")
def _MaxPoolGradGrad(op, grad):
  return (array_ops.zeros_like(op.inputs[0]),
          array_ops.zeros_like(op.inputs[1]),
          gen_nn_ops.max_pool_grad_grad(
              op.inputs[0],
              op.inputs[1],
              grad,
              op.get_attr("ksize"),
              op.get_attr("strides"),
              padding=op.get_attr("padding"),
              data_format=op.get_attr("data_format")))


@ops.RegisterGradient("MaxPoolGradV2")
def _MaxPoolGradGradV2(op, grad):
  ksize = op.inputs[3]
  strides = op.inputs[4]
  return (array_ops.zeros_like(op.inputs[0]),
          array_ops.zeros_like(op.inputs[1]),
          gen_nn_ops.max_pool_grad_grad_v2(
              op.inputs[0],
              op.inputs[1],
              grad,
              ksize,
              strides,
              padding=op.get_attr("padding"),
              data_format=op.get_attr("data_format")), None, None)


@ops.RegisterGradient("MaxPoolGradGrad")
def _MaxPoolGradGradGrad(op, grad):
  return (array_ops.zeros_like(op.inputs[0]),
          array_ops.zeros_like(op.inputs[1]),
          gen_nn_ops.max_pool_grad(
              op.inputs[0],
              op.inputs[1],
              grad,
              op.get_attr("ksize"),
              op.get_attr("strides"),
              padding=op.get_attr("padding"),
              data_format=op.get_attr("data_format")))


@ops.RegisterGradient("FractionalMaxPool")
def _FractionalMaxPoolGrad(op, grad_0, unused_grad_1, unused_grad_2):
  """Returns gradient for FractionalMaxPool.

  Since FractionalMaxPool has three outputs, there are three gradients passed in
  for each of the outputs. Only the first one is useful, the other two gradients
  are empty.

  Args:
    op: The FractionalMaxPoolOp.
    grad_0: Gradient with respect to op.outputs[0]
    unused_grad_1: Gradient with respect to op.outputs[1]/row_seq. It is empty.
    unused_grad_2: Gradient with respect to op.outputs[2]/col_seq. It is empty.

  Returns:
    Input backprop for FractionalMaxPool op.
  """
  return gen_nn_ops.fractional_max_pool_grad(
      op.inputs[0], op.outputs[0], grad_0, op.outputs[1], op.outputs[2],
      op.get_attr("overlapping"))


@ops.RegisterGradient("FractionalAvgPool")
def _FractionalAvgPoolGrad(op, grad_0, unused_grad_1, unused_grad_2):
  """Returns gradient for FractionalAvgPool.

  Since FractionalAvgPool has three outputs, there are three gradients passed in
  for each of the outputs. Only the first one is useful, the other two gradients
  are empty.

  Args:
    op: The FractionalAvgPoolOp.
    grad_0: Gradient with respect to op.outputs[0]
    unused_grad_1: Gradient with respect to op.outputs[1]/row_seq. It is empty.
    unused_grad_2: Gradient with respect to op.outputs[2]/col_seq. It is empty.

  Returns:
    Input backprop for FractionalAvgPool op.
  """
  return gen_nn_ops.fractional_avg_pool_grad(op.inputs[0].get_shape(), grad_0,
                                             op.outputs[1], op.outputs[2],
                                             op.get_attr("overlapping"))


@ops.RegisterGradient("BatchNormWithGlobalNormalization")
def _BatchNormWithGlobalNormalizationGrad(op, grad):
  """Return the gradients for the 5 inputs of BatchNormWithGlobalNormalization.

  We do not backprop anything for the mean and var intentionally as they are
  not being trained with backprop in the operation.

  Args:
    op: The BatchNormOp for which we need to generate gradients.
    grad: Tensor.  The gradients passed to the BatchNormOp.

  Returns:
    dx: Backprop for input, which is (grad * (g * rsqrt(v + epsilon)))
    dm: Backprop for mean, which is
        sum_over_rest(grad * g) * (-1 / rsqrt(v + epsilon))
    dv: Backprop for variance, which is
        sum_over_rest(grad * g * (x - m)) * (-1/2) * (v + epsilon) ^ (-3/2)
    db: Backprop for beta, which is grad reduced in all except the
        last dimension.
    dg: Backprop for gamma, which is (grad * ((x - m) * rsqrt(v + epsilon)))
  """
  dx, dm, dv, db, dg = gen_nn_ops.batch_norm_with_global_normalization_grad(
      op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[4], grad,
      op.get_attr("variance_epsilon"), op.get_attr("scale_after_normalization"))
  return dx, dm, dv, db, dg


def _BaseFusedBatchNormGrad(op, version, *grad):
  """Return the gradients for the 3 inputs of BatchNorm.

  Args:
    op: The BatchNormOp for which we need to compute gradients.
    version: Integer indicating which version to use of the fused batch
      norm gradient.
    *grad: An argument list for tensors of gradients wrt the outputs
      with grad[0] as grad_y.

  Returns:
    grad_x: gradient for x, which is scale * rsqrt(variance + epsilon) *
            [grad_y - mean(grad_y) - (x - mean(x)) *
            mean(grad_y * (x - mean(x))) / (variance + epsilon)]
            in training mode; grad_y * scale * rsqrt(pop_variance + epsilon)
            in freeze mode.

    grad_scale: gradient for scale, which is sum(grad_y * (x - mean(x)) *
                rsqrt(variance + epsilon)) in training mode;
                sum(grad_y * (x - pop_mean) * rsqrt(pop_variance + epsilon))
                in freeze mode.

    grad_offset: gradient for offset, which is sum(grad_y) in training mode;
                 sum(grad_y) in freeze mode.
  """
  x = op.inputs[0]
  grad_y = grad[0]
  scale = op.inputs[1]
  epsilon = op.get_attr("epsilon")
  data_format = op.get_attr("data_format")
  is_training = op.get_attr("is_training")
  if version == 2:
    grad_fun = gen_nn_ops.fused_batch_norm_grad_v3
  elif version == 1:
    grad_fun = gen_nn_ops.fused_batch_norm_grad_v2
  else:
    grad_fun = gen_nn_ops.fused_batch_norm_grad
  if is_training:
    args = {
        "y_backprop": grad_y,
        "x": x,
        "scale": scale,
        "reserve_space_1": op.outputs[3],
        "reserve_space_2": op.outputs[4],
        "epsilon": epsilon,
        "data_format": data_format,
        "is_training": is_training
    }
    if version == 2:
      args["reserve_space_3"] = op.outputs[5]
    dx, dscale, doffset, _, _ = grad_fun(**args)
  else:
    pop_mean = op.inputs[3]
    pop_var = op.inputs[4]
    if data_format == b"NCHW":
      x = array_ops.transpose(x, [0, 2, 3, 1])
      grad_y = array_ops.transpose(grad_y, [0, 2, 3, 1])
    elif data_format == b"NCDHW":
      x = array_ops.transpose(x, [0, 2, 3, 4, 1])
      grad_y = array_ops.transpose(grad_y, [0, 2, 3, 4, 1])
    target_data_format = ("NHWC" if data_format in (b"NCHW",
                                                    b"NHWC") else "NDHWC")
    args = {
        "y_backprop": grad_y,
        "x": x,
        "scale": scale,
        "reserve_space_1": pop_mean,
        "reserve_space_2": pop_var,
        "epsilon": epsilon,
        "data_format": target_data_format,
        "is_training": is_training
    }
    if version == 2:
      args["reserve_space_3"] = op.outputs[5]
    dx, dscale, doffset, _, _ = grad_fun(**args)
    if data_format == b"NCHW":
      dx = array_ops.transpose(dx, [0, 3, 1, 2])
    elif data_format == b"NCDHW":
      dx = array_ops.transpose(dx, [0, 4, 1, 2, 3])
  return dx, dscale, doffset, None, None


@ops.RegisterGradient("FusedBatchNorm")
def _FusedBatchNormGrad(op, *grad):
  return _BaseFusedBatchNormGrad(op, 0, *grad)


@ops.RegisterGradient("FusedBatchNormV2")
def _FusedBatchNormV2Grad(op, *grad):
  return _BaseFusedBatchNormGrad(op, 1, *grad)


@ops.RegisterGradient("FusedBatchNormV3")
def _FusedBatchNormV3Grad(op, *grad):
  return _BaseFusedBatchNormGrad(op, 2, *grad)


def _BatchNormGrad(grad_y,
                   x,
                   scale,
                   pop_mean,
                   pop_var,
                   epsilon,
                   data_format,
                   is_training=True):
  """Returns the gradients for the 3 inputs of BatchNorm.

  Args:
    grad_y: A `Tensor` of 4 or 5 dimensions for gradient for y.
    x: A `Tensor` of 4 or 5 dimensions for x.
    scale: A `Tensor` of 1 dimension for scaling.
    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
      is_training=False.
    pop_var: A `Tensor` of 1 dimension for the population variance. Only used
      when is_training=False.
    epsilon: A small float number added to the variance of x.
    data_format: The data format for input. Either b"NHWC" or b"NCHW".
    is_training: A bool value to indicate the operation is for training
      (default) or inference.

  Returns:
    A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
    for x, grad_scale the gradient for scale, and grad_offset the gradient
    for offset.
  """
  x_dtype = x.dtype.base_dtype
  if x_dtype == dtypes.float16:
    # float16 math is too imprecise, so we do the batch norm gradient
    # computations in float32.
    x = math_ops.cast(x, dtypes.float32)
    grad_y = math_ops.cast(grad_y, dtypes.float32)
  if is_training:
    if data_format == b"NHWC":
      keepdims = False
      reduce_axis = [0, 1, 2]
    elif data_format == b"NDHWC":
      keepdims = False
      reduce_axis = [0, 1, 2, 3]
    elif data_format == b"NCHW":
      keepdims = True
      reduce_axis = [0, 2, 3]
      shape = [1, array_ops.size(scale), 1, 1]
      scale = array_ops.reshape(scale, shape)
    else:
      keepdims = True
      reduce_axis = [0, 2, 3, 4]
      shape = [1, array_ops.size(scale), 1, 1, 1]
      scale = array_ops.reshape(scale, shape)
    mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims)
    mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims)
    var_x = math_ops.reduce_mean(
        math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
        reduce_axis,
        keepdims=keepdims)
    grad_y_offset = grad_y - mean_grad_y
    x_offset = x - mean_x
    mean = math_ops.reduce_mean(
        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
    grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
        grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
    grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
    if data_format == b"NCHW" or data_format == b"NCDHW":
      grad_scale = array_ops.squeeze(grad_scale)
    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
    return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
  else:
    if data_format == b"NHWC":
      reduce_axis = [0, 1, 2]
    elif data_format == b"NDHWC":
      reduce_axis = [0, 1, 2, 3]
    elif data_format == b"NCHW":
      reduce_axis = [0, 2, 3]
      shape = [1, array_ops.size(pop_mean), 1, 1]
      pop_mean = array_ops.reshape(pop_mean, shape)
      pop_var = array_ops.reshape(pop_var, shape)
      scale = array_ops.reshape(scale, shape)
    else:
      reduce_axis = [0, 2, 3, 4]
      shape = [1, array_ops.size(pop_mean), 1, 1, 1]
      pop_mean = array_ops.reshape(pop_mean, shape)
      pop_var = array_ops.reshape(pop_var, shape)
      scale = array_ops.reshape(scale, shape)

    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
    var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
    grad_scale = math_ops.reduce_sum(
        grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis)
    grad_x = grad_y * scale * var_rsqrt
    return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset


@ops.RegisterGradient("FusedBatchNormGrad")
def _FusedBatchNormGradGrad(op, *grad):
  """Returns the gradients for the 3 inputs of FusedBatchNormGrad.

  Args:
    op: The FusedBatchNormGradOp for which we need to compute gradients.
    *grad: An argument list for tensors of gradients wrt the outputs with
      grad[0] as grad_grad_x, grad[1] as grad_grad_scale, grad[2] as
      grad_grad_offset.

  Returns:
    A tuple (grad_grad_y, grad_x, grad_scale, None, None), where grad_grad_y
    is the gradient for grad_y, grad_x the gradient for x, grad_scale the
    gradient for scale.
  """
  data_format = op.get_attr("data_format")
  epsilon = op.get_attr("epsilon")
  is_training = op.get_attr("is_training")
  grad_y = op.inputs[0]
  x = op.inputs[1]
  scale = op.inputs[2]
  pop_mean = op.inputs[3]
  pop_var = op.inputs[4]
  grad_grad_x = grad[0]
  grad_grad_scale = grad[1]
  grad_grad_offset = grad[2]
  with backprop.GradientTape() as tape:
    tape.watch(grad_y)
    tape.watch(x)
    tape.watch(scale)
    grad_x, grad_scale, grad_offset = _BatchNormGrad(
        grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
    grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
  grad_grad_y, grad_x, grad_scale = tape.gradient(
      [grad_x, grad_scale, grad_offset], [grad_y, x, scale], grad_initial)
  return grad_grad_y, grad_x, grad_scale, None, None


@ops.RegisterGradient("FusedBatchNormGradV2")
def _FusedBatchNormGradGradV2(op, *grad):
  return _FusedBatchNormGradGrad(op, *grad)


@ops.RegisterGradient("FusedBatchNormGradV3")
def _FusedBatchNormGradGradV3(op, *grad):
  grad_grad_y, grad_x, grad_scale, _, _ = _FusedBatchNormGradGrad(op, *grad)
  return grad_grad_y, grad_x, grad_scale, None, None, None


@ops.RegisterGradient("L2Loss")
def _L2LossGrad(op, grad):
  """Return the gradients for L2Loss.

  Args:
    op: The L2LossOp for which we need to generate gradients.
    grad: Tensor containing a single number.

  Returns:
    The gradient, which is (x * grad).
  """
  return op.inputs[0] * grad


@ops.RegisterGradient("TopK")
@ops.RegisterGradient("TopKV2")
def _TopKGrad(op, grad, _):
  """Return the gradients for TopK.

  Args:
    op: The TopKOp for which we need to generate gradients.
    grad: Tensor. The gradients passed to the TopKOp.

  Returns:
    A list of two tensors, the first being the gradient w.r.t to the input and
    TopK, and the second being the gradient w.r.t. to the indices (all zero).
  """
  in_shape = array_ops.shape(op.inputs[0])
  ind_shape = array_ops.shape(op.outputs[1])

  # int32 is not supported on GPU hence up-casting
  ind_lastdim = array_ops.gather(
      math_ops.cast(ind_shape, dtypes.int64),
      array_ops.size(ind_shape) - 1)
  # Flatten indices to 2D.
  ind_2d = array_ops.reshape(op.outputs[1], array_ops.stack([-1, ind_lastdim]))

  in_lastdim = array_ops.gather(
      math_ops.cast(in_shape, dtypes.int64),
      array_ops.size(in_shape) - 1)
  outerdim = array_ops.shape(ind_2d)[0]
  # Compute linear indices (flattened to 1D).
  ind = array_ops.reshape(
      ind_2d + math_ops.cast(
          array_ops.expand_dims(
              math_ops.range(0,
                             math_ops.cast(outerdim, dtypes.int64) * in_lastdim,
                             in_lastdim), -1), dtypes.int32), [-1])

  # Substitute grad to appropriate locations and fill the rest with zeros,
  # finally reshaping it to the original input shape.
  return [
      array_ops.reshape(
          array_ops.scatter_nd(
              array_ops.expand_dims(ind, -1), array_ops.reshape(grad, [-1]),
              [math_ops.reduce_prod(in_shape)]), in_shape),
      array_ops.zeros([], dtype=dtypes.int32)
  ]


@ops.RegisterGradient("ApproxTopK")
def _ApproxTopKGradient(op, grad, _):
  """Return the gradients for ApproxTopK.

  Args:
    op: The ApproxTopK for which we need to generate gradients.
    grad: The gradients for backprop.

  Returns:
    Scattered gradient based on the top-k indices.
  """
  # The code below is to generate the correct index and value mapping for
  # scatter_nd to work properly.
  #
  # We use static evaluations as much as possible to reduce the runtime cost.
  # That's said, use operation.shape instead of array_ops.shape;
  # and use functools.reduce(operator.mul, ...) instead of math_ops.reduce_prod
  idx_shape = op.outputs[1].shape
  lifted_idx_shape = idx_shape + [1]
  flat_shape_len = functools.reduce(operator.mul, idx_shape)
  rank = idx_shape.rank
  reduction_dim = op.get_attr("reduction_dimension")
  if reduction_dim < 0:
    reduction_dim = rank + reduction_dim

  def GetLiftedIdx(d):
    if d == reduction_dim:
      return array_ops.reshape(op.outputs[1], lifted_idx_shape)
    iota_len = idx_shape[d]
    iota_shape = list(itertools.repeat(1, rank + 1))
    iota_shape[d] = iota_len
    iota = array_ops.reshape(math_ops.range(iota_len), iota_shape)
    return array_ops.broadcast_to(iota, lifted_idx_shape)

  lifted_idx = array_ops.concat(
      list(GetLiftedIdx(d) for d in range(rank)), axis=rank)
  flat_idx = array_ops.reshape(lifted_idx, [flat_shape_len, rank])
  flat_grad = array_ops.reshape(grad, [flat_shape_len])
  return array_ops.scatter_nd(flat_idx, flat_grad, op.inputs[0].shape)


@ops.RegisterGradient("NthElement")
def _NthElementGrad(op, grad):
  """Return the gradients for NthElement.

  Args:
    op: The NthElementOp for which we need to generate gradients.
    grad: Tensor. The gradients passed to the NthElementOp

  Returns:
    A list of two tensors, the first being the gradient w.r.t. the input,
    the second being the gradient w.r.t. the N (None).
  """
  input = op.inputs[0]  # pylint: disable=redefined-builtin
  output = op.outputs[0]

  # Compute the number of elements which equal to output in each reduction
  # dimension. If there are multiple elements then the gradient will be
  # divided between them.
  indicators = math_ops.cast(
      math_ops.equal(array_ops.expand_dims(output, -1), input), grad.dtype)

  grad = array_ops.expand_dims(grad, -1)
  num_selected = array_ops.expand_dims(math_ops.reduce_sum(indicators, -1), -1)

  return [math_ops.divide(indicators, num_selected) * grad, None]


def _MeanAggregator(inputs, segments):
  """Replaces each segment with its mean along the last axis.

  Specifically, each value in the `inputs` tensor gets replaced by the mean
  value computed from the values that belong to the same segment.

  Args:
   inputs: A 2-tensor. Aggregation is done over dimension 1.
   segments: A 2-tensor, same shape as `input`.

  Returns:
    The result, same shape and type as `inputs`.
  """
  result = []
  for inputs_i, segments_i in zip(
      array_ops.split(inputs, inputs.shape[0]),
      array_ops.split(segments, segments.shape[0])):
    # Note that we do not use tf.math.segment_mean, as it has no TPU support.
    means_i = math_ops.unsorted_segment_mean(
        inputs_i, segments_i, num_segments=math_ops.reduce_max(segments_i) + 1)
    result.append(
        array_ops.reshape(array_ops.gather(means_i, segments_i), [-1]))
  return array_ops.stack(result, axis=0)


# We have to register the gradients for these ops so that tensorflow will know
# how to differentiate them.
@ops.RegisterGradient("IsotonicRegression")
def _IsotonicRegressionGrad(op, grad_output, grad_segments):
  """Gradient for the isotonic regression function.

  Args:
    op: The IsotonicRegression tensorflow op.
    grad_output: Tensor of incoming gradients with respect to the output.
    grad_segments: Tensor of incoming gradients with respect to the segments.

  Returns:
    A tensor, same size as `grad_output` with the gradient with respect to
    the input.
  """
  del grad_segments  # Discrete, non-differentiable.
  segments = op.outputs[1]
  return _MeanAggregator(grad_output, segments)