xref: /aosp_15_r20/external/tensorflow/tensorflow/python/ops/signal/spectral_ops.py (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Spectral operations (e.g. Short-time Fourier Transform)."""
16
17import numpy as np
18
19from tensorflow.python.framework import constant_op
20from tensorflow.python.framework import dtypes
21from tensorflow.python.framework import ops
22from tensorflow.python.framework import tensor_util
23from tensorflow.python.ops import array_ops
24from tensorflow.python.ops import math_ops
25from tensorflow.python.ops.signal import dct_ops
26from tensorflow.python.ops.signal import fft_ops
27from tensorflow.python.ops.signal import reconstruction_ops
28from tensorflow.python.ops.signal import shape_ops
29from tensorflow.python.ops.signal import window_ops
30from tensorflow.python.util import dispatch
31from tensorflow.python.util.tf_export import tf_export
32
33
34@tf_export('signal.stft')
35@dispatch.add_dispatch_support
36def stft(signals, frame_length, frame_step, fft_length=None,
37         window_fn=window_ops.hann_window,
38         pad_end=False, name=None):
39  """Computes the [Short-time Fourier Transform][stft] of `signals`.
40
41  Implemented with TPU/GPU-compatible ops and supports gradients.
42
43  Args:
44    signals: A `[..., samples]` `float32`/`float64` `Tensor` of real-valued
45      signals.
46    frame_length: An integer scalar `Tensor`. The window length in samples.
47    frame_step: An integer scalar `Tensor`. The number of samples to step.
48    fft_length: An integer scalar `Tensor`. The size of the FFT to apply.
49      If not provided, uses the smallest power of 2 enclosing `frame_length`.
50    window_fn: A callable that takes a window length and a `dtype` keyword
51      argument and returns a `[window_length]` `Tensor` of samples in the
52      provided datatype. If set to `None`, no windowing is used.
53    pad_end: Whether to pad the end of `signals` with zeros when the provided
54      frame length and step produces a frame that lies partially past its end.
55    name: An optional name for the operation.
56
57  Returns:
58    A `[..., frames, fft_unique_bins]` `Tensor` of `complex64`/`complex128`
59    STFT values where `fft_unique_bins` is `fft_length // 2 + 1` (the unique
60    components of the FFT).
61
62  Raises:
63    ValueError: If `signals` is not at least rank 1, `frame_length` is
64      not scalar, or `frame_step` is not scalar.
65
66  [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
67  """
68  with ops.name_scope(name, 'stft', [signals, frame_length,
69                                     frame_step]):
70    signals = ops.convert_to_tensor(signals, name='signals')
71    signals.shape.with_rank_at_least(1)
72    frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
73    frame_length.shape.assert_has_rank(0)
74    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
75    frame_step.shape.assert_has_rank(0)
76
77    if fft_length is None:
78      fft_length = _enclosing_power_of_two(frame_length)
79    else:
80      fft_length = ops.convert_to_tensor(fft_length, name='fft_length')
81
82    framed_signals = shape_ops.frame(
83        signals, frame_length, frame_step, pad_end=pad_end)
84
85    # Optionally window the framed signals.
86    if window_fn is not None:
87      window = window_fn(frame_length, dtype=framed_signals.dtype)
88      framed_signals *= window
89
90    # fft_ops.rfft produces the (fft_length/2 + 1) unique components of the
91    # FFT of the real windowed signals in framed_signals.
92    return fft_ops.rfft(framed_signals, [fft_length])
93
94
95@tf_export('signal.inverse_stft_window_fn')
96@dispatch.add_dispatch_support
97def inverse_stft_window_fn(frame_step,
98                           forward_window_fn=window_ops.hann_window,
99                           name=None):
100  """Generates a window function that can be used in `inverse_stft`.
101
102  Constructs a window that is equal to the forward window with a further
103  pointwise amplitude correction.  `inverse_stft_window_fn` is equivalent to
104  `forward_window_fn` in the case where it would produce an exact inverse.
105
106  See examples in `inverse_stft` documentation for usage.
107
108  Args:
109    frame_step: An integer scalar `Tensor`. The number of samples to step.
110    forward_window_fn: window_fn used in the forward transform, `stft`.
111    name: An optional name for the operation.
112
113  Returns:
114    A callable that takes a window length and a `dtype` keyword argument and
115      returns a `[window_length]` `Tensor` of samples in the provided datatype.
116      The returned window is suitable for reconstructing original waveform in
117      inverse_stft.
118  """
119  def inverse_stft_window_fn_inner(frame_length, dtype):
120    """Computes a window that can be used in `inverse_stft`.
121
122    Args:
123      frame_length: An integer scalar `Tensor`. The window length in samples.
124      dtype: Data type of waveform passed to `stft`.
125
126    Returns:
127      A window suitable for reconstructing original waveform in `inverse_stft`.
128
129    Raises:
130      ValueError: If `frame_length` is not scalar, `forward_window_fn` is not a
131      callable that takes a window length and a `dtype` keyword argument and
132      returns a `[window_length]` `Tensor` of samples in the provided datatype
133      `frame_step` is not scalar, or `frame_step` is not scalar.
134    """
135    with ops.name_scope(name, 'inverse_stft_window_fn', [forward_window_fn]):
136      frame_step_ = ops.convert_to_tensor(frame_step, name='frame_step')
137      frame_step_.shape.assert_has_rank(0)
138      frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
139      frame_length.shape.assert_has_rank(0)
140
141      # Use equation 7 from Griffin + Lim.
142      forward_window = forward_window_fn(frame_length, dtype=dtype)
143      denom = math_ops.square(forward_window)
144      overlaps = -(-frame_length // frame_step_)  # Ceiling division.  # pylint: disable=invalid-unary-operand-type
145      denom = array_ops.pad(denom, [(0, overlaps * frame_step_ - frame_length)])
146      denom = array_ops.reshape(denom, [overlaps, frame_step_])
147      denom = math_ops.reduce_sum(denom, 0, keepdims=True)
148      denom = array_ops.tile(denom, [overlaps, 1])
149      denom = array_ops.reshape(denom, [overlaps * frame_step_])
150
151      return forward_window / denom[:frame_length]
152  return inverse_stft_window_fn_inner
153
154
155@tf_export('signal.inverse_stft')
156@dispatch.add_dispatch_support
157def inverse_stft(stfts,
158                 frame_length,
159                 frame_step,
160                 fft_length=None,
161                 window_fn=window_ops.hann_window,
162                 name=None):
163  """Computes the inverse [Short-time Fourier Transform][stft] of `stfts`.
164
165  To reconstruct an original waveform, a complementary window function should
166  be used with `inverse_stft`. Such a window function can be constructed with
167  `tf.signal.inverse_stft_window_fn`.
168  Example:
169
170  ```python
171  frame_length = 400
172  frame_step = 160
173  waveform = tf.random.normal(dtype=tf.float32, shape=[1000])
174  stft = tf.signal.stft(waveform, frame_length, frame_step)
175  inverse_stft = tf.signal.inverse_stft(
176      stft, frame_length, frame_step,
177      window_fn=tf.signal.inverse_stft_window_fn(frame_step))
178  ```
179
180  If a custom `window_fn` is used with `tf.signal.stft`, it must be passed to
181  `tf.signal.inverse_stft_window_fn`:
182
183  ```python
184  frame_length = 400
185  frame_step = 160
186  window_fn = tf.signal.hamming_window
187  waveform = tf.random.normal(dtype=tf.float32, shape=[1000])
188  stft = tf.signal.stft(
189      waveform, frame_length, frame_step, window_fn=window_fn)
190  inverse_stft = tf.signal.inverse_stft(
191      stft, frame_length, frame_step,
192      window_fn=tf.signal.inverse_stft_window_fn(
193         frame_step, forward_window_fn=window_fn))
194  ```
195
196  Implemented with TPU/GPU-compatible ops and supports gradients.
197
198  Args:
199    stfts: A `complex64`/`complex128` `[..., frames, fft_unique_bins]`
200      `Tensor` of STFT bins representing a batch of `fft_length`-point STFTs
201      where `fft_unique_bins` is `fft_length // 2 + 1`
202    frame_length: An integer scalar `Tensor`. The window length in samples.
203    frame_step: An integer scalar `Tensor`. The number of samples to step.
204    fft_length: An integer scalar `Tensor`. The size of the FFT that produced
205      `stfts`. If not provided, uses the smallest power of 2 enclosing
206      `frame_length`.
207    window_fn: A callable that takes a window length and a `dtype` keyword
208      argument and returns a `[window_length]` `Tensor` of samples in the
209      provided datatype. If set to `None`, no windowing is used.
210    name: An optional name for the operation.
211
212  Returns:
213    A `[..., samples]` `Tensor` of `float32`/`float64` signals representing
214    the inverse STFT for each input STFT in `stfts`.
215
216  Raises:
217    ValueError: If `stfts` is not at least rank 2, `frame_length` is not scalar,
218      `frame_step` is not scalar, or `fft_length` is not scalar.
219
220  [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
221  """
222  with ops.name_scope(name, 'inverse_stft', [stfts]):
223    stfts = ops.convert_to_tensor(stfts, name='stfts')
224    stfts.shape.with_rank_at_least(2)
225    frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
226    frame_length.shape.assert_has_rank(0)
227    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
228    frame_step.shape.assert_has_rank(0)
229    if fft_length is None:
230      fft_length = _enclosing_power_of_two(frame_length)
231    else:
232      fft_length = ops.convert_to_tensor(fft_length, name='fft_length')
233      fft_length.shape.assert_has_rank(0)
234
235    real_frames = fft_ops.irfft(stfts, [fft_length])
236
237    # frame_length may be larger or smaller than fft_length, so we pad or
238    # truncate real_frames to frame_length.
239    frame_length_static = tensor_util.constant_value(frame_length)
240    # If we don't know the shape of real_frames's inner dimension, pad and
241    # truncate to frame_length.
242    if (frame_length_static is None or real_frames.shape.ndims is None or
243        real_frames.shape.as_list()[-1] is None):
244      real_frames = real_frames[..., :frame_length]
245      real_frames_rank = array_ops.rank(real_frames)
246      real_frames_shape = array_ops.shape(real_frames)
247      paddings = array_ops.concat(
248          [array_ops.zeros([real_frames_rank - 1, 2],
249                           dtype=frame_length.dtype),
250           [[0, math_ops.maximum(0, frame_length - real_frames_shape[-1])]]], 0)
251      real_frames = array_ops.pad(real_frames, paddings)
252    # We know real_frames's last dimension and frame_length statically. If they
253    # are different, then pad or truncate real_frames to frame_length.
254    elif real_frames.shape.as_list()[-1] > frame_length_static:
255      real_frames = real_frames[..., :frame_length_static]
256    elif real_frames.shape.as_list()[-1] < frame_length_static:
257      pad_amount = frame_length_static - real_frames.shape.as_list()[-1]
258      real_frames = array_ops.pad(real_frames,
259                                  [[0, 0]] * (real_frames.shape.ndims - 1) +
260                                  [[0, pad_amount]])
261
262    # The above code pads the inner dimension of real_frames to frame_length,
263    # but it does so in a way that may not be shape-inference friendly.
264    # Restore shape information if we are able to.
265    if frame_length_static is not None and real_frames.shape.ndims is not None:
266      real_frames.set_shape([None] * (real_frames.shape.ndims - 1) +
267                            [frame_length_static])
268
269    # Optionally window and overlap-add the inner 2 dimensions of real_frames
270    # into a single [samples] dimension.
271    if window_fn is not None:
272      window = window_fn(frame_length, dtype=stfts.dtype.real_dtype)
273      real_frames *= window
274    return reconstruction_ops.overlap_and_add(real_frames, frame_step)
275
276
277def _enclosing_power_of_two(value):
278  """Return 2**N for integer N such that 2**N >= value."""
279  value_static = tensor_util.constant_value(value)
280  if value_static is not None:
281    return constant_op.constant(
282        int(2**np.ceil(np.log(value_static) / np.log(2.0))), value.dtype)
283  return math_ops.cast(
284      math_ops.pow(
285          2.0,
286          math_ops.ceil(
287              math_ops.log(math_ops.cast(value, dtypes.float32)) /
288              math_ops.log(2.0))), value.dtype)
289
290
291@tf_export('signal.mdct')
292@dispatch.add_dispatch_support
293def mdct(signals, frame_length, window_fn=window_ops.vorbis_window,
294         pad_end=False, norm=None, name=None):
295  """Computes the [Modified Discrete Cosine Transform][mdct] of `signals`.
296
297  Implemented with TPU/GPU-compatible ops and supports gradients.
298
299  Args:
300    signals: A `[..., samples]` `float32`/`float64` `Tensor` of real-valued
301      signals.
302    frame_length: An integer scalar `Tensor`. The window length in samples
303      which must be divisible by 4.
304    window_fn: A callable that takes a frame_length and a `dtype` keyword
305      argument and returns a `[frame_length]` `Tensor` of samples in the
306      provided datatype. If set to `None`, a rectangular window with a scale of
307      1/sqrt(2) is used. For perfect reconstruction of a signal from `mdct`
308      followed by `inverse_mdct`, please use `tf.signal.vorbis_window`,
309      `tf.signal.kaiser_bessel_derived_window` or `None`. If using another
310      window function, make sure that w[n]^2 + w[n + frame_length // 2]^2 = 1
311      and w[n] = w[frame_length - n - 1] for n = 0,...,frame_length // 2 - 1 to
312      achieve perfect reconstruction.
313    pad_end: Whether to pad the end of `signals` with zeros when the provided
314      frame length and step produces a frame that lies partially past its end.
315    norm: If it is None, unnormalized dct4 is used, if it is "ortho"
316      orthonormal dct4 is used.
317    name: An optional name for the operation.
318
319  Returns:
320    A `[..., frames, frame_length // 2]` `Tensor` of `float32`/`float64`
321    MDCT values where `frames` is roughly `samples // (frame_length // 2)`
322    when `pad_end=False`.
323
324  Raises:
325    ValueError: If `signals` is not at least rank 1, `frame_length` is
326      not scalar, or `frame_length` is not a multiple of `4`.
327
328  [mdct]: https://en.wikipedia.org/wiki/Modified_discrete_cosine_transform
329  """
330  with ops.name_scope(name, 'mdct', [signals, frame_length]):
331    signals = ops.convert_to_tensor(signals, name='signals')
332    signals.shape.with_rank_at_least(1)
333    frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
334    frame_length.shape.assert_has_rank(0)
335    # Assert that frame_length is divisible by 4.
336    frame_length_static = tensor_util.constant_value(frame_length)
337    if frame_length_static is not None:
338      if frame_length_static % 4 != 0:
339        raise ValueError('The frame length must be a multiple of 4.')
340      frame_step = ops.convert_to_tensor(frame_length_static // 2,
341                                         dtype=frame_length.dtype)
342    else:
343      frame_step = frame_length // 2
344
345    framed_signals = shape_ops.frame(
346        signals, frame_length, frame_step, pad_end=pad_end)
347
348    # Optionally window the framed signals.
349    if window_fn is not None:
350      window = window_fn(frame_length, dtype=framed_signals.dtype)
351      framed_signals *= window
352    else:
353      framed_signals *= 1.0 / np.sqrt(2)
354
355    split_frames = array_ops.split(framed_signals, 4, axis=-1)
356    frame_firsthalf = -array_ops.reverse(split_frames[2],
357                                         [-1]) - split_frames[3]
358    frame_secondhalf = split_frames[0] - array_ops.reverse(split_frames[1],
359                                                           [-1])
360    frames_rearranged = array_ops.concat((frame_firsthalf, frame_secondhalf),
361                                         axis=-1)
362    # Below call produces the (frame_length // 2) unique components of the
363    # type 4 orthonormal DCT of the real windowed signals in frames_rearranged.
364    return dct_ops.dct(frames_rearranged, type=4, norm=norm)
365
366
367@tf_export('signal.inverse_mdct')
368@dispatch.add_dispatch_support
369def inverse_mdct(mdcts,
370                 window_fn=window_ops.vorbis_window,
371                 norm=None,
372                 name=None):
373  """Computes the inverse modified DCT of `mdcts`.
374
375  To reconstruct an original waveform, the same window function should
376  be used with `mdct` and `inverse_mdct`.
377
378  Example usage:
379
380  >>> @tf.function
381  ... def compare_round_trip():
382  ...   samples = 1000
383  ...   frame_length = 400
384  ...   halflen = frame_length // 2
385  ...   waveform = tf.random.normal(dtype=tf.float32, shape=[samples])
386  ...   waveform_pad = tf.pad(waveform, [[halflen, 0],])
387  ...   mdct = tf.signal.mdct(waveform_pad, frame_length, pad_end=True,
388  ...                         window_fn=tf.signal.vorbis_window)
389  ...   inverse_mdct = tf.signal.inverse_mdct(mdct,
390  ...                                         window_fn=tf.signal.vorbis_window)
391  ...   inverse_mdct = inverse_mdct[halflen: halflen + samples]
392  ...   return waveform, inverse_mdct
393  >>> waveform, inverse_mdct = compare_round_trip()
394  >>> np.allclose(waveform.numpy(), inverse_mdct.numpy(), rtol=1e-3, atol=1e-4)
395  True
396
397  Implemented with TPU/GPU-compatible ops and supports gradients.
398
399  Args:
400    mdcts: A `float32`/`float64` `[..., frames, frame_length // 2]`
401      `Tensor` of MDCT bins representing a batch of `frame_length // 2`-point
402      MDCTs.
403    window_fn: A callable that takes a frame_length and a `dtype` keyword
404      argument and returns a `[frame_length]` `Tensor` of samples in the
405      provided datatype. If set to `None`, a rectangular window with a scale of
406      1/sqrt(2) is used. For perfect reconstruction of a signal from `mdct`
407      followed by `inverse_mdct`, please use `tf.signal.vorbis_window`,
408      `tf.signal.kaiser_bessel_derived_window` or `None`. If using another
409      window function, make sure that w[n]^2 + w[n + frame_length // 2]^2 = 1
410      and w[n] = w[frame_length - n - 1] for n = 0,...,frame_length // 2 - 1 to
411      achieve perfect reconstruction.
412    norm: If "ortho", orthonormal inverse DCT4 is performed, if it is None,
413      a regular dct4 followed by scaling of `1/frame_length` is performed.
414    name: An optional name for the operation.
415
416  Returns:
417    A `[..., samples]` `Tensor` of `float32`/`float64` signals representing
418    the inverse MDCT for each input MDCT in `mdcts` where `samples` is
419    `(frames - 1) * (frame_length // 2) + frame_length`.
420
421  Raises:
422    ValueError: If `mdcts` is not at least rank 2.
423
424  [mdct]: https://en.wikipedia.org/wiki/Modified_discrete_cosine_transform
425  """
426  with ops.name_scope(name, 'inverse_mdct', [mdcts]):
427    mdcts = ops.convert_to_tensor(mdcts, name='mdcts')
428    mdcts.shape.with_rank_at_least(2)
429    half_len = math_ops.cast(mdcts.shape[-1], dtype=dtypes.int32)
430
431    if norm is None:
432      half_len_float = math_ops.cast(half_len, dtype=mdcts.dtype)
433      result_idct4 = (0.5 / half_len_float) * dct_ops.dct(mdcts, type=4)
434    elif norm == 'ortho':
435      result_idct4 = dct_ops.dct(mdcts, type=4, norm='ortho')
436    split_result = array_ops.split(result_idct4, 2, axis=-1)
437    real_frames = array_ops.concat((split_result[1],
438                                    -array_ops.reverse(split_result[1], [-1]),
439                                    -array_ops.reverse(split_result[0], [-1]),
440                                    -split_result[0]), axis=-1)
441
442    # Optionally window and overlap-add the inner 2 dimensions of real_frames
443    # into a single [samples] dimension.
444    if window_fn is not None:
445      window = window_fn(2 * half_len, dtype=mdcts.dtype)
446      real_frames *= window
447    else:
448      real_frames *= 1.0 / np.sqrt(2)
449    return reconstruction_ops.overlap_and_add(real_frames, half_len)
450