Nagram/TMessagesProj/jni/webrtc/modules/audio_coding/neteq/time_stretch.cc

/*
 *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "modules/audio_coding/neteq/time_stretch.h"

#include <algorithm>  // min, max
#include <memory>

#include "common_audio/signal_processing/include/signal_processing_library.h"
#include "modules/audio_coding/neteq/background_noise.h"
#include "modules/audio_coding/neteq/cross_correlation.h"
#include "modules/audio_coding/neteq/dsp_helper.h"
#include "rtc_base/numerics/safe_conversions.h"

namespace webrtc {

TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input,
                                              size_t input_len,
                                              bool fast_mode,
                                              AudioMultiVector* output,
                                              size_t* length_change_samples) {
  // Pre-calculate common multiplication with |fs_mult_|.
  size_t fs_mult_120 =
      static_cast<size_t>(fs_mult_ * 120);  // Corresponds to 15 ms.

  const int16_t* signal;
  std::unique_ptr<int16_t[]> signal_array;
  size_t signal_len;
  if (num_channels_ == 1) {
    signal = input;
    signal_len = input_len;
  } else {
    // We want |signal| to be only the first channel of |input|, which is
    // interleaved. Thus, we take the first sample, skip forward |num_channels|
    // samples, and continue like that.
    signal_len = input_len / num_channels_;
    signal_array.reset(new int16_t[signal_len]);
    signal = signal_array.get();
    size_t j = kRefChannel;
    for (size_t i = 0; i < signal_len; ++i) {
      signal_array[i] = input[j];
      j += num_channels_;
    }
  }

  // Find maximum absolute value of input signal.
  max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, signal_len);

  // Downsample to 4 kHz sample rate and calculate auto-correlation.
  DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen,
                              sample_rate_hz_, true /* compensate delay*/,
                              downsampled_input_);
  AutoCorrelation();

  // Find the strongest correlation peak.
  static const size_t kNumPeaks = 1;
  size_t peak_index;
  int16_t peak_value;
  DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks,
                           fs_mult_, &peak_index, &peak_value);
  // Assert that |peak_index| stays within boundaries.
  assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_);

  // Compensate peak_index for displaced starting position. The displacement
  // happens in AutoCorrelation(). Here, |kMinLag| is in the down-sampled 4 kHz
  // domain, while the |peak_index| is in the original sample rate; hence, the
  // multiplication by fs_mult_ * 2.
  peak_index += kMinLag * fs_mult_ * 2;
  // Assert that |peak_index| stays within boundaries.
  assert(peak_index >= static_cast<size_t>(20 * fs_mult_));
  assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_);

  // Calculate scaling to ensure that |peak_index| samples can be square-summed
  // without overflowing.
  int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) -
                WebRtcSpl_NormW32(static_cast<int32_t>(peak_index));
  scaling = std::max(0, scaling);

  // |vec1| starts at 15 ms minus one pitch period.
  const int16_t* vec1 = &signal[fs_mult_120 - peak_index];
  // |vec2| start at 15 ms.
  const int16_t* vec2 = &signal[fs_mult_120];
  // Calculate energies for |vec1| and |vec2|, assuming they both contain
  // |peak_index| samples.
  int32_t vec1_energy =
      WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling);
  int32_t vec2_energy =
      WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling);

  // Calculate cross-correlation between |vec1| and |vec2|.
  int32_t cross_corr =
      WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling);

  // Check if the signal seems to be active speech or not (simple VAD).
  bool active_speech =
      SpeechDetection(vec1_energy, vec2_energy, peak_index, scaling);

  int16_t best_correlation;
  if (!active_speech) {
    SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index);
  } else {
    // Calculate correlation:
    // cross_corr / sqrt(vec1_energy * vec2_energy).

    // Start with calculating scale values.
    int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy));
    int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy));

    // Make sure total scaling is even (to simplify scale factor after sqrt).
    if ((energy1_scale + energy2_scale) & 1) {
      // The sum is odd.
      energy1_scale += 1;
    }

    // Scale energies to int16_t.
    int16_t vec1_energy_int16 =
        static_cast<int16_t>(vec1_energy >> energy1_scale);
    int16_t vec2_energy_int16 =
        static_cast<int16_t>(vec2_energy >> energy2_scale);

    // Calculate square-root of energy product.
    int16_t sqrt_energy_prod =
        WebRtcSpl_SqrtFloor(vec1_energy_int16 * vec2_energy_int16);

    // Calculate cross_corr / sqrt(en1*en2) in Q14.
    int temp_scale = 14 - (energy1_scale + energy2_scale) / 2;
    cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale);
    cross_corr = std::max(0, cross_corr);  // Don't use if negative.
    best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod);
    // Make sure |best_correlation| is no larger than 1 in Q14.
    best_correlation = std::min(static_cast<int16_t>(16384), best_correlation);
  }

  // Check accelerate criteria and stretch the signal.
  ReturnCodes return_value =
      CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation,
                              active_speech, fast_mode, output);
  switch (return_value) {
    case kSuccess:
      *length_change_samples = peak_index;
      break;
    case kSuccessLowEnergy:
      *length_change_samples = peak_index;
      break;
    case kNoStretch:
    case kError:
      *length_change_samples = 0;
      break;
  }
  return return_value;
}

void TimeStretch::AutoCorrelation() {
  // Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain.
  int32_t auto_corr[kCorrelationLen];
  CrossCorrelationWithAutoShift(
      &downsampled_input_[kMaxLag], &downsampled_input_[kMaxLag - kMinLag],
      kCorrelationLen, kMaxLag - kMinLag, -1, auto_corr);

  // Normalize correlation to 14 bits and write to |auto_correlation_|.
  int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen);
  int scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr));
  WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen,
                                   auto_corr, scaling);
}

bool TimeStretch::SpeechDetection(int32_t vec1_energy,
                                  int32_t vec2_energy,
                                  size_t peak_index,
                                  int scaling) const {
  // Check if the signal seems to be active speech or not (simple VAD).
  // If (vec1_energy + vec2_energy) / (2 * peak_index) <=
  // 8 * background_noise_energy, then we say that the signal contains no
  // active speech.
  // Rewrite the inequality as:
  // (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy.
  // The two sides of the inequality will be denoted |left_side| and
  // |right_side|.
  int32_t left_side = rtc::saturated_cast<int32_t>(
      (static_cast<int64_t>(vec1_energy) + vec2_energy) / 16);
  int32_t right_side;
  if (background_noise_.initialized()) {
    right_side = background_noise_.Energy(kRefChannel);
  } else {
    // If noise parameters have not been estimated, use a fixed threshold.
    right_side = 75000;
  }
  int right_scale = 16 - WebRtcSpl_NormW32(right_side);
  right_scale = std::max(0, right_scale);
  left_side = left_side >> right_scale;
  right_side =
      rtc::dchecked_cast<int32_t>(peak_index) * (right_side >> right_scale);

  // Scale |left_side| properly before comparing with |right_side|.
  // (|scaling| is the scale factor before energy calculation, thus the scale
  // factor for the energy is 2 * scaling.)
  if (WebRtcSpl_NormW32(left_side) < 2 * scaling) {
    // Cannot scale only |left_side|, must scale |right_side| too.
    int temp_scale = WebRtcSpl_NormW32(left_side);
    left_side = left_side << temp_scale;
    right_side = right_side >> (2 * scaling - temp_scale);
  } else {
    left_side = left_side << 2 * scaling;
  }
  return left_side > right_side;
}

}  // namespace webrtc
Update to 7.0.0 (2060) 2020-08-14 16:58:22 +00:00			`/*`
			`* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.`
			`*`
			`* Use of this source code is governed by a BSD-style license`
			`* that can be found in the LICENSE file in the root of the source`
			`* tree. An additional intellectual property rights grant can be found`
			`* in the file PATENTS. All contributing project authors may`
			`* be found in the AUTHORS file in the root of the source tree.`
			`*/`

			`#include "modules/audio_coding/neteq/time_stretch.h"`

			`#include <algorithm> // min, max`
			`#include <memory>`

			`#include "common_audio/signal_processing/include/signal_processing_library.h"`
			`#include "modules/audio_coding/neteq/background_noise.h"`
			`#include "modules/audio_coding/neteq/cross_correlation.h"`
			`#include "modules/audio_coding/neteq/dsp_helper.h"`
			`#include "rtc_base/numerics/safe_conversions.h"`

			`namespace webrtc {`

			`TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input,`
			`size_t input_len,`
			`bool fast_mode,`
			`AudioMultiVector* output,`
			`size_t* length_change_samples) {`
			`// Pre-calculate common multiplication with \|fs_mult_\|.`
			`size_t fs_mult_120 =`
			`static_cast<size_t>(fs_mult_ * 120); // Corresponds to 15 ms.`

			`const int16_t* signal;`
			`std::unique_ptr<int16_t[]> signal_array;`
			`size_t signal_len;`
			`if (num_channels_ == 1) {`
			`signal = input;`
			`signal_len = input_len;`
			`} else {`
			`// We want \|signal\| to be only the first channel of \|input\|, which is`
			`// interleaved. Thus, we take the first sample, skip forward \|num_channels\|`
			`// samples, and continue like that.`
			`signal_len = input_len / num_channels_;`
			`signal_array.reset(new int16_t[signal_len]);`
			`signal = signal_array.get();`
			`size_t j = kRefChannel;`
			`for (size_t i = 0; i < signal_len; ++i) {`
			`signal_array[i] = input[j];`
			`j += num_channels_;`
			`}`
			`}`

			`// Find maximum absolute value of input signal.`
			`max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, signal_len);`

			`// Downsample to 4 kHz sample rate and calculate auto-correlation.`
			`DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen,`
			`sample_rate_hz_, true /* compensate delay*/,`
			`downsampled_input_);`
			`AutoCorrelation();`

			`// Find the strongest correlation peak.`
			`static const size_t kNumPeaks = 1;`
			`size_t peak_index;`
			`int16_t peak_value;`
			`DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks,`
			`fs_mult_, &peak_index, &peak_value);`
			`// Assert that \|peak_index\| stays within boundaries.`
			`assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_);`

			`// Compensate peak_index for displaced starting position. The displacement`
			`// happens in AutoCorrelation(). Here, \|kMinLag\| is in the down-sampled 4 kHz`
			`// domain, while the \|peak_index\| is in the original sample rate; hence, the`
			`// multiplication by fs_mult_ * 2.`
			`peak_index += kMinLag * fs_mult_ * 2;`
			`// Assert that \|peak_index\| stays within boundaries.`
			`assert(peak_index >= static_cast<size_t>(20 * fs_mult_));`
			`assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_);`

			`// Calculate scaling to ensure that \|peak_index\| samples can be square-summed`
			`// without overflowing.`
			`int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) -`
			`WebRtcSpl_NormW32(static_cast<int32_t>(peak_index));`
			`scaling = std::max(0, scaling);`

			`// \|vec1\| starts at 15 ms minus one pitch period.`
			`const int16_t* vec1 = &signal[fs_mult_120 - peak_index];`
			`// \|vec2\| start at 15 ms.`
			`const int16_t* vec2 = &signal[fs_mult_120];`
			`// Calculate energies for \|vec1\| and \|vec2\|, assuming they both contain`
			`// \|peak_index\| samples.`
			`int32_t vec1_energy =`
			`WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling);`
			`int32_t vec2_energy =`
			`WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling);`

			`// Calculate cross-correlation between \|vec1\| and \|vec2\|.`
			`int32_t cross_corr =`
			`WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling);`

			`// Check if the signal seems to be active speech or not (simple VAD).`
			`bool active_speech =`
			`SpeechDetection(vec1_energy, vec2_energy, peak_index, scaling);`

			`int16_t best_correlation;`
			`if (!active_speech) {`
			`SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index);`
			`} else {`
			`// Calculate correlation:`
			`// cross_corr / sqrt(vec1_energy * vec2_energy).`

			`// Start with calculating scale values.`
			`int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy));`
			`int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy));`

			`// Make sure total scaling is even (to simplify scale factor after sqrt).`
			`if ((energy1_scale + energy2_scale) & 1) {`
			`// The sum is odd.`
			`energy1_scale += 1;`
			`}`

			`// Scale energies to int16_t.`
			`int16_t vec1_energy_int16 =`
			`static_cast<int16_t>(vec1_energy >> energy1_scale);`
			`int16_t vec2_energy_int16 =`
			`static_cast<int16_t>(vec2_energy >> energy2_scale);`

			`// Calculate square-root of energy product.`
			`int16_t sqrt_energy_prod =`
			`WebRtcSpl_SqrtFloor(vec1_energy_int16 * vec2_energy_int16);`

			`// Calculate cross_corr / sqrt(en1*en2) in Q14.`
			`int temp_scale = 14 - (energy1_scale + energy2_scale) / 2;`
			`cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale);`
			`cross_corr = std::max(0, cross_corr); // Don't use if negative.`
			`best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod);`
			`// Make sure \|best_correlation\| is no larger than 1 in Q14.`
			`best_correlation = std::min(static_cast<int16_t>(16384), best_correlation);`
			`}`

			`// Check accelerate criteria and stretch the signal.`
			`ReturnCodes return_value =`
			`CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation,`
			`active_speech, fast_mode, output);`
			`switch (return_value) {`
			`case kSuccess:`
			`*length_change_samples = peak_index;`
			`break;`
			`case kSuccessLowEnergy:`
			`*length_change_samples = peak_index;`
			`break;`
			`case kNoStretch:`
			`case kError:`
			`*length_change_samples = 0;`
			`break;`
			`}`
			`return return_value;`
			`}`

			`void TimeStretch::AutoCorrelation() {`
			`// Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain.`
			`int32_t auto_corr[kCorrelationLen];`
			`CrossCorrelationWithAutoShift(`
			`&downsampled_input_[kMaxLag], &downsampled_input_[kMaxLag - kMinLag],`
			`kCorrelationLen, kMaxLag - kMinLag, -1, auto_corr);`

			`// Normalize correlation to 14 bits and write to \|auto_correlation_\|.`
			`int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen);`
			`int scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr));`
			`WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen,`
			`auto_corr, scaling);`
			`}`

			`bool TimeStretch::SpeechDetection(int32_t vec1_energy,`
			`int32_t vec2_energy,`
			`size_t peak_index,`
			`int scaling) const {`
			`// Check if the signal seems to be active speech or not (simple VAD).`
			`// If (vec1_energy + vec2_energy) / (2 * peak_index) <=`
			`// 8 * background_noise_energy, then we say that the signal contains no`
			`// active speech.`
			`// Rewrite the inequality as:`
			`// (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy.`
			`// The two sides of the inequality will be denoted \|left_side\| and`
			`// \|right_side\|.`
			`int32_t left_side = rtc::saturated_cast<int32_t>(`
			`(static_cast<int64_t>(vec1_energy) + vec2_energy) / 16);`
			`int32_t right_side;`
			`if (background_noise_.initialized()) {`
			`right_side = background_noise_.Energy(kRefChannel);`
			`} else {`
			`// If noise parameters have not been estimated, use a fixed threshold.`
			`right_side = 75000;`
			`}`
			`int right_scale = 16 - WebRtcSpl_NormW32(right_side);`
			`right_scale = std::max(0, right_scale);`
			`left_side = left_side >> right_scale;`
			`right_side =`
			`rtc::dchecked_cast<int32_t>(peak_index) * (right_side >> right_scale);`

			`// Scale \|left_side\| properly before comparing with \|right_side\|.`
			`// (\|scaling\| is the scale factor before energy calculation, thus the scale`
			`// factor for the energy is 2 * scaling.)`
			`if (WebRtcSpl_NormW32(left_side) < 2 * scaling) {`
			`// Cannot scale only \|left_side\|, must scale \|right_side\| too.`
			`int temp_scale = WebRtcSpl_NormW32(left_side);`
			`left_side = left_side << temp_scale;`
			`right_side = right_side >> (2 * scaling - temp_scale);`
			`} else {`
			`left_side = left_side << 2 * scaling;`
			`}`
			`return left_side > right_side;`
			`}`

			`} // namespace webrtc`