File Manager

Viewing File: /home/ubuntu/codegamaai-test/voice_clone/vocie_ai/lib/python3.10/site-packages/torchcrepe/core.py

import warnings

import numpy as np
import resampy
import torch
import tqdm

import torchcrepe


__all__ = ['CENTS_PER_BIN',
           'MAX_FMAX',
           'PITCH_BINS',
           'SAMPLE_RATE',
           'WINDOW_SIZE',
           'UNVOICED',
           'embed',
           'embed_from_file',
           'embed_from_file_to_file',
           'embed_from_files_to_files',
           'infer',
           'predict',
           'predict_from_file',
           'predict_from_file_to_file',
           'predict_from_files_to_files',
           'preprocess',
           'postprocess',
           'resample']


###############################################################################
# Constants
###############################################################################


CENTS_PER_BIN = 20  # cents
MAX_FMAX = 2006.  # hz
PITCH_BINS = 360
SAMPLE_RATE = 16000  # hz
WINDOW_SIZE = 1024  # samples
UNVOICED = np.nan


###############################################################################
# Crepe pitch prediction
###############################################################################


def predict(audio,
            sample_rate,
            hop_length=None,
            fmin=50.,
            fmax=MAX_FMAX,
            model='full',
            decoder=torchcrepe.decode.viterbi,
            return_harmonicity=False,
            return_periodicity=False,
            batch_size=None,
            device='cpu',
            pad=True):
    """Performs pitch estimation

    Arguments
        audio (torch.tensor [shape=(1, time)])
            The audio signal
        sample_rate (int)
            The sampling rate in Hz
        hop_length (int)
            The hop_length in samples
        fmin (float)
            The minimum allowable frequency in Hz
        fmax (float)
            The maximum allowable frequency in Hz
        model (string)
            The model capacity. One of 'full' or 'tiny'.
        decoder (function)
            The decoder to use. See decode.py for decoders.
        return_harmonicity (bool) [DEPRECATED]
            Whether to also return the network confidence
        return_periodicity (bool)
            Whether to also return the network confidence
        batch_size (int)
            The number of frames per batch
        device (string)
            The device used to run inference
        pad (bool)
            Whether to zero-pad the audio

    Returns
        pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))])
        (Optional) periodicity (torch.tensor
                                [shape=(1, 1 + int(time // hop_length))])
    """
    # Deprecate return_harmonicity
    if return_harmonicity:
        message = (
            'The torchcrepe return_harmonicity argument is deprecated and '
            'will be removed in a future release. Please use '
            'return_periodicity. Rationale: if network confidence measured '
            'harmonics, the value would be low for non-harmonic, periodic '
            'sounds (e.g., sine waves). But this is not observed.')
        warnings.warn(message, DeprecationWarning)
        return_periodicity = return_harmonicity

    results = []

    # Postprocessing breaks gradients, so just don't compute them
    with torch.no_grad():

        # Preprocess audio
        generator = preprocess(audio,
                               sample_rate,
                               hop_length,
                               batch_size,
                               device,
                               pad)
        for frames in generator:

            # Infer independent probabilities for each pitch bin
            probabilities = infer(frames, model)

            # shape=(batch, 360, time / hop_length)
            probabilities = probabilities.reshape(
                audio.size(0), -1, PITCH_BINS).transpose(1, 2)

            # Convert probabilities to F0 and periodicity
            result = postprocess(probabilities,
                                 fmin,
                                 fmax,
                                 decoder,
                                 return_harmonicity,
                                 return_periodicity)

            # Place on same device as audio to allow very long inputs
            if isinstance(result, tuple):
                result = (result[0].to(audio.device),
                          result[1].to(audio.device))
            else:
                 result = result.to(audio.device)

            results.append(result)

    # Split pitch and periodicity
    if return_periodicity:
        pitch, periodicity = zip(*results)
        return torch.cat(pitch, 1), torch.cat(periodicity, 1)

    # Concatenate
    return torch.cat(results, 1)


def predict_from_file(audio_file,
                      hop_length=None,
                      fmin=50.,
                      fmax=MAX_FMAX,
                      model='full',
                      decoder=torchcrepe.decode.viterbi,
                      return_harmonicity=False,
                      return_periodicity=False,
                      batch_size=None,
                      device='cpu',
                      pad=True):
    """Performs pitch estimation from file on disk

    Arguments
        audio_file (string)
            The file to perform pitch tracking on
        hop_length (int)
            The hop_length in samples
        fmin (float)
            The minimum allowable frequency in Hz
        fmax (float)
            The maximum allowable frequency in Hz
        model (string)
            The model capacity. One of 'full' or 'tiny'.
        decoder (function)
            The decoder to use. See decode.py for decoders.
        return_harmonicity (bool) [DEPRECATED]
            Whether to also return the network confidence
        return_periodicity (bool)
            Whether to also return the network confidence
        batch_size (int)
            The number of frames per batch
        device (string)
            The device used to run inference
        pad (bool)
            Whether to zero-pad the audio

    Returns
        pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))])
        (Optional) periodicity (torch.tensor
                                [shape=(1, 1 + int(time // hop_length))])
    """
    # Load audio
    audio, sample_rate = torchcrepe.load.audio(audio_file)

    # Predict
    return predict(audio,
                   sample_rate,
                   hop_length,
                   fmin,
                   fmax,
                   model,
                   decoder,
                   return_harmonicity,
                   return_periodicity,
                   batch_size,
                   device,
                   pad)


def predict_from_file_to_file(audio_file,
                              output_pitch_file,
                              output_harmonicity_file=None,
                              output_periodicity_file=None,
                              hop_length=None,
                              fmin=50.,
                              fmax=MAX_FMAX,
                              model='full',
                              decoder=torchcrepe.decode.viterbi,
                              batch_size=None,
                              device='cpu',
                              pad=True):
    """Performs pitch estimation from file on disk

    Arguments
        audio_file (string)
            The file to perform pitch tracking on
        output_pitch_file (string)
            The file to save predicted pitch
        output_harmonicity_file (string or None) [DEPRECATED]
            The file to save predicted harmonicity
        output_periodicity_file (string or None)
            The file to save predicted periodicity
        hop_length (int)
            The hop_length in samples
        fmin (float)
            The minimum allowable frequency in Hz
        fmax (float)
            The maximum allowable frequency in Hz
        model (string)
            The model capacity. One of 'full' or 'tiny'.
        decoder (function)
            The decoder to use. See decode.py for decoders.
        batch_size (int)
            The number of frames per batch
        device (string)
            The device used to run inference
        pad (bool)
            Whether to zero-pad the audio
    """
    # Deprecate output_harmonicity_file
    if output_harmonicity_file is not None:
        message = (
            'The torchcrepe output_harmonicity_file argument is deprecated and '
            'will be removed in a future release. Please use '
            'output_periodicity_file. Rationale: if network confidence measured '
            'harmonic content, the value would be low for non-harmonic, periodic '
            'sounds (e.g., sine waves). But this is not observed.')
        warnings.warn(message, DeprecationWarning)
        output_periodicity_file = output_harmonicity_file

    # Predict from file
    prediction = predict_from_file(audio_file,
                                   hop_length,
                                   fmin,
                                   fmax,
                                   model,
                                   decoder,
                                   False,
                                   output_periodicity_file is not None,
                                   batch_size,
                                   device,
                                   pad)

    # Save to disk
    if output_periodicity_file is not None:
        torch.save(prediction[0].detach(), output_pitch_file)
        torch.save(prediction[1].detach(), output_periodicity_file)
    else:
        torch.save(prediction.detach(), output_pitch_file)


def predict_from_files_to_files(audio_files,
                                output_pitch_files,
                                output_harmonicity_files=None,
                                output_periodicity_files=None,
                                hop_length=None,
                                fmin=50.,
                                fmax=MAX_FMAX,
                                model='full',
                                decoder=torchcrepe.decode.viterbi,
                                batch_size=None,
                                device='cpu',
                                pad=True):
    """Performs pitch estimation from files on disk without reloading model

    Arguments
        audio_files (list[string])
            The files to perform pitch tracking on
        output_pitch_files (list[string])
            The files to save predicted pitch
        output_harmonicity_files (list[string] or None) [DEPRECATED]
            The files to save predicted harmonicity
        output_periodicity_files (list[string] or None)
            The files to save predicted periodicity
        hop_length (int)
            The hop_length in samples
        fmin (float)
            The minimum allowable frequency in Hz
        fmax (float)
            The maximum allowable frequency in Hz
        model (string)
            The model capacity. One of 'full' or 'tiny'.
        decoder (function)
            The decoder to use. See decode.py for decoders.
        batch_size (int)
            The number of frames per batch
        device (string)
            The device used to run inference
        pad (bool)
            Whether to zero-pad the audio
    """
    # Deprecate output_harmonicity_files
    if output_harmonicity_files is not None:
        message = (
            'The torchcrepe output_harmonicity_files argument is deprecated and '
            'will be removed in a future release. Please use '
            'output_periodicity_files. Rationale: if network confidence measured '
            'harmonic content, the value would be low for non-harmonic, periodic '
            'sounds (e.g., sine waves). But this is not observed.')
        warnings.warn(message, DeprecationWarning)
        output_periodicity_files = output_harmonicity_files

    if output_periodicity_files is None:
        output_periodicity_files = len(audio_files) * [None]

    # Setup iterator
    iterator = zip(audio_files, output_pitch_files, output_periodicity_files)
    iterator = tqdm.tqdm(iterator, desc='torchcrepe', dynamic_ncols=True)
    for audio_file, output_pitch_file, output_periodicity_file in iterator:

        # Predict a file
        predict_from_file_to_file(audio_file,
                                  output_pitch_file,
                                  None,
                                  output_periodicity_file,
                                  hop_length,
                                  fmin,
                                  fmax,
                                  model,
                                  decoder,
                                  batch_size,
                                  device,
                                  pad)

###############################################################################
# Crepe pitch embedding
###############################################################################


def embed(audio,
          sample_rate,
          hop_length=None,
          model='full',
          batch_size=None,
          device='cpu',
          pad=True):
    """Embeds audio to the output of CREPE's fifth maxpool layer

    Arguments
        audio (torch.tensor [shape=(1, time)])
            The audio signals
        sample_rate (int)
            The sampling rate in Hz
        hop_length (int)
            The hop_length in samples
        model (string)
            The model capacity. One of 'full' or 'tiny'.
        batch_size (int)
            The number of frames per batch
        device (string)
            The device to run inference on
        pad (bool)
            Whether to zero-pad the audio

    Returns
        embedding (torch.tensor [shape=(1,
                                        1 + int(time // hop_length), 32, -1)])
    """
    results = []

    # Preprocess audio
    generator = preprocess(audio,
                           sample_rate,
                           hop_length,
                           batch_size,
                           device,
                           pad)
    for frames in generator:

        # Infer pitch embeddings
        embedding = infer(frames, model, embed=True)

        # shape=(batch, time / hop_length, 32, embedding_size)
        result = embedding.reshape(audio.size(0), frames.size(0), 32, -1)

        # Place on same device as audio. This allows for large inputs.
        results.append(result.to(audio.device))

    # Concatenate
    return torch.cat(results, 1)


def embed_from_file(audio_file,
                    hop_length=None,
                    model='full',
                    batch_size=None,
                    device='cpu',
                    pad=True):
    """Embeds audio from disk to the output of CREPE's fifth maxpool layer

    Arguments
        audio_file (string)
            The wav file containing the audio to embed
        hop_length (int)
            The hop_length in samples
        model (string)
            The model capacity. One of 'full' or 'tiny'.
        batch_size (int)
            The number of frames per batch
        device (string)
            The device to run inference on
        pad (bool)
            Whether to zero-pad the audio

    Returns
        embedding (torch.tensor [shape=(1,
                                        1 + int(time // hop_length), 32, -1)])
    """
    # Load audio
    audio, sample_rate = torchcrepe.load.audio(audio_file)

    # Embed
    return embed(audio,
                 sample_rate,
                 hop_length,
                 model,
                 batch_size,
                 device,
                 pad)


def embed_from_file_to_file(audio_file,
                            output_file,
                            hop_length=None,
                            model='full',
                            batch_size=None,
                            device='cpu',
                            pad=True):
    """Embeds audio from disk and saves to disk

    Arguments
        audio_file (string)
            The wav file containing the audio to embed
        hop_length (int)
            The hop_length in samples
        output_file (string)
            The file to save the embedding
        model (string)
            The model capacity. One of 'full' or 'tiny'.
        batch_size (int)
            The number of frames per batch
        device (string)
            The device to run inference on
        pad (bool)
            Whether to zero-pad the audio
    """
    # No use computing gradients if we're just saving to file
    with torch.no_grad():

        # Embed
        embedding = embed_from_file(audio_file,
                                    hop_length,
                                    model,
                                    batch_size,
                                    device,
                                    pad)

        # Save to disk
        torch.save(embedding.detach(), output_file)


def embed_from_files_to_files(audio_files,
                              output_files,
                              hop_length=None,
                              model='full',
                              batch_size=None,
                              device='cpu',
                              pad=True):
    """Embeds audio from disk and saves to disk without reloading model

    Arguments
        audio_files (list[string])
            The wav files containing the audio to embed
        output_files (list[string])
            The files to save the embeddings
        hop_length (int)
            The hop_length in samples
        model (string)
            The model capacity. One of 'full' or 'tiny'.
        batch_size (int)
            The number of frames per batch
        device (string)
            The device to run inference on
        pad (bool)
            Whether to zero-pad the audio
    """
    # Setup iterator
    iterator = zip(audio_files, output_files)
    iterator = tqdm.tqdm(iterator, desc='torchcrepe', dynamic_ncols=True)
    for audio_file, output_file in iterator:

        # Embed a file
        embed_from_file_to_file(audio_file,
                                output_file,
                                hop_length,
                                model,
                                batch_size,
                                device,
                                pad)


###############################################################################
# Components for step-by-step prediction
###############################################################################


def infer(frames, model='full', embed=False):
    """Forward pass through the model

    Arguments
        frames (torch.tensor [shape=(time / hop_length, 1024)])
            The network input
        model (string)
            The model capacity. One of 'full' or 'tiny'.
        embed (bool)
            Whether to stop inference at the intermediate embedding layer

    Returns
        logits (torch.tensor [shape=(1 + int(time // hop_length), 360)]) OR
        embedding (torch.tensor [shape=(1 + int(time // hop_length),
                                       embedding_size)])
    """
    # Load the model if necessary
    if not hasattr(infer, 'model') or not hasattr(infer, 'capacity') or \
       (hasattr(infer, 'capacity') and infer.capacity != model):
        torchcrepe.load.model(frames.device, model)

    # Move model to correct device (no-op if devices are the same)
    infer.model = infer.model.to(frames.device)

    # Apply model
    return infer.model(frames, embed=embed)


def postprocess(probabilities,
                fmin=0.,
                fmax=MAX_FMAX,
                decoder=torchcrepe.decode.viterbi,
                return_harmonicity=False,
                return_periodicity=False):
    """Convert model output to F0 and periodicity

    Arguments
        probabilities (torch.tensor [shape=(1, 360, time / hop_length)])
            The probabilities for each pitch bin inferred by the network
        fmin (float)
            The minimum allowable frequency in Hz
        fmax (float)
            The maximum allowable frequency in Hz
        viterbi (bool)
            Whether to use viterbi decoding
        return_harmonicity (bool) [DEPRECATED]
            Whether to also return the network confidence
        return_periodicity (bool)
            Whether to also return the network confidence

    Returns
        pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))])
        periodicity (torch.tensor [shape=(1, 1 + int(time // hop_length))])
    """
    # Sampling is non-differentiable, so remove from graph
    probabilities = probabilities.detach()

    # Convert frequency range to pitch bin range
    minidx = torchcrepe.convert.frequency_to_bins(torch.tensor(fmin))
    maxidx = torchcrepe.convert.frequency_to_bins(torch.tensor(fmax),
                                                  torch.ceil)

    # Remove frequencies outside of allowable range
    probabilities[:, :minidx] = -float('inf')
    probabilities[:, maxidx:] = -float('inf')

    # Perform argmax or viterbi sampling
    bins, pitch = decoder(probabilities)

    # Deprecate return_harmonicity
    if return_harmonicity:
        message = (
            'The torchcrepe return_harmonicity argument is deprecated and '
            'will be removed in a future release. Please use '
            'return_periodicity. Rationale: if network confidence measured '
            'harmonics, the value would be low for non-harmonic, periodic '
            'sounds (e.g., sine waves). But this is not observed.')
        warnings.warn(message, DeprecationWarning)
        return_periodicity = return_harmonicity

    if not return_periodicity:
        return pitch

    # Compute periodicity from probabilities and decoded pitch bins
    return pitch, periodicity(probabilities, bins)


def preprocess(audio,
               sample_rate,
               hop_length=None,
               batch_size=None,
               device='cpu',
               pad=True):
    """Convert audio to model input

    Arguments
        audio (torch.tensor [shape=(1, time)])
            The audio signals
        sample_rate (int)
            The sampling rate in Hz
        hop_length (int)
            The hop_length in samples
        batch_size (int)
            The number of frames per batch
        device (string)
            The device to run inference on
        pad (bool)
            Whether to zero-pad the audio

    Returns
        frames (torch.tensor [shape=(1 + int(time // hop_length), 1024)])
    """
    # Default hop length of 10 ms
    hop_length = sample_rate // 100 if hop_length is None else hop_length

    # Resample
    if sample_rate != SAMPLE_RATE:
        audio = resample(audio, sample_rate)
        hop_length = int(hop_length * SAMPLE_RATE / sample_rate)

    # Get total number of frames

    # Maybe pad
    if pad:
        total_frames = 1 + int(audio.size(1) // hop_length)
        audio = torch.nn.functional.pad(
            audio,
            (WINDOW_SIZE // 2, WINDOW_SIZE // 2))
    else:
        total_frames = 1 + int((audio.size(1) - WINDOW_SIZE) // hop_length)

    # Default to running all frames in a single batch
    batch_size = total_frames if batch_size is None else batch_size

    # Generate batches
    for i in range(0, total_frames, batch_size):

        # Batch indices
        start = max(0, i * hop_length)
        end = min(audio.size(1),
                  (i + batch_size - 1) * hop_length + WINDOW_SIZE)

        # Chunk
        frames = torch.nn.functional.unfold(
            audio[:, None, None, start:end],
            kernel_size=(1, WINDOW_SIZE),
            stride=(1, hop_length))

        # shape=(1 + int(time / hop_length, 1024)
        frames = frames.transpose(1, 2).reshape(-1, WINDOW_SIZE)

        # Place on device
        frames = frames.to(device)

        # Mean-center
        frames -= frames.mean(dim=1, keepdim=True)

        # Scale
        # Note: during silent frames, this produces very large values. But
        # this seems to be what the network expects.
        frames /= torch.max(torch.tensor(1e-10, device=frames.device),
                            frames.std(dim=1, keepdim=True))

        yield frames


###############################################################################
# Utilities
###############################################################################


def periodicity(probabilities, bins):
    """Computes the periodicity from the network output and pitch bins"""
    # shape=(batch * time / hop_length, 360)
    probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS)

    # shape=(batch * time / hop_length, 1)
    bins_stacked = bins.reshape(-1, 1).to(torch.int64)

    # Use maximum logit over pitch bins as periodicity
    periodicity = probs_stacked.gather(1, bins_stacked)

    # shape=(batch, time / hop_length)
    return periodicity.reshape(probabilities.size(0), probabilities.size(2))


def resample(audio, sample_rate):
    """Resample audio"""
    # Store device for later placement
    device = audio.device

    # Convert to numpy
    audio = audio.detach().cpu().numpy().squeeze(0)

    # Resample
    # We have to use resampy if we want numbers to match Crepe
    audio = resampy.resample(audio, sample_rate, SAMPLE_RATE)

    # Convert to pytorch
    return torch.tensor(audio, device=device).unsqueeze(0)
Back to Directory File Manager