Source code for quail.decode_speech

from __future__ import print_function
from builtins import str
from builtins import range
import os
import json
import csv
import pickle
import time
import warnings
import pandas as pd

# optional imports for speech decoding
try:
    import whisper
    HAS_WHISPER = True
except ImportError:
    HAS_WHISPER = False



[docs]
def decode_speech(path, model_size='base', save=False, return_raw=False, **kwargs):
    """
    Decode speech for a file or folder and return results using OpenAI Whisper.

    Parameters
    ----------
    path : str
        Path to a wav file, or a folder of wav files.

    model_size : str
        Whisper model size: 'tiny', 'base', 'small', 'medium', 'large'.
        Default is 'base'.

    save : boolean
        False by default. If true, saves results object (pickle) and text transcript.

    return_raw : boolean
        If True, returns the full Whisper result dictionary. 
        If False (default), returns a list of (WORD, START, END) tuples.
        
    **kwargs : dict
        Additional arguments passed to whisper.transcribe (e.g. language).

    Returns
    ----------
    words : list of str, or list of lists of str
        The results of the speech decoding.
    """
    
    if not HAS_WHISPER:
        raise ImportError("openai-whisper not installed. pip install openai-whisper")

    # Load model
    print(f"Loading Whisper model: {model_size}...")
    model = whisper.load_model(model_size)
    print("Model loaded.")

    # make a list of files
    files = []
    if path.endswith(".wav") or path.endswith(".mp3") or path.endswith(".m4a") or path.endswith(".flac"):
        files = [path]
    elif os.path.isdir(path):
        listdirectory = os.listdir(path)
        for filename in listdirectory:
            if filename.lower().endswith((".wav", ".mp3", ".m4a", ".flac")):
                files.append(os.path.join(path, filename))
    else:
        raise ValueError("Path must be an audio file or directory of audio files.")

    # initialize results
    results = []
    
    # loop over files
    for i, f in enumerate(files):
        print('Decoding file ' + str(i+1) + ' of ' + str(len(files)) + f": {f}")
        start = time.time()
        
        try:
            # Decode
            # Whisper expects path or array. Path is fine.
            # word_timestamps=True needed for offsets
            result = model.transcribe(f, word_timestamps=True, **kwargs)
            
            if return_raw:
                parsed = result
            else:
                # Parse into (WORD, START, END) format matching Quail legacy
                parsed = []
                for segment in result['segments']:
                    # Ensure words are available
                    if 'words' in segment:
                         for w in segment['words']:
                             parsed.append((w['word'].strip().upper(), w['start'], w['end']))
                    else:
                        # Fallback if no word level timestamps (shouldn't happen with word_timestamps=True)
                        # Just split text? Timestamps will be approximate segment level
                        text = segment['text'].strip().upper()
                        for t_word in text.split():
                             parsed.append((t_word, segment['start'], segment['end']))

            # Save
            if save:
                # save raw pickle
                with open(f + ".p", "wb") as pfile:
                    pickle.dump(result, pfile)
                
                # save text
                if not return_raw:
                     pd.DataFrame(parsed).to_csv(f + '.txt', header=False, index=False)
                else:
                    with open(f + '.txt', 'w') as tfile:
                        tfile.write(result['text'])

            results.append(parsed)
            
            print('Finished in ' + str(round(time.time()-start,2)) + ' seconds.')

        except Exception as e:
            print(f"Error decoding {f}: {e}")
            results.append("Error")

    if len(results) == 1:
        return results[0]
    else:
        return results