Source code for quail.decode_speech

from __future__ import print_function
from builtins import str
from builtins import range
import os
import json
import csv
import pickle
import time
import warnings
import pandas as pd

# optional imports for speech decoding
try:
    import whisper
    HAS_WHISPER = True
except ImportError:
    HAS_WHISPER = False


[docs] def decode_speech(path, model_size='base', save=False, return_raw=False, **kwargs): """ Decode speech for a file or folder and return results using OpenAI Whisper. Parameters ---------- path : str Path to a wav file, or a folder of wav files. model_size : str Whisper model size: 'tiny', 'base', 'small', 'medium', 'large'. Default is 'base'. save : boolean False by default. If true, saves results object (pickle) and text transcript. return_raw : boolean If True, returns the full Whisper result dictionary. If False (default), returns a list of (WORD, START, END) tuples. **kwargs : dict Additional arguments passed to whisper.transcribe (e.g. language). Returns ---------- words : list of str, or list of lists of str The results of the speech decoding. """ if not HAS_WHISPER: raise ImportError("openai-whisper not installed. pip install openai-whisper") # Load model print(f"Loading Whisper model: {model_size}...") model = whisper.load_model(model_size) print("Model loaded.") # make a list of files files = [] if path.endswith(".wav") or path.endswith(".mp3") or path.endswith(".m4a") or path.endswith(".flac"): files = [path] elif os.path.isdir(path): listdirectory = os.listdir(path) for filename in listdirectory: if filename.lower().endswith((".wav", ".mp3", ".m4a", ".flac")): files.append(os.path.join(path, filename)) else: raise ValueError("Path must be an audio file or directory of audio files.") # initialize results results = [] # loop over files for i, f in enumerate(files): print('Decoding file ' + str(i+1) + ' of ' + str(len(files)) + f": {f}") start = time.time() try: # Decode # Whisper expects path or array. Path is fine. # word_timestamps=True needed for offsets result = model.transcribe(f, word_timestamps=True, **kwargs) if return_raw: parsed = result else: # Parse into (WORD, START, END) format matching Quail legacy parsed = [] for segment in result['segments']: # Ensure words are available if 'words' in segment: for w in segment['words']: parsed.append((w['word'].strip().upper(), w['start'], w['end'])) else: # Fallback if no word level timestamps (shouldn't happen with word_timestamps=True) # Just split text? Timestamps will be approximate segment level text = segment['text'].strip().upper() for t_word in text.split(): parsed.append((t_word, segment['start'], segment['end'])) # Save if save: # save raw pickle with open(f + ".p", "wb") as pfile: pickle.dump(result, pfile) # save text if not return_raw: pd.DataFrame(parsed).to_csv(f + '.txt', header=False, index=False) else: with open(f + '.txt', 'w') as tfile: tfile.write(result['text']) results.append(parsed) print('Finished in ' + str(round(time.time()-start,2)) + ' seconds.') except Exception as e: print(f"Error decoding {f}: {e}") results.append("Error") if len(results) == 1: return results[0] else: return results