add whisper models and fix bug

BBC-Esq · web-flow · commit 383cd04e9ad3 · 2024-08-03T18:28:56.000-04:00
diff --git a/src/gui.py b/src/gui.py
@@ -1,3 +1,6 @@
+import multiprocessing
+multiprocessing.set_start_method('spawn')
+
 import sys
 import os
 from pathlib import Path
@@ -7,11 +10,21 @@
     QApplication, QWidget, QVBoxLayout, QTabWidget,
     QStyleFactory, QMenuBar
 )
-import multiprocessing
 from initialize import main as initialize_system
 from gui_tabs import create_tabs
 from utilities import list_theme_files, make_theme_changer, load_stylesheet
 
+# Print the current working directory
+print(f"Current working directory: {os.getcwd()}")
+
+# Check if we can write to the current directory
+try:
+    with open('test_write.txt', 'w') as f:
+        f.write("Testing write permissions")
+    os.remove('test_write.txt')
+except Exception as e:
+    print(f"Cannot write to the current directory: {e}")
+
 logging.basicConfig(filename='gui_log.txt', level=logging.DEBUG, 
                     format='%(asctime)s - %(levelname)s - %(message)s')
 
@@ -83,7 +96,7 @@ def closeEvent(self, event):
 def main():
     try:
         logging.info("Starting application")
-        multiprocessing.set_start_method('spawn')
+        # multiprocessing.set_start_method('spawn')
         app = QApplication(sys.argv)
         app.setStyleSheet(load_stylesheet('custom_stylesheet_steel_ocean.css'))
         ex = DocQA_GUI()
diff --git a/src/gui_tabs_tools_transcribe.py b/src/gui_tabs_tools_transcribe.py
@@ -1,23 +1,20 @@
 import threading
-from functools import partial
 from pathlib import Path
-
 import yaml
 from PySide6.QtCore import Qt
 from PySide6.QtWidgets import (
     QWidget, QHBoxLayout, QVBoxLayout, QPushButton, QFileDialog, QLabel, QComboBox, QSlider
 )
-
 from module_transcribe import WhisperTranscriber
 from utilities import my_cprint
+from constants import WHISPER_MODELS
 
 class TranscriberToolSettingsTab(QWidget):
     CONFIG_FILE = 'config.yaml'
 
     def __init__(self):
         super().__init__()
         self.selected_audio_file = None
-
         self.create_layout()
 
     def read_config(self):
@@ -26,40 +23,36 @@ def read_config(self):
 
     def create_layout(self):
         main_layout = QVBoxLayout()
-
         model_selection_hbox = QHBoxLayout()
-        model_selection_hbox.addWidget(QLabel("Whisper Model"))
+        model_selection_hbox.addWidget(QLabel("Model"))
         self.model_combo = QComboBox()
-
-        self.model_name_mapping = {f"{size} - {precision}": f"ctranslate2-4you/whisper-{size}-ct2-{precision}"
-                                   for size in ["large-v2", "medium.en", "small.en"]
-                                   for precision in ["float32", "float16"]}
-
-        self.model_combo.addItems(list(self.model_name_mapping.keys()))
-
+        
+        # Use the WHISPER_MODELS dictionary to populate the combo box
+        self.model_combo.addItems(WHISPER_MODELS.keys())
+        
         model_selection_hbox.addWidget(self.model_combo)
-
-        model_selection_hbox.addWidget(QLabel("Speed:"))
-
+        model_selection_hbox.addWidget(QLabel("Batch:"))
         self.slider_label = QLabel("8")
         self.number_slider = QSlider(Qt.Horizontal)
         self.number_slider.setMinimum(1)
         self.number_slider.setMaximum(150)
         self.number_slider.setValue(8)
         self.number_slider.valueChanged.connect(self.update_slider_label)
-
         model_selection_hbox.addWidget(self.number_slider)
         model_selection_hbox.addWidget(self.slider_label)
-
+        
+        model_selection_hbox.setStretchFactor(self.model_combo, 2)
+        model_selection_hbox.setStretchFactor(self.number_slider, 2)
+        
         main_layout.addLayout(model_selection_hbox)
 
         hbox = QHBoxLayout()
         self.select_file_button = QPushButton("Select Audio File")
-        self.select_file_button.clicked.connect(lambda: self.select_audio_file())
+        self.select_file_button.clicked.connect(self.select_audio_file)
         hbox.addWidget(self.select_file_button)
 
         self.transcribe_button = QPushButton("Transcribe")
-        self.transcribe_button.clicked.connect(lambda: self.start_transcription())
+        self.transcribe_button.clicked.connect(self.start_transcription)
         hbox.addWidget(self.transcribe_button)
 
         main_layout.addLayout(hbox)
@@ -89,17 +82,16 @@ def start_transcription(self):
         if not self.selected_audio_file:
             print("Please select an audio file.")
             return
-
-        selected_model = self.model_combo.currentText()
-        selected_model_identifier = self.model_name_mapping.get(selected_model, selected_model)
-        
-        selected_compute_type = selected_model.rsplit(' - ', 1)[-1]
         
+        selected_model_key = self.model_combo.currentText()
         selected_batch_size = int(self.slider_label.text())
-
+        
         def transcription_thread():
-            transcriber = WhisperTranscriber(model_identifier=selected_model_identifier, batch_size=selected_batch_size, compute_type=selected_compute_type)
+            transcriber = WhisperTranscriber(
+                model_key=selected_model_key, 
+                batch_size=selected_batch_size
+            )
             transcriber.start_transcription_process(self.selected_audio_file)
             my_cprint("Transcription created and ready to be input into vector database.", 'green')
-
+        
         threading.Thread(target=transcription_thread, daemon=True).start()
diff --git a/src/module_transcribe.py b/src/module_transcribe.py
@@ -10,15 +10,28 @@
 from langchain_community.docstore.document import Document
 
 import whisper_s2t
+from whisper_s2t.backends.ctranslate2.hf_utils import download_model
 from extract_metadata import extract_audio_metadata
+from constants import WHISPER_MODELS
 
 warnings.filterwarnings("ignore")
 
+current_directory = Path(__file__).parent
+CACHE_DIR = current_directory / "Models" / "whisper"
+CACHE_DIR.mkdir(parents=True, exist_ok=True)
+
 class WhisperTranscriber:
-    def __init__(self, model_identifier="ctranslate2-4you/whisper-mediuim.en-ct2-int8", batch_size=16, compute_type='int8'):
-        self.model_identifier = model_identifier
+    def __init__(self, model_key, batch_size):
+        model_info = WHISPER_MODELS[model_key]
+        self.model_identifier = model_info['repo_id']
+        self.compute_type = model_info['precision']
         self.batch_size = batch_size
-        self.compute_type = compute_type
+        self.cache_dir = str(CACHE_DIR)
+
+        script_dir = Path(__file__).parent
+        self.model_dir = script_dir / "Models" / "whisper"
+        self.model_dir.mkdir(parents=True, exist_ok=True)
+        
         self.model_kwargs = {
             'compute_type': self.compute_type,
             'asr_options': {
@@ -42,25 +55,59 @@ def __init__(self, model_identifier="ctranslate2-4you/whisper-mediuim.en-ct2-int
                 "return_no_speech_prob": True,
                 "word_aligner_model": 'tiny',
             },
-            'model_identifier': model_identifier,
+            'model_identifier': self.model_identifier,
             'backend': 'CTranslate2',
         }
 
+        if 'large-v3' in self.model_identifier:
+            self.model_kwargs['n_mels'] = 128
+
     def start_transcription_process(self, audio_file):
         self.audio_file = audio_file
         process = Process(target=self.transcribe_and_create_document)
         process.start()
         process.join()
 
+
     @torch.inference_mode()
     def transcribe_and_create_document(self):
         audio_file_str = str(self.audio_file)
         converted_audio_file = self.convert_to_wav(audio_file_str)
-        self.model_kwargs['model_identifier'] = self.model_identifier
-        model = whisper_s2t.load_model(**self.model_kwargs)
+        
+        try:
+            downloaded_path = download_model(
+                size_or_id=self.model_identifier,
+                cache_dir=str(CACHE_DIR)
+            )
+            
+            model_kwargs = self.model_kwargs.copy()
+            model_kwargs.pop('model_identifier', None)
+            model_kwargs.pop('cache_dir', None)
+            
+            model = whisper_s2t.load_model(
+                model_identifier=downloaded_path,
+                **model_kwargs
+            )
+            
+        except Exception as e:
+            print(f"Error loading model {self.model_identifier}: {e}")
+            raise
+
         transcription = self.transcribe(model, [str(converted_audio_file)])
         self.create_document_object(transcription, audio_file_str)
 
+        script_dir = Path(__file__).parent
+        converted_audio_file_name = f"{Path(audio_file_str).stem}_converted.wav"
+        converted_audio_file_full_path = script_dir / converted_audio_file_name
+
+        if converted_audio_file_full_path.exists():
+            try:
+                converted_audio_file_full_path.unlink()
+            except Exception as e:
+                print(f"Error deleting file {converted_audio_file_full_path}: {e}")
+        else:
+            print(f"File does not exist: {converted_audio_file_full_path}")
+
     def convert_to_wav(self, audio_file):
         output_file = f"{Path(audio_file).stem}_converted.wav"
         output_path = Path(__file__).parent / output_file