BBC-Esq
diff --git a/‎src/choose_documents_and_vector_model.py‎
Lines changed: 82 additions & 0 deletions b/‎src/choose_documents_and_vector_model.py‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎src/config.yaml‎
Lines changed: 61 additions & 0 deletions b/‎src/config.yaml‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎src/constants.py‎
Lines changed: 158 additions & 0 deletions b/‎src/constants.py‎
Lines changed: 158 additions & 0 deletions
@@ -0,0 +1,82 @@
+import subprocess
+from pathlib import Path
+import yaml
+from PySide6.QtWidgets import QFileDialog, QDialog, QVBoxLayout, QTextEdit, QPushButton, QHBoxLayout, QMessageBox
+import torch
+
+def check_cuda_for_images(files):
+    image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff'}
+    if any(Path(file).suffix.lower() in image_extensions for file in files):
+        if not torch.cuda.is_available():
+            QMessageBox.warning(None, "CUDA Support Required", 
+                "Processing images currently only available with GPU acceleration. Please remove any images and try again.")
+            return False
+    return True
+
+def choose_documents_directory():
+    allowed_extensions = {'.pdf', '.docx', '.epub', '.txt', '.enex', '.eml', '.msg', '.csv', '.xls', '.xlsx', 
+                          '.rtf', '.odt', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff', '.html', 
+                          '.htm', '.md', '.doc'}
+    current_dir = Path(__file__).parent.resolve()
+    file_dialog = QFileDialog()
+    file_dialog.setFileMode(QFileDialog.ExistingFiles)
+    file_paths, _ = file_dialog.getOpenFileNames(None, "Choose Documents and Images for Database", str(current_dir))
+
+    if file_paths:
+        if not check_cuda_for_images(file_paths):
+            return
+
+        compatible_files = [file for file in file_paths if Path(file).suffix.lower() in allowed_extensions]
+        incompatible_files = [Path(file).name for file in file_paths if Path(file).suffix.lower() not in allowed_extensions]
+
+        if incompatible_files:
+            dialog_text = "The following files cannot be added here due to their file extension:\n\n" + "\n".join(incompatible_files) + "\n\nHowever, if any of them are audio files you can still add them directly in the Tools Tab."
+            dialog_text += "\n\nClick 'Ok' to add the compatible documents only (remembering to add audio files separately) or 'Cancel' to back out completely."
+            incompatible_dialog = QDialog()
+            incompatible_dialog.resize(800, 600)
+            incompatible_dialog.setWindowTitle("Incompatible Files Detected")
+            layout = QVBoxLayout()
+
+            text_edit = QTextEdit()
+            text_edit.setReadOnly(True)
+            text_edit.setText(dialog_text)
+            layout.addWidget(text_edit)
+
+            button_box = QHBoxLayout()
+            ok_button = QPushButton("OK")
+            cancel_button = QPushButton("Cancel")
+            button_box.addWidget(ok_button)
+            button_box.addWidget(cancel_button)
+            layout.addLayout(button_box)
+
+            incompatible_dialog.setLayout(layout)
+
+            ok_button.clicked.connect(incompatible_dialog.accept)
+            cancel_button.clicked.connect(incompatible_dialog.reject)
+
+            user_choice = incompatible_dialog.exec()
+
+            if user_choice == QDialog.Rejected:
+                return
+
+        target_folder = current_dir / "Docs_for_DB"
+        target_folder.mkdir(parents=True, exist_ok=True)
+        for file_path in compatible_files:
+            symlink_target = target_folder / Path(file_path).name
+            symlink_target.unlink(missing_ok=True)
+            symlink_target.symlink_to(file_path)
+
+def load_config():
+    with open("config.yaml", 'r', encoding='utf-8') as stream:
+        return yaml.safe_load(stream)
+
+def select_embedding_model_directory():
+    initial_dir = Path('Models') if Path('Models').exists() else Path.home()
+    chosen_directory = QFileDialog.getExistingDirectory(None, "Select Embedding Model Directory", str(initial_dir))
+    
+    if chosen_directory:
+        config_file_path = Path("config.yaml")
+        config_data = yaml.safe_load(config_file_path.read_text(encoding='utf-8')) if config_file_path.exists() else {}
+        config_data["EMBEDDING_MODEL_NAME"] = chosen_directory
+        config_file_path.write_text(yaml.dump(config_data), encoding='utf-8')
+        print(f"Selected directory: {chosen_directory}")
@@ -0,0 +1,61 @@
+Compute_Device:
+  available:
+  database_creation: cpu
+  database_query: cpu
+  gpu_brand: NVIDIA
+EMBEDDING_MODEL_NAME: null
+Platform_Info:
+  os: windows
+Supported_CTranslate2_Quantizations:
+  CPU:
+  GPU:
+WhisperSpeech:
+  model: null
+bark:
+  enable_cpu_offload: false
+  model_precision: float16
+  size: small
+  speaker: v2/en_speaker_6
+created_databases: {}
+database:
+  chunk_overlap: 250
+  chunk_size: 700
+  contexts: '5'
+  database_to_search: ''
+  document_types: ''
+  search_term: ''
+  similarity: 0.9
+embedding-models:
+  bge:
+    query_instruction: 'Represent this sentence for searching relevant passages:'
+  instructor:
+    embed_instruction: 'Represent the document for retrieval:'
+    query_instruction: 'Represent the question for retrieving supporting documents:'
+  mxbai:
+    query_instruction: 'Represent this sentence for searching relevant passages:'
+server:
+  api_key: ''
+  connection_str: http://localhost:1234/v1
+  model_max_tokens: -1
+  model_temperature: 0.1
+  prefix: '### User:'
+  prompt_format_disabled: false
+  suffix: '### Assistant:'
+transcribe_file:
+  device: cpu
+  file: null
+  model: whisper-small-en
+  quant: float32
+  timestamps: true
+transcriber:
+  device: cpu
+  model: whisper-small.en
+  quant: float32
+tts:
+  model: whisperspeech
+vision:
+  chosen_model: moondream2
+  chosen_quant: float16
+  chosen_size: 2b
+  flash_attention2: null
+  test_image: null
@@ -0,0 +1,158 @@
+VECTOR_MODELS = {
+    'BAAI': [
+        {
+            'name': 'bge-small-en-v1.5',
+            'dimensions': 384,
+            'max_sequence': 512,
+            'size_mb': 134,
+            'repo_id': 'BAAI/bge-small-en-v1.5',
+            'cache_dir': 'BAAI--bge-small-en-v1.5',
+            'type': 'vector'
+        },
+        {
+            'name': 'bge-base-en-v1.5',
+            'dimensions': 768,
+            'max_sequence': 512,
+            'size_mb': 438,
+            'repo_id': 'BAAI/bge-base-en-v1.5',
+            'cache_dir': 'BAAI--bge-base-en-v1.5',
+            'type': 'vector'
+        },
+        {
+            'name': 'bge-large-en-v1.5',
+            'dimensions': 1024,
+            'max_sequence': 512,
+            'size_mb': 1340,
+            'repo_id': 'BAAI/bge-large-en-v1.5',
+            'cache_dir': 'BAAI--bge-large-en-v1.5',
+            'type': 'vector'
+        },
+    ],
+    'hkunlp': [
+        {
+            'name': 'instructor-base',
+            'dimensions': 768,
+            'max_sequence': 512,
+            'size_mb': 439,
+            'repo_id': 'hkunlp/instructor-base',
+            'cache_dir': 'hkunlp--instructor-base',
+            'type': 'vector'
+        },
+        {
+            'name': 'instructor-large',
+            'dimensions': 1024,
+            'max_sequence': 512,
+            'size_mb': 1340,
+            'repo_id': 'hkunlp/instructor-large',
+            'cache_dir': 'hkunlp--instructor-large',
+            'type': 'vector'
+        },
+        {
+            'name': 'instructor-xl',
+            'dimensions': 1024,
+            'max_sequence': 512,
+            'size_mb': 4960,
+            'repo_id': 'hkunlp/instructor-xl',
+            'cache_dir': 'hkunlp--instructor-xl',
+            'type': 'vector'
+        },
+    ],
+    'sentence-transformers': [
+        {
+            'name': 'all-MiniLM-L12-v2',
+            'dimensions': 384,
+            'max_sequence': 256,
+            'size_mb': 120,
+            'repo_id': 'sentence-transformers/all-MiniLM-L12-v2',
+            'cache_dir': 'sentence-transformers--all-MiniLM-L12-v2',
+            'type': 'vector'
+        },
+        {
+            'name': 'all-mpnet-base-v2',
+            'dimensions': 768,
+            'max_sequence': 384,
+            'size_mb': 438,
+            'repo_id': 'sentence-transformers/all-mpnet-base-v2',
+            'cache_dir': 'sentence-transformers--all-mpnet-base-v2',
+            'type': 'vector'
+        },
+    ],
+    'thenlper': [
+        {
+            'name': 'gte-small',
+            'dimensions': 384,
+            'max_sequence': 512,
+            'size_mb': 67,
+            'repo_id': 'thenlper/gte-small',
+            'cache_dir': 'thenlper--gte-small',
+            'type': 'vector'
+        },
+        {
+            'name': 'gte-base',
+            'dimensions': 768,
+            'max_sequence': 512,
+            'size_mb': 219,
+            'repo_id': 'thenlper/gte-base',
+            'cache_dir': 'thenlper--gte-base',
+            'type': 'vector'
+        },
+        {
+            'name': 'gte-large',
+            'dimensions': 1024,
+            'max_sequence': 512,
+            'size_mb': 670,
+            'repo_id': 'thenlper/gte-large',
+            'cache_dir': 'thenlper--gte-large',
+            'type': 'vector'
+        },
+    ],
+}
+
+
+VISION_MODELS = {
+    'Florence-2-base': {
+        'precision': 'autoselect',
+        'size': '232m',
+        'repo_id': 'microsoft/Florence-2-base',
+        'cache_dir': 'microsoft--Florence-2-base',
+        'requires_cuda': False
+    },
+    'Florence-2-large': {
+        'precision': 'autoselect',
+        'size': '770m',
+        'repo_id': 'microsoft/Florence-2-large',
+        'cache_dir': 'microsoft--Florence-2-large',
+        'requires_cuda': False
+    },
+    'Moondream2': {
+        'precision': 'float16',
+        'size': '2b',
+        'repo_id': 'vikhyatk/moondream2',
+        'cache_dir': 'vikhyatk--moondream2',
+        'requires_cuda': True
+    }
+}
+
+
+DOCUMENT_LOADERS = {
+    ".pdf": "PyMuPDFLoader",
+    ".docx": "Docx2txtLoader",
+    ".txt": "TextLoader",
+    ".enex": "EverNoteLoader",
+    ".epub": "UnstructuredEPubLoader",
+    ".eml": "UnstructuredEmailLoader",
+    ".msg": "UnstructuredEmailLoader",
+    ".csv": "CSVLoader",
+    ".xls": "UnstructuredExcelLoader",
+    ".xlsx": "UnstructuredExcelLoader",
+    ".xlsm": "UnstructuredExcelLoader",
+    ".rtf": "UnstructuredRTFLoader",
+    ".odt": "UnstructuredODTLoader",
+    ".md": "UnstructuredMarkdownLoader",
+    ".html": "UnstructuredHTMLLoader",
+}
+
+
+CHUNKS_ONLY_TOOLTIP = "Only return relevant chunks without connecting to the LLM. Extremely useful to test the chunk size/overlap settings."
+
+DOWNLOAD_EMBEDDING_MODEL_TOOLTIP = "Remember, wait until downloading is complete!"