Skip to content

Commit fbc02f6

Browse files
authored
v0.0.1
1 parent 85740da commit fbc02f6

30 files changed

+4129
-0
lines changed
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import subprocess
2+
from pathlib import Path
3+
import yaml
4+
from PySide6.QtWidgets import QFileDialog, QDialog, QVBoxLayout, QTextEdit, QPushButton, QHBoxLayout, QMessageBox
5+
import torch
6+
7+
def check_cuda_for_images(files):
8+
image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff'}
9+
if any(Path(file).suffix.lower() in image_extensions for file in files):
10+
if not torch.cuda.is_available():
11+
QMessageBox.warning(None, "CUDA Support Required",
12+
"Processing images currently only available with GPU acceleration. Please remove any images and try again.")
13+
return False
14+
return True
15+
16+
def choose_documents_directory():
17+
allowed_extensions = {'.pdf', '.docx', '.epub', '.txt', '.enex', '.eml', '.msg', '.csv', '.xls', '.xlsx',
18+
'.rtf', '.odt', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff', '.html',
19+
'.htm', '.md', '.doc'}
20+
current_dir = Path(__file__).parent.resolve()
21+
file_dialog = QFileDialog()
22+
file_dialog.setFileMode(QFileDialog.ExistingFiles)
23+
file_paths, _ = file_dialog.getOpenFileNames(None, "Choose Documents and Images for Database", str(current_dir))
24+
25+
if file_paths:
26+
if not check_cuda_for_images(file_paths):
27+
return
28+
29+
compatible_files = [file for file in file_paths if Path(file).suffix.lower() in allowed_extensions]
30+
incompatible_files = [Path(file).name for file in file_paths if Path(file).suffix.lower() not in allowed_extensions]
31+
32+
if incompatible_files:
33+
dialog_text = "The following files cannot be added here due to their file extension:\n\n" + "\n".join(incompatible_files) + "\n\nHowever, if any of them are audio files you can still add them directly in the Tools Tab."
34+
dialog_text += "\n\nClick 'Ok' to add the compatible documents only (remembering to add audio files separately) or 'Cancel' to back out completely."
35+
incompatible_dialog = QDialog()
36+
incompatible_dialog.resize(800, 600)
37+
incompatible_dialog.setWindowTitle("Incompatible Files Detected")
38+
layout = QVBoxLayout()
39+
40+
text_edit = QTextEdit()
41+
text_edit.setReadOnly(True)
42+
text_edit.setText(dialog_text)
43+
layout.addWidget(text_edit)
44+
45+
button_box = QHBoxLayout()
46+
ok_button = QPushButton("OK")
47+
cancel_button = QPushButton("Cancel")
48+
button_box.addWidget(ok_button)
49+
button_box.addWidget(cancel_button)
50+
layout.addLayout(button_box)
51+
52+
incompatible_dialog.setLayout(layout)
53+
54+
ok_button.clicked.connect(incompatible_dialog.accept)
55+
cancel_button.clicked.connect(incompatible_dialog.reject)
56+
57+
user_choice = incompatible_dialog.exec()
58+
59+
if user_choice == QDialog.Rejected:
60+
return
61+
62+
target_folder = current_dir / "Docs_for_DB"
63+
target_folder.mkdir(parents=True, exist_ok=True)
64+
for file_path in compatible_files:
65+
symlink_target = target_folder / Path(file_path).name
66+
symlink_target.unlink(missing_ok=True)
67+
symlink_target.symlink_to(file_path)
68+
69+
def load_config():
70+
with open("config.yaml", 'r', encoding='utf-8') as stream:
71+
return yaml.safe_load(stream)
72+
73+
def select_embedding_model_directory():
74+
initial_dir = Path('Models') if Path('Models').exists() else Path.home()
75+
chosen_directory = QFileDialog.getExistingDirectory(None, "Select Embedding Model Directory", str(initial_dir))
76+
77+
if chosen_directory:
78+
config_file_path = Path("config.yaml")
79+
config_data = yaml.safe_load(config_file_path.read_text(encoding='utf-8')) if config_file_path.exists() else {}
80+
config_data["EMBEDDING_MODEL_NAME"] = chosen_directory
81+
config_file_path.write_text(yaml.dump(config_data), encoding='utf-8')
82+
print(f"Selected directory: {chosen_directory}")

src/config.yaml

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
Compute_Device:
2+
available:
3+
database_creation: cpu
4+
database_query: cpu
5+
gpu_brand: NVIDIA
6+
EMBEDDING_MODEL_NAME: null
7+
Platform_Info:
8+
os: windows
9+
Supported_CTranslate2_Quantizations:
10+
CPU:
11+
GPU:
12+
WhisperSpeech:
13+
model: null
14+
bark:
15+
enable_cpu_offload: false
16+
model_precision: float16
17+
size: small
18+
speaker: v2/en_speaker_6
19+
created_databases: {}
20+
database:
21+
chunk_overlap: 250
22+
chunk_size: 700
23+
contexts: '5'
24+
database_to_search: ''
25+
document_types: ''
26+
search_term: ''
27+
similarity: 0.9
28+
embedding-models:
29+
bge:
30+
query_instruction: 'Represent this sentence for searching relevant passages:'
31+
instructor:
32+
embed_instruction: 'Represent the document for retrieval:'
33+
query_instruction: 'Represent the question for retrieving supporting documents:'
34+
mxbai:
35+
query_instruction: 'Represent this sentence for searching relevant passages:'
36+
server:
37+
api_key: ''
38+
connection_str: http://localhost:1234/v1
39+
model_max_tokens: -1
40+
model_temperature: 0.1
41+
prefix: '### User:'
42+
prompt_format_disabled: false
43+
suffix: '### Assistant:'
44+
transcribe_file:
45+
device: cpu
46+
file: null
47+
model: whisper-small-en
48+
quant: float32
49+
timestamps: true
50+
transcriber:
51+
device: cpu
52+
model: whisper-small.en
53+
quant: float32
54+
tts:
55+
model: whisperspeech
56+
vision:
57+
chosen_model: moondream2
58+
chosen_quant: float16
59+
chosen_size: 2b
60+
flash_attention2: null
61+
test_image: null

src/constants.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
VECTOR_MODELS = {
2+
'BAAI': [
3+
{
4+
'name': 'bge-small-en-v1.5',
5+
'dimensions': 384,
6+
'max_sequence': 512,
7+
'size_mb': 134,
8+
'repo_id': 'BAAI/bge-small-en-v1.5',
9+
'cache_dir': 'BAAI--bge-small-en-v1.5',
10+
'type': 'vector'
11+
},
12+
{
13+
'name': 'bge-base-en-v1.5',
14+
'dimensions': 768,
15+
'max_sequence': 512,
16+
'size_mb': 438,
17+
'repo_id': 'BAAI/bge-base-en-v1.5',
18+
'cache_dir': 'BAAI--bge-base-en-v1.5',
19+
'type': 'vector'
20+
},
21+
{
22+
'name': 'bge-large-en-v1.5',
23+
'dimensions': 1024,
24+
'max_sequence': 512,
25+
'size_mb': 1340,
26+
'repo_id': 'BAAI/bge-large-en-v1.5',
27+
'cache_dir': 'BAAI--bge-large-en-v1.5',
28+
'type': 'vector'
29+
},
30+
],
31+
'hkunlp': [
32+
{
33+
'name': 'instructor-base',
34+
'dimensions': 768,
35+
'max_sequence': 512,
36+
'size_mb': 439,
37+
'repo_id': 'hkunlp/instructor-base',
38+
'cache_dir': 'hkunlp--instructor-base',
39+
'type': 'vector'
40+
},
41+
{
42+
'name': 'instructor-large',
43+
'dimensions': 1024,
44+
'max_sequence': 512,
45+
'size_mb': 1340,
46+
'repo_id': 'hkunlp/instructor-large',
47+
'cache_dir': 'hkunlp--instructor-large',
48+
'type': 'vector'
49+
},
50+
{
51+
'name': 'instructor-xl',
52+
'dimensions': 1024,
53+
'max_sequence': 512,
54+
'size_mb': 4960,
55+
'repo_id': 'hkunlp/instructor-xl',
56+
'cache_dir': 'hkunlp--instructor-xl',
57+
'type': 'vector'
58+
},
59+
],
60+
'sentence-transformers': [
61+
{
62+
'name': 'all-MiniLM-L12-v2',
63+
'dimensions': 384,
64+
'max_sequence': 256,
65+
'size_mb': 120,
66+
'repo_id': 'sentence-transformers/all-MiniLM-L12-v2',
67+
'cache_dir': 'sentence-transformers--all-MiniLM-L12-v2',
68+
'type': 'vector'
69+
},
70+
{
71+
'name': 'all-mpnet-base-v2',
72+
'dimensions': 768,
73+
'max_sequence': 384,
74+
'size_mb': 438,
75+
'repo_id': 'sentence-transformers/all-mpnet-base-v2',
76+
'cache_dir': 'sentence-transformers--all-mpnet-base-v2',
77+
'type': 'vector'
78+
},
79+
],
80+
'thenlper': [
81+
{
82+
'name': 'gte-small',
83+
'dimensions': 384,
84+
'max_sequence': 512,
85+
'size_mb': 67,
86+
'repo_id': 'thenlper/gte-small',
87+
'cache_dir': 'thenlper--gte-small',
88+
'type': 'vector'
89+
},
90+
{
91+
'name': 'gte-base',
92+
'dimensions': 768,
93+
'max_sequence': 512,
94+
'size_mb': 219,
95+
'repo_id': 'thenlper/gte-base',
96+
'cache_dir': 'thenlper--gte-base',
97+
'type': 'vector'
98+
},
99+
{
100+
'name': 'gte-large',
101+
'dimensions': 1024,
102+
'max_sequence': 512,
103+
'size_mb': 670,
104+
'repo_id': 'thenlper/gte-large',
105+
'cache_dir': 'thenlper--gte-large',
106+
'type': 'vector'
107+
},
108+
],
109+
}
110+
111+
112+
VISION_MODELS = {
113+
'Florence-2-base': {
114+
'precision': 'autoselect',
115+
'size': '232m',
116+
'repo_id': 'microsoft/Florence-2-base',
117+
'cache_dir': 'microsoft--Florence-2-base',
118+
'requires_cuda': False
119+
},
120+
'Florence-2-large': {
121+
'precision': 'autoselect',
122+
'size': '770m',
123+
'repo_id': 'microsoft/Florence-2-large',
124+
'cache_dir': 'microsoft--Florence-2-large',
125+
'requires_cuda': False
126+
},
127+
'Moondream2': {
128+
'precision': 'float16',
129+
'size': '2b',
130+
'repo_id': 'vikhyatk/moondream2',
131+
'cache_dir': 'vikhyatk--moondream2',
132+
'requires_cuda': True
133+
}
134+
}
135+
136+
137+
DOCUMENT_LOADERS = {
138+
".pdf": "PyMuPDFLoader",
139+
".docx": "Docx2txtLoader",
140+
".txt": "TextLoader",
141+
".enex": "EverNoteLoader",
142+
".epub": "UnstructuredEPubLoader",
143+
".eml": "UnstructuredEmailLoader",
144+
".msg": "UnstructuredEmailLoader",
145+
".csv": "CSVLoader",
146+
".xls": "UnstructuredExcelLoader",
147+
".xlsx": "UnstructuredExcelLoader",
148+
".xlsm": "UnstructuredExcelLoader",
149+
".rtf": "UnstructuredRTFLoader",
150+
".odt": "UnstructuredODTLoader",
151+
".md": "UnstructuredMarkdownLoader",
152+
".html": "UnstructuredHTMLLoader",
153+
}
154+
155+
156+
CHUNKS_ONLY_TOOLTIP = "Only return relevant chunks without connecting to the LLM. Extremely useful to test the chunk size/overlap settings."
157+
158+
DOWNLOAD_EMBEDDING_MODEL_TOOLTIP = "Remember, wait until downloading is complete!"

0 commit comments

Comments
 (0)