diff --git a/app/backend/package-lock.json b/app/backend/package-lock.json new file mode 100644 index 0000000000..dfb18f1156 --- /dev/null +++ b/app/backend/package-lock.json @@ -0,0 +1,6 @@ +{ + "name": "backend", + "lockfileVersion": 3, + "requires": true, + "packages": {} +} diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index e02695b8a8..6548f2c71f 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -40,7 +40,8 @@ def sourcepage_from_file_page(cls, filename, page=0) -> str: @classmethod def blob_name_from_file_name(cls, filename) -> str: - return os.path.basename(filename) + blob_name = os.path.basename(filename) + return blob_name.replace("#", "_") @classmethod def add_image_citation( @@ -184,8 +185,11 @@ async def upload_blob(self, file: File | IO, filename: str, user_oid: str) -> st # Ensure user directory exists but don't create a subdirectory user_directory_client = await self._ensure_directory(directory_path=user_oid, user_oid=user_oid) + # Sanitize filename by replacing # with _ before uploading + sanitized_filename = self.blob_name_from_file_name(filename) + # Create file directly in user directory - file_client = user_directory_client.get_file_client(filename) + file_client = user_directory_client.get_file_client(sanitized_filename) # Handle both File and IO objects if isinstance(file, File): @@ -216,9 +220,11 @@ def _get_image_directory_path(self, document_filename: str, user_oid: str, page_ Returns: str: Full path to the image directory """ + # Sanitize document filename by replacing # with _ + sanitized_doc_filename = self.blob_name_from_file_name(document_filename) if page_num is not None: - return f"{user_oid}/images/{document_filename}/page_{page_num}" - return f"{user_oid}/images/{document_filename}" + return f"{user_oid}/images/{sanitized_doc_filename}/page_{page_num}" + return f"{user_oid}/images/{sanitized_doc_filename}" async def upload_document_image( self, @@ -248,7 +254,8 @@ async def upload_document_image( await self._ensure_directory(directory_path=user_oid, user_oid=user_oid) image_directory_path = self._get_image_directory_path(document_filename, user_oid, image_page_num) image_directory_client = await self._ensure_directory(directory_path=image_directory_path, user_oid=user_oid) - file_client = image_directory_client.get_file_client(image_filename) + sanitized_image_filename = self.blob_name_from_file_name(image_filename) + file_client = image_directory_client.get_file_client(sanitized_image_filename) image_bytes = BaseBlobManager.add_image_citation(image_bytes, document_filename, image_filename, image_page_num) logger.info("Uploading document image '%s' to '%s'", image_filename, image_directory_path) await file_client.upload_data(image_bytes, overwrite=True, metadata={"UploadedBy": user_oid}) @@ -292,7 +299,9 @@ async def download_blob( try: user_directory_client = await self._ensure_directory(directory_path=directory_path, user_oid=user_oid) - file_client = user_directory_client.get_file_client(filename) + # Sanitize filename by replacing # with _ to match uploaded filename + sanitized_filename = self.blob_name_from_file_name(filename) + file_client = user_directory_client.get_file_client(sanitized_filename) download_response = await file_client.download_file() content = await download_response.readall() @@ -456,7 +465,10 @@ async def upload_document_image( if not await container_client.exists(): await container_client.create_container() image_bytes = self.add_image_citation(image_bytes, document_filename, image_filename, image_page_num) - blob_name = f"{self.blob_name_from_file_name(document_filename)}/page{image_page_num}/{image_filename}" + # Sanitize both document and image filenames by replacing # with _ + sanitized_doc_filename = self.blob_name_from_file_name(document_filename) + sanitized_image_filename = self.blob_name_from_file_name(image_filename) + blob_name = f"{sanitized_doc_filename}/page{image_page_num}/{sanitized_image_filename}" logger.info("Uploading blob for document image '%s'", blob_name) blob_client = await container_client.upload_blob(blob_name, image_bytes, overwrite=True) return blob_client.url diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py index 324ab2910b..24ad813e97 100644 --- a/app/backend/prepdocslib/cloudingestionstrategy.py +++ b/app/backend/prepdocslib/cloudingestionstrategy.py @@ -1,6 +1,7 @@ """Cloud ingestion strategy using Azure AI Search custom skills.""" import logging +import os from dataclasses import dataclass from datetime import timedelta @@ -319,6 +320,28 @@ async def run(self) -> None: files = self.list_file_strategy.list() async for file in files: try: + # Check if filename contains # and rename on disk if it does + if hasattr(file.content, "name") and "#" in file.filename(): + original_path = file.content.name + if os.path.exists(original_path) and os.path.isfile(original_path): + # Get directory and filename + directory = os.path.dirname(original_path) + original_filename = os.path.basename(original_path) + new_filename = original_filename.replace("#", "_") + new_path = os.path.join(directory, new_filename) + + # Only rename if the new filename is different + if new_path != original_path: + # Close the current file handle + file.content.close() + + # Rename the file on disk + os.rename(original_path, new_path) + logger.info("Renamed file from '%s' to '%s' (replaced # with _)", original_filename, new_filename) + + # Reopen the file with the new name + file.content = open(new_path, mode="rb") + await self.blob_manager.upload_blob(file) finally: if file: diff --git a/app/backend/prepdocslib/emailparser.py b/app/backend/prepdocslib/emailparser.py new file mode 100644 index 0000000000..4ec787d00f --- /dev/null +++ b/app/backend/prepdocslib/emailparser.py @@ -0,0 +1,137 @@ +""" +Email parser for MSG and EML files. +Compatible with the prepdocs pipeline structure. +""" +import logging +import email +from email import policy +from typing import AsyncGenerator, IO, Union, Optional +import extract_msg +from io import BytesIO +import re + +from .page import Page + +logger = logging.getLogger(__name__) + + +class EmailParser: + """Parser for EML email files.""" + + async def parse(self, content: IO) -> AsyncGenerator[Page, None]: + """Parse EML file content.""" + try: + # Read bytes from IO object + content_bytes = content.read() + # Parse email from bytes + msg = email.message_from_bytes(content_bytes, policy=policy.default) + + # Extract email metadata and body + text_parts = [] + + # Add headers + text_parts.append(f"From: {msg.get('From', 'Unknown')}") + text_parts.append(f"To: {msg.get('To', 'Unknown')}") + text_parts.append(f"Subject: {msg.get('Subject', 'No Subject')}") + text_parts.append(f"Date: {msg.get('Date', 'Unknown')}") + text_parts.append("\n" + "="*80 + "\n") + + # Extract body + if msg.is_multipart(): + for part in msg.walk(): + content_type = part.get_content_type() + content_disposition = str(part.get("Content-Disposition", "")) + + # Get text content + if content_type == "text/plain" and "attachment" not in content_disposition: + try: + body = part.get_content() + text_parts.append(body) + except Exception as e: + logger.warning(f"Could not extract text part: {e}") + + elif content_type == "text/html" and "attachment" not in content_disposition: + # Optionally extract HTML (you may want to strip HTML tags) + try: + html_body = part.get_content() + # Simple HTML tag removal (consider using BeautifulSoup for better results) + import re + text_body = re.sub('<[^<]+?>', '', html_body) + text_parts.append(text_body) + except Exception as e: + logger.warning(f"Could not extract HTML part: {e}") + + # Note attachments + elif "attachment" in content_disposition: + filename = part.get_filename() + if filename: + text_parts.append(f"\n[Attachment: {filename}]") + else: + # Single part message + try: + body = msg.get_content() + text_parts.append(body) + except Exception as e: + logger.warning(f"Could not extract message body: {e}") + + # Combine all parts + full_text = "\n".join(text_parts) + + # Return as single page + yield Page(page_num=0, offset=0, text=full_text) + + except Exception as e: + logger.error(f"Error parsing EML file: {e}") + raise ValueError(f"Failed to parse EML file: {e}") + + +class MsgParser: + """Parser for MSG (Outlook) email files.""" + + async def parse(self, content: IO) -> AsyncGenerator[Page, None]: + """Parse MSG file content.""" + try: + # Read bytes from IO object and create BytesIO + content_bytes = content.read() + msg_file = BytesIO(content_bytes) + + # Parse MSG file + msg = extract_msg.Message(msg_file) + + # Extract email metadata and body + text_parts = [] + + # Add headers + text_parts.append(f"From: {msg.sender or 'Unknown'}") + text_parts.append(f"To: {msg.to or 'Unknown'}") + text_parts.append(f"Subject: {msg.subject or 'No Subject'}") + text_parts.append(f"Date: {msg.date or 'Unknown'}") + text_parts.append("\n" + "="*80 + "\n") + + # Add body (prefer plain text over HTML) + if msg.body: + text_parts.append(msg.body) + elif msg.htmlBody: + # Simple HTML tag removal + import re + text_body = re.sub('<[^<]+?>', '', msg.htmlBody) + text_parts.append(text_body) + + # Note attachments + if msg.attachments: + text_parts.append("\n\nAttachments:") + for attachment in msg.attachments: + text_parts.append(f" - {attachment.longFilename or attachment.shortFilename}") + + # Combine all parts + full_text = "\n".join(text_parts) + + # Clean up + msg.close() + + # Return as single page + yield Page(page_num=0, offset=0, text=full_text) + + except Exception as e: + logger.error(f"Error parsing MSG file: {e}") + raise ValueError(f"Failed to parse MSG file: {e}") \ No newline at end of file diff --git a/app/backend/prepdocslib/integratedvectorizerstrategy.py b/app/backend/prepdocslib/integratedvectorizerstrategy.py index 42f06d4792..ac8567dedd 100644 --- a/app/backend/prepdocslib/integratedvectorizerstrategy.py +++ b/app/backend/prepdocslib/integratedvectorizerstrategy.py @@ -170,6 +170,28 @@ async def run(self): files = self.list_file_strategy.list() async for file in files: try: + # Check if filename contains # and rename on disk if it does + if hasattr(file.content, "name") and "#" in file.filename(): + original_path = file.content.name + if os.path.exists(original_path) and os.path.isfile(original_path): + # Get directory and filename + directory = os.path.dirname(original_path) + original_filename = os.path.basename(original_path) + new_filename = original_filename.replace("#", "_") + new_path = os.path.join(directory, new_filename) + + # Only rename if the new filename is different + if new_path != original_path: + # Close the current file handle + file.content.close() + + # Rename the file on disk + os.rename(original_path, new_path) + logger.info("Renamed file from '%s' to '%s' (replaced # with _)", original_filename, new_filename) + + # Reopen the file with the new name + file.content = open(new_path, mode="rb") + await self.blob_manager.upload_blob(file) finally: if file: diff --git a/app/backend/prepdocslib/servicesetup.py b/app/backend/prepdocslib/servicesetup.py index d00b023d80..cc01022565 100644 --- a/app/backend/prepdocslib/servicesetup.py +++ b/app/backend/prepdocslib/servicesetup.py @@ -23,6 +23,8 @@ from .strategy import SearchInfo from .textparser import TextParser from .textsplitter import SentenceTextSplitter, SimpleTextSplitter +from .emailparser import EmailParser +from .emailparser import MsgParser logger = logging.getLogger("scripts") @@ -290,6 +292,8 @@ def build_file_processors( ".md": FileProcessor(TextParser(), sentence_text_splitter), ".txt": FileProcessor(TextParser(), sentence_text_splitter), ".csv": FileProcessor(CsvParser(), sentence_text_splitter), + ".eml": FileProcessor(EmailParser(), sentence_text_splitter), + ".msg": FileProcessor(MsgParser(), sentence_text_splitter), } # These require either a Python package or Document Intelligence if pdf_parser is not None: diff --git a/app/backend/requirements.in b/app/backend/requirements.in index ba28ab5e36..4f95f6cfa5 100644 --- a/app/backend/requirements.in +++ b/app/backend/requirements.in @@ -31,3 +31,4 @@ python-dotenv prompty rich typing-extensions +extract-msg \ No newline at end of file diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt index 560acb1d69..095cad0947 100644 --- a/app/backend/requirements.txt +++ b/app/backend/requirements.txt @@ -453,4 +453,6 @@ wsproto==1.2.0 yarl==1.17.2 # via aiohttp zipp==3.21.0 - # via importlib-metadata \ No newline at end of file + # via importlib-metadata +extract-msg==0.42.1 + # via -r requirements.in diff --git a/app/frontend/package-lock.json b/app/frontend/package-lock.json index aacaf655cf..320d3c5318 100644 --- a/app/frontend/package-lock.json +++ b/app/frontend/package-lock.json @@ -13,13 +13,16 @@ "@fluentui/react": "^8.112.5", "@fluentui/react-components": "^9.56.2", "@fluentui/react-icons": "^2.0.265", + "@kenjiuno/msgreader": "^1.27.0-alpha.3", "@react-spring/web": "^9.7.5", + "buffer": "^6.0.3", "dompurify": "^3.2.4", "i18next": "^24.2.0", "i18next-browser-languagedetector": "^8.0.2", "i18next-http-backend": "^3.0.1", "idb": "^8.0.0", "ndjson-readablestream": "^1.2.0", + "postal-mime": "^2.6.0", "react": "^18.3.1", "react-dom": "^18.3.1", "react-helmet-async": "^2.0.5", @@ -2509,6 +2512,25 @@ "@jridgewell/sourcemap-codec": "^1.4.14" } }, + "node_modules/@kenjiuno/decompressrtf": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/@kenjiuno/decompressrtf/-/decompressrtf-0.1.4.tgz", + "integrity": "sha512-v9c/iFz17jRWyd2cRnrvJg4VOg/4I/VCk+bG8JnoX2gJ9sAesPzo3uTqcmlVXdpasTI8hChpBVw00pghKe3qTQ==", + "license": "BSD-2-Clause" + }, + "node_modules/@kenjiuno/msgreader": { + "version": "1.27.0", + "resolved": "https://registry.npmjs.org/@kenjiuno/msgreader/-/msgreader-1.27.0.tgz", + "integrity": "sha512-gElRDjxQGZQBVAqQYfZ4g82DqBQQubg9pg+nz6DsWG7tTx0TozNqaglGovT6tz3vqNE07u/wu88w8ymU1BIxYw==", + "license": "Apache-2.0", + "dependencies": { + "@kenjiuno/decompressrtf": "^0.1.3", + "iconv-lite": "^0.6.3" + }, + "engines": { + "node": ">= 10" + } + }, "node_modules/@microsoft/load-themed-styles": { "version": "1.10.295", "license": "MIT" @@ -3080,6 +3102,26 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/base64-js": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", + "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, "node_modules/browserslist": { "version": "4.25.1", "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.25.1.tgz", @@ -3113,6 +3155,30 @@ "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7" } }, + "node_modules/buffer": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-6.0.3.tgz", + "integrity": "sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "dependencies": { + "base64-js": "^1.3.1", + "ieee754": "^1.2.1" + } + }, "node_modules/caniuse-lite": { "version": "1.0.30001727", "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001727.tgz", @@ -3761,11 +3827,43 @@ "cross-fetch": "4.0.0" } }, + "node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/idb": { "version": "8.0.0", "resolved": "https://registry.npmjs.org/idb/-/idb-8.0.0.tgz", "integrity": "sha512-l//qvlAKGmQO31Qn7xdzagVPPaHTxXx199MhrAFuVBTPqydcPYBWjkrbv4Y0ktB+GmWOiwHl237UUOrLmQxLvw==" }, + "node_modules/ieee754": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", + "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "BSD-3-Clause" + }, "node_modules/inline-style-parser": { "version": "0.2.3", "resolved": "https://registry.npmjs.org/inline-style-parser/-/inline-style-parser-0.2.3.tgz", @@ -4958,6 +5056,12 @@ "url": "https://github.com/sponsors/jonschlinkert" } }, + "node_modules/postal-mime": { + "version": "2.6.1", + "resolved": "https://registry.npmjs.org/postal-mime/-/postal-mime-2.6.1.tgz", + "integrity": "sha512-YnNYvLTsWzk1Mpf1drc7XZPR1c2XInI94H5RY/hZ9vzTSYjLw9zuebdi/K0yfR3PVnedQZUn2umTyxfw9yOWBA==", + "license": "MIT-0" + }, "node_modules/postcss": { "version": "8.5.6", "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz", @@ -5399,6 +5503,12 @@ "@babel/runtime": "^7.1.2" } }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", + "license": "MIT" + }, "node_modules/scheduler": { "version": "0.20.2", "license": "MIT", diff --git a/app/frontend/package.json b/app/frontend/package.json index 23cc9f3e02..66bead7624 100644 --- a/app/frontend/package.json +++ b/app/frontend/package.json @@ -17,6 +17,7 @@ "@fluentui/react": "^8.112.5", "@fluentui/react-components": "^9.56.2", "@fluentui/react-icons": "^2.0.265", + "@kenjiuno/msgreader": "^1.27.0-alpha.3", "@react-spring/web": "^9.7.5", "dompurify": "^3.2.4", "i18next": "^24.2.0", @@ -24,6 +25,7 @@ "i18next-http-backend": "^3.0.1", "idb": "^8.0.0", "ndjson-readablestream": "^1.2.0", + "postal-mime": "^2.6.0", "react": "^18.3.1", "react-dom": "^18.3.1", "react-helmet-async": "^2.0.5", @@ -33,7 +35,8 @@ "react-syntax-highlighter": "^16.1.0", "rehype-raw": "^7.0.0", "remark-gfm": "^4.0.0", - "scheduler": "^0.20.2" + "scheduler": "^0.20.2", + "buffer": "^6.0.3" }, "devDependencies": { "@types/dom-speech-recognition": "^0.0.7", diff --git a/app/frontend/src/components/AnalysisPanel/AnalysisPanel.module.css b/app/frontend/src/components/AnalysisPanel/AnalysisPanel.module.css index fd39388d25..e5ff0462a9 100644 --- a/app/frontend/src/components/AnalysisPanel/AnalysisPanel.module.css +++ b/app/frontend/src/components/AnalysisPanel/AnalysisPanel.module.css @@ -82,6 +82,13 @@ object-fit: contain; } +.citationContainer { + width: 100%; + overflow-x: hidden; + overflow-y: auto; + box-sizing: border-box; +} + .header { color: #123bb6; position: relative; diff --git a/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx b/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx index bcbe227640..c45791038f 100644 --- a/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx +++ b/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx @@ -6,6 +6,7 @@ import { useTranslation } from "react-i18next"; import { ChatAppResponse, getHeaders } from "../../api"; import { getToken, useLogin } from "../../authConfig"; import { MarkdownViewer } from "../MarkdownViewer"; +import { EmailViewer } from "../EmailViewer"; import { SupportingContent } from "../SupportingContent"; import styles from "./AnalysisPanel.module.css"; import { AnalysisPanelTabs } from "./AnalysisPanelTabs"; @@ -44,7 +45,8 @@ export const AnalysisPanel = ({ answer, activeTab, activeCitation, citationHeigh if (activeCitation) { // Get hash from the URL as it may contain #page=N // which helps browser PDF renderer jump to correct page N - const originalHash = activeCitation.indexOf("#") ? activeCitation.split("#")[1] : ""; + const hashIndex = activeCitation.indexOf("#"); + const originalHash = hashIndex >= 0 ? activeCitation.split("#")[1] : ""; const response = await fetch(activeCitation, { method: "GET", headers: await getHeaders(token) @@ -73,6 +75,9 @@ export const AnalysisPanel = ({ answer, activeTab, activeCitation, citationHeigh return Citation Image; case "md": return ; + case "eml": + case "msg": + return ; default: return