Azure-Samples · kachihro · Dec 10, 2025 · Dec 10, 2025
diff --git a/app/backend/package-lock.json b/app/backend/package-lock.json
diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py
@@ -40,7 +40,8 @@ def sourcepage_from_file_page(cls, filename, page=0) -> str:
 
     @classmethod
     def blob_name_from_file_name(cls, filename) -> str:
-        return os.path.basename(filename)
+        blob_name = os.path.basename(filename)
+        return blob_name.replace("#", "_")
 
     @classmethod
     def add_image_citation(
@@ -184,8 +185,11 @@ async def upload_blob(self, file: File | IO, filename: str, user_oid: str) -> st
         # Ensure user directory exists but don't create a subdirectory
         user_directory_client = await self._ensure_directory(directory_path=user_oid, user_oid=user_oid)
 
+        # Sanitize filename by replacing # with _ before uploading
+        sanitized_filename = self.blob_name_from_file_name(filename)
+
         # Create file directly in user directory
-        file_client = user_directory_client.get_file_client(filename)
+        file_client = user_directory_client.get_file_client(sanitized_filename)
 
         # Handle both File and IO objects
         if isinstance(file, File):
@@ -216,9 +220,11 @@ def _get_image_directory_path(self, document_filename: str, user_oid: str, page_
         Returns:
             str: Full path to the image directory
         """
+        # Sanitize document filename by replacing # with _
+        sanitized_doc_filename = self.blob_name_from_file_name(document_filename)
         if page_num is not None:
-            return f"{user_oid}/images/{document_filename}/page_{page_num}"
-        return f"{user_oid}/images/{document_filename}"
+            return f"{user_oid}/images/{sanitized_doc_filename}/page_{page_num}"
+        return f"{user_oid}/images/{sanitized_doc_filename}"
 
     async def upload_document_image(
         self,
@@ -248,7 +254,8 @@ async def upload_document_image(
         await self._ensure_directory(directory_path=user_oid, user_oid=user_oid)
         image_directory_path = self._get_image_directory_path(document_filename, user_oid, image_page_num)
         image_directory_client = await self._ensure_directory(directory_path=image_directory_path, user_oid=user_oid)
-        file_client = image_directory_client.get_file_client(image_filename)
+        sanitized_image_filename = self.blob_name_from_file_name(image_filename)
+        file_client = image_directory_client.get_file_client(sanitized_image_filename)
         image_bytes = BaseBlobManager.add_image_citation(image_bytes, document_filename, image_filename, image_page_num)
         logger.info("Uploading document image '%s' to '%s'", image_filename, image_directory_path)
         await file_client.upload_data(image_bytes, overwrite=True, metadata={"UploadedBy": user_oid})
@@ -292,7 +299,9 @@ async def download_blob(
 
         try:
             user_directory_client = await self._ensure_directory(directory_path=directory_path, user_oid=user_oid)
-            file_client = user_directory_client.get_file_client(filename)
+            # Sanitize filename by replacing # with _ to match uploaded filename
+            sanitized_filename = self.blob_name_from_file_name(filename)
+            file_client = user_directory_client.get_file_client(sanitized_filename)
             download_response = await file_client.download_file()
             content = await download_response.readall()
 
@@ -456,7 +465,10 @@ async def upload_document_image(
         if not await container_client.exists():
             await container_client.create_container()
         image_bytes = self.add_image_citation(image_bytes, document_filename, image_filename, image_page_num)
-        blob_name = f"{self.blob_name_from_file_name(document_filename)}/page{image_page_num}/{image_filename}"
+        # Sanitize both document and image filenames by replacing # with _
+        sanitized_doc_filename = self.blob_name_from_file_name(document_filename)
+        sanitized_image_filename = self.blob_name_from_file_name(image_filename)
+        blob_name = f"{sanitized_doc_filename}/page{image_page_num}/{sanitized_image_filename}"
         logger.info("Uploading blob for document image '%s'", blob_name)
         blob_client = await container_client.upload_blob(blob_name, image_bytes, overwrite=True)
         return blob_client.url

diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py
@@ -1,6 +1,7 @@
 """Cloud ingestion strategy using Azure AI Search custom skills."""
 
 import logging
+import os
 from dataclasses import dataclass
 from datetime import timedelta
 
@@ -319,6 +320,28 @@ async def run(self) -> None:
         files = self.list_file_strategy.list()
         async for file in files:
             try:
+                # Check if filename contains # and rename on disk if it does
+                if hasattr(file.content, "name") and "#" in file.filename():
+                    original_path = file.content.name
+                    if os.path.exists(original_path) and os.path.isfile(original_path):
+                        # Get directory and filename
+                        directory = os.path.dirname(original_path)
+                        original_filename = os.path.basename(original_path)
+                        new_filename = original_filename.replace("#", "_")
+                        new_path = os.path.join(directory, new_filename)
+
+                        # Only rename if the new filename is different
+                        if new_path != original_path:
+                            # Close the current file handle
+                            file.content.close()
+
+                            # Rename the file on disk
+                            os.rename(original_path, new_path)
+                            logger.info("Renamed file from '%s' to '%s' (replaced # with _)", original_filename, new_filename)
+
+                            # Reopen the file with the new name
+                            file.content = open(new_path, mode="rb")
+
                 await self.blob_manager.upload_blob(file)
             finally:
                 if file:

diff --git a/app/backend/prepdocslib/emailparser.py b/app/backend/prepdocslib/emailparser.py
@@ -0,0 +1,137 @@
+"""
+Email parser for MSG and EML files.
+Compatible with the prepdocs pipeline structure.
+"""
+import logging
+import email
+from email import policy
+from typing import AsyncGenerator, IO, Union, Optional
+import extract_msg
+from io import BytesIO
+import re
+
+from .page import Page
+
+logger = logging.getLogger(__name__)
+
+
+class EmailParser:
+    """Parser for EML email files."""
+
+    async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
+        """Parse EML file content."""
+        try:
+            # Read bytes from IO object
+            content_bytes = content.read()
+            # Parse email from bytes
+            msg = email.message_from_bytes(content_bytes, policy=policy.default)
+
+            # Extract email metadata and body
+            text_parts = []
+
+            # Add headers
+            text_parts.append(f"From: {msg.get('From', 'Unknown')}")
+            text_parts.append(f"To: {msg.get('To', 'Unknown')}")
+            text_parts.append(f"Subject: {msg.get('Subject', 'No Subject')}")
+            text_parts.append(f"Date: {msg.get('Date', 'Unknown')}")
+            text_parts.append("\n" + "="*80 + "\n")
+
+            # Extract body
+            if msg.is_multipart():
+                for part in msg.walk():
+                    content_type = part.get_content_type()
+                    content_disposition = str(part.get("Content-Disposition", ""))
+
+                    # Get text content
+                    if content_type == "text/plain" and "attachment" not in content_disposition:
+                        try:
+                            body = part.get_content()
+                            text_parts.append(body)
+                        except Exception as e:
+                            logger.warning(f"Could not extract text part: {e}")
+
+                    elif content_type == "text/html" and "attachment" not in content_disposition:
+                        # Optionally extract HTML (you may want to strip HTML tags)
+                        try:
+                            html_body = part.get_content()
+                            # Simple HTML tag removal (consider using BeautifulSoup for better results)
+                            import re
+                            text_body = re.sub('<[^<]+?>', '', html_body)
+                            text_parts.append(text_body)
+                        except Exception as e:
+                            logger.warning(f"Could not extract HTML part: {e}")
+
+                    # Note attachments
+                    elif "attachment" in content_disposition:
+                        filename = part.get_filename()
+                        if filename:
+                            text_parts.append(f"\n[Attachment: {filename}]")
+            else:
+                # Single part message
+                try:
+                    body = msg.get_content()
+                    text_parts.append(body)
+                except Exception as e:
+                    logger.warning(f"Could not extract message body: {e}")
+
+            # Combine all parts
+            full_text = "\n".join(text_parts)
+
+            # Return as single page
+            yield Page(page_num=0, offset=0, text=full_text)
+
+        except Exception as e:
+            logger.error(f"Error parsing EML file: {e}")
+            raise ValueError(f"Failed to parse EML file: {e}")
+
+
+class MsgParser:
+    """Parser for MSG (Outlook) email files."""
+
+    async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
+        """Parse MSG file content."""
+        try:
+            # Read bytes from IO object and create BytesIO
+            content_bytes = content.read()
+            msg_file = BytesIO(content_bytes)
+
+            # Parse MSG file
+            msg = extract_msg.Message(msg_file)
+
+            # Extract email metadata and body
+            text_parts = []
+
+            # Add headers
+            text_parts.append(f"From: {msg.sender or 'Unknown'}")
+            text_parts.append(f"To: {msg.to or 'Unknown'}")
+            text_parts.append(f"Subject: {msg.subject or 'No Subject'}")
+            text_parts.append(f"Date: {msg.date or 'Unknown'}")
+            text_parts.append("\n" + "="*80 + "\n")
+
+            # Add body (prefer plain text over HTML)
+            if msg.body:
+                text_parts.append(msg.body)
+            elif msg.htmlBody:
+                # Simple HTML tag removal
+                import re
+                text_body = re.sub('<[^<]+?>', '', msg.htmlBody)
+                text_parts.append(text_body)
+
+            # Note attachments
+            if msg.attachments:
+                text_parts.append("\n\nAttachments:")
+                for attachment in msg.attachments:
+                    text_parts.append(f"  - {attachment.longFilename or attachment.shortFilename}")
+
+            # Combine all parts
+            full_text = "\n".join(text_parts)
+
+            # Clean up
+            msg.close()
+
+            # Return as single page
+            yield Page(page_num=0, offset=0, text=full_text)
+
+        except Exception as e:
+            logger.error(f"Error parsing MSG file: {e}")
+            raise ValueError(f"Failed to parse MSG file: {e}")
diff --git a/app/backend/prepdocslib/integratedvectorizerstrategy.py b/app/backend/prepdocslib/integratedvectorizerstrategy.py
@@ -170,6 +170,28 @@ async def run(self):
             files = self.list_file_strategy.list()
             async for file in files:
                 try:
+                     # Check if filename contains # and rename on disk if it does
+                    if hasattr(file.content, "name") and "#" in file.filename():
+                        original_path = file.content.name
+                        if os.path.exists(original_path) and os.path.isfile(original_path):
+                            # Get directory and filename
+                            directory = os.path.dirname(original_path)
+                            original_filename = os.path.basename(original_path)
+                            new_filename = original_filename.replace("#", "_")
+                            new_path = os.path.join(directory, new_filename)
+
+                            # Only rename if the new filename is different
+                            if new_path != original_path:
+                                # Close the current file handle
+                                file.content.close()
+
+                                # Rename the file on disk
+                                os.rename(original_path, new_path)
+                                logger.info("Renamed file from '%s' to '%s' (replaced # with _)", original_filename, new_filename)
+
+                                # Reopen the file with the new name
+                                file.content = open(new_path, mode="rb")
+
                     await self.blob_manager.upload_blob(file)
                 finally:
                     if file:

diff --git a/app/backend/prepdocslib/servicesetup.py b/app/backend/prepdocslib/servicesetup.py
@@ -23,6 +23,8 @@
 from .strategy import SearchInfo
 from .textparser import TextParser
 from .textsplitter import SentenceTextSplitter, SimpleTextSplitter
+from .emailparser import EmailParser
+from .emailparser import MsgParser
 
 logger = logging.getLogger("scripts")
 
@@ -290,6 +292,8 @@ def build_file_processors(
         ".md": FileProcessor(TextParser(), sentence_text_splitter),
         ".txt": FileProcessor(TextParser(), sentence_text_splitter),
         ".csv": FileProcessor(CsvParser(), sentence_text_splitter),
+        ".eml": FileProcessor(EmailParser(), sentence_text_splitter),
+        ".msg": FileProcessor(MsgParser(), sentence_text_splitter),
     }
     # These require either a Python package or Document Intelligence
     if pdf_parser is not None:

diff --git a/app/backend/requirements.in b/app/backend/requirements.in
@@ -31,3 +31,4 @@ python-dotenv
 prompty
 rich
 typing-extensions
+extract-msg
diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt
@@ -453,4 +453,6 @@ wsproto==1.2.0
 yarl==1.17.2
     # via aiohttp
 zipp==3.21.0
-    # via importlib-metadata
+    # via importlib-metadata
+extract-msg==0.42.1
+    # via -r requirements.in
-Original file line number
+Diff line change
@@ Expand Up / @@ -31,3 +31,4 @@ python-dotenv @@
     prompty
     rich
     typing-extensions
+    extract-msg