Added more robust csv, txt and excel reader for parsing log files. Based on tabular datablock in main datalab

be-smith · be-smith · commit f6cd1cf08c69 · 2025-11-11T10:28:24.000Z
diff --git a/src/datalab_app_plugin_insitu/apps/xrd/utils.py b/src/datalab_app_plugin_insitu/apps/xrd/utils.py
@@ -10,7 +10,7 @@
 from scipy.interpolate import interp1d
 
 from datalab_app_plugin_insitu.echem_utils import process_echem_data
-from datalab_app_plugin_insitu.utils import _find_folder_path
+from datalab_app_plugin_insitu.utils import _find_folder_path, flexible_data_reader
 
 
 def process_local_xrd_data(
@@ -323,39 +323,19 @@ def load_temperature_log_file(log_file: Path) -> pd.DataFrame:
     """
     Load temperature log file and return as a DataFrame. This currently assumes the Temperature is recorded in Celsius.
 
-    Supports both CSV and text files with common delimiters (comma, tab, whitespace).
+    Supports CSV, TXT, and Excel files with automatic delimiter detection.
 
     Args:
         log_file (Path): Path to the temperature log file, must contain scan_number and Temp as column headers.
 
     Returns:
         pd.DataFrame: DataFrame containing the temperature log data.
-    """
-    if not log_file.exists():
-        raise FileNotFoundError(f"Log file does not exist: {log_file}")
-
-    # Try to read the file with different delimiters
-    try:
-        # First try comma-separated (CSV)
-        log_df = pd.read_csv(log_file, sep=",")
-    except Exception:
-        try:
-            # Try tab-separated
-            log_df = pd.read_csv(log_file, sep="\t")
-        except Exception:
-            try:
-                # Try whitespace-separated
-                log_df = pd.read_csv(log_file, sep=r"\s+")
-            except Exception as e:
-                raise ValueError(
-                    f"Failed to parse log file {log_file}. Tried comma, tab, and whitespace delimiters. Error: {str(e)}"
-                )
 
-    if "scan_number" not in log_df.columns:
-        raise ValueError("Log file must contain a 'scan_number' column.")
-
-    if "Temp" not in log_df.columns:
-        raise ValueError("Log file must contain a 'Temp' column.")
+    Raises:
+        FileNotFoundError: If the log file does not exist.
+        ValueError: If the file cannot be parsed or required columns are missing.
+    """
+    log_df = flexible_data_reader(log_file, required_columns=["scan_number", "Temp"])
 
     return log_df
 
@@ -364,38 +344,20 @@ def load_echem_log_file(log_file: Path) -> pd.DataFrame:
     """
     Load electrochemical log file and return as a DataFrame.
 
-    Supports both CSV and text files with common delimiters (comma, tab, whitespace).
+    Supports CSV, TXT, and Excel files with automatic delimiter detection.
 
     Args:
         log_file (Path): Path to the electrochemical log file, must contain scan_number, start_time and end_time as column headers.
 
     Returns:
         pd.DataFrame: DataFrame containing the electrochemical log data.
-    """
-    if not log_file.exists():
-        raise FileNotFoundError(f"Log file does not exist: {log_file}")
-
-    # Try to read the file with different delimiters
-    try:
-        # First try comma-separated (CSV)
-        log_df = pd.read_csv(log_file, sep=",")
-    except Exception:
-        try:
-            # Try tab-separated
-            log_df = pd.read_csv(log_file, sep="\t")
-        except Exception:
-            try:
-                # Try whitespace-separated
-                log_df = pd.read_csv(log_file, sep=r"\s+")
-            except Exception as e:
-                raise ValueError(
-                    f"Failed to parse log file {log_file}. Tried comma, tab, and whitespace delimiters. Error: {str(e)}"
-                )
 
-    if "scan_number" not in log_df.columns:
-        raise ValueError("Log file must contain a 'scan_number' column.")
-
-    if "start_time" not in log_df.columns or "end_time" not in log_df.columns:
-        raise ValueError("Log file must contain 'start_time' and 'end_time' columns.")
+    Raises:
+        FileNotFoundError: If the log file does not exist.
+        ValueError: If the file cannot be parsed or required columns are missing.
+    """
+    log_df = flexible_data_reader(
+        log_file, required_columns=["scan_number", "start_time", "end_time"]
+    )
 
     return log_df
diff --git a/src/datalab_app_plugin_insitu/utils.py b/src/datalab_app_plugin_insitu/utils.py
@@ -2,6 +2,8 @@
 from pathlib import Path
 from typing import Optional, Union
 
+import pandas as pd
+
 
 def should_skip_path(path: Union[str, Path]) -> bool:
     """
@@ -50,3 +52,110 @@ def _find_folder_path(base_path: Path, target_folder_name: str | Path) -> Option
             return Path(root) / target_folder_name
 
     return None
+
+
+def flexible_data_reader(
+    file_path: Union[str, Path],
+    separators: Optional[list[str]] = None,
+    required_columns: Optional[list[str]] = None,
+) -> pd.DataFrame:
+    """
+    Flexible data reader that can handle different file formats and delimiters.
+
+    Tries to intelligently read CSV, TXT, and Excel files by:
+    1. Detecting Excel formats and using pd.read_excel()
+    2. Trying pd.read_csv() with sep=None (auto-detection) using python engine
+    3. Falling back to trying specific separators if provided
+
+    Args:
+        file_path: Path to the data file (csv, txt, xlsx, etc.)
+        separators: Optional list of separators to try (e.g., [",", "\t", r"\\s+"])
+                   If None, uses pandas' auto-detection first
+        required_columns: Optional list of column names that must be present
+
+    Returns:
+        pd.DataFrame: The loaded dataframe
+
+    Raises:
+        FileNotFoundError: If the file does not exist
+        ValueError: If the file cannot be parsed or required columns are missing
+    """
+    EXCEL_LIKE_EXTENSIONS = {".xlsx", ".xls", ".xlsm", ".xlsb", ".odf", ".ods", ".odt"}
+
+    file_path = Path(file_path)
+    if not file_path.exists():
+        raise FileNotFoundError(f"File does not exist: {file_path}")
+
+    # Handle Excel-like formats
+    if file_path.suffix.lower() in EXCEL_LIKE_EXTENSIONS:
+        try:
+            df_dict = pd.read_excel(file_path, sheet_name=None)
+            df = next(iter(df_dict.values()))
+
+            if len(df_dict) > 1:
+                import warnings
+
+                warnings.warn(
+                    f"Found {len(df_dict)} sheets in {file_path.name}, using only the first one."
+                )
+        except Exception as e:
+            raise ValueError(f"Failed to read Excel file {file_path}. Error: {e}") from e
+    else:
+        # Try pandas auto-detection first (works for most well-formed CSV/TSV files)
+        try:
+            df = pd.read_csv(
+                file_path,
+                sep=None,  # Auto-detect separator
+                encoding_errors="backslashreplace",
+                engine="python",
+            )
+
+            # Check if we got a reasonable result
+            if len(df.columns) > 1 and not df.empty:
+                # Successfully read with auto-detection
+                pass
+            else:
+                raise ValueError("Auto-detection resulted in single column or empty dataframe")
+
+        except Exception as auto_detect_error:
+            # If auto-detection fails, try specific separators
+            if separators is None:
+                separators = [",", "\t", r"\s+", ";"]
+
+            df = None
+            errors = []
+
+            for sep in separators:
+                try:
+                    df = pd.read_csv(file_path, sep=sep, encoding_errors="backslashreplace")
+
+                    # Validate we got reasonable data
+                    if len(df.columns) > 1 and not df.empty:
+                        break
+                    else:
+                        errors.append(f"sep='{sep}': Single column or empty result")
+                        df = None
+
+                except Exception as e:
+                    errors.append(f"sep='{sep}': {type(e).__name__}")
+                    continue
+
+            if df is None:
+                error_summary = "; ".join(errors)
+                raise ValueError(
+                    f"Failed to parse file {file_path}. "
+                    f"Tried separators: {separators}. "
+                    f"Errors: {error_summary}. "
+                    f"Original auto-detect error: {auto_detect_error}"
+                ) from auto_detect_error
+
+    # Validate required columns if specified
+    if required_columns:
+        missing_columns = set(required_columns) - set(df.columns)
+        if missing_columns:
+            raise ValueError(
+                f"File {file_path} is missing required columns: {missing_columns}. "
+                f"Available columns: {list(df.columns)}"
+            )
+
+    return df