Skip to content

Commit f6cd1cf

Browse files
committed
Added more robust csv, txt and excel reader for parsing log files. Based on tabular datablock in main datalab
1 parent 1395da3 commit f6cd1cf

File tree

2 files changed

+124
-53
lines changed

2 files changed

+124
-53
lines changed

src/datalab_app_plugin_insitu/apps/xrd/utils.py

Lines changed: 15 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from scipy.interpolate import interp1d
1111

1212
from datalab_app_plugin_insitu.echem_utils import process_echem_data
13-
from datalab_app_plugin_insitu.utils import _find_folder_path
13+
from datalab_app_plugin_insitu.utils import _find_folder_path, flexible_data_reader
1414

1515

1616
def process_local_xrd_data(
@@ -323,39 +323,19 @@ def load_temperature_log_file(log_file: Path) -> pd.DataFrame:
323323
"""
324324
Load temperature log file and return as a DataFrame. This currently assumes the Temperature is recorded in Celsius.
325325
326-
Supports both CSV and text files with common delimiters (comma, tab, whitespace).
326+
Supports CSV, TXT, and Excel files with automatic delimiter detection.
327327
328328
Args:
329329
log_file (Path): Path to the temperature log file, must contain scan_number and Temp as column headers.
330330
331331
Returns:
332332
pd.DataFrame: DataFrame containing the temperature log data.
333-
"""
334-
if not log_file.exists():
335-
raise FileNotFoundError(f"Log file does not exist: {log_file}")
336-
337-
# Try to read the file with different delimiters
338-
try:
339-
# First try comma-separated (CSV)
340-
log_df = pd.read_csv(log_file, sep=",")
341-
except Exception:
342-
try:
343-
# Try tab-separated
344-
log_df = pd.read_csv(log_file, sep="\t")
345-
except Exception:
346-
try:
347-
# Try whitespace-separated
348-
log_df = pd.read_csv(log_file, sep=r"\s+")
349-
except Exception as e:
350-
raise ValueError(
351-
f"Failed to parse log file {log_file}. Tried comma, tab, and whitespace delimiters. Error: {str(e)}"
352-
)
353333
354-
if "scan_number" not in log_df.columns:
355-
raise ValueError("Log file must contain a 'scan_number' column.")
356-
357-
if "Temp" not in log_df.columns:
358-
raise ValueError("Log file must contain a 'Temp' column.")
334+
Raises:
335+
FileNotFoundError: If the log file does not exist.
336+
ValueError: If the file cannot be parsed or required columns are missing.
337+
"""
338+
log_df = flexible_data_reader(log_file, required_columns=["scan_number", "Temp"])
359339

360340
return log_df
361341

@@ -364,38 +344,20 @@ def load_echem_log_file(log_file: Path) -> pd.DataFrame:
364344
"""
365345
Load electrochemical log file and return as a DataFrame.
366346
367-
Supports both CSV and text files with common delimiters (comma, tab, whitespace).
347+
Supports CSV, TXT, and Excel files with automatic delimiter detection.
368348
369349
Args:
370350
log_file (Path): Path to the electrochemical log file, must contain scan_number, start_time and end_time as column headers.
371351
372352
Returns:
373353
pd.DataFrame: DataFrame containing the electrochemical log data.
374-
"""
375-
if not log_file.exists():
376-
raise FileNotFoundError(f"Log file does not exist: {log_file}")
377-
378-
# Try to read the file with different delimiters
379-
try:
380-
# First try comma-separated (CSV)
381-
log_df = pd.read_csv(log_file, sep=",")
382-
except Exception:
383-
try:
384-
# Try tab-separated
385-
log_df = pd.read_csv(log_file, sep="\t")
386-
except Exception:
387-
try:
388-
# Try whitespace-separated
389-
log_df = pd.read_csv(log_file, sep=r"\s+")
390-
except Exception as e:
391-
raise ValueError(
392-
f"Failed to parse log file {log_file}. Tried comma, tab, and whitespace delimiters. Error: {str(e)}"
393-
)
394354
395-
if "scan_number" not in log_df.columns:
396-
raise ValueError("Log file must contain a 'scan_number' column.")
397-
398-
if "start_time" not in log_df.columns or "end_time" not in log_df.columns:
399-
raise ValueError("Log file must contain 'start_time' and 'end_time' columns.")
355+
Raises:
356+
FileNotFoundError: If the log file does not exist.
357+
ValueError: If the file cannot be parsed or required columns are missing.
358+
"""
359+
log_df = flexible_data_reader(
360+
log_file, required_columns=["scan_number", "start_time", "end_time"]
361+
)
400362

401363
return log_df

src/datalab_app_plugin_insitu/utils.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from pathlib import Path
33
from typing import Optional, Union
44

5+
import pandas as pd
6+
57

68
def should_skip_path(path: Union[str, Path]) -> bool:
79
"""
@@ -50,3 +52,110 @@ def _find_folder_path(base_path: Path, target_folder_name: str | Path) -> Option
5052
return Path(root) / target_folder_name
5153

5254
return None
55+
56+
57+
def flexible_data_reader(
58+
file_path: Union[str, Path],
59+
separators: Optional[list[str]] = None,
60+
required_columns: Optional[list[str]] = None,
61+
) -> pd.DataFrame:
62+
"""
63+
Flexible data reader that can handle different file formats and delimiters.
64+
65+
Tries to intelligently read CSV, TXT, and Excel files by:
66+
1. Detecting Excel formats and using pd.read_excel()
67+
2. Trying pd.read_csv() with sep=None (auto-detection) using python engine
68+
3. Falling back to trying specific separators if provided
69+
70+
Args:
71+
file_path: Path to the data file (csv, txt, xlsx, etc.)
72+
separators: Optional list of separators to try (e.g., [",", "\t", r"\\s+"])
73+
If None, uses pandas' auto-detection first
74+
required_columns: Optional list of column names that must be present
75+
76+
Returns:
77+
pd.DataFrame: The loaded dataframe
78+
79+
Raises:
80+
FileNotFoundError: If the file does not exist
81+
ValueError: If the file cannot be parsed or required columns are missing
82+
"""
83+
EXCEL_LIKE_EXTENSIONS = {".xlsx", ".xls", ".xlsm", ".xlsb", ".odf", ".ods", ".odt"}
84+
85+
file_path = Path(file_path)
86+
if not file_path.exists():
87+
raise FileNotFoundError(f"File does not exist: {file_path}")
88+
89+
# Handle Excel-like formats
90+
if file_path.suffix.lower() in EXCEL_LIKE_EXTENSIONS:
91+
try:
92+
df_dict = pd.read_excel(file_path, sheet_name=None)
93+
df = next(iter(df_dict.values()))
94+
95+
if len(df_dict) > 1:
96+
import warnings
97+
98+
warnings.warn(
99+
f"Found {len(df_dict)} sheets in {file_path.name}, using only the first one."
100+
)
101+
except Exception as e:
102+
raise ValueError(f"Failed to read Excel file {file_path}. Error: {e}") from e
103+
else:
104+
# Try pandas auto-detection first (works for most well-formed CSV/TSV files)
105+
try:
106+
df = pd.read_csv(
107+
file_path,
108+
sep=None, # Auto-detect separator
109+
encoding_errors="backslashreplace",
110+
engine="python",
111+
)
112+
113+
# Check if we got a reasonable result
114+
if len(df.columns) > 1 and not df.empty:
115+
# Successfully read with auto-detection
116+
pass
117+
else:
118+
raise ValueError("Auto-detection resulted in single column or empty dataframe")
119+
120+
except Exception as auto_detect_error:
121+
# If auto-detection fails, try specific separators
122+
if separators is None:
123+
separators = [",", "\t", r"\s+", ";"]
124+
125+
df = None
126+
errors = []
127+
128+
for sep in separators:
129+
try:
130+
df = pd.read_csv(file_path, sep=sep, encoding_errors="backslashreplace")
131+
132+
# Validate we got reasonable data
133+
if len(df.columns) > 1 and not df.empty:
134+
break
135+
else:
136+
errors.append(f"sep='{sep}': Single column or empty result")
137+
df = None
138+
139+
except Exception as e:
140+
errors.append(f"sep='{sep}': {type(e).__name__}")
141+
continue
142+
143+
if df is None:
144+
error_summary = "; ".join(errors)
145+
raise ValueError(
146+
f"Failed to parse file {file_path}. "
147+
f"Tried separators: {separators}. "
148+
f"Errors: {error_summary}. "
149+
f"Original auto-detect error: {auto_detect_error}"
150+
) from auto_detect_error
151+
152+
# Validate required columns if specified
153+
if required_columns:
154+
missing_columns = set(required_columns) - set(df.columns)
155+
if missing_columns:
156+
raise ValueError(
157+
f"File {file_path} is missing required columns: {missing_columns}. "
158+
f"Available columns: {list(df.columns)}"
159+
)
160+
161+
return df

0 commit comments

Comments
 (0)