Skip to content

Commit 26fe635

Browse files
committed
fix: normalize path (#283)
1 parent 012d0e4 commit 26fe635

File tree

3 files changed

+14
-11
lines changed

3 files changed

+14
-11
lines changed

changelog.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# Changelog
22

3+
## Unreleased
4+
5+
### Fixed
6+
- Fix `edsnlp.utils.file_system.normalize_fs_path` file system detection not working correctly
7+
38
## v0.11.1 (2024-04-02)
49

510
### Added

edsnlp/utils/file_system.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,24 +42,21 @@ def normalize_fs_path(
4242
filesystem: Optional[FileSystem],
4343
path: Union[str, Path],
4444
) -> Tuple[AbstractFileSystem, str]:
45-
path = str(path)
45+
has_protocol = isinstance(path, str) and "://" in path
4646

47-
if filesystem is None or (isinstance(path, str) and "://" in path):
48-
path = (
49-
os.path.abspath(path)
50-
if isinstance(path, Path) or "://" in path
51-
else f"file://{os.path.abspath(path)}"
52-
)
53-
inferred_fs, fs_path = pyarrow.fs.FileSystem.from_uri(path)
47+
# We need to detect the fs from the path
48+
if filesystem is None or has_protocol:
49+
uri: str = path if has_protocol else f"file://{os.path.abspath(path)}"
50+
inferred_fs, fs_path = pyarrow.fs.FileSystem.from_uri(uri)
5451
filesystem = filesystem or inferred_fs
5552
assert inferred_fs.type_name == filesystem.type_name, (
5653
f"Protocol {inferred_fs.type_name} in path does not match "
5754
f"filesystem {filesystem.type_name}"
5855
)
59-
path = fs_path
56+
path = fs_path # path without protocol
6057

6158
return (
6259
ArrowFSWrapper(filesystem)
6360
if isinstance(filesystem, pyarrow.fs.FileSystem)
6461
else filesystem
65-
), path
62+
), str(path)

tests/data/test_parquet.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from pathlib import Path
23

34
import pyarrow.dataset
@@ -242,7 +243,7 @@ def test_read_to_parquet(blank_nlp, tmpdir):
242243
fs = pyarrow.fs.LocalFileSystem()
243244
doc = list(
244245
edsnlp.data.read_parquet(
245-
input_dir,
246+
input_dir.relative_to(os.getcwd()),
246247
converter="omop",
247248
span_attributes=["etat", "assertion"],
248249
doc_attributes=["context_var"],

0 commit comments

Comments
 (0)