Skip to content

Commit 89d041a

Browse files
committed
Merge branch 'master' into tables-doc
2 parents 65f31c2 + f4a5079 commit 89d041a

File tree

10 files changed

+66
-72
lines changed

10 files changed

+66
-72
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) !
3434
You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/).
3535

3636
```shell
37-
pip install edsnlp==0.11.1
37+
pip install edsnlp==0.11.2
3838
```
3939

4040
or if you want to use the trainable components (using pytorch)
4141

4242
```shell
43-
pip install "edsnlp[ml]==0.11.1"
43+
pip install "edsnlp[ml]==0.11.2"
4444
```
4545

4646
### A first pipeline

changelog.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Changelog
22

3+
## v0.11.2
4+
5+
### Fixed
6+
- Fix `edsnlp.utils.file_system.normalize_fs_path` file system detection not working correctly
7+
- Improved performance of `edsnlp.data` methods over a filesystem (`fs` parameter)
8+
39
## v0.11.1 (2024-04-02)
410

511
### Added

docs/index.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) !
1515
You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/).
1616

1717
```{: data-md-color-scheme="slate" }
18-
pip install edsnlp==0.11.1
18+
pip install edsnlp==0.11.2
1919
```
2020

2121
or if you want to use the trainable components (using pytorch)
2222

2323
```{: data-md-color-scheme="slate" }
24-
pip install "edsnlp[ml]==0.11.1"
24+
pip install "edsnlp[ml]==0.11.2"
2525
```
2626

2727
### A first pipeline

edsnlp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import edsnlp.data # noqa: F401
1515
import edsnlp.pipes
1616

17-
__version__ = "0.11.1"
17+
__version__ = "0.11.2"
1818

1919
BASE_DIR = Path(__file__).parent
2020

edsnlp/data/json.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ def __init__(
3636
self.files = (
3737
[
3838
file
39-
for file in walk_match(self.fs, os.path.dirname(self.path), "*.json*")
39+
for file in walk_match(
40+
self.fs, os.path.dirname(self.path), ".*[.]json.*"
41+
)
4042
if keep_ipynb_checkpoints or ".ipynb_checkpoints" not in str(file)
4143
]
4244
if self.fs.isdir(self.path)
@@ -128,7 +130,7 @@ def __init__(
128130
)
129131
if path_exists:
130132
if self.fs.isdir(self.path):
131-
files = [f for f in walk_match(self.fs, self.path, "*.json*")]
133+
files = [f for f in walk_match(self.fs, self.path, ".*[.]json.*")]
132134
if files:
133135
if not overwrite:
134136
raise FileExistsError(

edsnlp/data/parquet.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,15 +83,15 @@ def __init__(
8383
self.fs, self.path = normalize_fs_path(filesystem, path)
8484

8585
# Check that filesystem has the same protocol as indicated by path
86-
self.fs.makedir(self.path, create_parents=True)
86+
self.fs.makedirs(self.path, exist_ok=True)
87+
8788
dataset: pyarrow.dataset.FileSystemDataset = ( # type: ignore
8889
pyarrow.dataset.dataset(
8990
self.path,
9091
format="parquet",
9192
filesystem=self.fs,
9293
)
9394
)
94-
self.fs.makedirs(self.path, exist_ok=True)
9595
if len(list(dataset.files)):
9696
if not overwrite:
9797
raise FileExistsError(

edsnlp/data/standoff.py

Lines changed: 23 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,11 @@
1919
from loguru import logger
2020

2121
from edsnlp import registry
22-
from edsnlp.core import PipelineProtocol
2322
from edsnlp.core.lazy_collection import LazyCollection
2423
from edsnlp.data.base import BaseReader, BaseWriter
2524
from edsnlp.data.converters import (
2625
FILENAME,
2726
AttributesMappingArg,
28-
SequenceStr,
2927
get_dict2doc_converter,
3028
get_doc2dict_converter,
3129
)
@@ -50,7 +48,8 @@ def __init__(self, ann_file, line):
5048

5149

5250
def parse_standoff_file(
53-
path: str,
51+
txt_path: str,
52+
ann_paths: List[str],
5453
merge_spaced_fragments: bool = True,
5554
fs: FileSystem = LOCAL_FS,
5655
) -> Dict:
@@ -74,28 +73,19 @@ def parse_standoff_file(
7473
-------
7574
Iterator[Dict]
7675
"""
77-
ann_filenames = []
78-
for filename in walk_match(
79-
fs,
80-
os.path.dirname(path),
81-
os.path.basename(path).replace(".txt", ".a*"),
82-
recursive=False,
83-
):
84-
ann_filenames.append(filename)
85-
8676
entities = {}
8777
relations = []
8878
events = {}
8979

90-
with fs.open(path, "r") as f:
80+
with fs.open(txt_path, "r") as f:
9181
text = f.read()
9282

93-
if not len(ann_filenames):
83+
if not len(ann_paths):
9484
return {
9585
"text": text,
9686
}
9787

98-
for ann_file in ann_filenames:
88+
for ann_file in ann_paths:
9989
with fs.open(ann_file, "r") as f:
10090
for line_idx, line in enumerate(f):
10191
try:
@@ -303,34 +293,33 @@ def __init__(
303293
):
304294
super().__init__()
305295
self.fs, self.path = normalize_fs_path(filesystem, path)
306-
self.files: List[str] = [
296+
files = {
307297
file
308-
for file in walk_match(self.fs, self.path, "*.txt")
298+
for file in walk_match(self.fs, self.path, ".*[.](txt|a*)")
309299
if (keep_ipynb_checkpoints or ".ipynb_checkpoints" not in str(file))
310-
and (
311-
keep_txt_only_docs
312-
or walk_match(
313-
self.fs,
314-
os.path.dirname(file),
315-
os.path.basename(file).replace(".txt", ".a*"),
316-
recursive=False,
317-
)
318-
)
300+
}
301+
ann_files = {}
302+
for f in files:
303+
name, ext = os.path.splitext(f)
304+
if ext.startswith(".a"):
305+
ann_files.setdefault(name, []).append(f)
306+
self.files = [
307+
(file, ann_files.get(file.replace(".txt", ""), []))
308+
for file in files
309+
if file.endswith(".txt")
310+
and (keep_txt_only_docs or file.replace(".txt", "") in ann_files)
319311
]
320312
assert len(self.files), f"No .txt files found in the BRAT directory {self.path}"
321-
for file in self.files:
322-
if not self.fs.exists(file):
323-
raise FileNotFoundError(f"File {file} does not exist")
324313
logger.info(f"The BRAT directory contains {len(self.files)} .txt files.")
325314

326315
def read_main(self) -> Iterable[Tuple[str, int]]:
327316
return ((f, 1) for f in self.files)
328317

329318
def read_worker(self, fragment: List[str]):
330319
tasks = []
331-
for file in fragment:
332-
anns = parse_standoff_file(str(file), fs=self.fs)
333-
anns[FILENAME] = os.path.relpath(file, self.path).rsplit(".", 1)[0]
320+
for txt_path, ann_paths in fragment:
321+
anns = parse_standoff_file(txt_path, ann_paths, fs=self.fs)
322+
anns[FILENAME] = os.path.relpath(txt_path, self.path).rsplit(".", 1)[0]
334323
anns["doc_id"] = anns[FILENAME]
335324
tasks.append(anns)
336325
return tasks
@@ -350,9 +339,8 @@ def __init__(
350339

351340
if self.fs.exists(self.path):
352341
unsafe_exts = Counter(
353-
os.path.splitext(f)[1] for f in walk_match(self.fs, self.path, "*.txt")
354-
) + Counter(
355-
os.path.splitext(f)[1] for f in walk_match(self.fs, self.path, "*.a*")
342+
os.path.splitext(f)[1]
343+
for f in walk_match(self.fs, self.path, ".*[.](txt|a.*)")
356344
)
357345
if unsafe_exts and not overwrite:
358346
raise FileExistsError(

edsnlp/utils/file_system.py

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
import fnmatch
21
import os
2+
import re
33
from pathlib import Path
44
from typing import Optional, Tuple, Union
55

6+
import fsspec.implementations.local
67
import pyarrow.fs
78
from fsspec import AbstractFileSystem
89
from fsspec import __version__ as fsspec_version
@@ -22,44 +23,40 @@ def walk_match(
2223
fs: FileSystem,
2324
root: str,
2425
file_pattern: str,
25-
recursive: bool = True,
2626
) -> list:
27-
if fsspec_version >= "2023.10.0":
28-
# Version fixes fsspec glob https://github.com/fsspec/filesystem_spec/pull/1329
29-
glob_str = os.path.join(root, "**" if recursive else "", file_pattern)
30-
return fs.glob(glob_str)
3127
return [
3228
os.path.join(dirpath, f)
33-
for dirpath, dirnames, files in fs.walk(
34-
root,
35-
maxdepth=None if recursive else 1,
36-
)
37-
for f in fnmatch.filter(files, file_pattern)
29+
for dirpath, dirnames, files in fs.walk(root)
30+
for f in files
31+
if re.match(file_pattern, f)
3832
]
3933

4034

4135
def normalize_fs_path(
4236
filesystem: Optional[FileSystem],
4337
path: Union[str, Path],
4438
) -> Tuple[AbstractFileSystem, str]:
45-
path = str(path)
39+
has_protocol = isinstance(path, str) and "://" in path
40+
filesystem = (
41+
ArrowFSWrapper(filesystem)
42+
if isinstance(filesystem, pyarrow.fs.FileSystem)
43+
else filesystem
44+
)
4645

47-
if filesystem is None or (isinstance(path, str) and "://" in path):
48-
path = (
49-
os.path.abspath(path)
50-
if isinstance(path, Path) or "://" in path
51-
else f"file://{os.path.abspath(path)}"
52-
)
53-
inferred_fs, fs_path = pyarrow.fs.FileSystem.from_uri(path)
46+
# We need to detect the fs from the path
47+
if filesystem is None or has_protocol:
48+
uri: str = path if has_protocol else f"file://{os.path.abspath(path)}"
49+
inferred_fs, fs_path = fsspec.core.url_to_fs(uri)
50+
inferred_fs: fsspec.AbstractFileSystem
5451
filesystem = filesystem or inferred_fs
55-
assert inferred_fs.type_name == filesystem.type_name, (
56-
f"Protocol {inferred_fs.type_name} in path does not match "
57-
f"filesystem {filesystem.type_name}"
52+
assert inferred_fs.protocol == filesystem.protocol, (
53+
f"Protocol {inferred_fs.protocol} in path does not match "
54+
f"filesystem {filesystem.protocol}"
5855
)
59-
path = fs_path
56+
path = fs_path # path without protocol
6057

6158
return (
6259
ArrowFSWrapper(filesystem)
6360
if isinstance(filesystem, pyarrow.fs.FileSystem)
6461
else filesystem
65-
), path
62+
), str(path)

tests/data/test_parquet.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from pathlib import Path
23

34
import pyarrow.dataset
@@ -242,7 +243,7 @@ def test_read_to_parquet(blank_nlp, tmpdir):
242243
fs = pyarrow.fs.LocalFileSystem()
243244
doc = list(
244245
edsnlp.data.read_parquet(
245-
input_dir,
246+
input_dir.relative_to(os.getcwd()),
246247
converter="omop",
247248
span_attributes=["etat", "assertion"],
248249
doc_attributes=["context_var"],

tests/training/test_train.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ def test_ner_qualif_train(run_in_test_dir, tmp_path):
8787
scorer = GenericScorer(**kwargs["scorer"])
8888
last_scores = scorer(nlp, Reader(**kwargs["val_data"])(nlp))
8989

90-
assert last_scores["exact_ner"]["micro"]["f"] > 0.5
91-
assert last_scores["qualifier"]["micro"]["f"] > 0.5
90+
assert last_scores["exact_ner"]["micro"]["f"] > 0.4
91+
assert last_scores["qualifier"]["micro"]["f"] > 0.4
9292

9393

9494
def test_qualif_train(run_in_test_dir, tmp_path):
@@ -100,7 +100,7 @@ def test_qualif_train(run_in_test_dir, tmp_path):
100100
scorer = GenericScorer(**kwargs["scorer"])
101101
last_scores = scorer(nlp, Reader(**kwargs["val_data"])(nlp))
102102

103-
assert last_scores["qualifier"]["micro"]["f"] > 0.5
103+
assert last_scores["qualifier"]["micro"]["f"] >= 0.4
104104

105105

106106
def test_optimizer():

0 commit comments

Comments
 (0)