Merge branch 'master' into tables-doc

svittoz · svittoz · commit 89d041a7d330 · 2024-04-24T10:20:50.000Z
diff --git a/README.md b/README.md
@@ -34,13 +34,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) !
 You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/).
 
 ```shell
-pip install edsnlp==0.11.1
+pip install edsnlp==0.11.2
 ```
 
 or if you want to use the trainable components (using pytorch)
 
 ```shell
-pip install "edsnlp[ml]==0.11.1"
+pip install "edsnlp[ml]==0.11.2"
 ```
 
 ### A first pipeline
diff --git a/changelog.md b/changelog.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## v0.11.2
+
+### Fixed
+- Fix `edsnlp.utils.file_system.normalize_fs_path` file system detection not working correctly
+- Improved performance of `edsnlp.data` methods over a filesystem (`fs` parameter)
+
 ## v0.11.1 (2024-04-02)
 
 ### Added
diff --git a/docs/index.md b/docs/index.md
@@ -15,13 +15,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) !
 You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/).
 
 ```{: data-md-color-scheme="slate" }
-pip install edsnlp==0.11.1
+pip install edsnlp==0.11.2
 ```
 
 or if you want to use the trainable components (using pytorch)
 
 ```{: data-md-color-scheme="slate" }
-pip install "edsnlp[ml]==0.11.1"
+pip install "edsnlp[ml]==0.11.2"
 ```
 
 ### A first pipeline
diff --git a/edsnlp/__init__.py b/edsnlp/__init__.py
@@ -14,7 +14,7 @@
 import edsnlp.data  # noqa: F401
 import edsnlp.pipes
 
-__version__ = "0.11.1"
+__version__ = "0.11.2"
 
 BASE_DIR = Path(__file__).parent
 
diff --git a/edsnlp/data/json.py b/edsnlp/data/json.py
@@ -36,7 +36,9 @@ def __init__(
         self.files = (
             [
                 file
-                for file in walk_match(self.fs, os.path.dirname(self.path), "*.json*")
+                for file in walk_match(
+                    self.fs, os.path.dirname(self.path), ".*[.]json.*"
+                )
                 if keep_ipynb_checkpoints or ".ipynb_checkpoints" not in str(file)
             ]
             if self.fs.isdir(self.path)
@@ -128,7 +130,7 @@ def __init__(
         )
         if path_exists:
             if self.fs.isdir(self.path):
-                files = [f for f in walk_match(self.fs, self.path, "*.json*")]
+                files = [f for f in walk_match(self.fs, self.path, ".*[.]json.*")]
                 if files:
                     if not overwrite:
                         raise FileExistsError(
diff --git a/edsnlp/data/parquet.py b/edsnlp/data/parquet.py
@@ -83,15 +83,15 @@ def __init__(
         self.fs, self.path = normalize_fs_path(filesystem, path)
 
         # Check that filesystem has the same protocol as indicated by path
-        self.fs.makedir(self.path, create_parents=True)
+        self.fs.makedirs(self.path, exist_ok=True)
+
         dataset: pyarrow.dataset.FileSystemDataset = (  # type: ignore
             pyarrow.dataset.dataset(
                 self.path,
                 format="parquet",
                 filesystem=self.fs,
             )
         )
-        self.fs.makedirs(self.path, exist_ok=True)
         if len(list(dataset.files)):
             if not overwrite:
                 raise FileExistsError(
diff --git a/edsnlp/data/standoff.py b/edsnlp/data/standoff.py
@@ -19,13 +19,11 @@
 from loguru import logger
 
 from edsnlp import registry
-from edsnlp.core import PipelineProtocol
 from edsnlp.core.lazy_collection import LazyCollection
 from edsnlp.data.base import BaseReader, BaseWriter
 from edsnlp.data.converters import (
     FILENAME,
     AttributesMappingArg,
-    SequenceStr,
     get_dict2doc_converter,
     get_doc2dict_converter,
 )
@@ -50,7 +48,8 @@ def __init__(self, ann_file, line):
 
 
 def parse_standoff_file(
-    path: str,
+    txt_path: str,
+    ann_paths: List[str],
     merge_spaced_fragments: bool = True,
     fs: FileSystem = LOCAL_FS,
 ) -> Dict:
@@ -74,28 +73,19 @@ def parse_standoff_file(
     -------
     Iterator[Dict]
     """
-    ann_filenames = []
-    for filename in walk_match(
-        fs,
-        os.path.dirname(path),
-        os.path.basename(path).replace(".txt", ".a*"),
-        recursive=False,
-    ):
-        ann_filenames.append(filename)
-
     entities = {}
     relations = []
     events = {}
 
-    with fs.open(path, "r") as f:
+    with fs.open(txt_path, "r") as f:
         text = f.read()
 
-    if not len(ann_filenames):
+    if not len(ann_paths):
         return {
             "text": text,
         }
 
-    for ann_file in ann_filenames:
+    for ann_file in ann_paths:
         with fs.open(ann_file, "r") as f:
             for line_idx, line in enumerate(f):
                 try:
@@ -303,34 +293,33 @@ def __init__(
     ):
         super().__init__()
         self.fs, self.path = normalize_fs_path(filesystem, path)
-        self.files: List[str] = [
+        files = {
             file
-            for file in walk_match(self.fs, self.path, "*.txt")
+            for file in walk_match(self.fs, self.path, ".*[.](txt|a*)")
             if (keep_ipynb_checkpoints or ".ipynb_checkpoints" not in str(file))
-            and (
-                keep_txt_only_docs
-                or walk_match(
-                    self.fs,
-                    os.path.dirname(file),
-                    os.path.basename(file).replace(".txt", ".a*"),
-                    recursive=False,
-                )
-            )
+        }
+        ann_files = {}
+        for f in files:
+            name, ext = os.path.splitext(f)
+            if ext.startswith(".a"):
+                ann_files.setdefault(name, []).append(f)
+        self.files = [
+            (file, ann_files.get(file.replace(".txt", ""), []))
+            for file in files
+            if file.endswith(".txt")
+            and (keep_txt_only_docs or file.replace(".txt", "") in ann_files)
         ]
         assert len(self.files), f"No .txt files found in the BRAT directory {self.path}"
-        for file in self.files:
-            if not self.fs.exists(file):
-                raise FileNotFoundError(f"File {file} does not exist")
         logger.info(f"The BRAT directory contains {len(self.files)} .txt files.")
 
     def read_main(self) -> Iterable[Tuple[str, int]]:
         return ((f, 1) for f in self.files)
 
     def read_worker(self, fragment: List[str]):
         tasks = []
-        for file in fragment:
-            anns = parse_standoff_file(str(file), fs=self.fs)
-            anns[FILENAME] = os.path.relpath(file, self.path).rsplit(".", 1)[0]
+        for txt_path, ann_paths in fragment:
+            anns = parse_standoff_file(txt_path, ann_paths, fs=self.fs)
+            anns[FILENAME] = os.path.relpath(txt_path, self.path).rsplit(".", 1)[0]
             anns["doc_id"] = anns[FILENAME]
             tasks.append(anns)
         return tasks
@@ -350,9 +339,8 @@ def __init__(
 
         if self.fs.exists(self.path):
             unsafe_exts = Counter(
-                os.path.splitext(f)[1] for f in walk_match(self.fs, self.path, "*.txt")
-            ) + Counter(
-                os.path.splitext(f)[1] for f in walk_match(self.fs, self.path, "*.a*")
+                os.path.splitext(f)[1]
+                for f in walk_match(self.fs, self.path, ".*[.](txt|a.*)")
             )
             if unsafe_exts and not overwrite:
                 raise FileExistsError(
diff --git a/edsnlp/utils/file_system.py b/edsnlp/utils/file_system.py
@@ -1,8 +1,9 @@
-import fnmatch
 import os
+import re
 from pathlib import Path
 from typing import Optional, Tuple, Union
 
+import fsspec.implementations.local
 import pyarrow.fs
 from fsspec import AbstractFileSystem
 from fsspec import __version__ as fsspec_version
@@ -22,44 +23,40 @@ def walk_match(
     fs: FileSystem,
     root: str,
     file_pattern: str,
-    recursive: bool = True,
 ) -> list:
-    if fsspec_version >= "2023.10.0":
-        # Version fixes fsspec glob https://github.com/fsspec/filesystem_spec/pull/1329
-        glob_str = os.path.join(root, "**" if recursive else "", file_pattern)
-        return fs.glob(glob_str)
     return [
         os.path.join(dirpath, f)
-        for dirpath, dirnames, files in fs.walk(
-            root,
-            maxdepth=None if recursive else 1,
-        )
-        for f in fnmatch.filter(files, file_pattern)
+        for dirpath, dirnames, files in fs.walk(root)
+        for f in files
+        if re.match(file_pattern, f)
     ]
 
 
 def normalize_fs_path(
     filesystem: Optional[FileSystem],
     path: Union[str, Path],
 ) -> Tuple[AbstractFileSystem, str]:
-    path = str(path)
+    has_protocol = isinstance(path, str) and "://" in path
+    filesystem = (
+        ArrowFSWrapper(filesystem)
+        if isinstance(filesystem, pyarrow.fs.FileSystem)
+        else filesystem
+    )
 
-    if filesystem is None or (isinstance(path, str) and "://" in path):
-        path = (
-            os.path.abspath(path)
-            if isinstance(path, Path) or "://" in path
-            else f"file://{os.path.abspath(path)}"
-        )
-        inferred_fs, fs_path = pyarrow.fs.FileSystem.from_uri(path)
+    # We need to detect the fs from the path
+    if filesystem is None or has_protocol:
+        uri: str = path if has_protocol else f"file://{os.path.abspath(path)}"
+        inferred_fs, fs_path = fsspec.core.url_to_fs(uri)
+        inferred_fs: fsspec.AbstractFileSystem
         filesystem = filesystem or inferred_fs
-        assert inferred_fs.type_name == filesystem.type_name, (
-            f"Protocol {inferred_fs.type_name} in path does not match "
-            f"filesystem {filesystem.type_name}"
+        assert inferred_fs.protocol == filesystem.protocol, (
+            f"Protocol {inferred_fs.protocol} in path does not match "
+            f"filesystem {filesystem.protocol}"
         )
-        path = fs_path
+        path = fs_path  # path without protocol
 
     return (
         ArrowFSWrapper(filesystem)
         if isinstance(filesystem, pyarrow.fs.FileSystem)
         else filesystem
-    ), path
+    ), str(path)
diff --git a/tests/data/test_parquet.py b/tests/data/test_parquet.py
@@ -1,3 +1,4 @@
+import os
 from pathlib import Path
 
 import pyarrow.dataset
@@ -242,7 +243,7 @@ def test_read_to_parquet(blank_nlp, tmpdir):
     fs = pyarrow.fs.LocalFileSystem()
     doc = list(
         edsnlp.data.read_parquet(
-            input_dir,
+            input_dir.relative_to(os.getcwd()),
             converter="omop",
             span_attributes=["etat", "assertion"],
             doc_attributes=["context_var"],
diff --git a/tests/training/test_train.py b/tests/training/test_train.py
@@ -87,8 +87,8 @@ def test_ner_qualif_train(run_in_test_dir, tmp_path):
     scorer = GenericScorer(**kwargs["scorer"])
     last_scores = scorer(nlp, Reader(**kwargs["val_data"])(nlp))
 
-    assert last_scores["exact_ner"]["micro"]["f"] > 0.5
-    assert last_scores["qualifier"]["micro"]["f"] > 0.5
+    assert last_scores["exact_ner"]["micro"]["f"] > 0.4
+    assert last_scores["qualifier"]["micro"]["f"] > 0.4
 
 
 def test_qualif_train(run_in_test_dir, tmp_path):
@@ -100,7 +100,7 @@ def test_qualif_train(run_in_test_dir, tmp_path):
     scorer = GenericScorer(**kwargs["scorer"])
     last_scores = scorer(nlp, Reader(**kwargs["val_data"])(nlp))
 
-    assert last_scores["qualifier"]["micro"]["f"] > 0.5
+    assert last_scores["qualifier"]["micro"]["f"] >= 0.4
 
 
 def test_optimizer():

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,9 @@ def __init__(`
`36`	`36`	`self.files = (`
`37`	`37`	`[`
`38`	`38`	`file`
`39`		`- for file in walk_match(self.fs, os.path.dirname(self.path), ".json")`
	`39`	`+ for file in walk_match(`
	`40`	`+ self.fs, os.path.dirname(self.path), ".[.]json."`
	`41`	`+ )`
`40`	`42`	`if keep_ipynb_checkpoints or ".ipynb_checkpoints" not in str(file)`
`41`	`43`	`]`
`42`	`44`	`if self.fs.isdir(self.path)`
`@@ -128,7 +130,7 @@ def __init__(`
`128`	`130`	`)`
`129`	`131`	`if path_exists:`
`130`	`132`	`if self.fs.isdir(self.path):`
`131`		`- files = [f for f in walk_match(self.fs, self.path, ".json")]`
	`133`	`+ files = [f for f in walk_match(self.fs, self.path, ".[.]json.")]`
`132`	`134`	`if files:`
`133`	`135`	`if not overwrite:`
`134`	`136`	`raise FileExistsError(`