1919from loguru import logger
2020
2121from edsnlp import registry
22- from edsnlp .core import PipelineProtocol
2322from edsnlp .core .lazy_collection import LazyCollection
2423from edsnlp .data .base import BaseReader , BaseWriter
2524from edsnlp .data .converters import (
2625 FILENAME ,
2726 AttributesMappingArg ,
28- SequenceStr ,
2927 get_dict2doc_converter ,
3028 get_doc2dict_converter ,
3129)
@@ -50,7 +48,8 @@ def __init__(self, ann_file, line):
5048
5149
5250def parse_standoff_file (
53- path : str ,
51+ txt_path : str ,
52+ ann_paths : List [str ],
5453 merge_spaced_fragments : bool = True ,
5554 fs : FileSystem = LOCAL_FS ,
5655) -> Dict :
@@ -74,28 +73,19 @@ def parse_standoff_file(
7473 -------
7574 Iterator[Dict]
7675 """
77- ann_filenames = []
78- for filename in walk_match (
79- fs ,
80- os .path .dirname (path ),
81- os .path .basename (path ).replace (".txt" , ".a*" ),
82- recursive = False ,
83- ):
84- ann_filenames .append (filename )
85-
8676 entities = {}
8777 relations = []
8878 events = {}
8979
90- with fs .open (path , "r" ) as f :
80+ with fs .open (txt_path , "r" ) as f :
9181 text = f .read ()
9282
93- if not len (ann_filenames ):
83+ if not len (ann_paths ):
9484 return {
9585 "text" : text ,
9686 }
9787
98- for ann_file in ann_filenames :
88+ for ann_file in ann_paths :
9989 with fs .open (ann_file , "r" ) as f :
10090 for line_idx , line in enumerate (f ):
10191 try :
@@ -303,34 +293,33 @@ def __init__(
303293 ):
304294 super ().__init__ ()
305295 self .fs , self .path = normalize_fs_path (filesystem , path )
306- self . files : List [ str ] = [
296+ files = {
307297 file
308- for file in walk_match (self .fs , self .path , "*. txt" )
298+ for file in walk_match (self .fs , self .path , ".*[.]( txt|a*) " )
309299 if (keep_ipynb_checkpoints or ".ipynb_checkpoints" not in str (file ))
310- and (
311- keep_txt_only_docs
312- or walk_match (
313- self .fs ,
314- os .path .dirname (file ),
315- os .path .basename (file ).replace (".txt" , ".a*" ),
316- recursive = False ,
317- )
318- )
300+ }
301+ ann_files = {}
302+ for f in files :
303+ name , ext = os .path .splitext (f )
304+ if ext .startswith (".a" ):
305+ ann_files .setdefault (name , []).append (f )
306+ self .files = [
307+ (file , ann_files .get (file .replace (".txt" , "" ), []))
308+ for file in files
309+ if file .endswith (".txt" )
310+ and (keep_txt_only_docs or file .replace (".txt" , "" ) in ann_files )
319311 ]
320312 assert len (self .files ), f"No .txt files found in the BRAT directory { self .path } "
321- for file in self .files :
322- if not self .fs .exists (file ):
323- raise FileNotFoundError (f"File { file } does not exist" )
324313 logger .info (f"The BRAT directory contains { len (self .files )} .txt files." )
325314
326315 def read_main (self ) -> Iterable [Tuple [str , int ]]:
327316 return ((f , 1 ) for f in self .files )
328317
329318 def read_worker (self , fragment : List [str ]):
330319 tasks = []
331- for file in fragment :
332- anns = parse_standoff_file (str ( file ) , fs = self .fs )
333- anns [FILENAME ] = os .path .relpath (file , self .path ).rsplit ("." , 1 )[0 ]
320+ for txt_path , ann_paths in fragment :
321+ anns = parse_standoff_file (txt_path , ann_paths , fs = self .fs )
322+ anns [FILENAME ] = os .path .relpath (txt_path , self .path ).rsplit ("." , 1 )[0 ]
334323 anns ["doc_id" ] = anns [FILENAME ]
335324 tasks .append (anns )
336325 return tasks
@@ -350,9 +339,8 @@ def __init__(
350339
351340 if self .fs .exists (self .path ):
352341 unsafe_exts = Counter (
353- os .path .splitext (f )[1 ] for f in walk_match (self .fs , self .path , "*.txt" )
354- ) + Counter (
355- os .path .splitext (f )[1 ] for f in walk_match (self .fs , self .path , "*.a*" )
342+ os .path .splitext (f )[1 ]
343+ for f in walk_match (self .fs , self .path , ".*[.](txt|a.*)" )
356344 )
357345 if unsafe_exts and not overwrite :
358346 raise FileExistsError (
0 commit comments