Skip to content

Commit 702ec25

Browse files
percevalwThomzoy
authored andcommitted
test: add test for line-break induced missed matches
1 parent 78fe4aa commit 702ec25

File tree

2 files changed

+21
-9
lines changed

2 files changed

+21
-9
lines changed

edsnlp/language.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,6 @@ class EDSLanguage(French):
4141
Defaults = EDSDefaults
4242

4343

44-
TOKENIZER_EXCEPTIONS = [
45-
r"Dr\.",
46-
r"Pr\.",
47-
r"M\.",
48-
r"Mme\.",
49-
r"Mlle\.",
50-
r"(?i:(?:ep\.))",
51-
r"(?<![\W\d_])-\n",
52-
]
5344
TOKENIZER_EXCEPTIONS = [r"Dr\.", r"Pr\.", r"M\.", r"Mme\.", r"Mlle\.", r"(?i:(?:ep\.))"]
5445

5546

tests/matchers/test_regex.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,3 +361,24 @@ def test_empty_get_text(blank_nlp):
361361
doc = blank_nlp("==================================")
362362
clean = get_text(doc, attr="NORM", ignore_excluded=True, ignore_space_tokens=True)
363363
assert clean == ""
364+
365+
366+
def test_ignore_space_tokens_and_newline(blank_nlp):
367+
# Fixed by
368+
# `text_parts[i - 1] += " "` snippet in
369+
# edsnlp.utils.doc_to_text.aggregate_tokens
370+
blank_nlp.add_pipe("eds.normalizer")
371+
blank_nlp.add_pipe(
372+
"eds.matcher",
373+
config=dict(
374+
regex=dict(test=r"pneumopathie a coronavirus"),
375+
attr="NORM",
376+
ignore_excluded=True,
377+
ignore_space_tokens=True,
378+
),
379+
)
380+
text = "Il a une\npneumopathie à coronavirus"
381+
doc = blank_nlp(text)
382+
clean = get_text(doc, attr="NORM", ignore_space_tokens=True, ignore_excluded=True)
383+
assert clean == "il a une pneumopathie a coronavirus"
384+
assert [e.text for e in doc.ents] == ["pneumopathie à coronavirus"]

0 commit comments

Comments
 (0)