Skip to content

Commit 8f15ae3

Browse files
percevalwJungack
andcommitted
feat: improve eds.table matcher
Co-Authored-By: Jacques Ung <jungack@un-g.com>
1 parent 80d57f8 commit 8f15ae3

File tree

5 files changed

+187
-52
lines changed

5 files changed

+187
-52
lines changed

changelog.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
- Added a new `eds.ner_overlap_scorer` to evaluate matches between two lists of entities, counting true when the dice overlap is above a given threshold
2525
- `edsnlp.load` now accepts EDS-NLP models from the huggingface hub 🤗 !
2626
- New `python -m edsnlp.package` command to package a model for the huggingface hub or pypi-like registries
27+
- Improve table detection in `eds.tables` and support new options in `table._.to_pd_table(...)`:
28+
- `header=True` to use first row as header
29+
- `index=True` to use first column as index
30+
- `as_spans=True` to fill cells as document spans instead of strings
2731

2832
### Changed
2933

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
from .patterns import regex, sep
21
from .tables import TablesMatcher
Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,2 @@
1-
sep = r"¦|\|"
2-
regex = dict(
3-
tables=rf"(\b.*{sep}.*\n)+",
4-
)
1+
sep = ["¦", "|"]
2+
regex_template = [r"(?:{sep}?(?:[^{sep}\n]*{sep})+[^{sep}\n]*{sep}?\n)+"]

edsnlp/pipes/misc/tables/tables.py

Lines changed: 139 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
1-
from io import StringIO
1+
import re
22
from typing import Dict, Optional, Union
33

44
import pandas as pd
55
from spacy.tokens import Doc, Span
66

77
from edsnlp.core import PipelineProtocol
8-
from edsnlp.pipes.core.matcher.matcher import GenericMatcher
8+
from edsnlp.matchers.phrase import EDSPhraseMatcher
9+
from edsnlp.matchers.regex import RegexMatcher
10+
from edsnlp.pipes.base import BaseComponent
911
from edsnlp.pipes.misc.tables import patterns
10-
from edsnlp.utils.filter import get_spans
12+
from edsnlp.utils.typing import AsList
1113

1214

13-
class TablesMatcher(GenericMatcher):
15+
class TablesMatcher(BaseComponent):
1416
'''
1517
The `eds.tables` matcher detects tables in a documents.
1618
@@ -70,7 +72,11 @@ class TablesMatcher(GenericMatcher):
7072
# VMP ¦fL ¦11.5 + ¦7.4-10.8
7173
7274
# Convert span to Pandas table
73-
df = table._.to_pd_table()
75+
df = table._.to_pd_table(
76+
as_spans=False, # set True to set the table cells as spans instead of strings
77+
header=False, # set True to use the first row as header
78+
index=False, # set True to use the first column as index
79+
)
7480
type(df)
7581
# Out: pandas.core.frame.DataFrame
7682
```
@@ -96,7 +102,7 @@ class TablesMatcher(GenericMatcher):
96102
Parameters
97103
----------
98104
nlp : PipelineProtocol
99-
spaCy nlp pipeline to use for matching.
105+
Pipeline object
100106
name: str
101107
Name of the component.
102108
tables_pattern : Optional[Dict[str, str]]
@@ -105,6 +111,10 @@ class TablesMatcher(GenericMatcher):
105111
sep_pattern : Optional[str]
106112
The regex pattern to identify the separator pattern.
107113
Used when calling `to_pd_table`.
114+
col_names : Optional[bool]
115+
Whether the tables_pattern matches column names
116+
row_names : Optional[bool]
117+
Whether the table_pattern matches row names
108118
attr : str
109119
spaCy's attribute to use:
110120
a string with the value "TEXT" or "NORM", or a dict with
@@ -120,41 +130,106 @@ class TablesMatcher(GenericMatcher):
120130
def __init__(
121131
self,
122132
nlp: PipelineProtocol,
123-
name: str = "tables",
133+
name: Optional[str] = "tables",
124134
*,
125-
tables_pattern: Optional[Dict[str, str]] = None,
126-
sep_pattern: Optional[str] = None,
135+
tables_pattern: Optional[AsList[str]] = None,
136+
sep_pattern: Optional[AsList[str]] = None,
127137
attr: Union[Dict[str, str], str] = "TEXT",
128138
ignore_excluded: bool = True,
129139
):
130-
if tables_pattern is None and sep_pattern is None:
131-
self.tables_pattern = patterns.regex
132-
self.sep = patterns.sep
133-
elif tables_pattern is None or sep_pattern is None:
134-
raise ValueError(
135-
"Both tables_pattern and sep_pattern must be provided "
136-
"for custom eds.table pipeline."
137-
)
138-
else:
139-
self.tables_pattern = tables_pattern
140-
self.sep = sep_pattern
141-
142-
super().__init__(
143-
nlp=nlp,
144-
name=name,
145-
terms=None,
146-
regex=self.tables_pattern,
147-
attr=attr,
148-
ignore_excluded=ignore_excluded,
140+
super().__init__(nlp, name)
141+
if tables_pattern is None:
142+
tables_pattern = patterns.regex_template
143+
144+
if sep_pattern is None:
145+
sep_pattern = patterns.sep
146+
147+
self.regex_matcher = RegexMatcher(attr=attr, ignore_excluded=ignore_excluded)
148+
self.regex_matcher.add(
149+
"table",
150+
list(
151+
dict.fromkeys(
152+
template.format(sep=re.escape(sep))
153+
for sep in sep_pattern
154+
for template in tables_pattern
155+
)
156+
),
157+
)
158+
159+
self.term_matcher = EDSPhraseMatcher(
160+
nlp.vocab, attr=attr, ignore_excluded=ignore_excluded
161+
)
162+
self.term_matcher.build_patterns(
163+
nlp,
164+
{
165+
"eol_pattern": "\n",
166+
"sep_pattern": sep_pattern,
167+
},
149168
)
150169

151170
if not Span.has_extension("to_pd_table"):
152171
Span.set_extension("to_pd_table", method=self.to_pd_table)
153172

154-
self.set_extensions()
173+
@classmethod
174+
def set_extensions(cls) -> None:
175+
"""
176+
Set extensions for the tables pipeline.
177+
"""
178+
179+
if not Span.has_extension("table"):
180+
Span.set_extension("table", default=None)
181+
182+
def get_table(self, table):
183+
"""
184+
Convert spans of tables to dictionaries
185+
Parameters
186+
----------
187+
table : Span
188+
189+
Returns
190+
-------
191+
List[Span]
192+
"""
193+
194+
# We store each row in a list and store each of hese lists
195+
# in processed_table for post processing
196+
# considering the self.col_names and self.row_names var
197+
processed_table = []
198+
delimiters = [
199+
delimiter
200+
for delimiter in self.term_matcher(table, as_spans=True)
201+
if delimiter.start >= table.start and delimiter.end <= table.end
202+
]
203+
204+
last = table.start
205+
row = []
206+
# Parse the table to match each cell thanks to delimiters
207+
for delimiter in delimiters:
208+
row.append(table[last - table.start : delimiter.start - table.start])
209+
last = delimiter.end
210+
211+
# End the actual row if there is an end of line
212+
if delimiter.label_ == "eol_pattern":
213+
processed_table.append(row)
214+
row = []
215+
216+
# Remove first or last column in case the separator pattern is
217+
# also used in the raw table to draw the outlines
218+
max_len = max(len(row) for row in processed_table)
219+
if all(row[0].start == row[0].end for row in processed_table):
220+
processed_table = [row[1:] for row in processed_table]
221+
if all(
222+
row[-1].start == row[-1].end
223+
for row in processed_table
224+
if len(row) == max_len
225+
):
226+
processed_table = [row[:-1] for row in processed_table]
227+
228+
return processed_table
155229

156230
def __call__(self, doc: Doc) -> Doc:
157-
"""Find spans that contain tables
231+
"""
232+
Find spans that contain tables
158233
159234
Parameters
160235
----------
@@ -164,21 +239,40 @@ def __call__(self, doc: Doc) -> Doc:
164239
-------
165240
Doc
166241
"""
167-
matches = self.process(doc)
168-
tables = get_spans(matches, "tables")
169-
# parsed = self.parse(tables=tables)
242+
matches = list(self.regex_matcher(doc, as_spans=True))
243+
doc.spans["tables"] = matches
244+
return doc
170245

171-
doc.spans["tables"] = tables
246+
def to_pd_table(
247+
self,
248+
span,
249+
as_spans=False,
250+
header: bool = False,
251+
index: bool = False,
252+
) -> pd.DataFrame:
253+
"""
254+
Return pandas DataFrame
172255
173-
return doc
256+
Parameters
257+
----------
258+
span : Span
259+
The span containing the table
260+
as_spans : bool
261+
Whether to return the table cells as spans
262+
header : bool
263+
Whether the table has a header
264+
index : bool
265+
Whether the table has an index
266+
"""
267+
table = self.get_table(span)
268+
if not as_spans:
269+
table = [[str(cell) for cell in data] for data in table]
174270

175-
def to_pd_table(self, span) -> pd.DataFrame:
176-
table_str_io = StringIO(span.text)
177-
parsed = pd.read_csv(
178-
table_str_io,
179-
sep=self.sep,
180-
engine="python",
181-
header=None,
182-
on_bad_lines="skip",
183-
)
184-
return parsed
271+
table = pd.DataFrame.from_records(table)
272+
if header:
273+
table.columns = [str(k) for k in table.iloc[0]]
274+
table = table[1:]
275+
if index:
276+
table.index = [str(k) for k in table.iloc[:, 0]]
277+
table = table.iloc[:, 1:]
278+
return table

tests/pipelines/misc/test_tables.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import pytest
2+
from spacy.tokens.span import Span
3+
14
TEXT = """
25
Le patientqsfqfdf bla bla bla
36
Leucocytes ¦x10*9/L ¦4.97 ¦4.09-11
@@ -14,18 +17,55 @@
1417
1518
2/2Pat : <NOM> <Prenom> |<date> | <ipp> |Intitulé RCP
1619
20+
|Libellé | Unité | Valeur | Intervalle |
21+
|Leucocytes |x10*9/L |4.97 | 4.09-11 |
22+
|Hématies |x10*12/L|4.68 | 4.53-5.79 |
23+
|Hémoglobine |g/dL |14.8 | 13.4-16.7 |
24+
|Hématocrite ||44.2 | 39.2-48.6 |
25+
|VGM |fL | 94.4 + | 79.6-94 |
26+
|TCMH |pg |31.6 |
27+
|CCMH |g/dL
28+
|Plaquettes |x10*9/L |191 | 172-398 |
29+
|VMP |fL |11.5 + | 7.4-10.8 |
1730
1831
"""
1932

2033

2134
def test_tables(blank_nlp):
35+
if blank_nlp.lang != "eds":
36+
pytest.skip("Test only for eds language")
2237
blank_nlp.add_pipe("eds.normalizer")
2338
blank_nlp.add_pipe("eds.tables")
2439

2540
doc = blank_nlp(TEXT)
2641

27-
assert len(doc.spans["tables"]) == 1
42+
assert len(doc.spans["tables"]) == 2
2843

2944
span = doc.spans["tables"][0]
3045
df = span._.to_pd_table()
31-
assert df.iloc[5, 0] == "TCMH "
46+
assert len(df.columns) == 4
47+
assert len(df) == 9
48+
assert str(df.iloc[5, 0]) == "TCMH"
49+
50+
span = doc.spans["tables"][1]
51+
df = span._.to_pd_table(header=True, index=True, as_spans=True)
52+
print(df)
53+
assert df.columns.tolist() == [
54+
"Unité",
55+
"Valeur",
56+
"Intervalle",
57+
]
58+
assert df.index.tolist() == [
59+
"Leucocytes",
60+
"Hématies",
61+
"Hémoglobine",
62+
"Hématocrite",
63+
"VGM",
64+
"TCMH",
65+
"CCMH",
66+
"Plaquettes",
67+
"VMP",
68+
]
69+
cell = df.loc["TCMH", "Valeur"]
70+
assert isinstance(cell, Span)
71+
assert cell.text == "31.6"

0 commit comments

Comments
 (0)