1- from io import StringIO
1+ import re
22from typing import Dict , Optional , Union
33
44import pandas as pd
55from spacy .tokens import Doc , Span
66
77from edsnlp .core import PipelineProtocol
8- from edsnlp .pipes .core .matcher .matcher import GenericMatcher
8+ from edsnlp .matchers .phrase import EDSPhraseMatcher
9+ from edsnlp .matchers .regex import RegexMatcher
10+ from edsnlp .pipes .base import BaseComponent
911from edsnlp .pipes .misc .tables import patterns
10- from edsnlp .utils .filter import get_spans
12+ from edsnlp .utils .typing import AsList
1113
1214
13- class TablesMatcher (GenericMatcher ):
15+ class TablesMatcher (BaseComponent ):
1416 '''
1517 The `eds.tables` matcher detects tables in a documents.
1618
@@ -70,7 +72,11 @@ class TablesMatcher(GenericMatcher):
7072 # VMP ¦fL ¦11.5 + ¦7.4-10.8
7173
7274 # Convert span to Pandas table
73- df = table._.to_pd_table()
75+ df = table._.to_pd_table(
76+ as_spans=False, # set True to set the table cells as spans instead of strings
77+ header=False, # set True to use the first row as header
78+ index=False, # set True to use the first column as index
79+ )
7480 type(df)
7581 # Out: pandas.core.frame.DataFrame
7682 ```
@@ -96,7 +102,7 @@ class TablesMatcher(GenericMatcher):
96102 Parameters
97103 ----------
98104 nlp : PipelineProtocol
99- spaCy nlp pipeline to use for matching.
105+ Pipeline object
100106 name: str
101107 Name of the component.
102108 tables_pattern : Optional[Dict[str, str]]
@@ -105,6 +111,10 @@ class TablesMatcher(GenericMatcher):
105111 sep_pattern : Optional[str]
106112 The regex pattern to identify the separator pattern.
107113 Used when calling `to_pd_table`.
114+ col_names : Optional[bool]
115+ Whether the tables_pattern matches column names
116+ row_names : Optional[bool]
117+ Whether the table_pattern matches row names
108118 attr : str
109119 spaCy's attribute to use:
110120 a string with the value "TEXT" or "NORM", or a dict with
@@ -120,41 +130,106 @@ class TablesMatcher(GenericMatcher):
120130 def __init__ (
121131 self ,
122132 nlp : PipelineProtocol ,
123- name : str = "tables" ,
133+ name : Optional [ str ] = "tables" ,
124134 * ,
125- tables_pattern : Optional [Dict [ str , str ]] = None ,
126- sep_pattern : Optional [str ] = None ,
135+ tables_pattern : Optional [AsList [ str ]] = None ,
136+ sep_pattern : Optional [AsList [ str ] ] = None ,
127137 attr : Union [Dict [str , str ], str ] = "TEXT" ,
128138 ignore_excluded : bool = True ,
129139 ):
130- if tables_pattern is None and sep_pattern is None :
131- self .tables_pattern = patterns .regex
132- self .sep = patterns .sep
133- elif tables_pattern is None or sep_pattern is None :
134- raise ValueError (
135- "Both tables_pattern and sep_pattern must be provided "
136- "for custom eds.table pipeline."
137- )
138- else :
139- self .tables_pattern = tables_pattern
140- self .sep = sep_pattern
141-
142- super ().__init__ (
143- nlp = nlp ,
144- name = name ,
145- terms = None ,
146- regex = self .tables_pattern ,
147- attr = attr ,
148- ignore_excluded = ignore_excluded ,
140+ super ().__init__ (nlp , name )
141+ if tables_pattern is None :
142+ tables_pattern = patterns .regex_template
143+
144+ if sep_pattern is None :
145+ sep_pattern = patterns .sep
146+
147+ self .regex_matcher = RegexMatcher (attr = attr , ignore_excluded = ignore_excluded )
148+ self .regex_matcher .add (
149+ "table" ,
150+ list (
151+ dict .fromkeys (
152+ template .format (sep = re .escape (sep ))
153+ for sep in sep_pattern
154+ for template in tables_pattern
155+ )
156+ ),
157+ )
158+
159+ self .term_matcher = EDSPhraseMatcher (
160+ nlp .vocab , attr = attr , ignore_excluded = ignore_excluded
161+ )
162+ self .term_matcher .build_patterns (
163+ nlp ,
164+ {
165+ "eol_pattern" : "\n " ,
166+ "sep_pattern" : sep_pattern ,
167+ },
149168 )
150169
151170 if not Span .has_extension ("to_pd_table" ):
152171 Span .set_extension ("to_pd_table" , method = self .to_pd_table )
153172
154- self .set_extensions ()
173+ @classmethod
174+ def set_extensions (cls ) -> None :
175+ """
176+ Set extensions for the tables pipeline.
177+ """
178+
179+ if not Span .has_extension ("table" ):
180+ Span .set_extension ("table" , default = None )
181+
182+ def get_table (self , table ):
183+ """
184+ Convert spans of tables to dictionaries
185+ Parameters
186+ ----------
187+ table : Span
188+
189+ Returns
190+ -------
191+ List[Span]
192+ """
193+
194+ # We store each row in a list and store each of hese lists
195+ # in processed_table for post processing
196+ # considering the self.col_names and self.row_names var
197+ processed_table = []
198+ delimiters = [
199+ delimiter
200+ for delimiter in self .term_matcher (table , as_spans = True )
201+ if delimiter .start >= table .start and delimiter .end <= table .end
202+ ]
203+
204+ last = table .start
205+ row = []
206+ # Parse the table to match each cell thanks to delimiters
207+ for delimiter in delimiters :
208+ row .append (table [last - table .start : delimiter .start - table .start ])
209+ last = delimiter .end
210+
211+ # End the actual row if there is an end of line
212+ if delimiter .label_ == "eol_pattern" :
213+ processed_table .append (row )
214+ row = []
215+
216+ # Remove first or last column in case the separator pattern is
217+ # also used in the raw table to draw the outlines
218+ max_len = max (len (row ) for row in processed_table )
219+ if all (row [0 ].start == row [0 ].end for row in processed_table ):
220+ processed_table = [row [1 :] for row in processed_table ]
221+ if all (
222+ row [- 1 ].start == row [- 1 ].end
223+ for row in processed_table
224+ if len (row ) == max_len
225+ ):
226+ processed_table = [row [:- 1 ] for row in processed_table ]
227+
228+ return processed_table
155229
156230 def __call__ (self , doc : Doc ) -> Doc :
157- """Find spans that contain tables
231+ """
232+ Find spans that contain tables
158233
159234 Parameters
160235 ----------
@@ -164,21 +239,40 @@ def __call__(self, doc: Doc) -> Doc:
164239 -------
165240 Doc
166241 """
167- matches = self .process (doc )
168- tables = get_spans ( matches , "tables" )
169- # parsed = self.parse(tables=tables)
242+ matches = list ( self .regex_matcher (doc , as_spans = True ) )
243+ doc . spans [ " tables" ] = matches
244+ return doc
170245
171- doc .spans ["tables" ] = tables
246+ def to_pd_table (
247+ self ,
248+ span ,
249+ as_spans = False ,
250+ header : bool = False ,
251+ index : bool = False ,
252+ ) -> pd .DataFrame :
253+ """
254+ Return pandas DataFrame
172255
173- return doc
256+ Parameters
257+ ----------
258+ span : Span
259+ The span containing the table
260+ as_spans : bool
261+ Whether to return the table cells as spans
262+ header : bool
263+ Whether the table has a header
264+ index : bool
265+ Whether the table has an index
266+ """
267+ table = self .get_table (span )
268+ if not as_spans :
269+ table = [[str (cell ) for cell in data ] for data in table ]
174270
175- def to_pd_table (self , span ) -> pd .DataFrame :
176- table_str_io = StringIO (span .text )
177- parsed = pd .read_csv (
178- table_str_io ,
179- sep = self .sep ,
180- engine = "python" ,
181- header = None ,
182- on_bad_lines = "skip" ,
183- )
184- return parsed
271+ table = pd .DataFrame .from_records (table )
272+ if header :
273+ table .columns = [str (k ) for k in table .iloc [0 ]]
274+ table = table [1 :]
275+ if index :
276+ table .index = [str (k ) for k in table .iloc [:, 0 ]]
277+ table = table .iloc [:, 1 :]
278+ return table
0 commit comments