Skip to content

Commit 32472ae

Browse files
committed
feat: new markup_to_doc converter
1 parent 114482a commit 32472ae

File tree

5 files changed

+255
-10
lines changed

5 files changed

+255
-10
lines changed

changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
- Handling intra-word linebreak as pollution : adds a pollution pattern that detects intra-word linebreak, which can then be removed in the `get_text` method
88
- Qualifiers can process `Span` or `Doc` : this feature especially makes it easier to nest qualifiers components in other components
99
- New label_weights parameter in eds.span_classifier`, which allows the user to set per label-value loss weights during training
10+
- New `edsnlp.data.converters.MarkupToDocConverter` to convert Markdown or XML-like markup to documents, which is particularly useful to create annotated documents from scratch (e.g., for testing purposes).
1011

1112
### Fixed
1213

docs/data/converters.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,3 +218,13 @@ one per entity, that can be used to write to a dataframe. The schema of each pro
218218
options:
219219
heading_level: 4
220220
show_source: false
221+
222+
## Markup (`converter="markup"`) {: #edsnlp.data.converters.MarkupToDocConverter }
223+
224+
This converter is used to convert markup data, such as Markdown or XML into documents.
225+
This can be particularly useful when you want to create annotated documents from scratch (e.g., for testing purposes).
226+
227+
::: edsnlp.data.converters.MarkupToDocConverter
228+
options:
229+
heading_level: 4
230+
show_source: false

docs/data/index.md

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,10 @@ At the moment, we support the following data sources:
4646

4747
and the following schemas:
4848

49-
| Schema | Snippet |
50-
|:---------------------------------------------------------------------------|------------------------|
51-
| [Custom](./converters/#custom) | `converter=custom_fn` |
52-
| [OMOP](./converters/#omop) | `converter="omop"` |
53-
| [Standoff](./converters/#standoff) | `converter="standoff"` |
54-
| [Ents](./converters/#edsnlp.data.converters.EntsDoc2DictConverter) | `converter="ents"` |
49+
| Schema | Snippet |
50+
|:--------------------------------------------------------------------|------------------------|
51+
| [Custom](./converters/#custom) | `converter=custom_fn` |
52+
| [OMOP](./converters/#omop) | `converter="omop"` |
53+
| [Standoff](./converters/#standoff) | `converter="standoff"` |
54+
| [Ents](./converters/#edsnlp.data.converters.EntsDoc2DictConverter) | `converter="ents"` |
55+
| [Markup](./converters/#edsnlp.data.converters.MarkupToDocConverter) | `converter="markup"` |

edsnlp/data/converters.py

Lines changed: 236 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from confit.registry import ValidatedFunction
2525
from spacy.tokenizer import Tokenizer
2626
from spacy.tokens import Doc, Span
27+
from typing_extensions import Literal
2728

2829
import edsnlp
2930
from edsnlp import registry
@@ -707,6 +708,225 @@ def __call__(self, doc):
707708
]
708709

709710

711+
# ex: `[The [cat](ANIMAL) is [black](COLOR hex="#000000")].
712+
713+
714+
@registry.factory.register("eds.markup_to_doc", spacy_compatible=False)
715+
class MarkupToDocConverter:
716+
"""
717+
Examples
718+
--------
719+
```python
720+
import edsnlp
721+
722+
# Any kind of reader (`edsnlp.data.read/from_...`) can be used here
723+
# If input items are dicts, the converter expects a "text" key/column.
724+
docs = list(
725+
edsnlp.data.from_iterable(
726+
[
727+
"This [is](VERB negation=True) not a [test](NOUN).",
728+
"This is another [test](NOUN).",
729+
],
730+
converter="markup",
731+
span_setter="entities",
732+
),
733+
)
734+
print(docs[0].spans["entities"])
735+
# Out: [is, test]
736+
```
737+
738+
You can also use it directly on a string:
739+
740+
```python
741+
from edsnlp.data.converters import MarkupToDocConverter
742+
743+
converter = MarkupToDocConverter(
744+
span_setter={"verb": "VERB", "noun": "NOUN"},
745+
preset="xml",
746+
)
747+
doc = converter("This <VERB negation=True>is</VERB> not a <NOUN>test</NOUN>.")
748+
print(doc.spans["verb"])
749+
# Out: [is]
750+
print(doc.spans["verb"][0]._.negation)
751+
# Out: True
752+
```
753+
754+
Parameters
755+
----------
756+
preset: Literal["md", "xml"]
757+
The preset to use for the markup format. Defaults to "md" (Markdown-like
758+
syntax). Use "xml" for XML-like syntax.
759+
opener: Optional[str]
760+
The regex pattern to match the opening tag of the markup. Defaults to the
761+
preset's opener.
762+
closer: Optional[str]
763+
The regex pattern to match the closing tag of the markup. Defaults to the
764+
preset's closer.
765+
tokenizer: Optional[Tokenizer]
766+
The tokenizer instance used to tokenize the documents. Likely not needed since
767+
by default it uses the current context tokenizer :
768+
769+
- the tokenizer of the next pipeline run by `.map_pipeline` in a
770+
[Stream][edsnlp.core.stream.Stream].
771+
- or the `eds` tokenizer by default.
772+
span_setter: SpanSetterArg
773+
The span setter to use when setting the spans in the documents. Defaults to
774+
setting the spans in the `ents` attribute and creates a new span group for
775+
each JSON entity label.
776+
span_attributes: Optional[AttributesMappingArg]
777+
Mapping from markup attributes to Span extensions (can be a list too).
778+
By default, all attributes are imported as Span extensions with the same name.
779+
keep_raw_attribute_values: bool
780+
Whether to keep the raw attribute values (as strings) or to convert them to
781+
Python objects (e.g. booleans).
782+
default_attributes: AttributesMappingArg
783+
How to set attributes on spans for which no attribute value was found in the
784+
input format. This is especially useful for negation, or frequent attributes
785+
values (e.g. "negated" is often False, "temporal" is often "present"), that
786+
annotators may not want to annotate every time.
787+
bool_attributes: AsList[str]
788+
List of boolean attributes to set to False by default. This is useful for
789+
attributes that are often not annotated, but you want to have a default value
790+
for them.
791+
"""
792+
793+
PRESETS = {
794+
"md": {
795+
"opener": r"(?P<opener>\[)",
796+
"closer": r"(?P<closer>\]\(\s*(?P<closer_label>[a-zA-Z0-9]+)\s*(?P<closer_attrs>.*?)\))", # noqa: E501
797+
},
798+
"xml": {
799+
"opener": r"(?P<opener><(?P<opener_label>[a-zA-Z0-9]+)(?P<opener_attrs>.*?)>)", # noqa: E501
800+
"closer": r"(?P<closer></(?P<closer_label>[a-zA-Z0-9]+)>)",
801+
},
802+
}
803+
804+
def __init__(
805+
self,
806+
*,
807+
tokenizer: Optional[Tokenizer] = None,
808+
span_setter: SpanSetterArg = {"ents": True, "*": True},
809+
span_attributes: Optional[AttributesMappingArg] = None,
810+
keep_raw_attribute_values: bool = False,
811+
default_attributes: AttributesMappingArg = {},
812+
bool_attributes: AsList[str] = [],
813+
preset: Literal["md", "xml"] = "md",
814+
opener: Optional[str] = None,
815+
closer: Optional[str] = None,
816+
):
817+
self.tokenizer = tokenizer
818+
self.span_setter = span_setter
819+
self.span_attributes = span_attributes
820+
self.keep_raw_attribute_values = keep_raw_attribute_values
821+
self.default_attributes = dict(default_attributes)
822+
for attr in bool_attributes:
823+
self.default_attributes[attr] = False
824+
self.opener = opener or self.PRESETS[preset]["opener"]
825+
self.closer = closer or self.PRESETS[preset]["closer"]
826+
827+
def _as_python(self, value: str):
828+
import ast
829+
830+
if self.keep_raw_attribute_values:
831+
return value
832+
try:
833+
return ast.literal_eval(value)
834+
except Exception:
835+
if value.lower() == "true":
836+
return True
837+
elif value.lower() == "false":
838+
return False
839+
return value
840+
841+
def _parse(self, inline_text: str):
842+
import re
843+
844+
last_inline_offset = 0
845+
starts = []
846+
text = ""
847+
seps = list(re.finditer(self.opener + "|" + self.closer, inline_text))
848+
entities = []
849+
for i, sep in enumerate(seps):
850+
is_opener = bool(sep["opener"])
851+
groups = sep.groupdict()
852+
inline_start = sep.start("opener") if is_opener else sep.start("closer")
853+
inline_end = sep.end("opener") if is_opener else sep.end("closer")
854+
label = groups.get("closer_label", groups.get("opener_label"))
855+
attrs = groups.get("closer_attrs", groups.get("opener_attrs")) or ""
856+
attrs = {
857+
k: self._as_python(v)
858+
for k, v in (kv.split("=") for kv in attrs.split())
859+
}
860+
text += inline_text[last_inline_offset:inline_start]
861+
if is_opener:
862+
starts.append((len(text), label, attrs))
863+
else:
864+
try:
865+
idx = next(
866+
i
867+
for i in range(len(starts) - 1, -1, -1)
868+
if starts[i][1] == label or not label or not starts[i][1]
869+
)
870+
except StopIteration:
871+
warnings.warn(f"Unmatched closing tag for '{sep.group()}'")
872+
continue
873+
start, start_label, start_attrs = starts.pop(idx)
874+
entities.append(
875+
(start, len(text), start_label or label, {**attrs, **start_attrs})
876+
)
877+
last_inline_offset = inline_end
878+
if last_inline_offset < len(inline_text):
879+
text += inline_text[last_inline_offset:]
880+
if starts:
881+
warnings.warn(
882+
f"Unmatched opening tags at indices {', '.join(s[1] for s in starts)}"
883+
)
884+
entities = sorted(entities)
885+
return text, entities
886+
887+
def __call__(self, obj, tokenizer=None):
888+
tok = tokenizer or self.tokenizer or get_current_tokenizer()
889+
if isinstance(obj, str):
890+
obj = {"text": obj}
891+
annotated = obj["text"]
892+
plain, raw_ents = self._parse(annotated)
893+
894+
doc = tok(plain)
895+
doc._.note_id = obj.get("doc_id", obj.get(FILENAME))
896+
897+
for dst in (
898+
*(() if self.span_attributes is None else self.span_attributes.values()),
899+
*self.default_attributes,
900+
):
901+
if not Span.has_extension(dst):
902+
Span.set_extension(dst, default=None)
903+
904+
spans = []
905+
for start, end, label, attrs in raw_ents:
906+
span = doc.char_span(start, end, label=label, alignment_mode="expand")
907+
if span is None:
908+
continue
909+
for k, v in attrs.items():
910+
new_k = (
911+
self.span_attributes.get(k)
912+
if self.span_attributes is not None
913+
else k
914+
)
915+
if self.span_attributes is None and not Span.has_extension(new_k):
916+
Span.set_extension(new_k, default=None)
917+
if new_k:
918+
span._.set(new_k, v)
919+
spans.append(span)
920+
921+
set_spans(doc, spans, span_setter=self.span_setter)
922+
for attr, value in self.default_attributes.items():
923+
for span in spans:
924+
if span._.get(attr) is None:
925+
span._.set(attr, value)
926+
927+
return doc
928+
929+
710930
def get_dict2doc_converter(
711931
converter: Union[str, Callable], kwargs
712932
) -> Tuple[Callable, Dict]:
@@ -716,7 +936,11 @@ def get_dict2doc_converter(
716936
filtered = [
717937
name
718938
for name in available
719-
if converter == name or (converter in name and "dict2doc" in name)
939+
if converter == name
940+
or (
941+
converter in name
942+
and (name.endswith("2doc") or name.endswith("to_doc"))
943+
)
720944
]
721945
converter = edsnlp.registry.factory.get(filtered[0])
722946
nlp = kwargs.pop("nlp", None)
@@ -726,7 +950,9 @@ def get_dict2doc_converter(
726950
kwargs = {}
727951
return converter, kwargs
728952
except (KeyError, IndexError):
729-
available = [v for v in available if "dict2doc" in v]
953+
available = [
954+
v for v in available if (v.endswith("2doc") or v.endswith("to_doc"))
955+
]
730956
raise ValueError(
731957
f"Cannot find converter for format {converter}. "
732958
f"Available converters are {', '.join(available)}"
@@ -745,14 +971,20 @@ def get_doc2dict_converter(
745971
filtered = [
746972
name
747973
for name in available
748-
if converter == name or (converter in name and "doc2dict" in name)
974+
if converter == name
975+
or (
976+
converter in name
977+
and (name.endswith("2dict") or name.endswith("to_dict"))
978+
)
749979
]
750980
converter = edsnlp.registry.factory.get(filtered[0])
751981
converter = converter(**kwargs)
752982
kwargs = {}
753983
return converter, kwargs
754984
except (KeyError, IndexError):
755-
available = [v for v in available if "doc2dict" in v]
985+
available = [
986+
v for v in available if (v.endswith("2dict") or v.endswith("to_dict"))
987+
]
756988
raise ValueError(
757989
f"Cannot find converter for format {converter}. "
758990
f"Available converters are {', '.join(available)}"

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ where = ["."]
212212
"eds.omop_dict2doc" = "edsnlp.data.converters:OmopDict2DocConverter"
213213
"eds.omop_doc2dict" = "edsnlp.data.converters:OmopDoc2DictConverter"
214214
"eds.ents_doc2dict" = "edsnlp.data.converters:EntsDoc2DictConverter"
215+
"eds.markup_to_doc" = "edsnlp.data.converters:MarkupToDocConverter"
215216

216217
# Deprecated (links to the same factories as above)
217218
"SOFA" = "edsnlp.pipes.ner.scores.sofa.factory:create_component"

0 commit comments

Comments
 (0)