2424from confit .registry import ValidatedFunction
2525from spacy .tokenizer import Tokenizer
2626from spacy .tokens import Doc , Span
27+ from typing_extensions import Literal
2728
2829import edsnlp
2930from edsnlp import registry
@@ -707,6 +708,225 @@ def __call__(self, doc):
707708 ]
708709
709710
711+ # ex: `[The [cat](ANIMAL) is [black](COLOR hex="#000000")].
712+
713+
714+ @registry .factory .register ("eds.markup_to_doc" , spacy_compatible = False )
715+ class MarkupToDocConverter :
716+ """
717+ Examples
718+ --------
719+ ```python
720+ import edsnlp
721+
722+ # Any kind of reader (`edsnlp.data.read/from_...`) can be used here
723+ # If input items are dicts, the converter expects a "text" key/column.
724+ docs = list(
725+ edsnlp.data.from_iterable(
726+ [
727+ "This [is](VERB negation=True) not a [test](NOUN).",
728+ "This is another [test](NOUN).",
729+ ],
730+ converter="markup",
731+ span_setter="entities",
732+ ),
733+ )
734+ print(docs[0].spans["entities"])
735+ # Out: [is, test]
736+ ```
737+
738+ You can also use it directly on a string:
739+
740+ ```python
741+ from edsnlp.data.converters import MarkupToDocConverter
742+
743+ converter = MarkupToDocConverter(
744+ span_setter={"verb": "VERB", "noun": "NOUN"},
745+ preset="xml",
746+ )
747+ doc = converter("This <VERB negation=True>is</VERB> not a <NOUN>test</NOUN>.")
748+ print(doc.spans["verb"])
749+ # Out: [is]
750+ print(doc.spans["verb"][0]._.negation)
751+ # Out: True
752+ ```
753+
754+ Parameters
755+ ----------
756+ preset: Literal["md", "xml"]
757+ The preset to use for the markup format. Defaults to "md" (Markdown-like
758+ syntax). Use "xml" for XML-like syntax.
759+ opener: Optional[str]
760+ The regex pattern to match the opening tag of the markup. Defaults to the
761+ preset's opener.
762+ closer: Optional[str]
763+ The regex pattern to match the closing tag of the markup. Defaults to the
764+ preset's closer.
765+ tokenizer: Optional[Tokenizer]
766+ The tokenizer instance used to tokenize the documents. Likely not needed since
767+ by default it uses the current context tokenizer :
768+
769+ - the tokenizer of the next pipeline run by `.map_pipeline` in a
770+ [Stream][edsnlp.core.stream.Stream].
771+ - or the `eds` tokenizer by default.
772+ span_setter: SpanSetterArg
773+ The span setter to use when setting the spans in the documents. Defaults to
774+ setting the spans in the `ents` attribute and creates a new span group for
775+ each JSON entity label.
776+ span_attributes: Optional[AttributesMappingArg]
777+ Mapping from markup attributes to Span extensions (can be a list too).
778+ By default, all attributes are imported as Span extensions with the same name.
779+ keep_raw_attribute_values: bool
780+ Whether to keep the raw attribute values (as strings) or to convert them to
781+ Python objects (e.g. booleans).
782+ default_attributes: AttributesMappingArg
783+ How to set attributes on spans for which no attribute value was found in the
784+ input format. This is especially useful for negation, or frequent attributes
785+ values (e.g. "negated" is often False, "temporal" is often "present"), that
786+ annotators may not want to annotate every time.
787+ bool_attributes: AsList[str]
788+ List of boolean attributes to set to False by default. This is useful for
789+ attributes that are often not annotated, but you want to have a default value
790+ for them.
791+ """
792+
793+ PRESETS = {
794+ "md" : {
795+ "opener" : r"(?P<opener>\[)" ,
796+ "closer" : r"(?P<closer>\]\(\s*(?P<closer_label>[a-zA-Z0-9]+)\s*(?P<closer_attrs>.*?)\))" , # noqa: E501
797+ },
798+ "xml" : {
799+ "opener" : r"(?P<opener><(?P<opener_label>[a-zA-Z0-9]+)(?P<opener_attrs>.*?)>)" , # noqa: E501
800+ "closer" : r"(?P<closer></(?P<closer_label>[a-zA-Z0-9]+)>)" ,
801+ },
802+ }
803+
804+ def __init__ (
805+ self ,
806+ * ,
807+ tokenizer : Optional [Tokenizer ] = None ,
808+ span_setter : SpanSetterArg = {"ents" : True , "*" : True },
809+ span_attributes : Optional [AttributesMappingArg ] = None ,
810+ keep_raw_attribute_values : bool = False ,
811+ default_attributes : AttributesMappingArg = {},
812+ bool_attributes : AsList [str ] = [],
813+ preset : Literal ["md" , "xml" ] = "md" ,
814+ opener : Optional [str ] = None ,
815+ closer : Optional [str ] = None ,
816+ ):
817+ self .tokenizer = tokenizer
818+ self .span_setter = span_setter
819+ self .span_attributes = span_attributes
820+ self .keep_raw_attribute_values = keep_raw_attribute_values
821+ self .default_attributes = dict (default_attributes )
822+ for attr in bool_attributes :
823+ self .default_attributes [attr ] = False
824+ self .opener = opener or self .PRESETS [preset ]["opener" ]
825+ self .closer = closer or self .PRESETS [preset ]["closer" ]
826+
827+ def _as_python (self , value : str ):
828+ import ast
829+
830+ if self .keep_raw_attribute_values :
831+ return value
832+ try :
833+ return ast .literal_eval (value )
834+ except Exception :
835+ if value .lower () == "true" :
836+ return True
837+ elif value .lower () == "false" :
838+ return False
839+ return value
840+
841+ def _parse (self , inline_text : str ):
842+ import re
843+
844+ last_inline_offset = 0
845+ starts = []
846+ text = ""
847+ seps = list (re .finditer (self .opener + "|" + self .closer , inline_text ))
848+ entities = []
849+ for i , sep in enumerate (seps ):
850+ is_opener = bool (sep ["opener" ])
851+ groups = sep .groupdict ()
852+ inline_start = sep .start ("opener" ) if is_opener else sep .start ("closer" )
853+ inline_end = sep .end ("opener" ) if is_opener else sep .end ("closer" )
854+ label = groups .get ("closer_label" , groups .get ("opener_label" ))
855+ attrs = groups .get ("closer_attrs" , groups .get ("opener_attrs" )) or ""
856+ attrs = {
857+ k : self ._as_python (v )
858+ for k , v in (kv .split ("=" ) for kv in attrs .split ())
859+ }
860+ text += inline_text [last_inline_offset :inline_start ]
861+ if is_opener :
862+ starts .append ((len (text ), label , attrs ))
863+ else :
864+ try :
865+ idx = next (
866+ i
867+ for i in range (len (starts ) - 1 , - 1 , - 1 )
868+ if starts [i ][1 ] == label or not label or not starts [i ][1 ]
869+ )
870+ except StopIteration :
871+ warnings .warn (f"Unmatched closing tag for '{ sep .group ()} '" )
872+ continue
873+ start , start_label , start_attrs = starts .pop (idx )
874+ entities .append (
875+ (start , len (text ), start_label or label , {** attrs , ** start_attrs })
876+ )
877+ last_inline_offset = inline_end
878+ if last_inline_offset < len (inline_text ):
879+ text += inline_text [last_inline_offset :]
880+ if starts :
881+ warnings .warn (
882+ f"Unmatched opening tags at indices { ', ' .join (s [1 ] for s in starts )} "
883+ )
884+ entities = sorted (entities )
885+ return text , entities
886+
887+ def __call__ (self , obj , tokenizer = None ):
888+ tok = tokenizer or self .tokenizer or get_current_tokenizer ()
889+ if isinstance (obj , str ):
890+ obj = {"text" : obj }
891+ annotated = obj ["text" ]
892+ plain , raw_ents = self ._parse (annotated )
893+
894+ doc = tok (plain )
895+ doc ._ .note_id = obj .get ("doc_id" , obj .get (FILENAME ))
896+
897+ for dst in (
898+ * (() if self .span_attributes is None else self .span_attributes .values ()),
899+ * self .default_attributes ,
900+ ):
901+ if not Span .has_extension (dst ):
902+ Span .set_extension (dst , default = None )
903+
904+ spans = []
905+ for start , end , label , attrs in raw_ents :
906+ span = doc .char_span (start , end , label = label , alignment_mode = "expand" )
907+ if span is None :
908+ continue
909+ for k , v in attrs .items ():
910+ new_k = (
911+ self .span_attributes .get (k )
912+ if self .span_attributes is not None
913+ else k
914+ )
915+ if self .span_attributes is None and not Span .has_extension (new_k ):
916+ Span .set_extension (new_k , default = None )
917+ if new_k :
918+ span ._ .set (new_k , v )
919+ spans .append (span )
920+
921+ set_spans (doc , spans , span_setter = self .span_setter )
922+ for attr , value in self .default_attributes .items ():
923+ for span in spans :
924+ if span ._ .get (attr ) is None :
925+ span ._ .set (attr , value )
926+
927+ return doc
928+
929+
710930def get_dict2doc_converter (
711931 converter : Union [str , Callable ], kwargs
712932) -> Tuple [Callable , Dict ]:
@@ -716,7 +936,11 @@ def get_dict2doc_converter(
716936 filtered = [
717937 name
718938 for name in available
719- if converter == name or (converter in name and "dict2doc" in name )
939+ if converter == name
940+ or (
941+ converter in name
942+ and (name .endswith ("2doc" ) or name .endswith ("to_doc" ))
943+ )
720944 ]
721945 converter = edsnlp .registry .factory .get (filtered [0 ])
722946 nlp = kwargs .pop ("nlp" , None )
@@ -726,7 +950,9 @@ def get_dict2doc_converter(
726950 kwargs = {}
727951 return converter , kwargs
728952 except (KeyError , IndexError ):
729- available = [v for v in available if "dict2doc" in v ]
953+ available = [
954+ v for v in available if (v .endswith ("2doc" ) or v .endswith ("to_doc" ))
955+ ]
730956 raise ValueError (
731957 f"Cannot find converter for format { converter } . "
732958 f"Available converters are { ', ' .join (available )} "
@@ -745,14 +971,20 @@ def get_doc2dict_converter(
745971 filtered = [
746972 name
747973 for name in available
748- if converter == name or (converter in name and "doc2dict" in name )
974+ if converter == name
975+ or (
976+ converter in name
977+ and (name .endswith ("2dict" ) or name .endswith ("to_dict" ))
978+ )
749979 ]
750980 converter = edsnlp .registry .factory .get (filtered [0 ])
751981 converter = converter (** kwargs )
752982 kwargs = {}
753983 return converter , kwargs
754984 except (KeyError , IndexError ):
755- available = [v for v in available if "doc2dict" in v ]
985+ available = [
986+ v for v in available if (v .endswith ("2dict" ) or v .endswith ("to_dict" ))
987+ ]
756988 raise ValueError (
757989 f"Cannot find converter for format { converter } . "
758990 f"Available converters are { ', ' .join (available )} "
0 commit comments