@@ -566,52 +566,99 @@ def test_EOF_in_charref(self):
566566 for html , expected in data :
567567 self ._run_check (html , expected )
568568
569- def test_broken_comments (self ):
569+ def test_EOF_in_comments_or_decls (self ):
570+ data = [
571+ ('<!' , [('data' , '<!' )]),
572+ ('<!-' , [('data' , '<!-' )]),
573+ ('<!--' , [('data' , '<!--' )]),
574+ ('<![' , [('data' , '<![' )]),
575+ ('<![CDATA[' , [('data' , '<![CDATA[' )]),
576+ ('<![CDATA[x' , [('data' , '<![CDATA[x' )]),
577+ ('<!DOCTYPE' , [('data' , '<!DOCTYPE' )]),
578+ ('<!DOCTYPE HTML' , [('data' , '<!DOCTYPE HTML' )]),
579+ ]
580+ for html , expected in data :
581+ self ._run_check (html , expected )
582+ def test_bogus_comments (self ):
570583 html = ('<! not really a comment >'
571584 '<! not a comment either -->'
572585 '<! -- close enough -->'
573586 '<!><!<-- this was an empty comment>'
574- '<!!! another bogus comment !!!>' )
587+ '<!!! another bogus comment !!!>'
588+ # see #32876
589+ '<![with square brackets]!>'
590+ '<![\n multiline\n bogusness\n ]!>'
591+ '<![more brackets]-[and a hyphen]!>'
592+ '<![cdata[should be uppercase]]>'
593+ '<![CDATA [whitespaces are not ignored]]>'
594+ '<![CDATA]]>' # required '[' after CDATA
595+ )
575596 expected = [
576597 ('comment' , ' not really a comment ' ),
577598 ('comment' , ' not a comment either --' ),
578599 ('comment' , ' -- close enough --' ),
579600 ('comment' , '' ),
580601 ('comment' , '<-- this was an empty comment' ),
581602 ('comment' , '!! another bogus comment !!!' ),
603+ ('comment' , '[with square brackets]!' ),
604+ ('comment' , '[\n multiline\n bogusness\n ]!' ),
605+ ('comment' , '[more brackets]-[and a hyphen]!' ),
606+ ('comment' , '[cdata[should be uppercase]]' ),
607+ ('comment' , '[CDATA [whitespaces are not ignored]]' ),
608+ ('comment' , '[CDATA]]' ),
582609 ]
583610 self ._run_check (html , expected )
584611
585612 def test_broken_condcoms (self ):
586613 # these condcoms are missing the '--' after '<!' and before the '>'
614+ # and they are considered bogus comments according to
615+ # "8.2.4.42. Markup declaration open state"
587616 html = ('<![if !(IE)]>broken condcom<![endif]>'
588617 '<![if ! IE]><link href="favicon.tiff"/><![endif]>'
589618 '<![if !IE 6]><img src="firefox.png" /><![endif]>'
590619 '<![if !ie 6]><b>foo</b><![endif]>'
591620 '<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>' )
592- # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
593- # and "8.2.4.45 Markup declaration open state", comment tokens should
594- # be emitted instead of 'unknown decl', but calling unknown_decl
595- # provides more flexibility.
596- # See also Lib/_markupbase.py:parse_declaration
597621 expected = [
598- ('unknown decl ' , 'if !(IE)' ),
622+ ('comment ' , '[ if !(IE)] ' ),
599623 ('data' , 'broken condcom' ),
600- ('unknown decl ' , 'endif' ),
601- ('unknown decl ' , 'if ! IE' ),
624+ ('comment ' , '[ endif] ' ),
625+ ('comment ' , '[ if ! IE] ' ),
602626 ('startendtag' , 'link' , [('href' , 'favicon.tiff' )]),
603- ('unknown decl ' , 'endif' ),
604- ('unknown decl ' , 'if !IE 6' ),
627+ ('comment ' , '[ endif] ' ),
628+ ('comment ' , '[ if !IE 6] ' ),
605629 ('startendtag' , 'img' , [('src' , 'firefox.png' )]),
606- ('unknown decl ' , 'endif' ),
607- ('unknown decl ' , 'if !ie 6' ),
630+ ('comment ' , '[ endif] ' ),
631+ ('comment ' , '[ if !ie 6] ' ),
608632 ('starttag' , 'b' , []),
609633 ('data' , 'foo' ),
610634 ('endtag' , 'b' ),
611- ('unknown decl ' , 'endif' ),
612- ('unknown decl ' , 'if (!IE)|(lt IE 9)' ),
635+ ('comment ' , '[ endif] ' ),
636+ ('comment ' , '[ if (!IE)|(lt IE 9)] ' ),
613637 ('startendtag' , 'img' , [('src' , 'mammoth.bmp' )]),
614- ('unknown decl' , 'endif' )
638+ ('comment' , '[endif]' )
639+ ]
640+ self ._run_check (html , expected )
641+
642+ def test_cdata_declarations (self ):
643+ # More tests should be added. See also "8.2.4.42. Markup
644+ # declaration open state", "8.2.4.69. CDATA section state",
645+ # and issue 32876
646+ html = ('<![CDATA[just some plain text]]>' )
647+ expected = [('unknown decl' , 'CDATA[just some plain text' )]
648+ self ._run_check (html , expected )
649+
650+ def test_cdata_declarations_multiline (self ):
651+ html = ('<code><![CDATA['
652+ ' if (a < b && a > b) {'
653+ ' printf("[<marquee>How?</marquee>]");'
654+ ' }'
655+ ']]></code>' )
656+ expected = [
657+ ('starttag' , 'code' , []),
658+ ('unknown decl' ,
659+ 'CDATA[ if (a < b && a > b) { '
660+ 'printf("[<marquee>How?</marquee>]"); }' ),
661+ ('endtag' , 'code' )
615662 ]
616663 self ._run_check (html , expected )
617664
0 commit comments