@@ -394,8 +394,8 @@ protected function extractFootnotes(array $lines): void
394394
395395 continue ;
396396 }
397- // Check if line has at least base indentation
398- if (preg_match ('/^[ ]{ ' . $ baseIndent . '}(.*)$/ ' , $ nextLine , $ contMatch )) {
397+ // Check if line has at least base indentation (2 spaces or 1 tab)
398+ if (preg_match ('/^(?: [ ]{ ' . $ baseIndent . '}|\t) (.*)$/ ' , $ nextLine , $ contMatch )) {
399399 $ contentLines [] = $ contMatch [1 ];
400400 $ hasContent = true ;
401401 $ j ++;
@@ -449,8 +449,9 @@ protected function extractHeadingReferences(array $lines): void
449449 continue ;
450450 }
451451
452- // Match heading: optional leading spaces, 1-6 # characters, followed by content
453- if (preg_match ('/^[ ]{0,3}(#{1,6})(?:\s+(.*))?$/ ' , $ line , $ matches )) {
452+ // Match heading: optional leading spaces, 1-6 # characters, followed by space(s) and content
453+ // Space after # is syntax delimiter, not indentation - must be space(s) per spec, not tab
454+ if (preg_match ('/^[ ]{0,3}(#{1,6})(?: +(.*))?$/ ' , $ line , $ matches )) {
454455 $ headingText = isset ($ matches [2 ]) ? trim ($ matches [2 ]) : '' ;
455456
456457 // Collect continuation lines
@@ -935,8 +936,8 @@ protected function tryParseRawBlock(Node $parent, array $lines, int $start): ?in
935936 return null ;
936937 }
937938
938- // Match opening fence with =format: ``` =html or ```=html
939- if (!preg_match ('/^(`{3,})\s +=(\w+)\s *$/ ' , $ line , $ matches )) {
939+ // Match opening fence with =format: ``` =html (space before = is syntax delimiter)
940+ if (!preg_match ('/^(`{3,}) +=(\w+) *$/ ' , $ line , $ matches )) {
940941 return null ;
941942 }
942943
@@ -1085,9 +1086,10 @@ protected function tryParseHeading(Node $parent, array $lines, int $start): ?int
10851086 return null ;
10861087 }
10871088
1088- // Match heading: optional leading spaces, 1-6 # characters, optionally followed by space and content
1089+ // Match heading: optional leading spaces, 1-6 # characters, optionally followed by space(s) and content
10891090 // Can be: "## Heading", "##", " ## Heading", "##\n", etc.
1090- if (!preg_match ('/^[ ]{0,3}(#{1,6})(?:\s+(.*))?$/ ' , $ line , $ matches )) {
1091+ // Space after # is syntax delimiter - must be space(s) per spec, not tab
1092+ if (!preg_match ('/^[ ]{0,3}(#{1,6})(?: +(.*))?$/ ' , $ line , $ matches )) {
10911093 return null ;
10921094 }
10931095
@@ -1106,7 +1108,7 @@ protected function tryParseHeading(Node $parent, array $lines, int $start): ?int
11061108 }
11071109
11081110 // Check for continuation with # prefix (same level or less)
1109- if (preg_match ('/^[ ]{0,3}#{1, ' . $ level . '}\s +(.+)$/ ' , $ nextLine , $ contMatch )) {
1111+ if (preg_match ('/^[ ]{0,3}#{1, ' . $ level . '} +(.+)$/ ' , $ nextLine , $ contMatch )) {
11101112 if ($ content !== '' ) {
11111113 $ content .= "\n" ;
11121114 }
@@ -1206,7 +1208,7 @@ protected function tryParseBlockQuote(Node $parent, array $lines, int $start): ?
12061208 break ;
12071209 }
12081210
1209- // Continue with "> " prefix (space required)
1211+ // Continue with "> " prefix (space required per spec )
12101212 if (preg_match ('/^> (.*)$/ ' , $ currentLine , $ matches )) {
12111213 $ innerLines [] = $ matches [1 ];
12121214 $ i ++;
@@ -1261,7 +1263,7 @@ protected function tryParseDefinitionList(Node $parent, array $lines, int $start
12611263 }
12621264
12631265 // Next line must start with : (definition marker)
1264- if (!preg_match ('/^:\s +(.*)$/ ' , $ defLine )) {
1266+ if (!preg_match ('/^: +(.*)$/ ' , $ defLine )) {
12651267 return null ;
12661268 }
12671269
@@ -1282,7 +1284,7 @@ protected function tryParseDefinitionList(Node $parent, array $lines, int $start
12821284 // Check if this line is a term (followed by : definition)
12831285 if ($ i + 1 < $ count && !preg_match ('/^[>#\-*+\d`:|]/ ' , $ currentLine )) {
12841286 $ nextLine = $ lines [$ i + 1 ];
1285- if (preg_match ('/^:\s +(.*)$/ ' , $ nextLine )) {
1287+ if (preg_match ('/^: +(.*)$/ ' , $ nextLine )) {
12861288 // Parse term
12871289 $ term = new DefinitionTerm ();
12881290 $ this ->inlineParser ->parse ($ term , trim ($ currentLine ), $ i );
@@ -1292,7 +1294,7 @@ protected function tryParseDefinitionList(Node $parent, array $lines, int $start
12921294 // Parse definitions (can have multiple)
12931295 while ($ i < $ count ) {
12941296 $ defLineContent = $ lines [$ i ];
1295- if (preg_match ('/^:\s +(.*)$/ ' , $ defLineContent , $ defMatch )) {
1297+ if (preg_match ('/^: +(.*)$/ ' , $ defLineContent , $ defMatch )) {
12961298 $ defContent = $ defMatch [1 ];
12971299
12981300 // Collect continuation lines
@@ -1306,7 +1308,7 @@ protected function tryParseDefinitionList(Node $parent, array $lines, int $start
13061308 if (preg_match ('/^\s+(.+)$/ ' , $ contLine , $ contMatch )) {
13071309 $ defLines [] = $ contMatch [1 ];
13081310 $ i ++;
1309- } elseif (preg_match ('/^:\s +/ ' , $ contLine )) {
1311+ } elseif (preg_match ('/^: +/ ' , $ contLine )) {
13101312 // Another definition
13111313 break ;
13121314 } else {
@@ -1435,8 +1437,8 @@ protected function tryParseList(Node $parent, array $lines, int $start): ?int
14351437 $ lineIndent = $ this ->getLeadingSpaces ($ subLine );
14361438 // Check if line has at least the subIndent level
14371439 if ($ lineIndent >= $ subIndent ) {
1438- // Remove subIndent spaces of indentation
1439- $ subLines [] = substr ($ subLine , $ subIndent );
1440+ // Remove subIndent worth of indentation (handling tabs)
1441+ $ subLines [] = $ this -> stripLeadingIndent ($ subLine , $ subIndent );
14401442 $ sawBlankLine = false ;
14411443 $ i ++;
14421444 } elseif ($ lineIndent >= $ baseIndent ) {
@@ -1556,7 +1558,7 @@ protected function tryParseList(Node $parent, array $lines, int $start): ?int
15561558 // In djot, " - b" after "- a" (no blank line) is literal text, not a nested list
15571559 if ($ nextIndent >= $ contentIndent ) {
15581560 // Properly indented continuation - include with original indentation relative to content
1559- $ itemLines [] = substr ($ nextLine , $ contentIndent );
1561+ $ itemLines [] = $ this -> stripLeadingIndent ($ nextLine , $ contentIndent );
15601562 } else {
15611563 // Lazy continuation (not properly indented but not at base level either)
15621564 $ itemLines [] = $ nextTrimmed ;
@@ -1609,16 +1611,56 @@ protected function tryParseList(Node $parent, array $lines, int $start): ?int
16091611 }
16101612
16111613 /**
1612- * Get number of leading spaces in a line
1614+ * Get number of leading whitespace as space-equivalent count.
1615+ *
1616+ * Tabs are counted as 2 spaces (one indentation level) to support
1617+ * tab-based indentation for nested structures.
1618+ *
1619+ * @see https://github.com/jgm/djot/issues/255
16131620 */
16141621 protected function getLeadingSpaces (string $ line ): int
16151622 {
1616- $ match = [];
1617- if (preg_match ('/^( *)/ ' , $ line , $ match )) {
1618- return strlen ($ match [1 ]);
1623+ $ count = 0 ;
1624+ $ len = strlen ($ line );
1625+
1626+ for ($ i = 0 ; $ i < $ len ; $ i ++) {
1627+ if ($ line [$ i ] === ' ' ) {
1628+ $ count ++;
1629+ } elseif ($ line [$ i ] === "\t" ) {
1630+ // Tab counts as 2 spaces (one indentation level)
1631+ $ count += 2 ;
1632+ } else {
1633+ break ;
1634+ }
16191635 }
16201636
1621- return 0 ;
1637+ return $ count ;
1638+ }
1639+
1640+ /**
1641+ * Strip leading whitespace from a line, up to the specified space-equivalent count.
1642+ *
1643+ * Tabs count as 2 spaces. This correctly handles mixed spaces and tabs.
1644+ */
1645+ protected function stripLeadingIndent (string $ line , int $ amount ): string
1646+ {
1647+ $ stripped = 0 ;
1648+ $ len = strlen ($ line );
1649+ $ i = 0 ;
1650+
1651+ while ($ i < $ len && $ stripped < $ amount ) {
1652+ if ($ line [$ i ] === ' ' ) {
1653+ $ stripped ++;
1654+ $ i ++;
1655+ } elseif ($ line [$ i ] === "\t" ) {
1656+ $ stripped += 2 ;
1657+ $ i ++;
1658+ } else {
1659+ break ;
1660+ }
1661+ }
1662+
1663+ return substr ($ line , $ i );
16221664 }
16231665
16241666 /**
@@ -1681,8 +1723,8 @@ protected function tryParseDjotDefinitionList(Node $parent, array $lines, int $s
16811723 continue ;
16821724 }
16831725
1684- // Must start with ": "
1685- if (!preg_match ('/^:\s +(.*)$/ ' , $ line , $ matches )) {
1726+ // Must start with ": " (space is syntax delimiter, not tab)
1727+ if (!preg_match ('/^: +(.*)$/ ' , $ line , $ matches )) {
16861728 break ;
16871729 }
16881730
@@ -1740,8 +1782,8 @@ protected function tryParseDjotDefinitionList(Node $parent, array $lines, int $s
17401782 continue ;
17411783 }
17421784
1743- // Check for next term
1744- if (preg_match ('/^:\s +/ ' , $ defLine )) {
1785+ // Check for next term (space is syntax delimiter, not tab)
1786+ if (preg_match ('/^: +/ ' , $ defLine )) {
17451787 break ;
17461788 }
17471789
@@ -1893,7 +1935,8 @@ protected function disambiguateListStyle(array $listInfo, array $lines, int $sta
18931935 protected function parseListItemMarker (string $ line ): ?array
18941936 {
18951937 // Task list: - [ ] or - [x] or - [X]
1896- if (preg_match ('/^[-*+]\s+\[([ xX])\]\s+(.*)$/ ' , $ line , $ matches )) {
1938+ // Space after marker is syntax delimiter - must be space(s) per spec, not tab
1939+ if (preg_match ('/^[-*+] +\[([ xX])\] +(.*)$/ ' , $ line , $ matches )) {
18971940 return [
18981941 'type ' => ListBlock::TYPE_TASK ,
18991942 'marker ' => '- ' ,
@@ -1903,7 +1946,8 @@ protected function parseListItemMarker(string $line): ?array
19031946 }
19041947
19051948 // Bullet list: -, +, or *
1906- if (preg_match ('/^([-*+])\s+(.*)$/ ' , $ line , $ matches )) {
1949+ // Space after marker is syntax delimiter - must be space(s) per spec, not tab
1950+ if (preg_match ('/^([-*+]) +(.*)$/ ' , $ line , $ matches )) {
19071951 $ marker = $ matches [1 ];
19081952 $ content = $ matches [2 ];
19091953
@@ -1928,7 +1972,8 @@ protected function parseListItemMarker(string $line): ?array
19281972 }
19291973
19301974 // Ordered list: 1. or 1) or (1)
1931- if (preg_match ('/^(\d+)([.)])\s+(.*)$/ ' , $ line , $ matches )) {
1975+ // Space after marker is syntax delimiter - must be space(s) per spec, not tab
1976+ if (preg_match ('/^(\d+)([.)]) +(.*)$/ ' , $ line , $ matches )) {
19321977 return [
19331978 'type ' => ListBlock::TYPE_ORDERED ,
19341979 'marker ' => $ matches [2 ],
@@ -1937,7 +1982,7 @@ protected function parseListItemMarker(string $line): ?array
19371982 ];
19381983 }
19391984
1940- if (preg_match ('/^\((\d+)\)\s +(.*)$/ ' , $ line , $ matches )) {
1985+ if (preg_match ('/^\((\d+)\) +(.*)$/ ' , $ line , $ matches )) {
19411986 return [
19421987 'type ' => ListBlock::TYPE_ORDERED ,
19431988 'marker ' => '() ' ,
@@ -1949,7 +1994,8 @@ protected function parseListItemMarker(string $line): ?array
19491994 // Roman numeral ordered list: i. or I. or i) or I) or (i) or (I)
19501995 // Single letters are ambiguous - could be alpha or roman
19511996 // Return both possibilities and let the list parser disambiguate based on subsequent items
1952- if (preg_match ('/^([ivxlcdmIVXLCDM]+)([.)])\s+(.*)$/ ' , $ line , $ matches )) {
1997+ // Space after marker is syntax delimiter - must be space(s) per spec, not tab
1998+ if (preg_match ('/^([ivxlcdmIVXLCDM]+)([.)]) +(.*)$/ ' , $ line , $ matches )) {
19531999 $ roman = $ matches [1 ];
19542000 $ isLower = ctype_lower ($ roman [0 ]);
19552001 $ start = $ this ->romanToInt (strtoupper ($ roman ));
@@ -1973,7 +2019,7 @@ protected function parseListItemMarker(string $line): ?array
19732019 }
19742020 }
19752021
1976- if (preg_match ('/^\(([ivxlcdmIVXLCDM]+)\)\s +(.*)$/ ' , $ line , $ matches )) {
2022+ if (preg_match ('/^\(([ivxlcdmIVXLCDM]+)\) +(.*)$/ ' , $ line , $ matches )) {
19772023 $ roman = $ matches [1 ];
19782024 $ isLower = ctype_lower ($ roman [0 ]);
19792025 $ start = $ this ->romanToInt (strtoupper ($ roman ));
@@ -1999,7 +2045,8 @@ protected function parseListItemMarker(string $line): ?array
19992045
20002046 // Alpha ordered list: a. or A. or a) or A) or (a) or (A)
20012047 // Only single letters - multi-letter checked above as roman
2002- if (preg_match ('/^([a-zA-Z])([.)])\s+(.*)$/ ' , $ line , $ matches )) {
2048+ // Space after marker is syntax delimiter - must be space(s) per spec, not tab
2049+ if (preg_match ('/^([a-zA-Z])([.)]) +(.*)$/ ' , $ line , $ matches )) {
20032050 $ letter = $ matches [1 ];
20042051 $ isLower = ctype_lower ($ letter );
20052052 $ start = ord (strtolower ($ letter )) - ord ('a ' ) + 1 ;
@@ -2013,7 +2060,7 @@ protected function parseListItemMarker(string $line): ?array
20132060 ];
20142061 }
20152062
2016- if (preg_match ('/^\(([a-zA-Z])\)\s +(.*)$/ ' , $ line , $ matches )) {
2063+ if (preg_match ('/^\(([a-zA-Z])\) +(.*)$/ ' , $ line , $ matches )) {
20172064 $ letter = $ matches [1 ];
20182065 $ isLower = ctype_lower ($ letter );
20192066 $ start = ord (strtolower ($ letter )) - ord ('a ' ) + 1 ;
@@ -2028,7 +2075,8 @@ protected function parseListItemMarker(string $line): ?array
20282075 }
20292076
20302077 // Definition list: :
2031- if (preg_match ('/^:\s+(.*)$/ ' , $ line , $ matches )) {
2078+ // Space after marker is syntax delimiter - must be space(s) per spec, not tab
2079+ if (preg_match ('/^: +(.*)$/ ' , $ line , $ matches )) {
20322080 return [
20332081 'type ' => ListBlock::TYPE_DEFINITION ,
20342082 'marker ' => ': ' ,
@@ -2210,7 +2258,8 @@ protected function tryParseTable(Node $parent, array $lines, int $start): ?int
22102258 $ captionStart ++;
22112259 }
22122260
2213- if ($ captionStart < $ count && preg_match ('/^\^\s+(.+)$/ ' , $ lines [$ captionStart ], $ captionMatch )) {
2261+ // Table caption: ^ followed by space(s), not tab (syntax delimiter)
2262+ if ($ captionStart < $ count && preg_match ('/^\^ +(.+)$/ ' , $ lines [$ captionStart ], $ captionMatch )) {
22142263 $ captionLines = [$ captionMatch [1 ]];
22152264 $ captionStart ++;
22162265
0 commit comments