Skip to content

Commit 94e7a96

Browse files
authored
Merge pull request #7 from php-collective/fix/tab-indentation-support
Add tab indentation support
2 parents 1a4fe7e + 9bd3fc4 commit 94e7a96

File tree

2 files changed

+389
-36
lines changed

2 files changed

+389
-36
lines changed

src/Parser/BlockParser.php

Lines changed: 85 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -394,8 +394,8 @@ protected function extractFootnotes(array $lines): void
394394

395395
continue;
396396
}
397-
// Check if line has at least base indentation
398-
if (preg_match('/^[ ]{' . $baseIndent . '}(.*)$/', $nextLine, $contMatch)) {
397+
// Check if line has at least base indentation (2 spaces or 1 tab)
398+
if (preg_match('/^(?:[ ]{' . $baseIndent . '}|\t)(.*)$/', $nextLine, $contMatch)) {
399399
$contentLines[] = $contMatch[1];
400400
$hasContent = true;
401401
$j++;
@@ -449,8 +449,9 @@ protected function extractHeadingReferences(array $lines): void
449449
continue;
450450
}
451451

452-
// Match heading: optional leading spaces, 1-6 # characters, followed by content
453-
if (preg_match('/^[ ]{0,3}(#{1,6})(?:\s+(.*))?$/', $line, $matches)) {
452+
// Match heading: optional leading spaces, 1-6 # characters, followed by space(s) and content
453+
// Space after # is syntax delimiter, not indentation - must be space(s) per spec, not tab
454+
if (preg_match('/^[ ]{0,3}(#{1,6})(?: +(.*))?$/', $line, $matches)) {
454455
$headingText = isset($matches[2]) ? trim($matches[2]) : '';
455456

456457
// Collect continuation lines
@@ -935,8 +936,8 @@ protected function tryParseRawBlock(Node $parent, array $lines, int $start): ?in
935936
return null;
936937
}
937938

938-
// Match opening fence with =format: ``` =html or ```=html
939-
if (!preg_match('/^(`{3,})\s+=(\w+)\s*$/', $line, $matches)) {
939+
// Match opening fence with =format: ``` =html (space before = is syntax delimiter)
940+
if (!preg_match('/^(`{3,}) +=(\w+) *$/', $line, $matches)) {
940941
return null;
941942
}
942943

@@ -1085,9 +1086,10 @@ protected function tryParseHeading(Node $parent, array $lines, int $start): ?int
10851086
return null;
10861087
}
10871088

1088-
// Match heading: optional leading spaces, 1-6 # characters, optionally followed by space and content
1089+
// Match heading: optional leading spaces, 1-6 # characters, optionally followed by space(s) and content
10891090
// Can be: "## Heading", "##", " ## Heading", "##\n", etc.
1090-
if (!preg_match('/^[ ]{0,3}(#{1,6})(?:\s+(.*))?$/', $line, $matches)) {
1091+
// Space after # is syntax delimiter - must be space(s) per spec, not tab
1092+
if (!preg_match('/^[ ]{0,3}(#{1,6})(?: +(.*))?$/', $line, $matches)) {
10911093
return null;
10921094
}
10931095

@@ -1106,7 +1108,7 @@ protected function tryParseHeading(Node $parent, array $lines, int $start): ?int
11061108
}
11071109

11081110
// Check for continuation with # prefix (same level or less)
1109-
if (preg_match('/^[ ]{0,3}#{1,' . $level . '}\s+(.+)$/', $nextLine, $contMatch)) {
1111+
if (preg_match('/^[ ]{0,3}#{1,' . $level . '} +(.+)$/', $nextLine, $contMatch)) {
11101112
if ($content !== '') {
11111113
$content .= "\n";
11121114
}
@@ -1206,7 +1208,7 @@ protected function tryParseBlockQuote(Node $parent, array $lines, int $start): ?
12061208
break;
12071209
}
12081210

1209-
// Continue with "> " prefix (space required)
1211+
// Continue with "> " prefix (space required per spec)
12101212
if (preg_match('/^> (.*)$/', $currentLine, $matches)) {
12111213
$innerLines[] = $matches[1];
12121214
$i++;
@@ -1261,7 +1263,7 @@ protected function tryParseDefinitionList(Node $parent, array $lines, int $start
12611263
}
12621264

12631265
// Next line must start with : (definition marker)
1264-
if (!preg_match('/^:\s+(.*)$/', $defLine)) {
1266+
if (!preg_match('/^: +(.*)$/', $defLine)) {
12651267
return null;
12661268
}
12671269

@@ -1282,7 +1284,7 @@ protected function tryParseDefinitionList(Node $parent, array $lines, int $start
12821284
// Check if this line is a term (followed by : definition)
12831285
if ($i + 1 < $count && !preg_match('/^[>#\-*+\d`:|]/', $currentLine)) {
12841286
$nextLine = $lines[$i + 1];
1285-
if (preg_match('/^:\s+(.*)$/', $nextLine)) {
1287+
if (preg_match('/^: +(.*)$/', $nextLine)) {
12861288
// Parse term
12871289
$term = new DefinitionTerm();
12881290
$this->inlineParser->parse($term, trim($currentLine), $i);
@@ -1292,7 +1294,7 @@ protected function tryParseDefinitionList(Node $parent, array $lines, int $start
12921294
// Parse definitions (can have multiple)
12931295
while ($i < $count) {
12941296
$defLineContent = $lines[$i];
1295-
if (preg_match('/^:\s+(.*)$/', $defLineContent, $defMatch)) {
1297+
if (preg_match('/^: +(.*)$/', $defLineContent, $defMatch)) {
12961298
$defContent = $defMatch[1];
12971299

12981300
// Collect continuation lines
@@ -1306,7 +1308,7 @@ protected function tryParseDefinitionList(Node $parent, array $lines, int $start
13061308
if (preg_match('/^\s+(.+)$/', $contLine, $contMatch)) {
13071309
$defLines[] = $contMatch[1];
13081310
$i++;
1309-
} elseif (preg_match('/^:\s+/', $contLine)) {
1311+
} elseif (preg_match('/^: +/', $contLine)) {
13101312
// Another definition
13111313
break;
13121314
} else {
@@ -1435,8 +1437,8 @@ protected function tryParseList(Node $parent, array $lines, int $start): ?int
14351437
$lineIndent = $this->getLeadingSpaces($subLine);
14361438
// Check if line has at least the subIndent level
14371439
if ($lineIndent >= $subIndent) {
1438-
// Remove subIndent spaces of indentation
1439-
$subLines[] = substr($subLine, $subIndent);
1440+
// Remove subIndent worth of indentation (handling tabs)
1441+
$subLines[] = $this->stripLeadingIndent($subLine, $subIndent);
14401442
$sawBlankLine = false;
14411443
$i++;
14421444
} elseif ($lineIndent >= $baseIndent) {
@@ -1556,7 +1558,7 @@ protected function tryParseList(Node $parent, array $lines, int $start): ?int
15561558
// In djot, " - b" after "- a" (no blank line) is literal text, not a nested list
15571559
if ($nextIndent >= $contentIndent) {
15581560
// Properly indented continuation - include with original indentation relative to content
1559-
$itemLines[] = substr($nextLine, $contentIndent);
1561+
$itemLines[] = $this->stripLeadingIndent($nextLine, $contentIndent);
15601562
} else {
15611563
// Lazy continuation (not properly indented but not at base level either)
15621564
$itemLines[] = $nextTrimmed;
@@ -1609,16 +1611,56 @@ protected function tryParseList(Node $parent, array $lines, int $start): ?int
16091611
}
16101612

16111613
/**
1612-
* Get number of leading spaces in a line
1614+
* Get number of leading whitespace as space-equivalent count.
1615+
*
1616+
* Tabs are counted as 2 spaces (one indentation level) to support
1617+
* tab-based indentation for nested structures.
1618+
*
1619+
* @see https://github.com/jgm/djot/issues/255
16131620
*/
16141621
protected function getLeadingSpaces(string $line): int
16151622
{
1616-
$match = [];
1617-
if (preg_match('/^( *)/', $line, $match)) {
1618-
return strlen($match[1]);
1623+
$count = 0;
1624+
$len = strlen($line);
1625+
1626+
for ($i = 0; $i < $len; $i++) {
1627+
if ($line[$i] === ' ') {
1628+
$count++;
1629+
} elseif ($line[$i] === "\t") {
1630+
// Tab counts as 2 spaces (one indentation level)
1631+
$count += 2;
1632+
} else {
1633+
break;
1634+
}
16191635
}
16201636

1621-
return 0;
1637+
return $count;
1638+
}
1639+
1640+
/**
1641+
* Strip leading whitespace from a line, up to the specified space-equivalent count.
1642+
*
1643+
* Tabs count as 2 spaces. This correctly handles mixed spaces and tabs.
1644+
*/
1645+
protected function stripLeadingIndent(string $line, int $amount): string
1646+
{
1647+
$stripped = 0;
1648+
$len = strlen($line);
1649+
$i = 0;
1650+
1651+
while ($i < $len && $stripped < $amount) {
1652+
if ($line[$i] === ' ') {
1653+
$stripped++;
1654+
$i++;
1655+
} elseif ($line[$i] === "\t") {
1656+
$stripped += 2;
1657+
$i++;
1658+
} else {
1659+
break;
1660+
}
1661+
}
1662+
1663+
return substr($line, $i);
16221664
}
16231665

16241666
/**
@@ -1681,8 +1723,8 @@ protected function tryParseDjotDefinitionList(Node $parent, array $lines, int $s
16811723
continue;
16821724
}
16831725

1684-
// Must start with ": "
1685-
if (!preg_match('/^:\s+(.*)$/', $line, $matches)) {
1726+
// Must start with ": " (space is syntax delimiter, not tab)
1727+
if (!preg_match('/^: +(.*)$/', $line, $matches)) {
16861728
break;
16871729
}
16881730

@@ -1740,8 +1782,8 @@ protected function tryParseDjotDefinitionList(Node $parent, array $lines, int $s
17401782
continue;
17411783
}
17421784

1743-
// Check for next term
1744-
if (preg_match('/^:\s+/', $defLine)) {
1785+
// Check for next term (space is syntax delimiter, not tab)
1786+
if (preg_match('/^: +/', $defLine)) {
17451787
break;
17461788
}
17471789

@@ -1893,7 +1935,8 @@ protected function disambiguateListStyle(array $listInfo, array $lines, int $sta
18931935
protected function parseListItemMarker(string $line): ?array
18941936
{
18951937
// Task list: - [ ] or - [x] or - [X]
1896-
if (preg_match('/^[-*+]\s+\[([ xX])\]\s+(.*)$/', $line, $matches)) {
1938+
// Space after marker is syntax delimiter - must be space(s) per spec, not tab
1939+
if (preg_match('/^[-*+] +\[([ xX])\] +(.*)$/', $line, $matches)) {
18971940
return [
18981941
'type' => ListBlock::TYPE_TASK,
18991942
'marker' => '-',
@@ -1903,7 +1946,8 @@ protected function parseListItemMarker(string $line): ?array
19031946
}
19041947

19051948
// Bullet list: -, +, or *
1906-
if (preg_match('/^([-*+])\s+(.*)$/', $line, $matches)) {
1949+
// Space after marker is syntax delimiter - must be space(s) per spec, not tab
1950+
if (preg_match('/^([-*+]) +(.*)$/', $line, $matches)) {
19071951
$marker = $matches[1];
19081952
$content = $matches[2];
19091953

@@ -1928,7 +1972,8 @@ protected function parseListItemMarker(string $line): ?array
19281972
}
19291973

19301974
// Ordered list: 1. or 1) or (1)
1931-
if (preg_match('/^(\d+)([.)])\s+(.*)$/', $line, $matches)) {
1975+
// Space after marker is syntax delimiter - must be space(s) per spec, not tab
1976+
if (preg_match('/^(\d+)([.)]) +(.*)$/', $line, $matches)) {
19321977
return [
19331978
'type' => ListBlock::TYPE_ORDERED,
19341979
'marker' => $matches[2],
@@ -1937,7 +1982,7 @@ protected function parseListItemMarker(string $line): ?array
19371982
];
19381983
}
19391984

1940-
if (preg_match('/^\((\d+)\)\s+(.*)$/', $line, $matches)) {
1985+
if (preg_match('/^\((\d+)\) +(.*)$/', $line, $matches)) {
19411986
return [
19421987
'type' => ListBlock::TYPE_ORDERED,
19431988
'marker' => '()',
@@ -1949,7 +1994,8 @@ protected function parseListItemMarker(string $line): ?array
19491994
// Roman numeral ordered list: i. or I. or i) or I) or (i) or (I)
19501995
// Single letters are ambiguous - could be alpha or roman
19511996
// Return both possibilities and let the list parser disambiguate based on subsequent items
1952-
if (preg_match('/^([ivxlcdmIVXLCDM]+)([.)])\s+(.*)$/', $line, $matches)) {
1997+
// Space after marker is syntax delimiter - must be space(s) per spec, not tab
1998+
if (preg_match('/^([ivxlcdmIVXLCDM]+)([.)]) +(.*)$/', $line, $matches)) {
19531999
$roman = $matches[1];
19542000
$isLower = ctype_lower($roman[0]);
19552001
$start = $this->romanToInt(strtoupper($roman));
@@ -1973,7 +2019,7 @@ protected function parseListItemMarker(string $line): ?array
19732019
}
19742020
}
19752021

1976-
if (preg_match('/^\(([ivxlcdmIVXLCDM]+)\)\s+(.*)$/', $line, $matches)) {
2022+
if (preg_match('/^\(([ivxlcdmIVXLCDM]+)\) +(.*)$/', $line, $matches)) {
19772023
$roman = $matches[1];
19782024
$isLower = ctype_lower($roman[0]);
19792025
$start = $this->romanToInt(strtoupper($roman));
@@ -1999,7 +2045,8 @@ protected function parseListItemMarker(string $line): ?array
19992045

20002046
// Alpha ordered list: a. or A. or a) or A) or (a) or (A)
20012047
// Only single letters - multi-letter checked above as roman
2002-
if (preg_match('/^([a-zA-Z])([.)])\s+(.*)$/', $line, $matches)) {
2048+
// Space after marker is syntax delimiter - must be space(s) per spec, not tab
2049+
if (preg_match('/^([a-zA-Z])([.)]) +(.*)$/', $line, $matches)) {
20032050
$letter = $matches[1];
20042051
$isLower = ctype_lower($letter);
20052052
$start = ord(strtolower($letter)) - ord('a') + 1;
@@ -2013,7 +2060,7 @@ protected function parseListItemMarker(string $line): ?array
20132060
];
20142061
}
20152062

2016-
if (preg_match('/^\(([a-zA-Z])\)\s+(.*)$/', $line, $matches)) {
2063+
if (preg_match('/^\(([a-zA-Z])\) +(.*)$/', $line, $matches)) {
20172064
$letter = $matches[1];
20182065
$isLower = ctype_lower($letter);
20192066
$start = ord(strtolower($letter)) - ord('a') + 1;
@@ -2028,7 +2075,8 @@ protected function parseListItemMarker(string $line): ?array
20282075
}
20292076

20302077
// Definition list: :
2031-
if (preg_match('/^:\s+(.*)$/', $line, $matches)) {
2078+
// Space after marker is syntax delimiter - must be space(s) per spec, not tab
2079+
if (preg_match('/^: +(.*)$/', $line, $matches)) {
20322080
return [
20332081
'type' => ListBlock::TYPE_DEFINITION,
20342082
'marker' => ':',
@@ -2210,7 +2258,8 @@ protected function tryParseTable(Node $parent, array $lines, int $start): ?int
22102258
$captionStart++;
22112259
}
22122260

2213-
if ($captionStart < $count && preg_match('/^\^\s+(.+)$/', $lines[$captionStart], $captionMatch)) {
2261+
// Table caption: ^ followed by space(s), not tab (syntax delimiter)
2262+
if ($captionStart < $count && preg_match('/^\^ +(.+)$/', $lines[$captionStart], $captionMatch)) {
22142263
$captionLines = [$captionMatch[1]];
22152264
$captionStart++;
22162265

0 commit comments

Comments
 (0)