|
| 1 | +<?php |
| 2 | + |
| 3 | +declare(strict_types=1); |
| 4 | + |
| 5 | +namespace Djot\Converter; |
| 6 | + |
| 7 | +use RuntimeException; |
| 8 | + |
| 9 | +/** |
| 10 | + * Converts Markdown syntax to Djot syntax |
| 11 | + * |
| 12 | + * This performs a source-to-source transformation, not parsing. |
| 13 | + * It handles common Markdown patterns and converts them to their Djot equivalents. |
| 14 | + */ |
| 15 | +class MarkdownToDjot |
| 16 | +{ |
| 17 | + /** |
| 18 | + * Convert Markdown text to Djot text |
| 19 | + */ |
| 20 | + public function convert(string $markdown): string |
| 21 | + { |
| 22 | + $lines = explode("\n", $markdown); |
| 23 | + $result = []; |
| 24 | + $inCodeBlock = false; |
| 25 | + $codeFence = ''; |
| 26 | + |
| 27 | + foreach ($lines as $line) { |
| 28 | + // Track code blocks to avoid converting inside them |
| 29 | + if (!$inCodeBlock && preg_match('/^(`{3,}|~{3,})/', $line, $matches)) { |
| 30 | + $inCodeBlock = true; |
| 31 | + $codeFence = $matches[1][0]; // First char of fence |
| 32 | + $result[] = $line; |
| 33 | + |
| 34 | + continue; |
| 35 | + } |
| 36 | + |
| 37 | + if ($inCodeBlock) { |
| 38 | + // Check for closing fence |
| 39 | + if (preg_match('/^(' . $codeFence . '{3,})\s*$/', $line)) { |
| 40 | + $inCodeBlock = false; |
| 41 | + $codeFence = ''; |
| 42 | + } |
| 43 | + $result[] = $line; |
| 44 | + |
| 45 | + continue; |
| 46 | + } |
| 47 | + |
| 48 | + // Convert inline formatting |
| 49 | + $line = $this->convertInlineFormatting($line); |
| 50 | + $result[] = $line; |
| 51 | + } |
| 52 | + |
| 53 | + return implode("\n", $result); |
| 54 | + } |
| 55 | + |
| 56 | + /** |
| 57 | + * Convert inline Markdown formatting to Djot |
| 58 | + */ |
| 59 | + protected function convertInlineFormatting(string $line): string |
| 60 | + { |
| 61 | + // Protect inline code spans from conversion |
| 62 | + $protected = []; |
| 63 | + $line = preg_replace_callback('/`[^`]+`/', function ($match) use (&$protected) { |
| 64 | + $placeholder = "\x00PROTECTED" . count($protected) . "\x00"; |
| 65 | + $protected[$placeholder] = $match[0]; |
| 66 | + |
| 67 | + return $placeholder; |
| 68 | + }, $line) ?? $line; |
| 69 | + |
| 70 | + // Protect existing Djot syntax from double-conversion |
| 71 | + // Protect {-text-}, {=text=}, {^text^}, {~text~} |
| 72 | + $line = preg_replace_callback('/\{[-=^~][^}]+[-=^~]\}/', function ($match) use (&$protected) { |
| 73 | + $placeholder = "\x00PROTECTED" . count($protected) . "\x00"; |
| 74 | + $protected[$placeholder] = $match[0]; |
| 75 | + |
| 76 | + return $placeholder; |
| 77 | + }, $line) ?? $line; |
| 78 | + |
| 79 | + // Use placeholder to prevent re-matching |
| 80 | + $strongPlaceholders = []; |
| 81 | + |
| 82 | + // Convert ___bold italic___ to *_bold italic_* (Djot) |
| 83 | + $line = preg_replace_callback('/___(.+?)___/', function ($match) use (&$strongPlaceholders) { |
| 84 | + $placeholder = "\x00STRONG" . count($strongPlaceholders) . "\x00"; |
| 85 | + $strongPlaceholders[$placeholder] = '*_' . $match[1] . '_*'; |
| 86 | + |
| 87 | + return $placeholder; |
| 88 | + }, $line) ?? $line; |
| 89 | + |
| 90 | + // Convert ***bold italic*** to *_bold italic_* (Djot) |
| 91 | + // Match 3+ asterisks to avoid partial matches |
| 92 | + $line = preg_replace_callback('/(\*{3,})(.+?)(\*{3,})/', function ($match) use (&$strongPlaceholders) { |
| 93 | + $placeholder = "\x00STRONG" . count($strongPlaceholders) . "\x00"; |
| 94 | + $strongPlaceholders[$placeholder] = '*_' . $match[2] . '_*'; |
| 95 | + |
| 96 | + return $placeholder; |
| 97 | + }, $line) ?? $line; |
| 98 | + |
| 99 | + // Convert **bold with nested content** to *bold* (Djot strong) |
| 100 | + $line = preg_replace_callback('/\*\*(.+?)\*\*/', function ($match) use (&$strongPlaceholders) { |
| 101 | + $placeholder = "\x00STRONG" . count($strongPlaceholders) . "\x00"; |
| 102 | + // Recursively convert any *italic* inside to _italic_ |
| 103 | + $inner = preg_replace('/(?<!\*)\*([^*]+)\*(?!\*)/', '_$1_', $match[1]) ?? $match[1]; |
| 104 | + $strongPlaceholders[$placeholder] = '*' . $inner . '*'; |
| 105 | + |
| 106 | + return $placeholder; |
| 107 | + }, $line) ?? $line; |
| 108 | + |
| 109 | + // Convert __bold__ to *bold* (Djot strong) |
| 110 | + $line = preg_replace_callback('/__(.+?)__/', function ($match) use (&$strongPlaceholders) { |
| 111 | + $placeholder = "\x00STRONG" . count($strongPlaceholders) . "\x00"; |
| 112 | + $strongPlaceholders[$placeholder] = '*' . $match[1] . '*'; |
| 113 | + |
| 114 | + return $placeholder; |
| 115 | + }, $line) ?? $line; |
| 116 | + |
| 117 | + // Convert *italic* to _italic_ (Djot emphasis) |
| 118 | + // Only match single asterisks not preceded/followed by asterisks |
| 119 | + // Skip if it looks like already-Djot *strong* (single word without spaces surrounded by single *) |
| 120 | + $line = preg_replace_callback('/(?<!\*)\*([^*]+)\*(?!\*)/', function ($match) { |
| 121 | + // If this looks like Djot strong (content has no internal formatting markers), leave it |
| 122 | + // This is a heuristic - can't be perfect without full parsing |
| 123 | + return '_' . $match[1] . '_'; |
| 124 | + }, $line) ?? $line; |
| 125 | + |
| 126 | + // Convert ~~strikethrough~~ to {-strikethrough-} (Djot delete) |
| 127 | + $line = preg_replace('/~~([^~]+)~~/', '{-$1-}', $line) ?? $line; |
| 128 | + |
| 129 | + // Convert ==highlight== to {=highlight=} (Djot highlight, GFM extension) |
| 130 | + $line = preg_replace('/==([^=]+)==/', '{=$1=}', $line) ?? $line; |
| 131 | + |
| 132 | + // Convert ^superscript^ to {^superscript^} (some Markdown extensions) |
| 133 | + // Only if not already in Djot format |
| 134 | + $line = preg_replace('/(?<!\{)\^([^^]+)\^(?!\})/', '{^$1^}', $line) ?? $line; |
| 135 | + |
| 136 | + // Convert ~subscript~ to {~subscript~} (some Markdown extensions) |
| 137 | + // Only single tildes, not double (strikethrough) |
| 138 | + $line = preg_replace('/(?<![~{])~([^~}]+)~(?![~}])/', '{~$1~}', $line) ?? $line; |
| 139 | + |
| 140 | + // Restore strong placeholders |
| 141 | + foreach ($strongPlaceholders as $placeholder => $content) { |
| 142 | + $line = str_replace($placeholder, $content, $line); |
| 143 | + } |
| 144 | + |
| 145 | + // Restore protected content |
| 146 | + foreach ($protected as $placeholder => $content) { |
| 147 | + $line = str_replace($placeholder, $content, $line); |
| 148 | + } |
| 149 | + |
| 150 | + return $line; |
| 151 | + } |
| 152 | + |
| 153 | + /** |
| 154 | + * Convert a Markdown file to Djot |
| 155 | + * |
| 156 | + * @throws \RuntimeException If file cannot be read |
| 157 | + */ |
| 158 | + public function convertFile(string $inputPath): string |
| 159 | + { |
| 160 | + if (!is_file($inputPath)) { |
| 161 | + throw new RuntimeException("File not found: {$inputPath}"); |
| 162 | + } |
| 163 | + |
| 164 | + $content = file_get_contents($inputPath); |
| 165 | + if ($content === false) { |
| 166 | + throw new RuntimeException("Failed to read file: {$inputPath}"); |
| 167 | + } |
| 168 | + |
| 169 | + return $this->convert($content); |
| 170 | + } |
| 171 | + |
| 172 | + /** |
| 173 | + * Convert a Markdown file and save as Djot |
| 174 | + * |
| 175 | + * @throws \RuntimeException If file cannot be read or written |
| 176 | + */ |
| 177 | + public function convertFileAndSave(string $inputPath, ?string $outputPath = null): void |
| 178 | + { |
| 179 | + $djot = $this->convertFile($inputPath); |
| 180 | + |
| 181 | + if ($outputPath === null) { |
| 182 | + // Replace .md extension with .djot |
| 183 | + $outputPath = preg_replace('/\.md$/i', '.djot', $inputPath) ?? $inputPath; |
| 184 | + if ($outputPath === $inputPath) { |
| 185 | + $outputPath .= '.djot'; |
| 186 | + } |
| 187 | + } |
| 188 | + |
| 189 | + $result = file_put_contents($outputPath, $djot); |
| 190 | + if ($result === false) { |
| 191 | + throw new RuntimeException("Failed to write file: {$outputPath}"); |
| 192 | + } |
| 193 | + } |
| 194 | +} |
0 commit comments