Skip to content

Commit 35e92fd

Browse files
committed
test: Add comprehensive tests for RemoveComments functionality, ensuring proper handling of comments, regex, and template literals
1 parent 9b43d6a commit 35e92fd

File tree

3 files changed

+609
-67
lines changed

3 files changed

+609
-67
lines changed

src/Middleware/RemoveComments.php

Lines changed: 202 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -92,15 +92,9 @@ protected function removeCommentsFromTag($tag)
9292
$content = $matches[2];
9393
$closingTag = $matches[3];
9494

95-
// Split content by lines and process each line
96-
$lines = preg_split('/\r\n|\r|\n/', $content);
97-
$processedLines = [];
98-
99-
foreach ($lines as $line) {
100-
$processedLines[] = $this->removeSingleLineCommentFromLine($line);
101-
}
102-
103-
$processedContent = implode($lineEnding, $processedLines);
95+
// Process the whole content at once (supports multi-line template
96+
// literals and complex regex literals) for correctness and performance.
97+
$processedContent = $this->removeSingleLineCommentsFromContent($content);
10498

10599
// Reconstruct the tag with processed content
106100
return $openingTag . $processedContent . $closingTag;
@@ -120,82 +114,223 @@ protected function removeCommentsFromTag($tag)
120114
*/
121115
protected function removeSingleLineCommentFromLine($line)
122116
{
123-
// Early return for lines without //
117+
// Fast path: no comments at all
124118
if (strpos($line, '//') === false) {
125119
return $line;
126120
}
127121

128-
$result = '';
129-
$length = strlen($line);
130-
$inSingleQuote = false;
131-
$inDoubleQuote = false;
132-
$inRegex = false;
133-
$escaped = false;
134-
135-
for ($i = 0; $i < $length; $i++) {
136-
$char = $line[$i];
137-
$nextChar = $i + 1 < $length ? $line[$i + 1] : '';
138-
$prevChar = $i > 0 ? $line[$i - 1] : '';
139-
140-
// Handle escape sequences
141-
if ($escaped) {
142-
$result .= $char;
143-
$escaped = false;
144-
continue;
145-
}
122+
// If there are no quotes/backticks and no regex literal, we can do a fast, simple check.
123+
// This avoids invoking the heavier regex when not necessary — a common
124+
// case is lines like: var x = 1; // Comment
125+
// However, if there are regex literals in the line (e.g. /http:\/\/\//),
126+
// we must avoid the fast path as it can't safely detect // inside them.
127+
// Quick check for escaped slash sequences (e.g. http:\/\/) that indicate
128+
// the presence of regex literals or escaped slashes in general.
129+
$hasEscapedSlash = strpos($line, '\\/') !== false;
130+
if (strpos($line, '"') === false && strpos($line, "'") === false && strpos($line, '`') === false && !$hasEscapedSlash) {
131+
$offset = 0;
132+
while (($pos = strpos($line, '//', $offset)) !== false) {
133+
$prevChar = $pos > 0 ? $line[$pos - 1] : '';
146134

147-
if ($char === '\\' && ($inSingleQuote || $inDoubleQuote || $inRegex)) {
148-
$result .= $char;
149-
$escaped = true;
150-
continue;
151-
}
135+
// URLs like http://example.com are preceded by :, so ignore these
136+
if ($prevChar === ':') {
137+
// Skip over this occurrence (it's likely a protocol spec)
138+
$offset = $pos + 2;
139+
continue;
140+
}
152141

153-
// Toggle quote states
154-
if ($char === '"' && !$inSingleQuote && !$inRegex) {
155-
$inDoubleQuote = !$inDoubleQuote;
156-
$result .= $char;
157-
continue;
142+
// Comment starts here — strip it
143+
return substr($line, 0, $pos);
158144
}
159145

160-
if ($char === "'" && !$inDoubleQuote && !$inRegex) {
161-
$inSingleQuote = !$inSingleQuote;
162-
$result .= $char;
163-
continue;
164-
}
146+
return $line;
147+
}
165148

166-
// Handle regex literals (basic detection)
167-
if ($char === '/' && !$inSingleQuote && !$inDoubleQuote) {
168-
// Check if this might be a regex literal
169-
// Simple heuristic: regex usually comes after =, (, [, ,, return, or at start
170-
if ($prevChar === '=' || $prevChar === '(' || $prevChar === '[' || $prevChar === ',' || $prevChar === ' ') {
171-
// Look ahead to see if this looks like a regex (not a comment)
172-
if ($nextChar !== '/' && $nextChar !== '*') {
173-
$inRegex = true;
174-
$result .= $char;
175-
continue;
149+
// More complex lines can contain strings, regexes, or backticks — use a
150+
// single PCRE step which skips strings/regex literals and removes // comments
151+
// that are not preceded by a colon.
152+
$pattern = <<<'PATTERN'
153+
/(?:(?:"(?:\\.|[^"\\])*")|(?:'(?:\\.|[^'\\])*')|(?:`[^`]*`)|(?:\/(?:\\.|[^\/\\])+\/[a-zA-Z]*))(*SKIP)(*F)|(?<!:)\/\/[^\r\n]*/su
154+
PATTERN;
155+
156+
// preg_replace will remove matched // comments but will skip strings/regexes
157+
$result = preg_replace($pattern, '', $line);
158+
159+
// preg_replace returns null on error; if that happens fall back to original line
160+
return $result === null ? $line : $result;
161+
}
162+
163+
/**
164+
* Remove // comments from full content (possibly multi-line) while preserving
165+
* strings, template literals, and regex literals in the content.
166+
*
167+
* This function avoids splitting lines so that multi-line template literals
168+
* (backticks) are preserved correctly.
169+
*
170+
* @param string $content
171+
* @return string
172+
*/
173+
protected function removeSingleLineCommentsFromContent($content)
174+
{
175+
176+
// Fallback to a linear scanner: it's safer than a single complex PCRE
177+
// and supports multi-line template literals and complex regexes.
178+
$length = strlen($content);
179+
$out = '';
180+
181+
$inSingle = false;
182+
$inDouble = false;
183+
$inBacktick = false;
184+
$inRegex = false;
185+
$inRegexCharClass = false;
186+
$escaped = false;
187+
188+
for ($i = 0; $i < $length; $i++) {
189+
$char = $content[$i];
190+
$next = $i + 1 < $length ? $content[$i + 1] : '';
191+
192+
if ($escaped) {
193+
$out .= $char;
194+
$escaped = false;
195+
continue;
196+
}
197+
198+
if ($char === '\\') {
199+
$out .= $char;
200+
$escaped = true;
201+
continue;
202+
}
203+
204+
if ($inSingle) {
205+
if ($char === "'") {
206+
$inSingle = false;
176207
}
208+
$out .= $char;
209+
continue;
210+
}
211+
212+
if ($inDouble) {
213+
if ($char === '"') {
214+
$inDouble = false;
215+
}
216+
$out .= $char;
217+
continue;
218+
}
219+
220+
if ($inBacktick) {
221+
if ($char === '`') {
222+
$inBacktick = false;
223+
}
224+
$out .= $char;
225+
continue;
177226
}
178227

179-
// End of regex literal
180228
if ($inRegex) {
181-
$inRegex = false;
182-
$result .= $char;
229+
// Handle char classes inside regex
230+
if ($inRegexCharClass) {
231+
if ($char === ']' && !$escaped) {
232+
$inRegexCharClass = false;
233+
}
234+
$out .= $char;
235+
continue;
236+
}
237+
238+
if ($char === '[') {
239+
$inRegexCharClass = true;
240+
$out .= $char;
241+
continue;
242+
}
243+
244+
if ($char === '/' && !$escaped) {
245+
$inRegex = false;
246+
$out .= $char;
247+
// Append any regex flags
248+
$j = $i + 1;
249+
while ($j < $length && preg_match('/[a-zA-Z]/', $content[$j])) {
250+
$out .= $content[$j];
251+
$j++;
252+
}
253+
$i = $j - 1;
254+
continue;
255+
}
256+
257+
$out .= $char;
183258
continue;
184259
}
185-
}
186260

187-
// Check for // comment outside of strings
188-
if (!$inSingleQuote && !$inDoubleQuote && !$inRegex && $char === '/' && $nextChar === '/') {
189-
// Check if this is not part of a URL (preceded by :)
190-
if ($prevChar !== ':') {
191-
// Found a comment, remove everything from here to end of line
192-
break;
261+
// Not inside string, backtick or regex
262+
// Start single-quoted string
263+
if ($char === "'") {
264+
$inSingle = true;
265+
$out .= $char;
266+
continue;
193267
}
194-
}
195268

196-
$result .= $char;
197-
}
269+
// Start double-quoted string
270+
if ($char === '"') {
271+
$inDouble = true;
272+
$out .= $char;
273+
continue;
274+
}
275+
276+
// Start backtick template literal
277+
if ($char === '`') {
278+
$inBacktick = true;
279+
$out .= $char;
280+
continue;
281+
}
282+
283+
// Detect start of comment
284+
if ($char === '/' && $next === '/') {
285+
// Ensure '//' isn't part of a url (http://) — check previous char
286+
$prevIndex = strlen($out) - 1;
287+
$prevChar = $prevIndex >= 0 ? $out[$prevIndex] : '';
288+
if ($prevChar === ':') {
289+
// it's likely a URL-like, keep it
290+
$out .= $char;
291+
continue;
292+
}
293+
294+
// Skip until end of line
295+
$i += 2; // skip the //
296+
while ($i < $length && $content[$i] !== "\n" && $content[$i] !== "\r") {
297+
$i++;
298+
}
299+
// Append newline if present (preserve newline to keep structure)
300+
if ($i < $length && $content[$i] === "\r") {
301+
$out .= "\r";
302+
if ($i + 1 < $length && $content[$i + 1] === "\n") {
303+
$out .= "\n";
304+
$i++;
305+
}
306+
} elseif ($i < $length && $content[$i] === "\n") {
307+
$out .= "\n";
308+
}
309+
continue;
310+
}
311+
312+
// Potential start of regex literal
313+
if ($char === '/') {
314+
// Heuristic: regex often comes after these characters or at start
315+
$prevNonSpaceIndex = strlen($out) - 1;
316+
while ($prevNonSpaceIndex >= 0 && ctype_space($out[$prevNonSpaceIndex])) {
317+
$prevNonSpaceIndex--;
318+
}
319+
$prevNonSpaceChar = $prevNonSpaceIndex >= 0 ? $out[$prevNonSpaceIndex] : '';
320+
321+
if ($prevNonSpaceChar === '' || in_array($prevNonSpaceChar, ['=', '(', '[', ',', ':', '?', '!', '{', '}', ';', '+', '-', '*', '/', '%'])) {
322+
// This is likely a regex
323+
$inRegex = true;
324+
$out .= $char;
325+
continue;
326+
}
327+
// Otherwise it's a division operator
328+
}
329+
330+
// Default: append char
331+
$out .= $char;
332+
}
198333

199-
return $result;
334+
return $out;
200335
}
201336
}

0 commit comments

Comments
 (0)