Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 65 additions & 38 deletions lib/providers/url-regex-provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* URL Regex Link Provider
*
* Detects plain text URLs using regex pattern matching.
* Supports common protocols but excludes file paths.
* Uses RFC-compliant patterns for robust URL detection.
*
* This provider runs after OSC8LinkProvider, so explicit hyperlinks
* take precedence over regex-detected URLs.
Expand All @@ -13,32 +13,62 @@ import type { IBufferRange, ILink, ILinkProvider } from '../types';
/**
* URL Regex Provider
*
* Detects plain text URLs on a single line using regex.
* Detects plain text HTTP(S) URLs on a single line using regex.
* Does not support multi-line URLs or file paths.
*
* Features:
* - Matches http:// and https:// URLs
* - Excludes unsafe characters per RFC3986 and RFC1738
* - Properly handles trailing punctuation and brackets
* - Validates URLs using the URL constructor
*
* Supported protocols:
* - https://, http://
* - mailto:
* - ftp://, ssh://, git://
* - tel:, magnet:
* - gemini://, gopher://, news:
*
* Character exclusions:
* - Unsafe from RFC3986: !*'()
* - Unsafe from RFC1738: {}|\^~[]` (except ~ which is allowed)
* - Final punctuation: ,.!?
* - Brackets: ()[]{}<>
*/
export class UrlRegexProvider implements ILinkProvider {
/**
* URL regex pattern
* Matches common protocols followed by valid URL characters
* Excludes file paths (no ./ or ../ or bare /)
*
* Matches everything starting with http:// or https://
* up to first whitespace, quote, or excluded character.
*
* The pattern uses negative character classes to exclude:
* - Whitespace and quotes
* - Unsafe RFC characters
* - Common trailing punctuation
*/
private static readonly URL_REGEX =
/(?:https?:\/\/|mailto:|ftp:\/\/|ssh:\/\/|git:\/\/|tel:|magnet:|gemini:\/\/|gopher:\/\/|news:)[\w\-.~:\/?#@!$&*+,;=%]+/gi;
/(https?|HTTPS?):[/]{2}[^\s"'!*(){}|\\^<>`]*[^\s"':,.!?{}|\\^~\[\]`()<>]/g;

constructor(private terminal: ITerminalForUrlProvider) {}

/**
* Characters to strip from end of URLs
* Common punctuation that's unlikely to be part of the URL
* Validate that a matched string is a proper URL
*
* Uses the URL constructor to validate the URL format.
* Also checks that the URL starts with the parsed protocol+host
* to avoid false positives.
*/
private static readonly TRAILING_PUNCTUATION = /[.,;!?)\]]+$/;

constructor(private terminal: ITerminalForUrlProvider) {}
private isUrl(urlString: string): boolean {
try {
const url = new URL(urlString);
const parsedBase =
url.password && url.username
? `${url.protocol}//${url.username}:${url.password}@${url.host}`
: url.username
? `${url.protocol}//${url.username}@${url.host}`
: `${url.protocol}//${url.host}`;
return urlString.toLowerCase().startsWith(parsedBase.toLowerCase());
} catch {
return false;
}
}

/**
* Provide all regex-detected URLs on the given row
Expand All @@ -61,33 +91,30 @@ export class UrlRegexProvider implements ILinkProvider {
// Find all URL matches in the line
let match: RegExpExecArray | null = UrlRegexProvider.URL_REGEX.exec(lineText);
while (match !== null) {
let url = match[0];
const startX = match.index;
let endX = match.index + url.length - 1; // Inclusive end
const url = match[0];

// Strip trailing punctuation
const stripped = url.replace(UrlRegexProvider.TRAILING_PUNCTUATION, '');
if (stripped.length < url.length) {
url = stripped;
endX = startX + url.length - 1;
// Validate that the matched text is a proper URL
if (!this.isUrl(url)) {
match = UrlRegexProvider.URL_REGEX.exec(lineText);
continue;
}

// Skip if URL is too short (e.g., just "http://")
if (url.length > 8) {
links.push({
text: url,
range: {
start: { x: startX, y },
end: { x: endX, y },
},
activate: (event) => {
// Open link if Ctrl/Cmd is pressed
if (event.ctrlKey || event.metaKey) {
window.open(url, '_blank', 'noopener,noreferrer');
}
},
});
}
const startX = match.index;
const endX = match.index + url.length - 1; // Inclusive end

links.push({
text: url,
range: {
start: { x: startX, y },
end: { x: endX, y },
},
activate: (event) => {
// Open link if Ctrl/Cmd is pressed
if (event.ctrlKey || event.metaKey) {
window.open(url, '_blank', 'noopener,noreferrer');
}
},
});

// Get next match
match = UrlRegexProvider.URL_REGEX.exec(lineText);
Expand Down
161 changes: 110 additions & 51 deletions lib/url-detection.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ function getLinks(lineText: string): Promise<ILink[] | undefined> {
}

describe('URL Detection', () => {
// Basic HTTP(S) detection
test('detects HTTPS URLs', async () => {
const links = await getLinks('Visit https://github.com for code');
expect(links).toBeDefined();
Expand All @@ -62,75 +63,50 @@ describe('URL Detection', () => {
expect(links?.[0].text).toBe('http://example.com');
});

test('detects mailto: links', async () => {
const links = await getLinks('Email: mailto:test@example.com');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('mailto:test@example.com');
});

test('detects ssh:// URLs', async () => {
const links = await getLinks('Connect via ssh://user@server.com');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('ssh://user@server.com');
});

test('detects git:// URLs', async () => {
const links = await getLinks('Clone git://github.com/repo.git');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('git://github.com/repo.git');
});

test('detects ftp:// URLs', async () => {
const links = await getLinks('Download ftp://files.example.com/file');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('ftp://files.example.com/file');
});

test('strips trailing period', async () => {
// Trailing punctuation handling
test('excludes trailing period', async () => {
const links = await getLinks('Check https://example.com.');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('https://example.com');
// Should NOT include the trailing period
expect(links?.[0].text.endsWith('.')).toBe(false);
});

test('strips trailing comma', async () => {
test('excludes trailing comma', async () => {
const links = await getLinks('See https://example.com, or else');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('https://example.com');
});

test('strips trailing parenthesis', async () => {
test('excludes trailing closing parenthesis', async () => {
const links = await getLinks('(see https://example.com)');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('https://example.com');
});

test('strips trailing exclamation', async () => {
test('excludes trailing exclamation', async () => {
const links = await getLinks('Visit https://example.com!');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('https://example.com');
});

test('handles multiple URLs on same line', async () => {
const links = await getLinks('https://a.com and https://b.com');
test('excludes trailing question mark', async () => {
const links = await getLinks('Is it https://example.com?');
expect(links).toBeDefined();
expect(links?.length).toBe(2);
expect(links?.[0].text).toBe('https://a.com');
expect(links?.[1].text).toBe('https://b.com');
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('https://example.com');
});

test('returns undefined when no URL present', async () => {
const links = await getLinks('No URLs here');
expect(links).toBeUndefined();
// URLs with special characters
test('handles URLs with parentheses in path (Wikipedia)', async () => {
const links = await getLinks('https://en.wikipedia.org/wiki/Link_(The_Legend_of_Zelda)');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
// The regex will exclude the closing paren, but URL validation will fail
// This is expected behavior - complex Wikipedia URLs need special handling
});

test('handles URLs with query parameters', async () => {
Expand All @@ -154,6 +130,52 @@ describe('URL Detection', () => {
expect(links?.[0].text).toBe('https://example.com:8080/path');
});

test('handles localhost URLs', async () => {
const links = await getLinks('http://localhost:3000/api');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('http://localhost:3000/api');
});

test('handles IP address URLs', async () => {
const links = await getLinks('http://192.168.1.1:8080');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('http://192.168.1.1:8080');
});

test('handles URLs with encoded characters', async () => {
const links = await getLinks('https://example.com/path%20with%20spaces');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('https://example.com/path%20with%20spaces');
});

// Multiple URLs
test('handles multiple URLs on same line', async () => {
const links = await getLinks('https://a.com and https://b.com');
expect(links).toBeDefined();
expect(links?.length).toBe(2);
expect(links?.[0].text).toBe('https://a.com');
expect(links?.[1].text).toBe('https://b.com');
});

// URLs in context
test('handles URL in quotes', async () => {
const links = await getLinks('Check "https://example.com" for details');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('https://example.com');
});

test('handles URL in markdown link syntax', async () => {
const links = await getLinks('[link](https://example.com)');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('https://example.com');
});

// Negative cases - things that should NOT be detected
test('does not detect file paths', async () => {
const links = await getLinks('/home/user/file.txt');
expect(links).toBeUndefined();
Expand All @@ -164,24 +186,61 @@ describe('URL Detection', () => {
expect(links).toBeUndefined();
});

test('link has activate function', async () => {
const links = await getLinks('https://example.com');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(typeof links?.[0].activate).toBe('function');
test('does not detect text without URLs', async () => {
const links = await getLinks('No URLs here');
expect(links).toBeUndefined();
});

test('does not detect invalid protocols', async () => {
const links = await getLinks('zzz://not-a-real-protocol.com');
expect(links).toBeUndefined();
});

test('does not detect incomplete URLs', async () => {
const links = await getLinks('http://');
expect(links).toBeUndefined();
});

test('does not detect mailto: links (HTTP(S) only)', async () => {
const links = await getLinks('Email: mailto:test@example.com');
expect(links).toBeUndefined();
});

test('does not detect ssh:// URLs (HTTP(S) only)', async () => {
const links = await getLinks('Connect via ssh://user@server.com');
expect(links).toBeUndefined();
});

test('does not detect git:// URLs (HTTP(S) only)', async () => {
const links = await getLinks('Clone git://github.com/repo.git');
expect(links).toBeUndefined();
});

test('does not detect ftp:// URLs (HTTP(S) only)', async () => {
const links = await getLinks('Download ftp://files.example.com/file');
expect(links).toBeUndefined();
});

test('detects tel: URLs', async () => {
test('does not detect tel: URLs (HTTP(S) only)', async () => {
const links = await getLinks('Call tel:+1234567890');
expect(links).toBeUndefined();
});

// Link functionality
test('link has activate function', async () => {
const links = await getLinks('https://example.com');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toBe('tel:+1234567890');
expect(typeof links?.[0].activate).toBe('function');
});

test('detects magnet: URLs', async () => {
const links = await getLinks('Download magnet:?xt=urn:btih:abc123');
test('link has correct range coordinates', async () => {
const links = await getLinks('Visit https://example.com today');
expect(links).toBeDefined();
expect(links?.length).toBe(1);
expect(links?.[0].text).toContain('magnet:?xt=urn:btih:abc123');
expect(links?.[0].range.start.x).toBe(6);
expect(links?.[0].range.start.y).toBe(0);
expect(links?.[0].range.end.x).toBe(24); // Inclusive end
expect(links?.[0].range.end.y).toBe(0);
});
});