Skip to content

Commit 3fa0662

Browse files
committed
fix: fix URL parsing in docstrings
Fixes #95 - Restrict _RE_ARGSTART pattern to valid Python identifiers only - Add heuristics to detect non-argument lines containing URLs - Prevent lines with URLs from being incorrectly parsed as arguments This fixes the issue where URLs in docstrings were being split and wrapped with incorrect HTML tags.
1 parent fc1b6fe commit 3fa0662

File tree

1 file changed

+36
-5
lines changed

1 file changed

+36
-5
lines changed

src/lazydocs/generation.py

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@
3333
)
3434

3535
_RE_TYPED_ARGSTART = re.compile(r"^([\w\[\]_]{1,}?)[ ]*?\((.*?)\):[ ]+(.{2,})", re.IGNORECASE)
36-
_RE_ARGSTART = re.compile(r"^(.+):[ ]+(.{2,})$", re.IGNORECASE)
36+
# Restrict to valid Python identifier-like patterns to avoid matching URLs
37+
_RE_ARGSTART = re.compile(r"^([\w\[\]_]+):[ ]+(.{2,})$", re.IGNORECASE)
3738

3839
_RE_CODE_TEXT = re.compile(r"^```[\w\-\.]*[ ]*$", re.IGNORECASE)
3940

@@ -583,11 +584,41 @@ def _lines_isvalid(lines: list, start_index: int, blockindent: int,
583584
argindent = indent
584585
elif arg_list and not literal_block and _RE_ARGSTART.match(line):
585586
# start of an exception-type block
586-
out.append(
587-
"- "
588-
+ _RE_ARGSTART.sub(r"<b>`\1`</b>: \2", line)
587+
# Check if this looks like a URL being incorrectly parsed
588+
match = _RE_ARGSTART.match(line)
589+
# Check if the part before the colon contains URL indicators or
590+
# is likely descriptive text rather than an argument name
591+
before_colon = match.group(1) if match else ""
592+
after_colon = match.group(2) if match else ""
593+
594+
# Heuristics to detect non-argument lines:
595+
# 1. The text before colon contains "http" (part of a URL)
596+
# 2. The line contains "://" (URL protocol)
597+
# 3. The text before colon is too long to be an argument name (>40 chars)
598+
# 4. The text before colon contains common English words that aren't argument names
599+
is_not_argument = (
600+
"http" in before_colon.lower() or
601+
"://" in line or
602+
len(before_colon) > 40 or
603+
# Check for common descriptive phrases (without trailing space)
604+
any(word in before_colon.lower() for word in ["see", "to find", "refer", "documentation", "available"])
589605
)
590-
argindent = indent
606+
607+
if match and is_not_argument:
608+
# This is likely descriptive text with a colon, not an argument
609+
# Treat it as regular text continuation
610+
if argindent > 0:
611+
padding = max(indent - argindent + offset, 0)
612+
out.append(" " * padding + line.replace("\n", "\n" + " " * padding))
613+
else:
614+
out.append(line)
615+
else:
616+
# This is a real argument
617+
out.append(
618+
"- "
619+
+ _RE_ARGSTART.sub(r"<b>`\1`</b>: \2", line)
620+
)
621+
argindent = indent
591622
elif indent > argindent:
592623
# attach docs text of argument
593624
# * (blockindent + 2)

0 commit comments

Comments
 (0)