Skip to content

Commit 2399830

Browse files
committed
v.0.1.1
Added Concurrency for improved performance, added timeouts (which are added to bad links), fixed the imports, added docstrings following best practice guidance, Updated README.md
1 parent 7253551 commit 2399830

File tree

7 files changed

+520
-46
lines changed

7 files changed

+520
-46
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,9 @@ MIT License - see LICENSE file for details.
6767
6868
## Changelog
6969
70-
### v0.1.0
71-
- Initial release
70+
### v0.1.1
7271
- Basic link checking functionality
7372
- Dead link removal option
7473
- CLI interface with Click
75-
- GitHub Actions support
74+
- Concurrency for improved perforamance
75+
- Link timeouts

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "link-sweep"
7-
version = "0.1.0"
7+
version = "0.1.1"
88
description = "A tool for checking and cleaning dead links in Markdown files for SSGs"
99
readme = "README.md"
1010
requires-python = ">=3.8"

src/link_sweep/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""A fast, reliable tool for checking and cleaning dead links in Markdown files."""
22

3-
__version__ = "0.1.0"
3+
__version__ = "0.1.1"

src/link_sweep/cli.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
1-
import click
1+
"""Command line interface for Link Sweep."""
2+
3+
import asyncio
24
import logging
3-
from link_checker import LinkChecker
45
import sys
56

7+
import click
8+
9+
from .link_checker import LinkChecker
10+
611

712
@click.group()
813
def main():
14+
"""Link Sweep CLI - A tool for checking and cleaning dead links in Markdown files.""" # noqa: E501
915
pass
1016

1117

@@ -14,8 +20,15 @@ def main():
1420
@click.option(
1521
"--remove-dead", "-rmd", is_flag=True, help="Remove dead links from the source"
1622
)
23+
@click.option(
24+
"--timeout",
25+
"-t",
26+
default=10.0,
27+
type=float,
28+
help="Timeout in seconds for HTTP requests (default: 10.0)",
29+
)
1730
@click.argument("directory", default="content/") # Default for most SSGs
18-
def check_links(verbose, remove_dead, directory):
31+
def check_links(verbose, remove_dead, timeout, directory):
1932
"""Check for dead links in the provided source."""
2033

2134
# Set up logging - minimal for normal mode, detailed for verbose
@@ -36,22 +49,30 @@ def check_links(verbose, remove_dead, directory):
3649

3750
try:
3851
click.echo(f"🔍 Checking links in: {directory}")
39-
checker = LinkChecker(directory)
52+
click.echo(f"⏱️ Using timeout: {timeout} seconds")
53+
54+
checker = LinkChecker(directory, timeout=timeout)
55+
56+
# Run the async check_links method
57+
asyncio.run(checker.check_links())
4058

41-
checker.check_links()
4259
total_links = len(checker.links)
4360
bad_links_count = len(checker.bad_links)
61+
timed_out_count = len(checker.timed_out_links)
4462
good_links_count = total_links - bad_links_count
4563

4664
click.echo("\n📊 Results:")
4765
click.echo(f" Total links checked: {total_links}")
4866
click.echo(f" ✅ Good links: {good_links_count}")
4967
click.echo(f" ❌ Bad links: {bad_links_count}")
68+
if timed_out_count > 0:
69+
click.echo(f" ⏰ Timed out links: {timed_out_count}")
5070

5171
if bad_links_count > 0:
5272
click.echo("\n💥 Bad links found:")
5373
for link in checker.bad_links:
54-
click.echo(f" - {link}")
74+
status_icon = "⏰" if link in checker.timed_out_links else "❌"
75+
click.echo(f" {status_icon} {link}")
5576

5677
if remove_dead:
5778
click.echo(f"\n🔧 Removing {bad_links_count} bad links...")

src/link_sweep/link_checker.py

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,21 @@
1-
import httpx
2-
import markdown_parser
1+
"""Asynchronous link checker module for markdown files."""
2+
3+
import asyncio
34
import logging
45

6+
import httpx
7+
8+
from . import markdown_parser
9+
510

611
class LinkChecker:
7-
def __init__(self, directory):
12+
"""Asynchronous link checker for markdown files."""
13+
14+
def __init__(self, directory, timeout=10.0):
15+
"""Initialize LinkChecker with directory path and timeout settings."""
816
self.logger = logging.getLogger(__name__)
917
self.logger.info(f"Initializing LinkChecker for {directory}")
18+
self.timeout = timeout
1019

1120
self.pages = markdown_parser.get_files(directory) # content/
1221
self.links_data = markdown_parser.get_md_links(self.pages)
@@ -17,32 +26,59 @@ def __init__(self, directory):
1726
self.links = [link["url"] for link in self.links_data]
1827

1928
self.bad_links = []
29+
self.timed_out_links = []
2030
self._checked = False
2131

22-
def check_links(self):
23-
self.logger.info(f"Starting link check for {len(self.links)} links")
24-
25-
for i, link in enumerate(self.links):
26-
self.logger.debug(f"Checking link {i + 1}/{len(self.links)}: {link}")
27-
try:
28-
response = httpx.get(link, timeout=10.0)
29-
if response.status_code >= 400:
30-
self.logger.warning(
31-
f"Dead link found: {link} (status: {response.status_code})"
32-
)
33-
self.bad_links.append(link)
32+
async def _check_single_link(self, session, link):
33+
"""Check a single link asynchronously."""
34+
try:
35+
response = await session.get(link, timeout=self.timeout)
36+
if response.status_code >= 400:
37+
self.logger.warning(
38+
f"Dead link found: {link} (status: {response.status_code})"
39+
)
40+
return link, "dead"
41+
else:
42+
self.logger.debug(f"Link OK: {link} (status: {response.status_code})")
43+
return link, "ok"
44+
except (httpx.TimeoutException, asyncio.TimeoutError):
45+
self.logger.warning(f"Link timed out: {link}")
46+
return link, "timeout"
47+
except Exception as e:
48+
self.logger.error(f"Error checking {link}: {e}")
49+
return link, "error"
50+
51+
async def check_links(self):
52+
"""Check all links asynchronously."""
53+
self.logger.info(f"Starting async link check for {len(self.links)} links")
54+
55+
async with httpx.AsyncClient() as session:
56+
# Create tasks for all links
57+
tasks = [self._check_single_link(session, link) for link in self.links]
58+
59+
# Execute all tasks concurrently
60+
results = await asyncio.gather(*tasks, return_exceptions=True)
61+
62+
# Process results
63+
for i, result in enumerate(results):
64+
if isinstance(result, BaseException):
65+
self.logger.error(f"Unexpected error for {self.links[i]}: {result}")
66+
self.bad_links.append(self.links[i])
3467
else:
35-
self.logger.debug(
36-
f"Link OK: {link} (status: {response.status_code})"
37-
)
38-
except Exception as e:
39-
self.logger.error(f"Error checking {link}: {e}")
68+
link, status = result
69+
if status in ["dead", "error"]:
70+
self.bad_links.append(link)
71+
elif status == "timeout":
72+
self.bad_links.append(link)
73+
self.timed_out_links.append(link)
4074

4175
self._checked = True
4276

4377
def remove_bad_links(self):
78+
"""Remove bad links from markdown files by replacing them with their text content.""" # noqa: E501
4479
if not self._checked:
45-
self.check_links()
80+
# Run the async method synchronously if not already checked
81+
asyncio.run(self.check_links())
4682

4783
if not self.bad_links:
4884
self.logger.info("No bad links to remove")

src/link_sweep/markdown_parser.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
"""Markdown file parsing utilities for link extraction and replacement."""
2+
13
from pathlib import Path
24
import re
35

46

57
def get_files(directory):
6-
# Simple recursive search for .md files in target directory
8+
"""Simple recursive search for .md files in target directory."""
79
p = Path(directory)
810
md_files = []
911
for file in p.rglob("*.md"):
@@ -14,7 +16,7 @@ def get_files(directory):
1416

1517

1618
def get_md_links(md_file_path: list):
17-
# Creates two groups out of markdown links
19+
"""Creates two groups out of markdown links."""
1820
pattern = r'\[([^\]]+)\]\((https?:\/\/[^\s\)"]+)\)'
1921
links = []
2022

@@ -33,6 +35,7 @@ def get_md_links(md_file_path: list):
3335

3436

3537
def replace_link(md_file_path, bad_links):
38+
"""Replace bad links with their text content in markdown files."""
3639
try:
3740
with open(md_file_path, "r", encoding="utf-8") as f:
3841
content = f.read()
@@ -51,4 +54,4 @@ def replace_link(md_file_path, bad_links):
5154

5255
except Exception as e:
5356
print(f"Error updating {md_file_path}: {e}")
54-
return False
57+
return False

0 commit comments

Comments
 (0)