v.0.1.1

m0ddr · m0ddr · commit 23998309544a · 2025-06-22T18:33:12.000+01:00
Added Concurrency for improved performance, added timeouts (which are
added to bad links), fixed the imports, added docstrings following best
practice guidance, Updated README.md
diff --git a/README.md b/README.md
@@ -67,9 +67,9 @@ MIT License - see LICENSE file for details.
 
 ## Changelog
 
-### v0.1.0
-- Initial release
+### v0.1.1
 - Basic link checking functionality
 - Dead link removal option
 - CLI interface with Click
-- GitHub Actions support
+- Concurrency for improved perforamance
+- Link timeouts
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "link-sweep"
-version = "0.1.0"
+version = "0.1.1"
 description = "A tool for checking and cleaning dead links in Markdown files for SSGs"
 readme = "README.md"
 requires-python = ">=3.8"
diff --git a/src/link_sweep/__init__.py b/src/link_sweep/__init__.py
@@ -1,3 +1,3 @@
 """A fast, reliable tool for checking and cleaning dead links in Markdown files."""
 
-__version__ = "0.1.0"
+__version__ = "0.1.1"
diff --git a/src/link_sweep/cli.py b/src/link_sweep/cli.py
@@ -1,11 +1,17 @@
-import click
+"""Command line interface for Link Sweep."""
+
+import asyncio
 import logging
-from link_checker import LinkChecker
 import sys
 
+import click
+
+from .link_checker import LinkChecker
+
 
 @click.group()
 def main():
+    """Link Sweep CLI - A tool for checking and cleaning dead links in Markdown files."""  # noqa: E501
     pass
 
 
@@ -14,8 +20,15 @@ def main():
 @click.option(
     "--remove-dead", "-rmd", is_flag=True, help="Remove dead links from the source"
 )
+@click.option(
+    "--timeout",
+    "-t",
+    default=10.0,
+    type=float,
+    help="Timeout in seconds for HTTP requests (default: 10.0)",
+)
 @click.argument("directory", default="content/")  # Default for most SSGs
-def check_links(verbose, remove_dead, directory):
+def check_links(verbose, remove_dead, timeout, directory):
     """Check for dead links in the provided source."""
 
     # Set up logging - minimal for normal mode, detailed for verbose
@@ -36,22 +49,30 @@ def check_links(verbose, remove_dead, directory):
 
     try:
         click.echo(f"🔍 Checking links in: {directory}")
-        checker = LinkChecker(directory)
+        click.echo(f"⏱️  Using timeout: {timeout} seconds")
+
+        checker = LinkChecker(directory, timeout=timeout)
+
+        # Run the async check_links method
+        asyncio.run(checker.check_links())
 
-        checker.check_links()
         total_links = len(checker.links)
         bad_links_count = len(checker.bad_links)
+        timed_out_count = len(checker.timed_out_links)
         good_links_count = total_links - bad_links_count
 
         click.echo("\n📊 Results:")
         click.echo(f"   Total links checked: {total_links}")
         click.echo(f"   ✅ Good links: {good_links_count}")
         click.echo(f"   ❌ Bad links: {bad_links_count}")
+        if timed_out_count > 0:
+            click.echo(f"   ⏰ Timed out links: {timed_out_count}")
 
         if bad_links_count > 0:
             click.echo("\n💥 Bad links found:")
             for link in checker.bad_links:
-                click.echo(f"   - {link}")
+                status_icon = "⏰" if link in checker.timed_out_links else "❌"
+                click.echo(f"   {status_icon} {link}")
 
             if remove_dead:
                 click.echo(f"\n🔧 Removing {bad_links_count} bad links...")
diff --git a/src/link_sweep/link_checker.py b/src/link_sweep/link_checker.py
@@ -1,12 +1,21 @@
-import httpx
-import markdown_parser
+"""Asynchronous link checker module for markdown files."""
+
+import asyncio
 import logging
 
+import httpx
+
+from . import markdown_parser
+
 
 class LinkChecker:
-    def __init__(self, directory):
+    """Asynchronous link checker for markdown files."""
+
+    def __init__(self, directory, timeout=10.0):
+        """Initialize LinkChecker with directory path and timeout settings."""
         self.logger = logging.getLogger(__name__)
         self.logger.info(f"Initializing LinkChecker for {directory}")
+        self.timeout = timeout
 
         self.pages = markdown_parser.get_files(directory)  # content/
         self.links_data = markdown_parser.get_md_links(self.pages)
@@ -17,32 +26,59 @@ def __init__(self, directory):
         self.links = [link["url"] for link in self.links_data]
 
         self.bad_links = []
+        self.timed_out_links = []
         self._checked = False
 
-    def check_links(self):
-        self.logger.info(f"Starting link check for {len(self.links)} links")
-
-        for i, link in enumerate(self.links):
-            self.logger.debug(f"Checking link {i + 1}/{len(self.links)}: {link}")
-            try:
-                response = httpx.get(link, timeout=10.0)
-                if response.status_code >= 400:
-                    self.logger.warning(
-                        f"Dead link found: {link} (status: {response.status_code})"
-                    )
-                    self.bad_links.append(link)
+    async def _check_single_link(self, session, link):
+        """Check a single link asynchronously."""
+        try:
+            response = await session.get(link, timeout=self.timeout)
+            if response.status_code >= 400:
+                self.logger.warning(
+                    f"Dead link found: {link} (status: {response.status_code})"
+                )
+                return link, "dead"
+            else:
+                self.logger.debug(f"Link OK: {link} (status: {response.status_code})")
+                return link, "ok"
+        except (httpx.TimeoutException, asyncio.TimeoutError):
+            self.logger.warning(f"Link timed out: {link}")
+            return link, "timeout"
+        except Exception as e:
+            self.logger.error(f"Error checking {link}: {e}")
+            return link, "error"
+
+    async def check_links(self):
+        """Check all links asynchronously."""
+        self.logger.info(f"Starting async link check for {len(self.links)} links")
+
+        async with httpx.AsyncClient() as session:
+            # Create tasks for all links
+            tasks = [self._check_single_link(session, link) for link in self.links]
+
+            # Execute all tasks concurrently
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+
+            # Process results
+            for i, result in enumerate(results):
+                if isinstance(result, BaseException):
+                    self.logger.error(f"Unexpected error for {self.links[i]}: {result}")
+                    self.bad_links.append(self.links[i])
                 else:
-                    self.logger.debug(
-                        f"Link OK: {link} (status: {response.status_code})"
-                    )
-            except Exception as e:
-                self.logger.error(f"Error checking {link}: {e}")
+                    link, status = result
+                    if status in ["dead", "error"]:
+                        self.bad_links.append(link)
+                    elif status == "timeout":
+                        self.bad_links.append(link)
+                        self.timed_out_links.append(link)
 
         self._checked = True
 
     def remove_bad_links(self):
+        """Remove bad links from markdown files by replacing them with their text content."""  # noqa: E501
         if not self._checked:
-            self.check_links()
+            # Run the async method synchronously if not already checked
+            asyncio.run(self.check_links())
 
         if not self.bad_links:
             self.logger.info("No bad links to remove")
diff --git a/src/link_sweep/markdown_parser.py b/src/link_sweep/markdown_parser.py
@@ -1,9 +1,11 @@
+"""Markdown file parsing utilities for link extraction and replacement."""
+
 from pathlib import Path
 import re
 
 
 def get_files(directory):
-    # Simple recursive search for .md files in target directory
+    """Simple recursive search for .md files in target directory."""
     p = Path(directory)
     md_files = []
     for file in p.rglob("*.md"):
@@ -14,7 +16,7 @@ def get_files(directory):
 
 
 def get_md_links(md_file_path: list):
-    # Creates two groups out of markdown links
+    """Creates two groups out of markdown links."""
     pattern = r'\[([^\]]+)\]\((https?:\/\/[^\s\)"]+)\)'
     links = []
 
@@ -33,6 +35,7 @@ def get_md_links(md_file_path: list):
 
 
 def replace_link(md_file_path, bad_links):
+    """Replace bad links with their text content in markdown files."""
     try:
         with open(md_file_path, "r", encoding="utf-8") as f:
             content = f.read()
@@ -51,4 +54,4 @@ def replace_link(md_file_path, bad_links):
 
     except Exception as e:
         print(f"Error updating {md_file_path}: {e}")
-        return False
+        return False
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`"""A fast, reliable tool for checking and cleaning dead links in Markdown files."""`
`2`	`2`
`3`		`-__version__ = "0.1.0"`
	`3`	`+__version__ = "0.1.1"`