1- import httpx
2- import markdown_parser
1+ """Asynchronous link checker module for markdown files."""
2+
3+ import asyncio
34import logging
45
6+ import httpx
7+
8+ from . import markdown_parser
9+
510
611class LinkChecker :
7- def __init__ (self , directory ):
12+ """Asynchronous link checker for markdown files."""
13+
14+ def __init__ (self , directory , timeout = 10.0 ):
15+ """Initialize LinkChecker with directory path and timeout settings."""
816 self .logger = logging .getLogger (__name__ )
917 self .logger .info (f"Initializing LinkChecker for { directory } " )
18+ self .timeout = timeout
1019
1120 self .pages = markdown_parser .get_files (directory ) # content/
1221 self .links_data = markdown_parser .get_md_links (self .pages )
@@ -17,32 +26,59 @@ def __init__(self, directory):
1726 self .links = [link ["url" ] for link in self .links_data ]
1827
1928 self .bad_links = []
29+ self .timed_out_links = []
2030 self ._checked = False
2131
22- def check_links (self ):
23- self .logger .info (f"Starting link check for { len (self .links )} links" )
24-
25- for i , link in enumerate (self .links ):
26- self .logger .debug (f"Checking link { i + 1 } /{ len (self .links )} : { link } " )
27- try :
28- response = httpx .get (link , timeout = 10.0 )
29- if response .status_code >= 400 :
30- self .logger .warning (
31- f"Dead link found: { link } (status: { response .status_code } )"
32- )
33- self .bad_links .append (link )
32+ async def _check_single_link (self , session , link ):
33+ """Check a single link asynchronously."""
34+ try :
35+ response = await session .get (link , timeout = self .timeout )
36+ if response .status_code >= 400 :
37+ self .logger .warning (
38+ f"Dead link found: { link } (status: { response .status_code } )"
39+ )
40+ return link , "dead"
41+ else :
42+ self .logger .debug (f"Link OK: { link } (status: { response .status_code } )" )
43+ return link , "ok"
44+ except (httpx .TimeoutException , asyncio .TimeoutError ):
45+ self .logger .warning (f"Link timed out: { link } " )
46+ return link , "timeout"
47+ except Exception as e :
48+ self .logger .error (f"Error checking { link } : { e } " )
49+ return link , "error"
50+
51+ async def check_links (self ):
52+ """Check all links asynchronously."""
53+ self .logger .info (f"Starting async link check for { len (self .links )} links" )
54+
55+ async with httpx .AsyncClient () as session :
56+ # Create tasks for all links
57+ tasks = [self ._check_single_link (session , link ) for link in self .links ]
58+
59+ # Execute all tasks concurrently
60+ results = await asyncio .gather (* tasks , return_exceptions = True )
61+
62+ # Process results
63+ for i , result in enumerate (results ):
64+ if isinstance (result , BaseException ):
65+ self .logger .error (f"Unexpected error for { self .links [i ]} : { result } " )
66+ self .bad_links .append (self .links [i ])
3467 else :
35- self .logger .debug (
36- f"Link OK: { link } (status: { response .status_code } )"
37- )
38- except Exception as e :
39- self .logger .error (f"Error checking { link } : { e } " )
68+ link , status = result
69+ if status in ["dead" , "error" ]:
70+ self .bad_links .append (link )
71+ elif status == "timeout" :
72+ self .bad_links .append (link )
73+ self .timed_out_links .append (link )
4074
4175 self ._checked = True
4276
4377 def remove_bad_links (self ):
78+ """Remove bad links from markdown files by replacing them with their text content.""" # noqa: E501
4479 if not self ._checked :
45- self .check_links ()
80+ # Run the async method synchronously if not already checked
81+ asyncio .run (self .check_links ())
4682
4783 if not self .bad_links :
4884 self .logger .info ("No bad links to remove" )
0 commit comments