diff --git a/scripts/check-urls.py b/scripts/check-urls.py index 846b6ee..00beed9 100644 --- a/scripts/check-urls.py +++ b/scripts/check-urls.py @@ -2,16 +2,14 @@ import fileinput import os import re -import subprocess import sys -import threading -import time import typing import urllib.parse -from queue import Queue, Empty from github_job_summary import JobSummary from subdomains import Subdomains +from curl_wrapper import CurlExitCodes +from url_checker import UrlChecker """ Read file names from stdin (feed from git ls-files) @@ -20,40 +18,23 @@ Check them with CURL """ -# To avoid 403 responses -USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)" - -CONNECT_TIMEOUT_SEC = 5 -MAX_TIME_SEC = 10 JOIN_TIMEOUT_SEC = 120 - -class Curl: - """ - See: https://curl.se/libcurl/c/libcurl-errors.html - """ - - CURL_STDERR_HTTP_RE = re.compile(r"^curl: \(22\) The requested URL returned error: (?P\d+)") - OK = 0 - COULDNT_RESOLVE_HOST = 6 - HTTP_RETURNED_ERROR = 22 - - -CURL_EXIT_CODES_AND_HTTP_CODES = { - "https://api.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400), - "https://api.aspose.cloud/v3.0": (Curl.HTTP_RETURNED_ERROR, 404), - "https://api.aspose.cloud/v4.0": (Curl.HTTP_RETURNED_ERROR, 404), - "https://api.aspose.cloud/v4.0/": (Curl.HTTP_RETURNED_ERROR, 404), - "https://id.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400), +EXIT_CODE_EXPECTATIONS: dict[str, tuple[int, int | None]] = { + "https://api.aspose.cloud/connect/token": (CurlExitCodes.HTTP_RETURNED_ERROR, 400), + "https://api.aspose.cloud/v3.0": (CurlExitCodes.HTTP_RETURNED_ERROR, 404), + "https://api.aspose.cloud/v4.0": (CurlExitCodes.HTTP_RETURNED_ERROR, 404), + "https://api.aspose.cloud/v4.0/": (CurlExitCodes.HTTP_RETURNED_ERROR, 404), + "https://id.aspose.cloud/connect/token": (CurlExitCodes.HTTP_RETURNED_ERROR, 400), # TODO: Temporary fix - "https://dashboard.aspose.cloud/applications": (Curl.HTTP_RETURNED_ERROR, 404), + "https://dashboard.aspose.cloud/applications": (CurlExitCodes.HTTP_RETURNED_ERROR, 404), } REGEX_TO_IGNORE: list[re.Pattern[str]] = [ re.compile(r"^https://github\.com/(?P[^/]+)/(?P[^/]+)/(?:blob|issues)/\S+$"), ] -URLS_TO_IGNORE: frozenset[str] = frozenset( +URLS_TO_IGNORE = frozenset( [ "https://api.aspose.cloud", "https://www.aspose.cloud/404", @@ -170,140 +151,29 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None, raise -class Task: - _proc: subprocess.Popen[bytes] - _stderr: str | None - - def __init__(self, url: str): - self.url = url - self._proc = subprocess.Popen( - [ - "curl", - "-sSf", - "--output", - "-", - "--connect-timeout", - str(CONNECT_TIMEOUT_SEC), - "--max-time", - str(MAX_TIME_SEC), - "--user-agent", - USER_AGENT, - self.url, - ], - stdout=open(os.devnull, "w"), - stderr=subprocess.PIPE, - ) - self._stderr = None - self._started = time.time() - - @property - def running(self) -> bool: - return self._proc.poll() is None - - @property - def ret_code(self) -> int: - assert not self.running - return self._proc.returncode - - @property - def stderr(self) -> str: - assert not self.running - if self._stderr is None: - self._stderr = self._proc.stderr.read().decode() - return self._stderr - - @property - def age(self) -> float: - return time.time() - self._started - - -def create_new_task(url: str) -> Task: - # print("Create task:", url) - return Task(url) - - -def process_finished_task(task: Task) -> None: - # print("Finish task:", task.url) - expected_ret_code, expected_http_code = CURL_EXIT_CODES_AND_HTTP_CODES.get(task.url, (0, None)) - if task.ret_code == 0 or task.ret_code == expected_ret_code: - print("OK:", "'%s' %.2fs" % (task.url, task.age)) - JOB_SUMMARY.add_success(task.url) - return - - if task.ret_code == Curl.HTTP_RETURNED_ERROR and expected_http_code: - # Try parse stderr for HTTP code - match = Curl.CURL_STDERR_HTTP_RE.match(task.stderr) - assert match, "Unexpected output: %s" % task.stderr - http_code = int(match.groupdict()["http_code"]) - if http_code == expected_http_code: - print("OK HTTP:", "'%s' %.2fs" % (task.url, task.age)) - JOB_SUMMARY.add_success(task.url) - return - - print( - "Expected %d got %d for '%s': %s" % (expected_ret_code, task.ret_code, task.url, task.stderr), - file=sys.stderr, - ) - JOB_SUMMARY.add_error(f"Broken URL '{task.url}': {task.stderr}Files: {EXTRACTED_URLS_WITH_FILES[task.url]}") - - -WORKER_QUEUE: Queue[str | None] = Queue() - - -def url_checker(num_workers: int = 8) -> None: - next_report_age_sec = 5 - workers: list[Task | None] = [None for _ in range(num_workers)] - - queue_is_empty = False - - while not queue_is_empty or any(workers): - for i, task in enumerate(workers): - if task is None: - continue - if not task.running: - process_finished_task(task) - workers[i] = None - elif task.age > next_report_age_sec: - print("Long request: '%s' %.2fs" % (task.url, task.age)) - next_report_age_sec += 3 - - if not queue_is_empty: - for i in (i for (i, w) in enumerate(workers) if w is None): - # Avoid blocking forever if the queue is currently empty - try: - item = WORKER_QUEUE.get_nowait() - except Empty: - break - if item is None: - queue_is_empty = True - print("--- url queue is over ---") - break - url = item - workers[i] = create_new_task(url) - time.sleep(0.2) - print("Worker finished") - - JOB_SUMMARY = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md")) JOB_SUMMARY.add_header("Test all URLs") def main(files: list[str]) -> int: - checker = threading.Thread(target=url_checker, daemon=True) - checker.start() + url_checker = UrlChecker( + expectations=EXIT_CODE_EXPECTATIONS, + ) - for filename, text in text_extractor(files): - for url in url_extractor(text, filename): - # print("In:", url) - WORKER_QUEUE.put_nowait(url) - WORKER_QUEUE.put_nowait(None) - checker.join(timeout=JOIN_TIMEOUT_SEC) - if checker.is_alive(): - print( - f"URL checker did not finish within {JOIN_TIMEOUT_SEC}s; exiting early.", - file=sys.stderr, - flush=True, - ) + with url_checker.start() as checker: + for filename, text in text_extractor(files): + for url in url_extractor(text, filename): + checker.add_url(url) + checker.wait(JOIN_TIMEOUT_SEC) + results = url_checker.results + + # Collect results and write summary + for res in results: + if res.ok: + JOB_SUMMARY.add_success(res.url) + else: + src_files = EXTRACTED_URLS_WITH_FILES.get(res.url, []) + JOB_SUMMARY.add_error(f"Broken URL '{res.url}': {res.stderr}Files: {src_files}") JOB_SUMMARY.finalize("Checked {total} failed **{failed}**\nGood={success}") if JOB_SUMMARY.has_errors: diff --git a/scripts/check_all_urls.sh b/scripts/check_all_urls.sh index c7b58e1..79f9f59 100755 --- a/scripts/check_all_urls.sh +++ b/scripts/check_all_urls.sh @@ -5,9 +5,6 @@ set -euo pipefail SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" ROOT_DIR="$( cd "${SCRIPT_DIR}/.." &> /dev/null && pwd )" -check_file () { - echo "$1" -} pushd "${ROOT_DIR}" git ls-files --recurse-submodules --exclude-standard --full-name | grep -v 'package-lock.json$' | python "${SCRIPT_DIR}/check-urls.py" popd diff --git a/scripts/curl_wrapper.py b/scripts/curl_wrapper.py new file mode 100644 index 0000000..95d01e7 --- /dev/null +++ b/scripts/curl_wrapper.py @@ -0,0 +1,96 @@ +import contextlib +import os +import re +import subprocess +import time +from typing import Optional + +# To avoid 403 responses (default); caller may override per instance +DEFAULT_USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)" + + +class CurlExitCodes: + """ + See: https://curl.se/libcurl/c/libcurl-errors.html + """ + + OK = 0 + COULDNT_RESOLVE_HOST = 6 + HTTP_RETURNED_ERROR = 22 + + +class CurlWrapper: + """ + Encapsulates a single curl execution with timeouts and helpers. + """ + + CURL_STDERR_HTTP_RE = re.compile(r"^curl: \(22\) The requested URL returned error: (?P\d+)") + + def __init__( + self, + url: str, + *, + user_agent: str = DEFAULT_USER_AGENT, + connect_timeout: int = 5, + max_time: int = 10, + max_redirects: int = 3, + ) -> None: + self.url = url + self._stderr: Optional[str] = None + self._started = time.time() + self._proc = subprocess.Popen( + [ + "curl", + "-sSf", + "-L", # follow redirects + "--max-redirs", + f"{max_redirects}", # limit number of redirects + # "--proto", "=https", # (optional) only allow https for the initial URL + "--proto-redir", + "=all,https", # only allow https after redirects; http will fail + "--output", + "-", # discard body + "--connect-timeout", + f"{connect_timeout}", + "--max-time", + f"{max_time}", + "--user-agent", + f"{user_agent}", + self.url, + ], + stdout=open(os.devnull, "w"), + stderr=subprocess.PIPE, + ) + + @property + def running(self) -> bool: + return self._proc.poll() is None + + @property + def ret_code(self) -> int: + assert not self.running + return self._proc.returncode + + @property + def stderr(self) -> str: + assert not self.running + if self._stderr is None: + assert self._proc.stderr is not None + self._stderr = self._proc.stderr.read().decode() + return self._stderr + + @property + def age(self) -> float: + return time.time() - self._started + + def terminate(self, timeout: float | None = None) -> None: + try: + self._proc.terminate() + if timeout is not None: + self._proc.wait(timeout=timeout) + except Exception: + pass + + def kill(self) -> None: + with contextlib.suppress(Exception): + self._proc.kill() diff --git a/scripts/url_checker.py b/scripts/url_checker.py new file mode 100644 index 0000000..697a453 --- /dev/null +++ b/scripts/url_checker.py @@ -0,0 +1,185 @@ +import contextlib +import sys +import threading +import time +from dataclasses import dataclass +from queue import Queue, Empty +from types import TracebackType +from typing import Callable, Optional + +from curl_wrapper import CurlWrapper, CurlExitCodes + + +@dataclass +class CheckResult: + url: str + ok: bool + ret_code: int + age: float + stderr: str + expected_ret_code: int + expected_http_code: int | None + http_code: int | None + + +class UrlChecker: + def __init__( + self, + *, + num_workers: int = 8, + hard_kill_sec: int = 15, + expectations: dict[str, tuple[int, int | None]] | None = None, + worker_factory: Optional[Callable[[str], CurlWrapper]] = None, + ) -> None: + self.num_workers = num_workers + self.hard_kill_sec = hard_kill_sec + self.expectations = expectations or {} + self.worker_factory = worker_factory or (lambda url: CurlWrapper(url)) + + self.queue: Queue[str | None] = Queue() + self.workers: list[CurlWrapper | None] = [None for _ in range(self.num_workers)] + self.stop_event = False + self.next_report_age_sec = 5 + self.results: list[CheckResult] = [] + self._thread: threading.Thread | None = None + self._closed: bool = False + + def add_url(self, url: str) -> None: + self.queue.put_nowait(url) + + def _close(self) -> None: + if not self._closed: + self._closed = True + self.queue.put_nowait(None) + + def _stop(self) -> None: + self.stop_event = True + with contextlib.suppress(Exception): + self.queue.put_nowait(None) + + def _run(self) -> None: + queue_is_empty = False + while not queue_is_empty or any(self.workers): + # Graceful stop: cancel running curls + if self.stop_event: + queue_is_empty = True + for t in self.workers: + if t is not None and t.running: + t.terminate(timeout=1) + if t.running: + t.kill() + + # Tick workers + for i, task in enumerate(self.workers): + if task is None: + continue + if not task.running: + self._process_finished(task) + self.workers[i] = None + elif task.age > self.next_report_age_sec: + print("Long request: '%s' %.2fs" % (task.url, task.age)) + self.next_report_age_sec += 3 + if task.age > self.hard_kill_sec: + task.terminate(timeout=2) + if task.running: + task.kill() + print("Killed long request: '%s' %.2fs" % (task.url, task.age)) + + # Fill idle workers + if not queue_is_empty: + for i in (i for (i, w) in enumerate(self.workers) if w is None): + try: + item = self.queue.get_nowait() + except Empty: + break + if item is None: + queue_is_empty = True + print("--- url queue is over ---") + break + url = item + self.workers[i] = self.worker_factory(url) + time.sleep(0.2) + print("Worker finished") + + # Context management and user-friendly API + def start(self) -> "UrlChecker": + if self._thread is not None: + return self + self._thread = threading.Thread(target=self._run, daemon=True) + self._thread.start() + return self + + def __enter__(self) -> "UrlChecker": + return self.start() + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, + ) -> None: + # Ensure we signal end of input and wait for completion + self._close() + self.wait() + + def wait(self, join_timeout_sec: float | None = None) -> None: + # Ensure end-of-input signaled before waiting + self._close() + t = self._thread + if t is None: + return + if join_timeout_sec is not None: + t.join(timeout=join_timeout_sec) + if t.is_alive(): + # Try to stop gracefully and inform user + self._stop() + print( + f"URL checker did not finish within {join_timeout_sec}s; exiting early.", + file=sys.stderr, + flush=True, + ) + else: + t.join() + + def _process_finished(self, task: CurlWrapper) -> None: + expected_ret_code, expected_http_code = self.expectations.get(task.url, (0, None)) + + ok: bool = False + http_code_val: int | None = None + stderr_out: str = task.stderr + + # Fast path: exact expected ret code or success + if task.ret_code == 0 or task.ret_code == expected_ret_code: + print("OK:", "'%s' %.2fs" % (task.url, task.age)) + ok = True + stderr_out = "" + else: + # If curl reports HTTP error (22), attempt to parse HTTP code to compare + if task.ret_code == CurlExitCodes.HTTP_RETURNED_ERROR and expected_http_code: + match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr) + assert match, "Unexpected output: %s" % task.stderr + http_code_val = int(match.groupdict()["http_code"]) + if http_code_val == expected_http_code: + print("OK HTTP:", "'%s' %.2fs" % (task.url, task.age)) + ok = True + + if not ok: + # Otherwise, report error + print( + "Expected %d got %d for '%s': %s" % (expected_ret_code, task.ret_code, task.url, task.stderr), + file=sys.stderr, + ) + + # Append exactly once + self.results.append( + CheckResult( + url=task.url, + ok=ok, + ret_code=task.ret_code, + age=task.age, + stderr=stderr_out, + expected_ret_code=expected_ret_code, + expected_http_code=expected_http_code, + http_code=http_code_val, + ) + )