Skip to content

Commit 12a2442

Browse files
authored
Check urls refactoring (#130)
* Split into files * Refactoring * Using context * Renames * Code cleanup * Added max-redirects
1 parent 350cc5d commit 12a2442

File tree

4 files changed

+308
-160
lines changed

4 files changed

+308
-160
lines changed

scripts/check-urls.py

Lines changed: 27 additions & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,14 @@
22
import fileinput
33
import os
44
import re
5-
import subprocess
65
import sys
7-
import threading
8-
import time
96
import typing
107
import urllib.parse
11-
from queue import Queue, Empty
128

139
from github_job_summary import JobSummary
1410
from subdomains import Subdomains
11+
from curl_wrapper import CurlExitCodes
12+
from url_checker import UrlChecker
1513

1614
"""
1715
Read file names from stdin (feed from git ls-files)
@@ -20,40 +18,23 @@
2018
Check them with CURL
2119
"""
2220

23-
# To avoid 403 responses
24-
USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
25-
26-
CONNECT_TIMEOUT_SEC = 5
27-
MAX_TIME_SEC = 10
2821
JOIN_TIMEOUT_SEC = 120
2922

30-
31-
class Curl:
32-
"""
33-
See: https://curl.se/libcurl/c/libcurl-errors.html
34-
"""
35-
36-
CURL_STDERR_HTTP_RE = re.compile(r"^curl: \(22\) The requested URL returned error: (?P<http_code>\d+)")
37-
OK = 0
38-
COULDNT_RESOLVE_HOST = 6
39-
HTTP_RETURNED_ERROR = 22
40-
41-
42-
CURL_EXIT_CODES_AND_HTTP_CODES = {
43-
"https://api.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400),
44-
"https://api.aspose.cloud/v3.0": (Curl.HTTP_RETURNED_ERROR, 404),
45-
"https://api.aspose.cloud/v4.0": (Curl.HTTP_RETURNED_ERROR, 404),
46-
"https://api.aspose.cloud/v4.0/": (Curl.HTTP_RETURNED_ERROR, 404),
47-
"https://id.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400),
23+
EXIT_CODE_EXPECTATIONS: dict[str, tuple[int, int | None]] = {
24+
"https://api.aspose.cloud/connect/token": (CurlExitCodes.HTTP_RETURNED_ERROR, 400),
25+
"https://api.aspose.cloud/v3.0": (CurlExitCodes.HTTP_RETURNED_ERROR, 404),
26+
"https://api.aspose.cloud/v4.0": (CurlExitCodes.HTTP_RETURNED_ERROR, 404),
27+
"https://api.aspose.cloud/v4.0/": (CurlExitCodes.HTTP_RETURNED_ERROR, 404),
28+
"https://id.aspose.cloud/connect/token": (CurlExitCodes.HTTP_RETURNED_ERROR, 400),
4829
# TODO: Temporary fix
49-
"https://dashboard.aspose.cloud/applications": (Curl.HTTP_RETURNED_ERROR, 404),
30+
"https://dashboard.aspose.cloud/applications": (CurlExitCodes.HTTP_RETURNED_ERROR, 404),
5031
}
5132

5233
REGEX_TO_IGNORE: list[re.Pattern[str]] = [
5334
re.compile(r"^https://github\.com/(?P<user>[^/]+)/(?P<repo>[^/]+)/(?:blob|issues)/\S+$"),
5435
]
5536

56-
URLS_TO_IGNORE: frozenset[str] = frozenset(
37+
URLS_TO_IGNORE = frozenset(
5738
[
5839
"https://api.aspose.cloud",
5940
"https://www.aspose.cloud/404",
@@ -170,140 +151,29 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
170151
raise
171152

172153

173-
class Task:
174-
_proc: subprocess.Popen[bytes]
175-
_stderr: str | None
176-
177-
def __init__(self, url: str):
178-
self.url = url
179-
self._proc = subprocess.Popen(
180-
[
181-
"curl",
182-
"-sSf",
183-
"--output",
184-
"-",
185-
"--connect-timeout",
186-
str(CONNECT_TIMEOUT_SEC),
187-
"--max-time",
188-
str(MAX_TIME_SEC),
189-
"--user-agent",
190-
USER_AGENT,
191-
self.url,
192-
],
193-
stdout=open(os.devnull, "w"),
194-
stderr=subprocess.PIPE,
195-
)
196-
self._stderr = None
197-
self._started = time.time()
198-
199-
@property
200-
def running(self) -> bool:
201-
return self._proc.poll() is None
202-
203-
@property
204-
def ret_code(self) -> int:
205-
assert not self.running
206-
return self._proc.returncode
207-
208-
@property
209-
def stderr(self) -> str:
210-
assert not self.running
211-
if self._stderr is None:
212-
self._stderr = self._proc.stderr.read().decode()
213-
return self._stderr
214-
215-
@property
216-
def age(self) -> float:
217-
return time.time() - self._started
218-
219-
220-
def create_new_task(url: str) -> Task:
221-
# print("Create task:", url)
222-
return Task(url)
223-
224-
225-
def process_finished_task(task: Task) -> None:
226-
# print("Finish task:", task.url)
227-
expected_ret_code, expected_http_code = CURL_EXIT_CODES_AND_HTTP_CODES.get(task.url, (0, None))
228-
if task.ret_code == 0 or task.ret_code == expected_ret_code:
229-
print("OK:", "'%s' %.2fs" % (task.url, task.age))
230-
JOB_SUMMARY.add_success(task.url)
231-
return
232-
233-
if task.ret_code == Curl.HTTP_RETURNED_ERROR and expected_http_code:
234-
# Try parse stderr for HTTP code
235-
match = Curl.CURL_STDERR_HTTP_RE.match(task.stderr)
236-
assert match, "Unexpected output: %s" % task.stderr
237-
http_code = int(match.groupdict()["http_code"])
238-
if http_code == expected_http_code:
239-
print("OK HTTP:", "'%s' %.2fs" % (task.url, task.age))
240-
JOB_SUMMARY.add_success(task.url)
241-
return
242-
243-
print(
244-
"Expected %d got %d for '%s': %s" % (expected_ret_code, task.ret_code, task.url, task.stderr),
245-
file=sys.stderr,
246-
)
247-
JOB_SUMMARY.add_error(f"Broken URL '{task.url}': {task.stderr}Files: {EXTRACTED_URLS_WITH_FILES[task.url]}")
248-
249-
250-
WORKER_QUEUE: Queue[str | None] = Queue()
251-
252-
253-
def url_checker(num_workers: int = 8) -> None:
254-
next_report_age_sec = 5
255-
workers: list[Task | None] = [None for _ in range(num_workers)]
256-
257-
queue_is_empty = False
258-
259-
while not queue_is_empty or any(workers):
260-
for i, task in enumerate(workers):
261-
if task is None:
262-
continue
263-
if not task.running:
264-
process_finished_task(task)
265-
workers[i] = None
266-
elif task.age > next_report_age_sec:
267-
print("Long request: '%s' %.2fs" % (task.url, task.age))
268-
next_report_age_sec += 3
269-
270-
if not queue_is_empty:
271-
for i in (i for (i, w) in enumerate(workers) if w is None):
272-
# Avoid blocking forever if the queue is currently empty
273-
try:
274-
item = WORKER_QUEUE.get_nowait()
275-
except Empty:
276-
break
277-
if item is None:
278-
queue_is_empty = True
279-
print("--- url queue is over ---")
280-
break
281-
url = item
282-
workers[i] = create_new_task(url)
283-
time.sleep(0.2)
284-
print("Worker finished")
285-
286-
287154
JOB_SUMMARY = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md"))
288155
JOB_SUMMARY.add_header("Test all URLs")
289156

290157

291158
def main(files: list[str]) -> int:
292-
checker = threading.Thread(target=url_checker, daemon=True)
293-
checker.start()
159+
url_checker = UrlChecker(
160+
expectations=EXIT_CODE_EXPECTATIONS,
161+
)
294162

295-
for filename, text in text_extractor(files):
296-
for url in url_extractor(text, filename):
297-
# print("In:", url)
298-
WORKER_QUEUE.put_nowait(url)
299-
WORKER_QUEUE.put_nowait(None)
300-
checker.join(timeout=JOIN_TIMEOUT_SEC)
301-
if checker.is_alive():
302-
print(
303-
f"URL checker did not finish within {JOIN_TIMEOUT_SEC}s; exiting early.",
304-
file=sys.stderr,
305-
flush=True,
306-
)
163+
with url_checker.start() as checker:
164+
for filename, text in text_extractor(files):
165+
for url in url_extractor(text, filename):
166+
checker.add_url(url)
167+
checker.wait(JOIN_TIMEOUT_SEC)
168+
results = url_checker.results
169+
170+
# Collect results and write summary
171+
for res in results:
172+
if res.ok:
173+
JOB_SUMMARY.add_success(res.url)
174+
else:
175+
src_files = EXTRACTED_URLS_WITH_FILES.get(res.url, [])
176+
JOB_SUMMARY.add_error(f"Broken URL '{res.url}': {res.stderr}Files: {src_files}")
307177

308178
JOB_SUMMARY.finalize("Checked {total} failed **{failed}**\nGood={success}")
309179
if JOB_SUMMARY.has_errors:

scripts/check_all_urls.sh

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,6 @@ set -euo pipefail
55
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
66
ROOT_DIR="$( cd "${SCRIPT_DIR}/.." &> /dev/null && pwd )"
77

8-
check_file () {
9-
echo "$1"
10-
}
118
pushd "${ROOT_DIR}"
129
git ls-files --recurse-submodules --exclude-standard --full-name | grep -v 'package-lock.json$' | python "${SCRIPT_DIR}/check-urls.py"
1310
popd

scripts/curl_wrapper.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import contextlib
2+
import os
3+
import re
4+
import subprocess
5+
import time
6+
from typing import Optional
7+
8+
# To avoid 403 responses (default); caller may override per instance
9+
DEFAULT_USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
10+
11+
12+
class CurlExitCodes:
13+
"""
14+
See: https://curl.se/libcurl/c/libcurl-errors.html
15+
"""
16+
17+
OK = 0
18+
COULDNT_RESOLVE_HOST = 6
19+
HTTP_RETURNED_ERROR = 22
20+
21+
22+
class CurlWrapper:
23+
"""
24+
Encapsulates a single curl execution with timeouts and helpers.
25+
"""
26+
27+
CURL_STDERR_HTTP_RE = re.compile(r"^curl: \(22\) The requested URL returned error: (?P<http_code>\d+)")
28+
29+
def __init__(
30+
self,
31+
url: str,
32+
*,
33+
user_agent: str = DEFAULT_USER_AGENT,
34+
connect_timeout: int = 5,
35+
max_time: int = 10,
36+
max_redirects: int = 3,
37+
) -> None:
38+
self.url = url
39+
self._stderr: Optional[str] = None
40+
self._started = time.time()
41+
self._proc = subprocess.Popen(
42+
[
43+
"curl",
44+
"-sSf",
45+
"-L", # follow redirects
46+
"--max-redirs",
47+
f"{max_redirects}", # limit number of redirects
48+
# "--proto", "=https", # (optional) only allow https for the initial URL
49+
"--proto-redir",
50+
"=all,https", # only allow https after redirects; http will fail
51+
"--output",
52+
"-", # discard body
53+
"--connect-timeout",
54+
f"{connect_timeout}",
55+
"--max-time",
56+
f"{max_time}",
57+
"--user-agent",
58+
f"{user_agent}",
59+
self.url,
60+
],
61+
stdout=open(os.devnull, "w"),
62+
stderr=subprocess.PIPE,
63+
)
64+
65+
@property
66+
def running(self) -> bool:
67+
return self._proc.poll() is None
68+
69+
@property
70+
def ret_code(self) -> int:
71+
assert not self.running
72+
return self._proc.returncode
73+
74+
@property
75+
def stderr(self) -> str:
76+
assert not self.running
77+
if self._stderr is None:
78+
assert self._proc.stderr is not None
79+
self._stderr = self._proc.stderr.read().decode()
80+
return self._stderr
81+
82+
@property
83+
def age(self) -> float:
84+
return time.time() - self._started
85+
86+
def terminate(self, timeout: float | None = None) -> None:
87+
try:
88+
self._proc.terminate()
89+
if timeout is not None:
90+
self._proc.wait(timeout=timeout)
91+
except Exception:
92+
pass
93+
94+
def kill(self) -> None:
95+
with contextlib.suppress(Exception):
96+
self._proc.kill()

0 commit comments

Comments
 (0)