|
2 | 2 | import fileinput |
3 | 3 | import os |
4 | 4 | import re |
5 | | -import subprocess |
6 | 5 | import sys |
7 | | -import threading |
8 | | -import time |
9 | 6 | import typing |
10 | 7 | import urllib.parse |
11 | | -from queue import Queue, Empty |
12 | 8 |
|
13 | 9 | from github_job_summary import JobSummary |
14 | 10 | from subdomains import Subdomains |
| 11 | +from curl_wrapper import CurlExitCodes |
| 12 | +from url_checker import UrlChecker |
15 | 13 |
|
16 | 14 | """ |
17 | 15 | Read file names from stdin (feed from git ls-files) |
|
20 | 18 | Check them with CURL |
21 | 19 | """ |
22 | 20 |
|
23 | | -# To avoid 403 responses |
24 | | -USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)" |
25 | | - |
26 | | -CONNECT_TIMEOUT_SEC = 5 |
27 | | -MAX_TIME_SEC = 10 |
28 | 21 | JOIN_TIMEOUT_SEC = 120 |
29 | 22 |
|
30 | | - |
31 | | -class Curl: |
32 | | - """ |
33 | | - See: https://curl.se/libcurl/c/libcurl-errors.html |
34 | | - """ |
35 | | - |
36 | | - CURL_STDERR_HTTP_RE = re.compile(r"^curl: \(22\) The requested URL returned error: (?P<http_code>\d+)") |
37 | | - OK = 0 |
38 | | - COULDNT_RESOLVE_HOST = 6 |
39 | | - HTTP_RETURNED_ERROR = 22 |
40 | | - |
41 | | - |
42 | | -CURL_EXIT_CODES_AND_HTTP_CODES = { |
43 | | - "https://api.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400), |
44 | | - "https://api.aspose.cloud/v3.0": (Curl.HTTP_RETURNED_ERROR, 404), |
45 | | - "https://api.aspose.cloud/v4.0": (Curl.HTTP_RETURNED_ERROR, 404), |
46 | | - "https://api.aspose.cloud/v4.0/": (Curl.HTTP_RETURNED_ERROR, 404), |
47 | | - "https://id.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400), |
| 23 | +EXIT_CODE_EXPECTATIONS: dict[str, tuple[int, int | None]] = { |
| 24 | + "https://api.aspose.cloud/connect/token": (CurlExitCodes.HTTP_RETURNED_ERROR, 400), |
| 25 | + "https://api.aspose.cloud/v3.0": (CurlExitCodes.HTTP_RETURNED_ERROR, 404), |
| 26 | + "https://api.aspose.cloud/v4.0": (CurlExitCodes.HTTP_RETURNED_ERROR, 404), |
| 27 | + "https://api.aspose.cloud/v4.0/": (CurlExitCodes.HTTP_RETURNED_ERROR, 404), |
| 28 | + "https://id.aspose.cloud/connect/token": (CurlExitCodes.HTTP_RETURNED_ERROR, 400), |
48 | 29 | # TODO: Temporary fix |
49 | | - "https://dashboard.aspose.cloud/applications": (Curl.HTTP_RETURNED_ERROR, 404), |
| 30 | + "https://dashboard.aspose.cloud/applications": (CurlExitCodes.HTTP_RETURNED_ERROR, 404), |
50 | 31 | } |
51 | 32 |
|
52 | 33 | REGEX_TO_IGNORE: list[re.Pattern[str]] = [ |
53 | 34 | re.compile(r"^https://github\.com/(?P<user>[^/]+)/(?P<repo>[^/]+)/(?:blob|issues)/\S+$"), |
54 | 35 | ] |
55 | 36 |
|
56 | | -URLS_TO_IGNORE: frozenset[str] = frozenset( |
| 37 | +URLS_TO_IGNORE = frozenset( |
57 | 38 | [ |
58 | 39 | "https://api.aspose.cloud", |
59 | 40 | "https://www.aspose.cloud/404", |
@@ -170,140 +151,29 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None, |
170 | 151 | raise |
171 | 152 |
|
172 | 153 |
|
173 | | -class Task: |
174 | | - _proc: subprocess.Popen[bytes] |
175 | | - _stderr: str | None |
176 | | - |
177 | | - def __init__(self, url: str): |
178 | | - self.url = url |
179 | | - self._proc = subprocess.Popen( |
180 | | - [ |
181 | | - "curl", |
182 | | - "-sSf", |
183 | | - "--output", |
184 | | - "-", |
185 | | - "--connect-timeout", |
186 | | - str(CONNECT_TIMEOUT_SEC), |
187 | | - "--max-time", |
188 | | - str(MAX_TIME_SEC), |
189 | | - "--user-agent", |
190 | | - USER_AGENT, |
191 | | - self.url, |
192 | | - ], |
193 | | - stdout=open(os.devnull, "w"), |
194 | | - stderr=subprocess.PIPE, |
195 | | - ) |
196 | | - self._stderr = None |
197 | | - self._started = time.time() |
198 | | - |
199 | | - @property |
200 | | - def running(self) -> bool: |
201 | | - return self._proc.poll() is None |
202 | | - |
203 | | - @property |
204 | | - def ret_code(self) -> int: |
205 | | - assert not self.running |
206 | | - return self._proc.returncode |
207 | | - |
208 | | - @property |
209 | | - def stderr(self) -> str: |
210 | | - assert not self.running |
211 | | - if self._stderr is None: |
212 | | - self._stderr = self._proc.stderr.read().decode() |
213 | | - return self._stderr |
214 | | - |
215 | | - @property |
216 | | - def age(self) -> float: |
217 | | - return time.time() - self._started |
218 | | - |
219 | | - |
220 | | -def create_new_task(url: str) -> Task: |
221 | | - # print("Create task:", url) |
222 | | - return Task(url) |
223 | | - |
224 | | - |
225 | | -def process_finished_task(task: Task) -> None: |
226 | | - # print("Finish task:", task.url) |
227 | | - expected_ret_code, expected_http_code = CURL_EXIT_CODES_AND_HTTP_CODES.get(task.url, (0, None)) |
228 | | - if task.ret_code == 0 or task.ret_code == expected_ret_code: |
229 | | - print("OK:", "'%s' %.2fs" % (task.url, task.age)) |
230 | | - JOB_SUMMARY.add_success(task.url) |
231 | | - return |
232 | | - |
233 | | - if task.ret_code == Curl.HTTP_RETURNED_ERROR and expected_http_code: |
234 | | - # Try parse stderr for HTTP code |
235 | | - match = Curl.CURL_STDERR_HTTP_RE.match(task.stderr) |
236 | | - assert match, "Unexpected output: %s" % task.stderr |
237 | | - http_code = int(match.groupdict()["http_code"]) |
238 | | - if http_code == expected_http_code: |
239 | | - print("OK HTTP:", "'%s' %.2fs" % (task.url, task.age)) |
240 | | - JOB_SUMMARY.add_success(task.url) |
241 | | - return |
242 | | - |
243 | | - print( |
244 | | - "Expected %d got %d for '%s': %s" % (expected_ret_code, task.ret_code, task.url, task.stderr), |
245 | | - file=sys.stderr, |
246 | | - ) |
247 | | - JOB_SUMMARY.add_error(f"Broken URL '{task.url}': {task.stderr}Files: {EXTRACTED_URLS_WITH_FILES[task.url]}") |
248 | | - |
249 | | - |
250 | | -WORKER_QUEUE: Queue[str | None] = Queue() |
251 | | - |
252 | | - |
253 | | -def url_checker(num_workers: int = 8) -> None: |
254 | | - next_report_age_sec = 5 |
255 | | - workers: list[Task | None] = [None for _ in range(num_workers)] |
256 | | - |
257 | | - queue_is_empty = False |
258 | | - |
259 | | - while not queue_is_empty or any(workers): |
260 | | - for i, task in enumerate(workers): |
261 | | - if task is None: |
262 | | - continue |
263 | | - if not task.running: |
264 | | - process_finished_task(task) |
265 | | - workers[i] = None |
266 | | - elif task.age > next_report_age_sec: |
267 | | - print("Long request: '%s' %.2fs" % (task.url, task.age)) |
268 | | - next_report_age_sec += 3 |
269 | | - |
270 | | - if not queue_is_empty: |
271 | | - for i in (i for (i, w) in enumerate(workers) if w is None): |
272 | | - # Avoid blocking forever if the queue is currently empty |
273 | | - try: |
274 | | - item = WORKER_QUEUE.get_nowait() |
275 | | - except Empty: |
276 | | - break |
277 | | - if item is None: |
278 | | - queue_is_empty = True |
279 | | - print("--- url queue is over ---") |
280 | | - break |
281 | | - url = item |
282 | | - workers[i] = create_new_task(url) |
283 | | - time.sleep(0.2) |
284 | | - print("Worker finished") |
285 | | - |
286 | | - |
287 | 154 | JOB_SUMMARY = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md")) |
288 | 155 | JOB_SUMMARY.add_header("Test all URLs") |
289 | 156 |
|
290 | 157 |
|
291 | 158 | def main(files: list[str]) -> int: |
292 | | - checker = threading.Thread(target=url_checker, daemon=True) |
293 | | - checker.start() |
| 159 | + url_checker = UrlChecker( |
| 160 | + expectations=EXIT_CODE_EXPECTATIONS, |
| 161 | + ) |
294 | 162 |
|
295 | | - for filename, text in text_extractor(files): |
296 | | - for url in url_extractor(text, filename): |
297 | | - # print("In:", url) |
298 | | - WORKER_QUEUE.put_nowait(url) |
299 | | - WORKER_QUEUE.put_nowait(None) |
300 | | - checker.join(timeout=JOIN_TIMEOUT_SEC) |
301 | | - if checker.is_alive(): |
302 | | - print( |
303 | | - f"URL checker did not finish within {JOIN_TIMEOUT_SEC}s; exiting early.", |
304 | | - file=sys.stderr, |
305 | | - flush=True, |
306 | | - ) |
| 163 | + with url_checker.start() as checker: |
| 164 | + for filename, text in text_extractor(files): |
| 165 | + for url in url_extractor(text, filename): |
| 166 | + checker.add_url(url) |
| 167 | + checker.wait(JOIN_TIMEOUT_SEC) |
| 168 | + results = url_checker.results |
| 169 | + |
| 170 | + # Collect results and write summary |
| 171 | + for res in results: |
| 172 | + if res.ok: |
| 173 | + JOB_SUMMARY.add_success(res.url) |
| 174 | + else: |
| 175 | + src_files = EXTRACTED_URLS_WITH_FILES.get(res.url, []) |
| 176 | + JOB_SUMMARY.add_error(f"Broken URL '{res.url}': {res.stderr}Files: {src_files}") |
307 | 177 |
|
308 | 178 | JOB_SUMMARY.finalize("Checked {total} failed **{failed}**\nGood={success}") |
309 | 179 | if JOB_SUMMARY.has_errors: |
|
0 commit comments