|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Audit project metadata for deprecation signals. |
| 3 | +
|
| 4 | +This script analyses each YAML file in `_data/projects` and verifies whether |
| 5 | +its linked resources show indications of deprecation: |
| 6 | +
|
| 7 | +* The linked `source` or `homepage` mentions common deprecation keywords. |
| 8 | +* The most recent commit on a GitHub repository is older than two years. |
| 9 | +* The linked pages respond successfully. |
| 10 | +
|
| 11 | +Projects that fail the checks are marked as deprecated by adding the |
| 12 | +`deprecated` category and removing the `recommended: true` flag when present. |
| 13 | +""" |
| 14 | + |
| 15 | +from __future__ import annotations |
| 16 | + |
| 17 | +import asyncio |
| 18 | +import datetime as dt |
| 19 | +import re |
| 20 | +import sys |
| 21 | +from dataclasses import dataclass |
| 22 | +from pathlib import Path |
| 23 | +from typing import Dict, Iterable, List, Optional, Set |
| 24 | +from xml.etree import ElementTree as ET |
| 25 | + |
| 26 | +import aiohttp |
| 27 | +from ruamel.yaml import YAML |
| 28 | +from ruamel.yaml.comments import CommentedSeq |
| 29 | + |
| 30 | +PROJECT_DIR = Path(__file__).resolve().parent.parent / "_data" / "projects" |
| 31 | +KEYWORDS = ("deprecated", "obsolete", "no further development") |
| 32 | +TWO_YEARS_AGO = dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=365 * 2) |
| 33 | +HEADERS = { |
| 34 | + "User-Agent": "drupaltools-audit-script/1.0 (+https://drupaltools.github.io)" |
| 35 | +} |
| 36 | +REQUEST_TIMEOUT = aiohttp.ClientTimeout(total=12) |
| 37 | +CONCURRENCY = 12 |
| 38 | + |
| 39 | + |
| 40 | +def load_yaml(path: Path) -> Dict: |
| 41 | + yaml = YAML(typ="rt") |
| 42 | + yaml.preserve_quotes = True |
| 43 | + with path.open("r", encoding="utf-8") as handle: |
| 44 | + return yaml.load(handle) |
| 45 | + |
| 46 | + |
| 47 | +def dump_yaml(path: Path, data: Dict) -> None: |
| 48 | + yaml = YAML(typ="rt") |
| 49 | + yaml.preserve_quotes = True |
| 50 | + yaml.indent(sequence=2, offset=2) |
| 51 | + yaml.width = 4096 |
| 52 | + yaml.default_flow_style = False |
| 53 | + with path.open("w", encoding="utf-8") as handle: |
| 54 | + yaml.dump(data, handle) |
| 55 | + |
| 56 | + |
| 57 | +@dataclass |
| 58 | +class UrlCheck: |
| 59 | + valid: bool |
| 60 | + keyword_hit: bool |
| 61 | + |
| 62 | + |
| 63 | +async def fetch_page(session: aiohttp.ClientSession, semaphore: asyncio.Semaphore, url: str) -> UrlCheck: |
| 64 | + async with semaphore: |
| 65 | + try: |
| 66 | + async with session.get(url, timeout=REQUEST_TIMEOUT) as response: |
| 67 | + valid = 200 <= response.status < 400 |
| 68 | + text = await response.text(errors="ignore") if valid else "" |
| 69 | + except (aiohttp.ClientError, asyncio.TimeoutError): |
| 70 | + return UrlCheck(valid=False, keyword_hit=False) |
| 71 | + |
| 72 | + lower_text = text.lower() |
| 73 | + keyword_hit = any(keyword in lower_text for keyword in KEYWORDS) |
| 74 | + return UrlCheck(valid=valid, keyword_hit=keyword_hit) |
| 75 | + |
| 76 | + |
| 77 | +async def fetch_commit_date(session: aiohttp.ClientSession, semaphore: asyncio.Semaphore, repo: str) -> Optional[dt.datetime]: |
| 78 | + url = f"https://github.com/{repo}/commits.atom" |
| 79 | + async with semaphore: |
| 80 | + try: |
| 81 | + async with session.get(url, timeout=REQUEST_TIMEOUT) as response: |
| 82 | + if response.status >= 400: |
| 83 | + return None |
| 84 | + text = await response.text(errors="ignore") |
| 85 | + except (aiohttp.ClientError, asyncio.TimeoutError): |
| 86 | + return None |
| 87 | + |
| 88 | + try: |
| 89 | + root = ET.fromstring(text) |
| 90 | + ns = {"atom": "http://www.w3.org/2005/Atom"} |
| 91 | + updated = root.find("atom:updated", ns) |
| 92 | + if updated is None or not updated.text: |
| 93 | + return None |
| 94 | + iso_value = updated.text.replace("Z", "+00:00") |
| 95 | + return dt.datetime.fromisoformat(iso_value) |
| 96 | + except ET.ParseError: |
| 97 | + return None |
| 98 | + except ValueError: |
| 99 | + return None |
| 100 | + |
| 101 | + |
| 102 | +def github_repo(url: str) -> Optional[str]: |
| 103 | + match = re.match(r"https?://github\.com/([^/]+)/([^/#?]+)", url) |
| 104 | + if not match: |
| 105 | + return None |
| 106 | + owner, repo = match.group(1), match.group(2) |
| 107 | + repo = repo.rstrip(".git") |
| 108 | + return f"{owner}/{repo}" |
| 109 | + |
| 110 | + |
| 111 | +async def collect_url_data(urls: Set[str]) -> Dict[str, UrlCheck]: |
| 112 | + connector = aiohttp.TCPConnector(limit=CONCURRENCY) |
| 113 | + semaphore = asyncio.Semaphore(CONCURRENCY) |
| 114 | + async with aiohttp.ClientSession(headers=HEADERS, connector=connector) as session: |
| 115 | + tasks = {url: asyncio.create_task(fetch_page(session, semaphore, url)) for url in urls} |
| 116 | + results = {} |
| 117 | + for url, task in tasks.items(): |
| 118 | + results[url] = await task |
| 119 | + return results |
| 120 | + |
| 121 | + |
| 122 | +async def collect_commit_data(repos: Set[str]) -> Dict[str, Optional[dt.datetime]]: |
| 123 | + connector = aiohttp.TCPConnector(limit=CONCURRENCY) |
| 124 | + semaphore = asyncio.Semaphore(CONCURRENCY) |
| 125 | + async with aiohttp.ClientSession(headers=HEADERS, connector=connector) as session: |
| 126 | + tasks = {repo: asyncio.create_task(fetch_commit_date(session, semaphore, repo)) for repo in repos} |
| 127 | + results = {} |
| 128 | + for repo, task in tasks.items(): |
| 129 | + results[repo] = await task |
| 130 | + return results |
| 131 | + |
| 132 | + |
| 133 | +def decide_deprecation(urls: Iterable[str], url_checks: Dict[str, UrlCheck], commit_dates: Dict[str, Optional[dt.datetime]]) -> Dict[str, bool]: |
| 134 | + results = [url_checks.get(url) for url in urls if url] |
| 135 | + keyword_hit = any(result.keyword_hit for result in results if result) |
| 136 | + valid = any(result.valid for result in results if result) |
| 137 | + stale = False |
| 138 | + |
| 139 | + if not keyword_hit: |
| 140 | + for url in urls: |
| 141 | + repo = github_repo(url) |
| 142 | + if not repo: |
| 143 | + continue |
| 144 | + commit_date = commit_dates.get(repo) |
| 145 | + if commit_date and commit_date < TWO_YEARS_AGO: |
| 146 | + stale = True |
| 147 | + break |
| 148 | + |
| 149 | + return { |
| 150 | + "keyword_hit": keyword_hit, |
| 151 | + "valid": valid, |
| 152 | + "stale": stale, |
| 153 | + } |
| 154 | + |
| 155 | + |
| 156 | +def update_project(path: Path, data: Dict, assessment: Dict[str, bool]) -> bool: |
| 157 | + should_deprecate = assessment["keyword_hit"] or assessment["stale"] or not assessment["valid"] |
| 158 | + |
| 159 | + categories = data.get("category") |
| 160 | + if categories is None: |
| 161 | + categories = CommentedSeq() |
| 162 | + data["category"] = categories |
| 163 | + |
| 164 | + modified = False |
| 165 | + |
| 166 | + if should_deprecate: |
| 167 | + if "deprecated" not in categories: |
| 168 | + categories.append("deprecated") |
| 169 | + modified = True |
| 170 | + if data.get("recommended") is True: |
| 171 | + data.pop("recommended") |
| 172 | + modified = True |
| 173 | + else: |
| 174 | + if "deprecated" in categories: |
| 175 | + categories.remove("deprecated") |
| 176 | + modified = True |
| 177 | + |
| 178 | + if modified: |
| 179 | + dump_yaml(path, data) |
| 180 | + return modified |
| 181 | + |
| 182 | + |
| 183 | +async def audit_projects(project_files: List[Path]) -> int: |
| 184 | + projects = [] |
| 185 | + all_urls: Set[str] = set() |
| 186 | + for path in project_files: |
| 187 | + data = load_yaml(path) |
| 188 | + urls = [] |
| 189 | + for key in ("source", "homepage"): |
| 190 | + value = data.get(key) |
| 191 | + if isinstance(value, str) and value.strip(): |
| 192 | + urls.append(value.strip()) |
| 193 | + all_urls.add(value.strip()) |
| 194 | + projects.append((path, data, urls)) |
| 195 | + |
| 196 | + url_checks = await collect_url_data(all_urls) |
| 197 | + repos = {repo for url in all_urls if (repo := github_repo(url))} |
| 198 | + commit_dates = await collect_commit_data(repos) |
| 199 | + |
| 200 | + deprecated_count = 0 |
| 201 | + for path, data, urls in projects: |
| 202 | + assessment = decide_deprecation(urls, url_checks, commit_dates) |
| 203 | + if update_project(path, data, assessment): |
| 204 | + deprecated_count += 1 |
| 205 | + return deprecated_count |
| 206 | + |
| 207 | + |
| 208 | +def main() -> int: |
| 209 | + project_files = sorted(PROJECT_DIR.glob("*.yml")) |
| 210 | + deprecated_count = asyncio.run(audit_projects(project_files)) |
| 211 | + print(f"Updated {deprecated_count} project files.") |
| 212 | + return 0 |
| 213 | + |
| 214 | + |
| 215 | +if __name__ == "__main__": |
| 216 | + sys.exit(main()) |
0 commit comments