Skip to content

Commit 82c4ee4

Browse files
Merge pull request #29 from drupaltools/codex/update-project-categories-based-on-web-search
2 parents 5c01ded + 9855f38 commit 82c4ee4

File tree

1 file changed

+216
-0
lines changed

1 file changed

+216
-0
lines changed
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#!/usr/bin/env python3
2+
"""Audit project metadata for deprecation signals.
3+
4+
This script analyses each YAML file in `_data/projects` and verifies whether
5+
its linked resources show indications of deprecation:
6+
7+
* The linked `source` or `homepage` mentions common deprecation keywords.
8+
* The most recent commit on a GitHub repository is older than two years.
9+
* The linked pages respond successfully.
10+
11+
Projects that fail the checks are marked as deprecated by adding the
12+
`deprecated` category and removing the `recommended: true` flag when present.
13+
"""
14+
15+
from __future__ import annotations
16+
17+
import asyncio
18+
import datetime as dt
19+
import re
20+
import sys
21+
from dataclasses import dataclass
22+
from pathlib import Path
23+
from typing import Dict, Iterable, List, Optional, Set
24+
from xml.etree import ElementTree as ET
25+
26+
import aiohttp
27+
from ruamel.yaml import YAML
28+
from ruamel.yaml.comments import CommentedSeq
29+
30+
PROJECT_DIR = Path(__file__).resolve().parent.parent / "_data" / "projects"
31+
KEYWORDS = ("deprecated", "obsolete", "no further development")
32+
TWO_YEARS_AGO = dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=365 * 2)
33+
HEADERS = {
34+
"User-Agent": "drupaltools-audit-script/1.0 (+https://drupaltools.github.io)"
35+
}
36+
REQUEST_TIMEOUT = aiohttp.ClientTimeout(total=12)
37+
CONCURRENCY = 12
38+
39+
40+
def load_yaml(path: Path) -> Dict:
41+
yaml = YAML(typ="rt")
42+
yaml.preserve_quotes = True
43+
with path.open("r", encoding="utf-8") as handle:
44+
return yaml.load(handle)
45+
46+
47+
def dump_yaml(path: Path, data: Dict) -> None:
48+
yaml = YAML(typ="rt")
49+
yaml.preserve_quotes = True
50+
yaml.indent(sequence=2, offset=2)
51+
yaml.width = 4096
52+
yaml.default_flow_style = False
53+
with path.open("w", encoding="utf-8") as handle:
54+
yaml.dump(data, handle)
55+
56+
57+
@dataclass
58+
class UrlCheck:
59+
valid: bool
60+
keyword_hit: bool
61+
62+
63+
async def fetch_page(session: aiohttp.ClientSession, semaphore: asyncio.Semaphore, url: str) -> UrlCheck:
64+
async with semaphore:
65+
try:
66+
async with session.get(url, timeout=REQUEST_TIMEOUT) as response:
67+
valid = 200 <= response.status < 400
68+
text = await response.text(errors="ignore") if valid else ""
69+
except (aiohttp.ClientError, asyncio.TimeoutError):
70+
return UrlCheck(valid=False, keyword_hit=False)
71+
72+
lower_text = text.lower()
73+
keyword_hit = any(keyword in lower_text for keyword in KEYWORDS)
74+
return UrlCheck(valid=valid, keyword_hit=keyword_hit)
75+
76+
77+
async def fetch_commit_date(session: aiohttp.ClientSession, semaphore: asyncio.Semaphore, repo: str) -> Optional[dt.datetime]:
78+
url = f"https://github.com/{repo}/commits.atom"
79+
async with semaphore:
80+
try:
81+
async with session.get(url, timeout=REQUEST_TIMEOUT) as response:
82+
if response.status >= 400:
83+
return None
84+
text = await response.text(errors="ignore")
85+
except (aiohttp.ClientError, asyncio.TimeoutError):
86+
return None
87+
88+
try:
89+
root = ET.fromstring(text)
90+
ns = {"atom": "http://www.w3.org/2005/Atom"}
91+
updated = root.find("atom:updated", ns)
92+
if updated is None or not updated.text:
93+
return None
94+
iso_value = updated.text.replace("Z", "+00:00")
95+
return dt.datetime.fromisoformat(iso_value)
96+
except ET.ParseError:
97+
return None
98+
except ValueError:
99+
return None
100+
101+
102+
def github_repo(url: str) -> Optional[str]:
103+
match = re.match(r"https?://github\.com/([^/]+)/([^/#?]+)", url)
104+
if not match:
105+
return None
106+
owner, repo = match.group(1), match.group(2)
107+
repo = repo.rstrip(".git")
108+
return f"{owner}/{repo}"
109+
110+
111+
async def collect_url_data(urls: Set[str]) -> Dict[str, UrlCheck]:
112+
connector = aiohttp.TCPConnector(limit=CONCURRENCY)
113+
semaphore = asyncio.Semaphore(CONCURRENCY)
114+
async with aiohttp.ClientSession(headers=HEADERS, connector=connector) as session:
115+
tasks = {url: asyncio.create_task(fetch_page(session, semaphore, url)) for url in urls}
116+
results = {}
117+
for url, task in tasks.items():
118+
results[url] = await task
119+
return results
120+
121+
122+
async def collect_commit_data(repos: Set[str]) -> Dict[str, Optional[dt.datetime]]:
123+
connector = aiohttp.TCPConnector(limit=CONCURRENCY)
124+
semaphore = asyncio.Semaphore(CONCURRENCY)
125+
async with aiohttp.ClientSession(headers=HEADERS, connector=connector) as session:
126+
tasks = {repo: asyncio.create_task(fetch_commit_date(session, semaphore, repo)) for repo in repos}
127+
results = {}
128+
for repo, task in tasks.items():
129+
results[repo] = await task
130+
return results
131+
132+
133+
def decide_deprecation(urls: Iterable[str], url_checks: Dict[str, UrlCheck], commit_dates: Dict[str, Optional[dt.datetime]]) -> Dict[str, bool]:
134+
results = [url_checks.get(url) for url in urls if url]
135+
keyword_hit = any(result.keyword_hit for result in results if result)
136+
valid = any(result.valid for result in results if result)
137+
stale = False
138+
139+
if not keyword_hit:
140+
for url in urls:
141+
repo = github_repo(url)
142+
if not repo:
143+
continue
144+
commit_date = commit_dates.get(repo)
145+
if commit_date and commit_date < TWO_YEARS_AGO:
146+
stale = True
147+
break
148+
149+
return {
150+
"keyword_hit": keyword_hit,
151+
"valid": valid,
152+
"stale": stale,
153+
}
154+
155+
156+
def update_project(path: Path, data: Dict, assessment: Dict[str, bool]) -> bool:
157+
should_deprecate = assessment["keyword_hit"] or assessment["stale"] or not assessment["valid"]
158+
159+
categories = data.get("category")
160+
if categories is None:
161+
categories = CommentedSeq()
162+
data["category"] = categories
163+
164+
modified = False
165+
166+
if should_deprecate:
167+
if "deprecated" not in categories:
168+
categories.append("deprecated")
169+
modified = True
170+
if data.get("recommended") is True:
171+
data.pop("recommended")
172+
modified = True
173+
else:
174+
if "deprecated" in categories:
175+
categories.remove("deprecated")
176+
modified = True
177+
178+
if modified:
179+
dump_yaml(path, data)
180+
return modified
181+
182+
183+
async def audit_projects(project_files: List[Path]) -> int:
184+
projects = []
185+
all_urls: Set[str] = set()
186+
for path in project_files:
187+
data = load_yaml(path)
188+
urls = []
189+
for key in ("source", "homepage"):
190+
value = data.get(key)
191+
if isinstance(value, str) and value.strip():
192+
urls.append(value.strip())
193+
all_urls.add(value.strip())
194+
projects.append((path, data, urls))
195+
196+
url_checks = await collect_url_data(all_urls)
197+
repos = {repo for url in all_urls if (repo := github_repo(url))}
198+
commit_dates = await collect_commit_data(repos)
199+
200+
deprecated_count = 0
201+
for path, data, urls in projects:
202+
assessment = decide_deprecation(urls, url_checks, commit_dates)
203+
if update_project(path, data, assessment):
204+
deprecated_count += 1
205+
return deprecated_count
206+
207+
208+
def main() -> int:
209+
project_files = sorted(PROJECT_DIR.glob("*.yml"))
210+
deprecated_count = asyncio.run(audit_projects(project_files))
211+
print(f"Updated {deprecated_count} project files.")
212+
return 0
213+
214+
215+
if __name__ == "__main__":
216+
sys.exit(main())

0 commit comments

Comments
 (0)