Skip to content

Commit 995b7ed

Browse files
authored
Merge pull request #454 from Iamrodos/http-451
Skip DMCA'd repos which return a 451 response
2 parents 6fb0d86 + 7840528 commit 995b7ed

File tree

2 files changed

+201
-29
lines changed

2 files changed

+201
-29
lines changed

github_backup/github_backup.py

Lines changed: 58 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,15 @@
3737
FILE_URI_PREFIX = "file://"
3838
logger = logging.getLogger(__name__)
3939

40+
41+
class RepositoryUnavailableError(Exception):
42+
"""Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown)."""
43+
44+
def __init__(self, message, dmca_url=None):
45+
super().__init__(message)
46+
self.dmca_url = dmca_url
47+
48+
4049
# Setup SSL context with fallback chain
4150
https_ctx = ssl.create_default_context()
4251
if https_ctx.get_ca_certs():
@@ -612,6 +621,19 @@ def retrieve_data_gen(args, template, query_args=None, single_request=False):
612621

613622
status_code = int(r.getcode())
614623

624+
# Handle DMCA takedown (HTTP 451) - raise exception to skip entire repository
625+
if status_code == 451:
626+
dmca_url = None
627+
try:
628+
response_data = json.loads(r.read().decode("utf-8"))
629+
dmca_url = response_data.get("block", {}).get("html_url")
630+
except Exception:
631+
pass
632+
raise RepositoryUnavailableError(
633+
"Repository unavailable due to legal reasons (HTTP 451)",
634+
dmca_url=dmca_url
635+
)
636+
615637
# Check if we got correct data
616638
try:
617639
response = json.loads(r.read().decode("utf-8"))
@@ -1668,40 +1690,47 @@ def backup_repositories(args, output_directory, repositories):
16681690

16691691
continue # don't try to back anything else for a gist; it doesn't exist
16701692

1671-
download_wiki = args.include_wiki or args.include_everything
1672-
if repository["has_wiki"] and download_wiki:
1673-
fetch_repository(
1674-
repository["name"],
1675-
repo_url.replace(".git", ".wiki.git"),
1676-
os.path.join(repo_cwd, "wiki"),
1677-
skip_existing=args.skip_existing,
1678-
bare_clone=args.bare_clone,
1679-
lfs_clone=args.lfs_clone,
1680-
no_prune=args.no_prune,
1681-
)
1682-
if args.include_issues or args.include_everything:
1683-
backup_issues(args, repo_cwd, repository, repos_template)
1693+
try:
1694+
download_wiki = args.include_wiki or args.include_everything
1695+
if repository["has_wiki"] and download_wiki:
1696+
fetch_repository(
1697+
repository["name"],
1698+
repo_url.replace(".git", ".wiki.git"),
1699+
os.path.join(repo_cwd, "wiki"),
1700+
skip_existing=args.skip_existing,
1701+
bare_clone=args.bare_clone,
1702+
lfs_clone=args.lfs_clone,
1703+
no_prune=args.no_prune,
1704+
)
1705+
if args.include_issues or args.include_everything:
1706+
backup_issues(args, repo_cwd, repository, repos_template)
16841707

1685-
if args.include_pulls or args.include_everything:
1686-
backup_pulls(args, repo_cwd, repository, repos_template)
1708+
if args.include_pulls or args.include_everything:
1709+
backup_pulls(args, repo_cwd, repository, repos_template)
16871710

1688-
if args.include_milestones or args.include_everything:
1689-
backup_milestones(args, repo_cwd, repository, repos_template)
1711+
if args.include_milestones or args.include_everything:
1712+
backup_milestones(args, repo_cwd, repository, repos_template)
16901713

1691-
if args.include_labels or args.include_everything:
1692-
backup_labels(args, repo_cwd, repository, repos_template)
1714+
if args.include_labels or args.include_everything:
1715+
backup_labels(args, repo_cwd, repository, repos_template)
16931716

1694-
if args.include_hooks or args.include_everything:
1695-
backup_hooks(args, repo_cwd, repository, repos_template)
1717+
if args.include_hooks or args.include_everything:
1718+
backup_hooks(args, repo_cwd, repository, repos_template)
16961719

1697-
if args.include_releases or args.include_everything:
1698-
backup_releases(
1699-
args,
1700-
repo_cwd,
1701-
repository,
1702-
repos_template,
1703-
include_assets=args.include_assets or args.include_everything,
1704-
)
1720+
if args.include_releases or args.include_everything:
1721+
backup_releases(
1722+
args,
1723+
repo_cwd,
1724+
repository,
1725+
repos_template,
1726+
include_assets=args.include_assets or args.include_everything,
1727+
)
1728+
except RepositoryUnavailableError as e:
1729+
logger.warning(f"Repository {repository['full_name']} is unavailable (HTTP 451)")
1730+
if e.dmca_url:
1731+
logger.warning(f"DMCA notice: {e.dmca_url}")
1732+
logger.info(f"Skipping remaining resources for {repository['full_name']}")
1733+
continue
17051734

17061735
if args.incremental:
17071736
if last_update == "0000-00-00T00:00:00Z":

tests/test_http_451.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
"""Tests for HTTP 451 (DMCA takedown) handling."""
2+
3+
import json
4+
from unittest.mock import Mock, patch
5+
6+
import pytest
7+
8+
from github_backup import github_backup
9+
10+
11+
class TestHTTP451Exception:
12+
"""Test suite for HTTP 451 DMCA takedown exception handling."""
13+
14+
def test_repository_unavailable_error_raised(self):
15+
"""HTTP 451 should raise RepositoryUnavailableError with DMCA URL."""
16+
# Create mock args
17+
args = Mock()
18+
args.as_app = False
19+
args.token_fine = None
20+
args.token_classic = None
21+
args.username = None
22+
args.password = None
23+
args.osx_keychain_item_name = None
24+
args.osx_keychain_item_account = None
25+
args.throttle_limit = None
26+
args.throttle_pause = 0
27+
28+
# Mock HTTPError 451 response
29+
mock_response = Mock()
30+
mock_response.getcode.return_value = 451
31+
32+
dmca_data = {
33+
"message": "Repository access blocked",
34+
"block": {
35+
"reason": "dmca",
36+
"created_at": "2024-11-12T14:38:04Z",
37+
"html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
38+
}
39+
}
40+
mock_response.read.return_value = json.dumps(dmca_data).encode("utf-8")
41+
mock_response.headers = {"x-ratelimit-remaining": "5000"}
42+
mock_response.reason = "Unavailable For Legal Reasons"
43+
44+
def mock_get_response(request, auth, template):
45+
return mock_response, []
46+
47+
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
48+
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
49+
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
50+
51+
# Check exception has DMCA URL
52+
assert exc_info.value.dmca_url == "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
53+
assert "451" in str(exc_info.value)
54+
55+
def test_repository_unavailable_error_without_dmca_url(self):
56+
"""HTTP 451 without DMCA details should still raise exception."""
57+
args = Mock()
58+
args.as_app = False
59+
args.token_fine = None
60+
args.token_classic = None
61+
args.username = None
62+
args.password = None
63+
args.osx_keychain_item_name = None
64+
args.osx_keychain_item_account = None
65+
args.throttle_limit = None
66+
args.throttle_pause = 0
67+
68+
mock_response = Mock()
69+
mock_response.getcode.return_value = 451
70+
mock_response.read.return_value = b'{"message": "Blocked"}'
71+
mock_response.headers = {"x-ratelimit-remaining": "5000"}
72+
mock_response.reason = "Unavailable For Legal Reasons"
73+
74+
def mock_get_response(request, auth, template):
75+
return mock_response, []
76+
77+
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
78+
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
79+
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
80+
81+
# Exception raised even without DMCA URL
82+
assert exc_info.value.dmca_url is None
83+
assert "451" in str(exc_info.value)
84+
85+
def test_repository_unavailable_error_with_malformed_json(self):
86+
"""HTTP 451 with malformed JSON should still raise exception."""
87+
args = Mock()
88+
args.as_app = False
89+
args.token_fine = None
90+
args.token_classic = None
91+
args.username = None
92+
args.password = None
93+
args.osx_keychain_item_name = None
94+
args.osx_keychain_item_account = None
95+
args.throttle_limit = None
96+
args.throttle_pause = 0
97+
98+
mock_response = Mock()
99+
mock_response.getcode.return_value = 451
100+
mock_response.read.return_value = b"invalid json {"
101+
mock_response.headers = {"x-ratelimit-remaining": "5000"}
102+
mock_response.reason = "Unavailable For Legal Reasons"
103+
104+
def mock_get_response(request, auth, template):
105+
return mock_response, []
106+
107+
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
108+
with pytest.raises(github_backup.RepositoryUnavailableError):
109+
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
110+
111+
def test_other_http_errors_unchanged(self):
112+
"""Other HTTP errors should still raise generic Exception."""
113+
args = Mock()
114+
args.as_app = False
115+
args.token_fine = None
116+
args.token_classic = None
117+
args.username = None
118+
args.password = None
119+
args.osx_keychain_item_name = None
120+
args.osx_keychain_item_account = None
121+
args.throttle_limit = None
122+
args.throttle_pause = 0
123+
124+
mock_response = Mock()
125+
mock_response.getcode.return_value = 404
126+
mock_response.read.return_value = b'{"message": "Not Found"}'
127+
mock_response.headers = {"x-ratelimit-remaining": "5000"}
128+
mock_response.reason = "Not Found"
129+
130+
def mock_get_response(request, auth, template):
131+
return mock_response, []
132+
133+
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
134+
# Should raise generic Exception, not RepositoryUnavailableError
135+
with pytest.raises(Exception) as exc_info:
136+
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/notfound/issues"))
137+
138+
assert not isinstance(exc_info.value, github_backup.RepositoryUnavailableError)
139+
assert "404" in str(exc_info.value)
140+
141+
142+
if __name__ == "__main__":
143+
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)