Skip to content

Commit b35132a

Browse files
committed
Skip DMCA'd repos which return a 451 response
Log a warning and the link to the DMCA notice. Continue backing up other repositories instead of crashing. Closes #163
1 parent 6fb0d86 commit b35132a

File tree

2 files changed

+204
-29
lines changed

2 files changed

+204
-29
lines changed

github_backup/github_backup.py

Lines changed: 57 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@
3737
FILE_URI_PREFIX = "file://"
3838
logger = logging.getLogger(__name__)
3939

40+
41+
class RepositoryUnavailableError(Exception):
42+
"""Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown)."""
43+
44+
def __init__(self, message, dmca_url=None):
45+
super().__init__(message)
46+
self.dmca_url = dmca_url
47+
4048
# Setup SSL context with fallback chain
4149
https_ctx = ssl.create_default_context()
4250
if https_ctx.get_ca_certs():
@@ -612,6 +620,19 @@ def retrieve_data_gen(args, template, query_args=None, single_request=False):
612620

613621
status_code = int(r.getcode())
614622

623+
# Handle DMCA takedown (HTTP 451) - raise exception to skip entire repository
624+
if status_code == 451:
625+
dmca_url = None
626+
try:
627+
response_data = json.loads(r.read().decode("utf-8"))
628+
dmca_url = response_data.get("block", {}).get("html_url")
629+
except Exception:
630+
pass
631+
raise RepositoryUnavailableError(
632+
"Repository unavailable due to legal reasons (HTTP 451)",
633+
dmca_url=dmca_url
634+
)
635+
615636
# Check if we got correct data
616637
try:
617638
response = json.loads(r.read().decode("utf-8"))
@@ -1668,40 +1689,47 @@ def backup_repositories(args, output_directory, repositories):
16681689

16691690
continue # don't try to back anything else for a gist; it doesn't exist
16701691

1671-
download_wiki = args.include_wiki or args.include_everything
1672-
if repository["has_wiki"] and download_wiki:
1673-
fetch_repository(
1674-
repository["name"],
1675-
repo_url.replace(".git", ".wiki.git"),
1676-
os.path.join(repo_cwd, "wiki"),
1677-
skip_existing=args.skip_existing,
1678-
bare_clone=args.bare_clone,
1679-
lfs_clone=args.lfs_clone,
1680-
no_prune=args.no_prune,
1681-
)
1682-
if args.include_issues or args.include_everything:
1683-
backup_issues(args, repo_cwd, repository, repos_template)
1692+
try:
1693+
download_wiki = args.include_wiki or args.include_everything
1694+
if repository["has_wiki"] and download_wiki:
1695+
fetch_repository(
1696+
repository["name"],
1697+
repo_url.replace(".git", ".wiki.git"),
1698+
os.path.join(repo_cwd, "wiki"),
1699+
skip_existing=args.skip_existing,
1700+
bare_clone=args.bare_clone,
1701+
lfs_clone=args.lfs_clone,
1702+
no_prune=args.no_prune,
1703+
)
1704+
if args.include_issues or args.include_everything:
1705+
backup_issues(args, repo_cwd, repository, repos_template)
16841706

1685-
if args.include_pulls or args.include_everything:
1686-
backup_pulls(args, repo_cwd, repository, repos_template)
1707+
if args.include_pulls or args.include_everything:
1708+
backup_pulls(args, repo_cwd, repository, repos_template)
16871709

1688-
if args.include_milestones or args.include_everything:
1689-
backup_milestones(args, repo_cwd, repository, repos_template)
1710+
if args.include_milestones or args.include_everything:
1711+
backup_milestones(args, repo_cwd, repository, repos_template)
16901712

1691-
if args.include_labels or args.include_everything:
1692-
backup_labels(args, repo_cwd, repository, repos_template)
1713+
if args.include_labels or args.include_everything:
1714+
backup_labels(args, repo_cwd, repository, repos_template)
16931715

1694-
if args.include_hooks or args.include_everything:
1695-
backup_hooks(args, repo_cwd, repository, repos_template)
1716+
if args.include_hooks or args.include_everything:
1717+
backup_hooks(args, repo_cwd, repository, repos_template)
16961718

1697-
if args.include_releases or args.include_everything:
1698-
backup_releases(
1699-
args,
1700-
repo_cwd,
1701-
repository,
1702-
repos_template,
1703-
include_assets=args.include_assets or args.include_everything,
1704-
)
1719+
if args.include_releases or args.include_everything:
1720+
backup_releases(
1721+
args,
1722+
repo_cwd,
1723+
repository,
1724+
repos_template,
1725+
include_assets=args.include_assets or args.include_everything,
1726+
)
1727+
except RepositoryUnavailableError as e:
1728+
logger.warning(f"Repository {repository['full_name']} is unavailable (HTTP 451)")
1729+
if e.dmca_url:
1730+
logger.warning(f"DMCA notice: {e.dmca_url}")
1731+
logger.info(f"Skipping remaining resources for {repository['full_name']}")
1732+
continue
17051733

17061734
if args.incremental:
17071735
if last_update == "0000-00-00T00:00:00Z":

tests/test_http_451.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
"""Tests for HTTP 451 (DMCA takedown) handling."""
2+
3+
import json
4+
from unittest.mock import Mock, patch
5+
6+
import pytest
7+
8+
from github_backup import github_backup
9+
10+
11+
class TestHTTP451Exception:
12+
"""Test suite for HTTP 451 DMCA takedown exception handling."""
13+
14+
def test_repository_unavailable_error_raised(self):
15+
"""HTTP 451 should raise RepositoryUnavailableError with DMCA URL."""
16+
from urllib.error import HTTPError
17+
18+
# Create mock args
19+
args = Mock()
20+
args.as_app = False
21+
args.token_fine = None
22+
args.token_classic = None
23+
args.username = None
24+
args.password = None
25+
args.osx_keychain_item_name = None
26+
args.osx_keychain_item_account = None
27+
args.throttle_limit = None
28+
args.throttle_pause = 0
29+
30+
# Mock HTTPError 451 response
31+
mock_response = Mock()
32+
mock_response.getcode.return_value = 451
33+
34+
dmca_data = {
35+
"message": "Repository access blocked",
36+
"block": {
37+
"reason": "dmca",
38+
"created_at": "2024-11-12T14:38:04Z",
39+
"html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
40+
}
41+
}
42+
mock_response.read.return_value = json.dumps(dmca_data).encode("utf-8")
43+
mock_response.headers = {"x-ratelimit-remaining": "5000"}
44+
mock_response.reason = "Unavailable For Legal Reasons"
45+
46+
def mock_get_response(request, auth, template):
47+
return mock_response, []
48+
49+
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
50+
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
51+
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
52+
53+
# Check exception has DMCA URL
54+
assert exc_info.value.dmca_url == "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
55+
assert "451" in str(exc_info.value)
56+
57+
def test_repository_unavailable_error_without_dmca_url(self):
58+
"""HTTP 451 without DMCA details should still raise exception."""
59+
from urllib.error import HTTPError
60+
61+
args = Mock()
62+
args.as_app = False
63+
args.token_fine = None
64+
args.token_classic = None
65+
args.username = None
66+
args.password = None
67+
args.osx_keychain_item_name = None
68+
args.osx_keychain_item_account = None
69+
args.throttle_limit = None
70+
args.throttle_pause = 0
71+
72+
mock_response = Mock()
73+
mock_response.getcode.return_value = 451
74+
mock_response.read.return_value = b'{"message": "Blocked"}'
75+
mock_response.headers = {"x-ratelimit-remaining": "5000"}
76+
mock_response.reason = "Unavailable For Legal Reasons"
77+
78+
def mock_get_response(request, auth, template):
79+
return mock_response, []
80+
81+
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
82+
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
83+
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
84+
85+
# Exception raised even without DMCA URL
86+
assert exc_info.value.dmca_url is None
87+
assert "451" in str(exc_info.value)
88+
89+
def test_repository_unavailable_error_with_malformed_json(self):
90+
"""HTTP 451 with malformed JSON should still raise exception."""
91+
args = Mock()
92+
args.as_app = False
93+
args.token_fine = None
94+
args.token_classic = None
95+
args.username = None
96+
args.password = None
97+
args.osx_keychain_item_name = None
98+
args.osx_keychain_item_account = None
99+
args.throttle_limit = None
100+
args.throttle_pause = 0
101+
102+
mock_response = Mock()
103+
mock_response.getcode.return_value = 451
104+
mock_response.read.return_value = b"invalid json {"
105+
mock_response.headers = {"x-ratelimit-remaining": "5000"}
106+
mock_response.reason = "Unavailable For Legal Reasons"
107+
108+
def mock_get_response(request, auth, template):
109+
return mock_response, []
110+
111+
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
112+
with pytest.raises(github_backup.RepositoryUnavailableError):
113+
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
114+
115+
def test_other_http_errors_unchanged(self):
116+
"""Other HTTP errors should still raise generic Exception."""
117+
args = Mock()
118+
args.as_app = False
119+
args.token_fine = None
120+
args.token_classic = None
121+
args.username = None
122+
args.password = None
123+
args.osx_keychain_item_name = None
124+
args.osx_keychain_item_account = None
125+
args.throttle_limit = None
126+
args.throttle_pause = 0
127+
128+
mock_response = Mock()
129+
mock_response.getcode.return_value = 404
130+
mock_response.read.return_value = b'{"message": "Not Found"}'
131+
mock_response.headers = {"x-ratelimit-remaining": "5000"}
132+
mock_response.reason = "Not Found"
133+
134+
def mock_get_response(request, auth, template):
135+
return mock_response, []
136+
137+
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
138+
# Should raise generic Exception, not RepositoryUnavailableError
139+
with pytest.raises(Exception) as exc_info:
140+
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/notfound/issues"))
141+
142+
assert not isinstance(exc_info.value, github_backup.RepositoryUnavailableError)
143+
assert "404" in str(exc_info.value)
144+
145+
146+
if __name__ == "__main__":
147+
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)