Skip to content

Commit 5103ff0

Browse files
committed
Avoid rewriting unchanged JSON files for labels, milestones, releases, hooks, followers, and following
This change reduces unnecessary writes when backing up metadata that changes infrequently. The implementation compares existing file content before writing and skips the write if the content is identical, preserving file timestamps. Key changes: - Added json_dump_if_changed() helper that compares content before writing - Uses atomic writes (temp file + rename) for all metadata files - NOT applied to issues/pulls (they use incremental_by_files logic) - Made log messages consistent and past tense ("Saved" instead of "Saving") - Added informative logging showing skip counts Fixes josegonzalez#133
1 parent 8b7512c commit 5103ff0

File tree

2 files changed

+287
-11
lines changed

2 files changed

+287
-11
lines changed

github_backup/github_backup.py

Lines changed: 85 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1898,11 +1898,21 @@ def backup_milestones(args, repo_cwd, repository, repos_template):
18981898
for milestone in _milestones:
18991899
milestones[milestone["number"]] = milestone
19001900

1901-
logger.info("Saving {0} milestones to disk".format(len(list(milestones.keys()))))
1901+
written_count = 0
19021902
for number, milestone in list(milestones.items()):
19031903
milestone_file = "{0}/{1}.json".format(milestone_cwd, number)
1904-
with codecs.open(milestone_file, "w", encoding="utf-8") as f:
1905-
json_dump(milestone, f)
1904+
if json_dump_if_changed(milestone, milestone_file):
1905+
written_count += 1
1906+
1907+
total = len(milestones)
1908+
if written_count == total:
1909+
logger.info("Saved {0} milestones to disk".format(total))
1910+
elif written_count == 0:
1911+
logger.info("{0} milestones unchanged, skipped write".format(total))
1912+
else:
1913+
logger.info("Saved {0} of {1} milestones to disk ({2} unchanged)".format(
1914+
written_count, total, total - written_count
1915+
))
19061916

19071917

19081918
def backup_labels(args, repo_cwd, repository, repos_template):
@@ -1955,19 +1965,17 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
19551965
reverse=True,
19561966
)
19571967
releases = releases[: args.number_of_latest_releases]
1958-
logger.info("Saving the latest {0} releases to disk".format(len(releases)))
1959-
else:
1960-
logger.info("Saving {0} releases to disk".format(len(releases)))
19611968

19621969
# for each release, store it
1970+
written_count = 0
19631971
for release in releases:
19641972
release_name = release["tag_name"]
19651973
release_name_safe = release_name.replace("/", "__")
19661974
output_filepath = os.path.join(
19671975
release_cwd, "{0}.json".format(release_name_safe)
19681976
)
1969-
with codecs.open(output_filepath, "w+", encoding="utf-8") as f:
1970-
json_dump(release, f)
1977+
if json_dump_if_changed(release, output_filepath):
1978+
written_count += 1
19711979

19721980
if include_assets:
19731981
assets = retrieve_data(args, release["assets_url"])
@@ -1984,6 +1992,17 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
19841992
fine=True if args.token_fine is not None else False,
19851993
)
19861994

1995+
# Log the results
1996+
total = len(releases)
1997+
if written_count == total:
1998+
logger.info("Saved {0} releases to disk".format(total))
1999+
elif written_count == 0:
2000+
logger.info("{0} releases unchanged, skipped write".format(total))
2001+
else:
2002+
logger.info("Saved {0} of {1} releases to disk ({2} unchanged)".format(
2003+
written_count, total, total - written_count
2004+
))
2005+
19872006

19882007
def fetch_repository(
19892008
name,
@@ -2108,9 +2127,10 @@ def _backup_data(args, name, template, output_file, output_directory):
21082127
mkdir_p(output_directory)
21092128
data = retrieve_data(args, template)
21102129

2111-
logger.info("Writing {0} {1} to disk".format(len(data), name))
2112-
with codecs.open(output_file, "w", encoding="utf-8") as f:
2113-
json_dump(data, f)
2130+
if json_dump_if_changed(data, output_file):
2131+
logger.info("Saved {0} {1} to disk".format(len(data), name))
2132+
else:
2133+
logger.info("{0} {1} unchanged, skipped write".format(len(data), name))
21142134

21152135

21162136
def json_dump(data, output_file):
@@ -2122,3 +2142,57 @@ def json_dump(data, output_file):
21222142
indent=4,
21232143
separators=(",", ": "),
21242144
)
2145+
2146+
2147+
def json_dump_if_changed(data, output_file_path):
2148+
"""
2149+
Write JSON data to file only if content has changed.
2150+
2151+
Compares the serialized JSON data with the existing file content
2152+
and only writes if different. This prevents unnecessary file
2153+
modification timestamp updates and disk writes.
2154+
2155+
Uses atomic writes (temp file + rename) to prevent corruption
2156+
if the process is interrupted during the write.
2157+
2158+
Args:
2159+
data: The data to serialize as JSON
2160+
output_file_path: The path to the output file
2161+
2162+
Returns:
2163+
True if file was written (content changed or new file)
2164+
False if write was skipped (content unchanged)
2165+
"""
2166+
# Serialize new data with consistent formatting matching json_dump()
2167+
new_content = json.dumps(
2168+
data,
2169+
ensure_ascii=False,
2170+
sort_keys=True,
2171+
indent=4,
2172+
separators=(",", ": "),
2173+
)
2174+
2175+
# Check if file exists and compare content
2176+
if os.path.exists(output_file_path):
2177+
try:
2178+
with codecs.open(output_file_path, "r", encoding="utf-8") as f:
2179+
existing_content = f.read()
2180+
if existing_content == new_content:
2181+
logger.debug(
2182+
"Content unchanged, skipping write: {0}".format(output_file_path)
2183+
)
2184+
return False
2185+
except (OSError, UnicodeDecodeError) as e:
2186+
# If we can't read the existing file, write the new one
2187+
logger.debug(
2188+
"Error reading existing file {0}, will overwrite: {1}".format(
2189+
output_file_path, e
2190+
)
2191+
)
2192+
2193+
# Write the file atomically using temp file + rename
2194+
temp_file = output_file_path + ".temp"
2195+
with codecs.open(temp_file, "w", encoding="utf-8") as f:
2196+
f.write(new_content)
2197+
os.rename(temp_file, output_file_path) # Atomic on POSIX systems
2198+
return True

tests/test_json_dump_if_changed.py

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
"""Tests for json_dump_if_changed functionality."""
2+
3+
import codecs
4+
import json
5+
import os
6+
import tempfile
7+
from unittest.mock import Mock, patch
8+
9+
import pytest
10+
11+
from github_backup import github_backup
12+
13+
14+
class TestJsonDumpIfChanged:
15+
"""Test suite for json_dump_if_changed function."""
16+
17+
def test_writes_new_file(self):
18+
"""Should write file when it doesn't exist."""
19+
with tempfile.TemporaryDirectory() as tmpdir:
20+
output_file = os.path.join(tmpdir, "test.json")
21+
test_data = {"key": "value", "number": 42}
22+
23+
result = github_backup.json_dump_if_changed(test_data, output_file)
24+
25+
assert result is True
26+
assert os.path.exists(output_file)
27+
28+
# Verify content matches expected format
29+
with codecs.open(output_file, "r", encoding="utf-8") as f:
30+
content = f.read()
31+
loaded = json.loads(content)
32+
assert loaded == test_data
33+
34+
def test_skips_unchanged_file(self):
35+
"""Should skip write when content is identical."""
36+
with tempfile.TemporaryDirectory() as tmpdir:
37+
output_file = os.path.join(tmpdir, "test.json")
38+
test_data = {"key": "value", "number": 42}
39+
40+
# First write
41+
result1 = github_backup.json_dump_if_changed(test_data, output_file)
42+
assert result1 is True
43+
44+
# Get the initial mtime
45+
mtime1 = os.path.getmtime(output_file)
46+
47+
# Second write with same data
48+
result2 = github_backup.json_dump_if_changed(test_data, output_file)
49+
assert result2 is False
50+
51+
# File should not have been modified
52+
mtime2 = os.path.getmtime(output_file)
53+
assert mtime1 == mtime2
54+
55+
def test_writes_when_content_changed(self):
56+
"""Should write file when content has changed."""
57+
with tempfile.TemporaryDirectory() as tmpdir:
58+
output_file = os.path.join(tmpdir, "test.json")
59+
test_data1 = {"key": "value1"}
60+
test_data2 = {"key": "value2"}
61+
62+
# First write
63+
result1 = github_backup.json_dump_if_changed(test_data1, output_file)
64+
assert result1 is True
65+
66+
# Second write with different data
67+
result2 = github_backup.json_dump_if_changed(test_data2, output_file)
68+
assert result2 is True
69+
70+
# Verify new content
71+
with codecs.open(output_file, "r", encoding="utf-8") as f:
72+
loaded = json.load(f)
73+
assert loaded == test_data2
74+
75+
def test_uses_consistent_formatting(self):
76+
"""Should use same JSON formatting as json_dump."""
77+
with tempfile.TemporaryDirectory() as tmpdir:
78+
output_file = os.path.join(tmpdir, "test.json")
79+
test_data = {"z": "last", "a": "first", "m": "middle"}
80+
81+
github_backup.json_dump_if_changed(test_data, output_file)
82+
83+
with codecs.open(output_file, "r", encoding="utf-8") as f:
84+
content = f.read()
85+
86+
# Check for consistent formatting:
87+
# - sorted keys
88+
# - 4-space indent
89+
# - comma-colon-space separator
90+
expected = json.dumps(
91+
test_data,
92+
ensure_ascii=False,
93+
sort_keys=True,
94+
indent=4,
95+
separators=(",", ": "),
96+
)
97+
assert content == expected
98+
99+
def test_atomic_write_always_used(self):
100+
"""Should always use temp file and rename for atomic writes."""
101+
with tempfile.TemporaryDirectory() as tmpdir:
102+
output_file = os.path.join(tmpdir, "test.json")
103+
test_data = {"key": "value"}
104+
105+
result = github_backup.json_dump_if_changed(test_data, output_file)
106+
107+
assert result is True
108+
assert os.path.exists(output_file)
109+
110+
# Temp file should not exist after atomic write
111+
temp_file = output_file + ".temp"
112+
assert not os.path.exists(temp_file)
113+
114+
# Verify content
115+
with codecs.open(output_file, "r", encoding="utf-8") as f:
116+
loaded = json.load(f)
117+
assert loaded == test_data
118+
119+
def test_handles_unicode_content(self):
120+
"""Should correctly handle Unicode content."""
121+
with tempfile.TemporaryDirectory() as tmpdir:
122+
output_file = os.path.join(tmpdir, "test.json")
123+
test_data = {
124+
"emoji": "🚀",
125+
"chinese": "你好",
126+
"arabic": "مرحبا",
127+
"cyrillic": "Привет"
128+
}
129+
130+
result = github_backup.json_dump_if_changed(test_data, output_file)
131+
assert result is True
132+
133+
# Verify Unicode is preserved
134+
with codecs.open(output_file, "r", encoding="utf-8") as f:
135+
loaded = json.load(f)
136+
assert loaded == test_data
137+
138+
# Second write should skip
139+
result2 = github_backup.json_dump_if_changed(test_data, output_file)
140+
assert result2 is False
141+
142+
def test_handles_complex_nested_data(self):
143+
"""Should handle complex nested data structures."""
144+
with tempfile.TemporaryDirectory() as tmpdir:
145+
output_file = os.path.join(tmpdir, "test.json")
146+
test_data = {
147+
"users": [
148+
{"id": 1, "name": "Alice", "tags": ["admin", "user"]},
149+
{"id": 2, "name": "Bob", "tags": ["user"]}
150+
],
151+
"metadata": {
152+
"version": "1.0",
153+
"nested": {"deep": {"value": 42}}
154+
}
155+
}
156+
157+
result = github_backup.json_dump_if_changed(test_data, output_file)
158+
assert result is True
159+
160+
# Verify structure is preserved
161+
with codecs.open(output_file, "r", encoding="utf-8") as f:
162+
loaded = json.load(f)
163+
assert loaded == test_data
164+
165+
def test_overwrites_on_unicode_decode_error(self):
166+
"""Should overwrite if existing file has invalid UTF-8."""
167+
with tempfile.TemporaryDirectory() as tmpdir:
168+
output_file = os.path.join(tmpdir, "test.json")
169+
test_data = {"key": "value"}
170+
171+
# Write invalid UTF-8 bytes
172+
with open(output_file, "wb") as f:
173+
f.write(b'\xff\xfe invalid utf-8')
174+
175+
# Should catch UnicodeDecodeError and overwrite
176+
result = github_backup.json_dump_if_changed(test_data, output_file)
177+
assert result is True
178+
179+
# Verify new content was written
180+
with codecs.open(output_file, "r", encoding="utf-8") as f:
181+
loaded = json.load(f)
182+
assert loaded == test_data
183+
184+
def test_key_order_independence(self):
185+
"""Should treat differently-ordered dicts as same if keys/values match."""
186+
with tempfile.TemporaryDirectory() as tmpdir:
187+
output_file = os.path.join(tmpdir, "test.json")
188+
189+
# Write first dict
190+
data1 = {"z": 1, "a": 2, "m": 3}
191+
github_backup.json_dump_if_changed(data1, output_file)
192+
193+
# Try to write same data but different order
194+
data2 = {"a": 2, "m": 3, "z": 1}
195+
result = github_backup.json_dump_if_changed(data2, output_file)
196+
197+
# Should skip because content is the same (keys are sorted)
198+
assert result is False
199+
200+
201+
if __name__ == "__main__":
202+
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)