Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 85 additions & 11 deletions github_backup/github_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1898,11 +1898,21 @@ def backup_milestones(args, repo_cwd, repository, repos_template):
for milestone in _milestones:
milestones[milestone["number"]] = milestone

logger.info("Saving {0} milestones to disk".format(len(list(milestones.keys()))))
written_count = 0
for number, milestone in list(milestones.items()):
milestone_file = "{0}/{1}.json".format(milestone_cwd, number)
with codecs.open(milestone_file, "w", encoding="utf-8") as f:
json_dump(milestone, f)
if json_dump_if_changed(milestone, milestone_file):
written_count += 1

total = len(milestones)
if written_count == total:
logger.info("Saved {0} milestones to disk".format(total))
elif written_count == 0:
logger.info("{0} milestones unchanged, skipped write".format(total))
else:
logger.info("Saved {0} of {1} milestones to disk ({2} unchanged)".format(
written_count, total, total - written_count
))


def backup_labels(args, repo_cwd, repository, repos_template):
Expand Down Expand Up @@ -1955,19 +1965,17 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
reverse=True,
)
releases = releases[: args.number_of_latest_releases]
logger.info("Saving the latest {0} releases to disk".format(len(releases)))
else:
logger.info("Saving {0} releases to disk".format(len(releases)))

# for each release, store it
written_count = 0
for release in releases:
release_name = release["tag_name"]
release_name_safe = release_name.replace("/", "__")
output_filepath = os.path.join(
release_cwd, "{0}.json".format(release_name_safe)
)
with codecs.open(output_filepath, "w+", encoding="utf-8") as f:
json_dump(release, f)
if json_dump_if_changed(release, output_filepath):
written_count += 1

if include_assets:
assets = retrieve_data(args, release["assets_url"])
Expand All @@ -1984,6 +1992,17 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
fine=True if args.token_fine is not None else False,
)

# Log the results
total = len(releases)
if written_count == total:
logger.info("Saved {0} releases to disk".format(total))
elif written_count == 0:
logger.info("{0} releases unchanged, skipped write".format(total))
else:
logger.info("Saved {0} of {1} releases to disk ({2} unchanged)".format(
written_count, total, total - written_count
))


def fetch_repository(
name,
Expand Down Expand Up @@ -2108,9 +2127,10 @@ def _backup_data(args, name, template, output_file, output_directory):
mkdir_p(output_directory)
data = retrieve_data(args, template)

logger.info("Writing {0} {1} to disk".format(len(data), name))
with codecs.open(output_file, "w", encoding="utf-8") as f:
json_dump(data, f)
if json_dump_if_changed(data, output_file):
logger.info("Saved {0} {1} to disk".format(len(data), name))
else:
logger.info("{0} {1} unchanged, skipped write".format(len(data), name))


def json_dump(data, output_file):
Expand All @@ -2122,3 +2142,57 @@ def json_dump(data, output_file):
indent=4,
separators=(",", ": "),
)


def json_dump_if_changed(data, output_file_path):
"""
Write JSON data to file only if content has changed.

Compares the serialized JSON data with the existing file content
and only writes if different. This prevents unnecessary file
modification timestamp updates and disk writes.

Uses atomic writes (temp file + rename) to prevent corruption
if the process is interrupted during the write.

Args:
data: The data to serialize as JSON
output_file_path: The path to the output file

Returns:
True if file was written (content changed or new file)
False if write was skipped (content unchanged)
"""
# Serialize new data with consistent formatting matching json_dump()
new_content = json.dumps(
data,
ensure_ascii=False,
sort_keys=True,
indent=4,
separators=(",", ": "),
)

# Check if file exists and compare content
if os.path.exists(output_file_path):
try:
with codecs.open(output_file_path, "r", encoding="utf-8") as f:
existing_content = f.read()
if existing_content == new_content:
logger.debug(
"Content unchanged, skipping write: {0}".format(output_file_path)
)
return False
except (OSError, UnicodeDecodeError) as e:
# If we can't read the existing file, write the new one
logger.debug(
"Error reading existing file {0}, will overwrite: {1}".format(
output_file_path, e
)
)

# Write the file atomically using temp file + rename
temp_file = output_file_path + ".temp"
with codecs.open(temp_file, "w", encoding="utf-8") as f:
f.write(new_content)
os.rename(temp_file, output_file_path) # Atomic on POSIX systems
return True
198 changes: 198 additions & 0 deletions tests/test_json_dump_if_changed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
"""Tests for json_dump_if_changed functionality."""

import codecs
import json
import os
import tempfile

import pytest

from github_backup import github_backup


class TestJsonDumpIfChanged:
"""Test suite for json_dump_if_changed function."""

def test_writes_new_file(self):
"""Should write file when it doesn't exist."""
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "test.json")
test_data = {"key": "value", "number": 42}

result = github_backup.json_dump_if_changed(test_data, output_file)

assert result is True
assert os.path.exists(output_file)

# Verify content matches expected format
with codecs.open(output_file, "r", encoding="utf-8") as f:
content = f.read()
loaded = json.loads(content)
assert loaded == test_data

def test_skips_unchanged_file(self):
"""Should skip write when content is identical."""
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "test.json")
test_data = {"key": "value", "number": 42}

# First write
result1 = github_backup.json_dump_if_changed(test_data, output_file)
assert result1 is True

# Get the initial mtime
mtime1 = os.path.getmtime(output_file)

# Second write with same data
result2 = github_backup.json_dump_if_changed(test_data, output_file)
assert result2 is False

# File should not have been modified
mtime2 = os.path.getmtime(output_file)
assert mtime1 == mtime2

def test_writes_when_content_changed(self):
"""Should write file when content has changed."""
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "test.json")
test_data1 = {"key": "value1"}
test_data2 = {"key": "value2"}

# First write
result1 = github_backup.json_dump_if_changed(test_data1, output_file)
assert result1 is True

# Second write with different data
result2 = github_backup.json_dump_if_changed(test_data2, output_file)
assert result2 is True

# Verify new content
with codecs.open(output_file, "r", encoding="utf-8") as f:
loaded = json.load(f)
assert loaded == test_data2

def test_uses_consistent_formatting(self):
"""Should use same JSON formatting as json_dump."""
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "test.json")
test_data = {"z": "last", "a": "first", "m": "middle"}

github_backup.json_dump_if_changed(test_data, output_file)

with codecs.open(output_file, "r", encoding="utf-8") as f:
content = f.read()

# Check for consistent formatting:
# - sorted keys
# - 4-space indent
# - comma-colon-space separator
expected = json.dumps(
test_data,
ensure_ascii=False,
sort_keys=True,
indent=4,
separators=(",", ": "),
)
assert content == expected

def test_atomic_write_always_used(self):
"""Should always use temp file and rename for atomic writes."""
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "test.json")
test_data = {"key": "value"}

result = github_backup.json_dump_if_changed(test_data, output_file)

assert result is True
assert os.path.exists(output_file)

# Temp file should not exist after atomic write
temp_file = output_file + ".temp"
assert not os.path.exists(temp_file)

# Verify content
with codecs.open(output_file, "r", encoding="utf-8") as f:
loaded = json.load(f)
assert loaded == test_data

def test_handles_unicode_content(self):
"""Should correctly handle Unicode content."""
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "test.json")
test_data = {
"emoji": "🚀",
"chinese": "你好",
"arabic": "مرحبا",
"cyrillic": "Привет",
}

result = github_backup.json_dump_if_changed(test_data, output_file)
assert result is True

# Verify Unicode is preserved
with codecs.open(output_file, "r", encoding="utf-8") as f:
loaded = json.load(f)
assert loaded == test_data

# Second write should skip
result2 = github_backup.json_dump_if_changed(test_data, output_file)
assert result2 is False

def test_handles_complex_nested_data(self):
"""Should handle complex nested data structures."""
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "test.json")
test_data = {
"users": [
{"id": 1, "name": "Alice", "tags": ["admin", "user"]},
{"id": 2, "name": "Bob", "tags": ["user"]},
],
"metadata": {"version": "1.0", "nested": {"deep": {"value": 42}}},
}

result = github_backup.json_dump_if_changed(test_data, output_file)
assert result is True

# Verify structure is preserved
with codecs.open(output_file, "r", encoding="utf-8") as f:
loaded = json.load(f)
assert loaded == test_data

def test_overwrites_on_unicode_decode_error(self):
"""Should overwrite if existing file has invalid UTF-8."""
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "test.json")
test_data = {"key": "value"}

# Write invalid UTF-8 bytes
with open(output_file, "wb") as f:
f.write(b"\xff\xfe invalid utf-8")

# Should catch UnicodeDecodeError and overwrite
result = github_backup.json_dump_if_changed(test_data, output_file)
assert result is True

# Verify new content was written
with codecs.open(output_file, "r", encoding="utf-8") as f:
loaded = json.load(f)
assert loaded == test_data

def test_key_order_independence(self):
"""Should treat differently-ordered dicts as same if keys/values match."""
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "test.json")

# Write first dict
data1 = {"z": 1, "a": 2, "m": 3}
github_backup.json_dump_if_changed(data1, output_file)

# Try to write same data but different order
data2 = {"a": 2, "m": 3, "z": 1}
result = github_backup.json_dump_if_changed(data2, output_file)

# Should skip because content is the same (keys are sorted)
assert result is False


if __name__ == "__main__":
pytest.main([__file__, "-v"])