Skip to content

Commit 34196c0

Browse files
Save PDFs and SVGs with content hash (#3024)
Related to https://linear.app/getsentry/issue/EME-550/ios-insights-duplicate-files Write PDF/SVG raw data to disk during asset catalog parsing to enable duplicate detection of SVG and pdf files
1 parent 03edd6d commit 34196c0

File tree

1 file changed

+28
-7
lines changed

1 file changed

+28
-7
lines changed

apple-catalog-parsing/native/swift/AssetCatalogParser/Sources/AssetCatalogParser/AssetCatalogReader.swift

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import CoreGraphics
2+
import CryptoKit
23
import Foundation
34
import ImageIO
45
import UniformTypeIdentifiers
@@ -40,6 +41,7 @@ struct AssetCatalogEntry: Encodable {
4041
let type: AssetType?
4142
let idiom: String?
4243
let colorspace: String?
44+
let contentHash: String?
4345
}
4446

4547
enum Error: Swift.Error {
@@ -112,7 +114,7 @@ enum AssetUtil {
112114

113115
let (structuredThemeStore, assetKeys) = initializeCatalog(from: file)
114116

115-
var images: [String: (cgImage: CGImage, format: String)] = [:]
117+
var cgImages: [String: (cgImage: CGImage, format: String)] = [:]
116118

117119
// First pass: Build map of multisize sets and cache renditions for performance
118120
var multisizeSets: [MultisizeSetInfo] = []
@@ -216,17 +218,23 @@ enum AssetUtil {
216218
var width: Int?
217219
var height: Int?
218220
var unslicedImage: CGImage?
221+
var contentHash: String? = nil
219222

220223
if isMultisizeImageSet {
221224
continue
222225
} else {
223226
// Get image dimensions from regular rendition
224227
(width, height, unslicedImage) = resolveImageDimensions(rendition, isVector)
225228

226-
// Skip SVGs, but save images even if they don't have an extension (default to png)
227-
if fileExtension != "svg", let unslicedImage = unslicedImage {
229+
// Compute content hash for PDFs/SVGs without saving to disk
230+
if fileExtension == "pdf" || fileExtension == "svg" {
231+
// Hash PDFs/SVGs in-memory (Python can't access _srcData without parsing binary .car format)
232+
contentHash = data.sha256Hash()
233+
}
234+
// Save images that can be converted to CGImage (excluding PDFs/SVGs)
235+
else if let unslicedImage = unslicedImage {
228236
let format = fileExtension.isEmpty ? "png" : fileExtension
229-
images[imageId] = (cgImage: unslicedImage, format: format)
237+
cgImages[imageId] = (cgImage: unslicedImage, format: format)
230238
}
231239
}
232240

@@ -251,7 +259,8 @@ enum AssetUtil {
251259
filename: renditionTypeName,
252260
type: assetType,
253261
idiom: idiomToString(idiomValue),
254-
colorspace: colorSpaceIDToString(colorSpaceID)
262+
colorspace: colorSpaceIDToString(colorSpaceID),
263+
contentHash: contentHash
255264
)
256265
assets.append(asset)
257266
}
@@ -266,7 +275,8 @@ enum AssetUtil {
266275
filename: nil,
267276
type: nil,
268277
idiom: nil,
269-
colorspace: nil
278+
colorspace: nil,
279+
contentHash: nil
270280
))
271281

272282
let data = try! JSONEncoder().encode(assets)
@@ -275,7 +285,7 @@ enum AssetUtil {
275285
.appendingPathComponent("Assets")
276286
.appendingPathExtension("json")
277287
try! data.write(to: url, options: [])
278-
for (id, imageInfo) in images {
288+
for (id, imageInfo) in cgImages {
279289
let format = imageInfo.format
280290
let cgImage = imageInfo.cgImage
281291
let fileURL = folder.appendingPathComponent(id).appendingPathExtension(format)
@@ -460,6 +470,17 @@ enum AssetUtil {
460470
}
461471
}
462472

473+
private extension Data {
474+
func sha256Hash() -> String {
475+
if #available(macOS 10.15, *) {
476+
let digest = SHA256.hash(data: self)
477+
return digest.map { String(format: "%02x", $0) }.joined()
478+
}
479+
// Fallback for older macOS (shouldn't happen with version 13+ requirement)
480+
return ""
481+
}
482+
}
483+
463484
private extension NSObject {
464485
func getUInt(forKey key: String) -> UInt? {
465486
if let result = perform(Selector(key)) {

0 commit comments

Comments
 (0)