Skip to content

Commit cfab9d0

Browse files
committed
Add skip_transfer_syntaxes parameter to HTJ2K transcoding
Introduces a skip_transfer_syntaxes parameter to transcode_dicom_to_htj2k() that allows skipping transcoding for files already in desired formats. Files with specified transfer syntaxes are copied directly to output, avoiding unnecessary re-encoding of already-compressed formats. Default skip list includes: - HTJ2K transfer syntaxes (to avoid re-encoding) - Lossy JPEG 2000 (1.2.840.10008.1.2.4.91) - Lossy JPEG formats (1.2.840.10008.1.2.4.50, 1.2.840.10008.1.2.4.51) Also simplifies Basic Offset Table conditional logic and adds comprehensive unit tests covering skip behavior, statistics tracking, and edge cases. Signed-off-by: Joaquin Anton Guirao <janton@nvidia.com>
1 parent 09c0e33 commit cfab9d0

File tree

2 files changed

+459
-12
lines changed

2 files changed

+459
-12
lines changed

monailabel/datastore/utils/convert_htj2k.py

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,16 @@ def transcode_dicom_to_htj2k(
258258
progression_order: str = "RPCL",
259259
max_batch_size: int = 256,
260260
add_basic_offset_table: bool = True,
261+
skip_transfer_syntaxes: list = (
262+
_get_transfer_syntax_constants()['HTJ2K'] |
263+
frozenset([
264+
# Lossy JPEG 2000
265+
"1.2.840.10008.1.2.4.91", # JPEG 2000 Image Compression (lossy allowed)
266+
# Lossy JPEG
267+
"1.2.840.10008.1.2.4.50", # JPEG Baseline (Process 1) - always lossy
268+
"1.2.840.10008.1.2.4.51", # JPEG Extended (Process 2 & 4, can be lossy)
269+
])
270+
),
261271
) -> str:
262272
"""
263273
Transcode DICOM files to HTJ2K (High Throughput JPEG 2000) lossless compression.
@@ -280,7 +290,7 @@ def transcode_dicom_to_htj2k(
280290
in memory simultaneously.
281291
282292
Supported source transfer syntaxes:
283-
- HTJ2K (High-Throughput JPEG 2000) - decoded and re-encoded to add BOT if needed
293+
- HTJ2K (High-Throughput JPEG 2000) - decoded and re-encoded (add bot if needed)
284294
- JPEG 2000 (lossless and lossy)
285295
- JPEG (baseline, extended, lossless)
286296
- Uncompressed (Explicit/Implicit VR Little/Big Endian)
@@ -307,6 +317,10 @@ def transcode_dicom_to_htj2k(
307317
add_basic_offset_table: If True, creates Basic Offset Table for multi-frame DICOMs (default: True)
308318
BOT enables O(1) frame access without parsing entire pixel data stream
309319
Per DICOM Part 5 Section A.4. Only affects multi-frame files.
320+
skip_transfer_syntaxes: Optional list of Transfer Syntax UIDs to skip transcoding (default: HTJ2K, lossy JPEG 2000, and lossy JPEG)
321+
Files with these transfer syntaxes will be copied directly to output
322+
without transcoding. Useful for preserving already-compressed formats.
323+
Example: ["1.2.840.10008.1.2.4.201", "1.2.840.10008.1.2.4.202"]
310324
311325
Returns:
312326
str: Path to output directory containing transcoded DICOM files
@@ -337,6 +351,12 @@ def transcode_dicom_to_htj2k(
337351
... max_batch_size=5
338352
... )
339353
354+
>>> # Skip transcoding for files already in HTJ2K format
355+
>>> output_dir = transcode_dicom_to_htj2k(
356+
... input_dir="/path/to/dicoms",
357+
... skip_transfer_syntaxes=["1.2.840.10008.1.2.4.201", "1.2.840.10008.1.2.4.202"]
358+
... )
359+
340360
Note:
341361
Requires nvidia-nvimgcodec to be installed:
342362
pip install nvidia-nvimgcodec-cu{XX}[all]
@@ -396,8 +416,17 @@ def transcode_dicom_to_htj2k(
396416
ts_constants = _get_transfer_syntax_constants()
397417
NVIMGCODEC_SYNTAXES = ts_constants['NVIMGCODEC']
398418

419+
# Initialize skip list
420+
if skip_transfer_syntaxes is None:
421+
skip_transfer_syntaxes = []
422+
else:
423+
# Convert to set of strings for faster lookup
424+
skip_transfer_syntaxes = set(str(ts) for ts in skip_transfer_syntaxes)
425+
logger.info(f"Files with these transfer syntaxes will be copied without transcoding: {skip_transfer_syntaxes}")
426+
399427
start_time = time.time()
400428
transcoded_count = 0
429+
skipped_count = 0
401430

402431
# Calculate batch info for logging
403432
total_files = len(valid_dicom_files)
@@ -411,20 +440,37 @@ def transcode_dicom_to_htj2k(
411440
batch_datasets = [pydicom.dcmread(file) for file in batch_files]
412441
nvimgcodec_batch = []
413442
pydicom_batch = []
443+
skip_batch = [] # Indices of files to skip (copy directly)
414444

415445
for idx, ds in enumerate(batch_datasets):
416446
current_ts = getattr(ds, 'file_meta', {}).get('TransferSyntaxUID', None)
417447
if current_ts is None:
418448
raise ValueError(f"DICOM file {os.path.basename(batch_files[idx])} does not have a Transfer Syntax UID")
419449

420450
ts_str = str(current_ts)
451+
452+
# Check if this transfer syntax should be skipped
453+
if ts_str in skip_transfer_syntaxes:
454+
skip_batch.append(idx)
455+
logger.info(f" Skipping {os.path.basename(batch_files[idx])} (Transfer Syntax: {ts_str})")
456+
continue
457+
421458
if ts_str in NVIMGCODEC_SYNTAXES:
422459
if not hasattr(ds, "PixelData") or ds.PixelData is None:
423460
raise ValueError(f"DICOM file {os.path.basename(batch_files[idx])} does not have a PixelData member")
424461
nvimgcodec_batch.append(idx)
425462
else:
426463
pydicom_batch.append(idx)
427464

465+
# Handle skip_batch: copy files directly to output
466+
if skip_batch:
467+
for idx in skip_batch:
468+
source_file = batch_files[idx]
469+
output_file = os.path.join(output_dir, os.path.basename(source_file))
470+
shutil.copy2(source_file, output_file)
471+
skipped_count += 1
472+
logger.info(f" Copied {os.path.basename(source_file)} to output (skipped transcoding)")
473+
428474
num_frames = []
429475
encoded_data = []
430476

@@ -545,12 +591,7 @@ def transcode_dicom_to_htj2k(
545591

546592
# Update dataset with HTJ2K encoded data
547593
# Create Basic Offset Table for multi-frame files if requested
548-
if add_basic_offset_table and nframes > 1:
549-
batch_datasets[dataset_idx].PixelData = pydicom.encaps.encapsulate(encoded_frames, has_bot=True)
550-
logger.info(f" ✓ Basic Offset Table included for efficient frame access")
551-
else:
552-
batch_datasets[dataset_idx].PixelData = pydicom.encaps.encapsulate(encoded_frames)
553-
594+
batch_datasets[dataset_idx].PixelData = pydicom.encaps.encapsulate(encoded_frames, has_bot=add_basic_offset_table)
554595
batch_datasets[dataset_idx].file_meta.TransferSyntaxUID = pydicom.uid.UID(target_transfer_syntax)
555596

556597
# Update PhotometricInterpretation to RGB for YBR images since we decoded with RGB color_spec
@@ -572,6 +613,7 @@ def transcode_dicom_to_htj2k(
572613
logger.info(f"Transcoding complete:")
573614
logger.info(f" Total files: {len(valid_dicom_files)}")
574615
logger.info(f" Successfully transcoded: {transcoded_count}")
616+
logger.info(f" Skipped (copied without transcoding): {skipped_count}")
575617
logger.info(f" Time elapsed: {elapsed_time:.2f} seconds")
576618
logger.info(f" Output directory: {output_dir}")
577619

@@ -910,11 +952,7 @@ def convert_single_frame_dicom_series_to_multiframe(
910952
if encoded_frames_bytes is not None:
911953
# Encapsulated data (HTJ2K or preserved compressed format)
912954
# Use Basic Offset Table for multi-frame efficiency
913-
if add_basic_offset_table:
914-
output_ds.PixelData = pydicom.encaps.encapsulate(encoded_frames_bytes, has_bot=True)
915-
logger.info(f" ✓ Basic Offset Table included for efficient frame access")
916-
else:
917-
output_ds.PixelData = pydicom.encaps.encapsulate(encoded_frames_bytes)
955+
output_ds.PixelData = pydicom.encaps.encapsulate(encoded_frames_bytes, has_bot=add_basic_offset_table)
918956
else:
919957
# Uncompressed mode: combine all frames into a 3D array
920958
# Stack frames: (frames, rows, cols)

0 commit comments

Comments
 (0)