Skip to content

Commit e00d77b

Browse files
committed
Skip datasets without pixel data
Signed-off-by: Joaquin Anton Guirao <janton@nvidia.com>
1 parent 2bd3b9e commit e00d77b

File tree

2 files changed

+208
-36
lines changed

2 files changed

+208
-36
lines changed

monailabel/datastore/utils/convert_htj2k.py

Lines changed: 63 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,9 @@ def transcode_dicom_to_htj2k(
507507
has_pixel_data = hasattr(ds, "PixelData") and ds.PixelData is not None
508508
if ts_str in skip_transfer_syntaxes or not has_pixel_data:
509509
skip_batch.append(idx)
510-
logger.info(f" Skipping {os.path.basename(batch_in[idx])} (Transfer Syntax: {ts_str}, has_pixel_data: {has_pixel_data})")
510+
logger.info(
511+
f" Skipping {os.path.basename(batch_in[idx])} (Transfer Syntax: {ts_str}, has_pixel_data: {has_pixel_data})"
512+
)
511513
continue
512514

513515
assert has_pixel_data, f"DICOM file {os.path.basename(batch_in[idx])} does not have a PixelData member"
@@ -918,6 +920,22 @@ def convert_single_frame_dicom_series_to_multiframe(
918920
file_paths = [fp for _, fp in file_list]
919921
datasets = [pydicom.dcmread(fp) for fp in file_paths]
920922

923+
# Filter out datasets without PixelData (e.g., DICOM SR, Presentation States, corrupted files)
924+
datasets_with_pixels = []
925+
for idx, ds in enumerate(datasets):
926+
if hasattr(ds, "PixelData") and ds.PixelData is not None:
927+
datasets_with_pixels.append(ds)
928+
else:
929+
logger.warning(f" Skipping file {file_paths[idx]} (no PixelData found)")
930+
931+
if not datasets_with_pixels:
932+
logger.error(f" Series {series_uid}: No valid datasets with PixelData found, skipping series")
933+
continue
934+
935+
# Replace datasets with filtered list
936+
datasets = datasets_with_pixels
937+
logger.info(f" Loaded {len(datasets)} valid datasets with PixelData")
938+
921939
# CRITICAL: Sort datasets by ImagePositionPatient Z-coordinate
922940
# This ensures Frame[0] is the first slice, Frame[N] is the last slice
923941
if all(hasattr(ds, "ImagePositionPatient") for ds in datasets):
@@ -946,9 +964,11 @@ def convert_single_frame_dicom_series_to_multiframe(
946964
logger.info(f" Using original transfer syntax: {target_transfer_syntax}")
947965

948966
# Check if we're dealing with encapsulated (compressed) data
967+
has_pixel_data = hasattr(template_ds, "PixelData") and template_ds.PixelData is not None
968+
# At this point we have filtered out datasets without PixelData, so this should never happen
969+
assert has_pixel_data, f"Template dataset {file_paths[0]} does not have a PixelData member"
949970
is_encapsulated = (
950-
hasattr(template_ds, "PixelData")
951-
and template_ds.file_meta.TransferSyntaxUID != pydicom.uid.ExplicitVRLittleEndian
971+
has_pixel_data and template_ds.file_meta.TransferSyntaxUID != pydicom.uid.ExplicitVRLittleEndian
952972
)
953973

954974
# Determine color_spec for this series based on PhotometricInterpretation
@@ -994,21 +1014,22 @@ def convert_single_frame_dicom_series_to_multiframe(
9941014
if first_ts in NVIMGCODEC_SYNTAXES or pydicom.encaps.encapsulate_extended:
9951015
# Encapsulated data - extract compressed frames
9961016
for ds in datasets:
997-
if hasattr(ds, "PixelData"):
998-
try:
999-
# Extract compressed frames
1000-
frames = [fragment for fragment in pydicom.encaps.generate_frames(ds.PixelData)]
1001-
all_frames.extend(frames)
1002-
except:
1003-
# Fall back to pixel_array for uncompressed
1004-
pixel_array = ds.pixel_array
1005-
if not isinstance(pixel_array, np.ndarray):
1006-
pixel_array = np.array(pixel_array)
1007-
if pixel_array.ndim == 2:
1008-
all_frames.append(pixel_array)
1009-
elif pixel_array.ndim == 3:
1010-
for frame_idx in range(pixel_array.shape[0]):
1011-
all_frames.append(pixel_array[frame_idx, :, :])
1017+
has_pixel_data = hasattr(ds, "PixelData") and ds.PixelData is not None
1018+
assert has_pixel_data, f"Dataset {file_paths[idx]} does not have a PixelData member"
1019+
try:
1020+
# Extract compressed frames
1021+
frames = [fragment for fragment in pydicom.encaps.generate_frames(ds.PixelData)]
1022+
all_frames.extend(frames)
1023+
except:
1024+
# Fall back to pixel_array for uncompressed
1025+
pixel_array = ds.pixel_array
1026+
if not isinstance(pixel_array, np.ndarray):
1027+
pixel_array = np.array(pixel_array)
1028+
if pixel_array.ndim == 2:
1029+
all_frames.append(pixel_array)
1030+
elif pixel_array.ndim == 3:
1031+
for frame_idx in range(pixel_array.shape[0]):
1032+
all_frames.append(pixel_array[frame_idx, :, :])
10121033
else:
10131034
# Uncompressed data - use pixel arrays
10141035
for ds in datasets:
@@ -1049,9 +1070,13 @@ def convert_single_frame_dicom_series_to_multiframe(
10491070
# Save ImageOrientationPatient and ImagePositionPatient BEFORE creating output_ds
10501071
# The shallow copy + delattr will affect the original datasets objects
10511072
# Save these values now so we can use them in functional groups later
1052-
original_image_orientation = datasets[0].ImageOrientationPatient if hasattr(datasets[0], "ImageOrientationPatient") else None
1053-
original_image_positions = [ds.ImagePositionPatient if hasattr(ds, "ImagePositionPatient") else None for ds in datasets]
1054-
1073+
original_image_orientation = (
1074+
datasets[0].ImageOrientationPatient if hasattr(datasets[0], "ImageOrientationPatient") else None
1075+
)
1076+
original_image_positions = [
1077+
ds.ImagePositionPatient if hasattr(ds, "ImagePositionPatient") else None for ds in datasets
1078+
]
1079+
10551080
# Create SIMPLE multi-frame DICOM file (like the user's example)
10561081
# Use first dataset as template, keeping its metadata
10571082
logger.info(f" Creating simple multi-frame DICOM from {total_frame_count} frames...")
@@ -1108,19 +1133,28 @@ def convert_single_frame_dicom_series_to_multiframe(
11081133
# CRITICAL: Remove top-level ImagePositionPatient and ImageOrientationPatient
11091134
# Working files (that display correctly in OHIF MPR) have NEITHER at top level
11101135
# These should ONLY exist in functional groups for Enhanced CT
1111-
1136+
11121137
if hasattr(output_ds, "ImagePositionPatient"):
11131138
delattr(output_ds, "ImagePositionPatient")
11141139
logger.info(f" ✓ Removed top-level ImagePositionPatient (use per-frame only)")
1115-
1140+
11161141
if hasattr(output_ds, "ImageOrientationPatient"):
11171142
delattr(output_ds, "ImageOrientationPatient")
11181143
logger.info(f" ✓ Removed top-level ImageOrientationPatient (use SharedFunctionalGroupsSequence only)")
11191144
# Set correct SOPClassUID for multi-frame (Enhanced/Multiframe) conversion
11201145
sopclass_map = {
1121-
"1.2.840.10008.5.1.4.1.1.2": ("1.2.840.10008.5.1.4.1.1.2.1", "Enhanced CT Image Storage"), # CT -> Enhanced CT
1122-
"1.2.840.10008.5.1.4.1.1.4": ("1.2.840.10008.5.1.4.1.1.4.1", "Enhanced MR Image Storage"), # MR -> Enhanced MR
1123-
"1.2.840.10008.5.1.4.1.1.6.1": ("1.2.840.10008.5.1.4.1.1.3.1", "Ultrasound Multi-frame Image Storage"), # US -> Ultrasound Multi-frame
1146+
"1.2.840.10008.5.1.4.1.1.2": (
1147+
"1.2.840.10008.5.1.4.1.1.2.1",
1148+
"Enhanced CT Image Storage",
1149+
), # CT -> Enhanced CT
1150+
"1.2.840.10008.5.1.4.1.1.4": (
1151+
"1.2.840.10008.5.1.4.1.1.4.1",
1152+
"Enhanced MR Image Storage",
1153+
), # MR -> Enhanced MR
1154+
"1.2.840.10008.5.1.4.1.1.6.1": (
1155+
"1.2.840.10008.5.1.4.1.1.3.1",
1156+
"Ultrasound Multi-frame Image Storage",
1157+
), # US -> Ultrasound Multi-frame
11241158
}
11251159

11261160
original_sopclass = getattr(datasets[0], "SOPClassUID", None)
@@ -1186,7 +1220,9 @@ def convert_single_frame_dicom_series_to_multiframe(
11861220
else:
11871221
# If missing, use default (0,0,frame_idx * spacing)
11881222
# This shouldn't happen for valid CT series, but ensures MPR compatibility
1189-
default_spacing = float(output_ds.SpacingBetweenSlices) if hasattr(output_ds, 'SpacingBetweenSlices') else 1.0
1223+
default_spacing = (
1224+
float(output_ds.SpacingBetweenSlices) if hasattr(output_ds, "SpacingBetweenSlices") else 1.0
1225+
)
11901226
plane_pos_item.ImagePositionPatient = [0.0, 0.0, frame_idx * default_spacing]
11911227
logger.warning(f" Frame {frame_idx} missing ImagePositionPatient, using default")
11921228
frame_item.PlanePositionSequence = Sequence([plane_pos_item])

tests/unit/datastore/test_convert_htj2k.py

Lines changed: 145 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# limitations under the License.
1111

1212
import os
13+
import shutil
1314
import tempfile
1415
import unittest
1516
from pathlib import Path
@@ -786,7 +787,9 @@ def test_transcode_dicom_to_htj2k_multiframe_metadata(self):
786787
first_original = original_datasets[0]
787788

788789
# Check ImagePositionPatient is NOT there at top level DICOM file
789-
self.assertFalse(hasattr(ds_multiframe, "ImagePositionPatient"), "Should not have ImagePositionPatient at top level")
790+
self.assertFalse(
791+
hasattr(ds_multiframe, "ImagePositionPatient"), "Should not have ImagePositionPatient at top level"
792+
)
790793

791794
# Check PixelSpacing
792795
self.assertTrue(hasattr(ds_multiframe, "PixelSpacing"), "Should have PixelSpacing")
@@ -812,31 +815,31 @@ def test_transcode_dicom_to_htj2k_multiframe_metadata(self):
812815
# Check SOPClassUID conversion to Enhanced/Multi-frame
813816
self.assertTrue(hasattr(ds_multiframe, "SOPClassUID"), "Should have SOPClassUID")
814817
self.assertTrue(hasattr(first_original, "SOPClassUID"), "Original should have SOPClassUID")
815-
818+
816819
# Map of single-frame to enhanced/multi-frame SOPClassUIDs
817820
sopclass_map = {
818-
"1.2.840.10008.5.1.4.1.1.2": "1.2.840.10008.5.1.4.1.1.2.1", # CT -> Enhanced CT
819-
"1.2.840.10008.5.1.4.1.1.4": "1.2.840.10008.5.1.4.1.1.4.1", # MR -> Enhanced MR
820-
"1.2.840.10008.5.1.4.1.1.6.1": "1.2.840.10008.5.1.4.1.1.3.1", # US -> Ultrasound Multi-frame
821+
"1.2.840.10008.5.1.4.1.1.2": "1.2.840.10008.5.1.4.1.1.2.1", # CT -> Enhanced CT
822+
"1.2.840.10008.5.1.4.1.1.4": "1.2.840.10008.5.1.4.1.1.4.1", # MR -> Enhanced MR
823+
"1.2.840.10008.5.1.4.1.1.6.1": "1.2.840.10008.5.1.4.1.1.3.1", # US -> Ultrasound Multi-frame
821824
}
822-
825+
823826
original_sopclass = str(first_original.SOPClassUID)
824827
multiframe_sopclass = str(ds_multiframe.SOPClassUID)
825-
828+
826829
if original_sopclass in sopclass_map:
827830
expected_sopclass = sopclass_map[original_sopclass]
828831
self.assertEqual(
829832
multiframe_sopclass,
830833
expected_sopclass,
831-
f"SOPClassUID should be converted from {original_sopclass} to {expected_sopclass}"
834+
f"SOPClassUID should be converted from {original_sopclass} to {expected_sopclass}",
832835
)
833836
print(f"✓ SOPClassUID converted: {original_sopclass} -> {multiframe_sopclass}")
834837
else:
835838
# If not in map, should remain unchanged
836839
self.assertEqual(
837840
multiframe_sopclass,
838841
original_sopclass,
839-
"SOPClassUID should remain unchanged if not in conversion map"
842+
"SOPClassUID should remain unchanged if not in conversion map",
840843
)
841844
print(f"✓ SOPClassUID unchanged: {multiframe_sopclass}")
842845

@@ -1974,6 +1977,139 @@ def collate_paths(batch):
19741977
shutil.rmtree(input_dir, ignore_errors=True)
19751978
shutil.rmtree(output_dir, ignore_errors=True)
19761979

1980+
def test_convert_multiframe_handles_missing_pixeldata(self):
1981+
"""Test that convert_single_frame_dicom_series_to_multiframe handles datasets without PixelData."""
1982+
if not HAS_NVIMGCODEC:
1983+
self.skipTest(
1984+
"nvimgcodec not available. Install nvidia-nvimgcodec-cu{XX} matching your CUDA version (e.g., nvidia-nvimgcodec-cu13 for CUDA 13.x)"
1985+
)
1986+
1987+
# Create temporary directory with mixed DICOM files
1988+
input_dir = tempfile.mkdtemp(prefix="test_missing_pixeldata_")
1989+
output_dir = tempfile.mkdtemp(prefix="test_missing_pixeldata_output_")
1990+
1991+
try:
1992+
# Create a series with some files having PixelData and some without
1993+
study_uid = pydicom.uid.generate_uid()
1994+
series_uid = pydicom.uid.generate_uid()
1995+
1996+
print(f"\nCreating test series with mixed PixelData presence...")
1997+
1998+
# Create 3 valid DICOM files with PixelData
1999+
valid_files = []
2000+
for i in range(3):
2001+
ds = pydicom.Dataset()
2002+
ds.StudyInstanceUID = study_uid
2003+
ds.SeriesInstanceUID = series_uid
2004+
ds.SOPInstanceUID = pydicom.uid.generate_uid()
2005+
ds.SOPClassUID = "1.2.840.10008.5.1.4.1.1.2" # CT Image Storage
2006+
ds.InstanceNumber = i + 1
2007+
ds.Modality = "CT"
2008+
ds.PatientName = "Test^Patient"
2009+
ds.PatientID = "12345"
2010+
2011+
# Add spatial metadata
2012+
ds.ImagePositionPatient = [0.0, 0.0, float(i * 2.5)]
2013+
ds.ImageOrientationPatient = [1.0, 0.0, 0.0, 0.0, 1.0, 0.0]
2014+
ds.PixelSpacing = [0.5, 0.5]
2015+
ds.SliceThickness = 2.5
2016+
2017+
# Add image data
2018+
ds.Rows = 64
2019+
ds.Columns = 64
2020+
ds.SamplesPerPixel = 1
2021+
ds.PhotometricInterpretation = "MONOCHROME2"
2022+
ds.BitsAllocated = 16
2023+
ds.BitsStored = 16
2024+
ds.HighBit = 15
2025+
ds.PixelRepresentation = 0
2026+
2027+
# Create pixel data
2028+
pixel_array = np.random.randint(0, 1000, (64, 64), dtype=np.uint16)
2029+
ds.PixelData = pixel_array.tobytes()
2030+
2031+
# Save file with proper file meta
2032+
ds.file_meta = pydicom.dataset.FileMetaDataset()
2033+
ds.file_meta.FileMetaInformationVersion = b"\x00\x01"
2034+
ds.file_meta.TransferSyntaxUID = pydicom.uid.ExplicitVRLittleEndian
2035+
ds.file_meta.MediaStorageSOPClassUID = ds.SOPClassUID
2036+
ds.file_meta.MediaStorageSOPInstanceUID = ds.SOPInstanceUID
2037+
ds.file_meta.ImplementationClassUID = pydicom.uid.PYDICOM_IMPLEMENTATION_UID
2038+
2039+
filepath = os.path.join(input_dir, f"valid_{i:03d}.dcm")
2040+
# Use save_as which properly writes DICOM Part 10 format with preamble
2041+
ds.save_as(filepath, enforce_file_format=True)
2042+
valid_files.append(filepath)
2043+
print(f" Created valid file: {os.path.basename(filepath)}")
2044+
2045+
# Create 2 DICOM files WITHOUT PixelData (like SR or metadata-only)
2046+
for i in range(2):
2047+
ds = pydicom.Dataset()
2048+
ds.StudyInstanceUID = study_uid
2049+
ds.SeriesInstanceUID = series_uid
2050+
ds.SOPInstanceUID = pydicom.uid.generate_uid()
2051+
ds.SOPClassUID = "1.2.840.10008.5.1.4.1.1.2" # CT Image Storage
2052+
ds.InstanceNumber = i + 10
2053+
ds.Modality = "CT"
2054+
ds.PatientName = "Test^Patient"
2055+
ds.PatientID = "12345"
2056+
2057+
# Add spatial metadata but NO PixelData
2058+
ds.ImagePositionPatient = [0.0, 0.0, float((i + 10) * 2.5)]
2059+
ds.ImageOrientationPatient = [1.0, 0.0, 0.0, 0.0, 1.0, 0.0]
2060+
2061+
# Save file with proper file meta
2062+
ds.file_meta = pydicom.dataset.FileMetaDataset()
2063+
ds.file_meta.FileMetaInformationVersion = b"\x00\x01"
2064+
ds.file_meta.TransferSyntaxUID = pydicom.uid.ExplicitVRLittleEndian
2065+
ds.file_meta.MediaStorageSOPClassUID = ds.SOPClassUID
2066+
ds.file_meta.MediaStorageSOPInstanceUID = ds.SOPInstanceUID
2067+
ds.file_meta.ImplementationClassUID = pydicom.uid.PYDICOM_IMPLEMENTATION_UID
2068+
2069+
filepath = os.path.join(input_dir, f"no_pixel_{i:03d}.dcm")
2070+
# Use save_as which properly writes DICOM Part 10 format with preamble
2071+
ds.save_as(filepath, enforce_file_format=True)
2072+
print(f" Created file without PixelData: {os.path.basename(filepath)}")
2073+
2074+
print(f"✓ Created {len(valid_files)} valid files and 2 files without PixelData")
2075+
2076+
# Convert to multiframe - should skip files without PixelData
2077+
result_dir = convert_single_frame_dicom_series_to_multiframe(
2078+
input_dir=input_dir,
2079+
output_dir=output_dir,
2080+
convert_to_htj2k=True,
2081+
)
2082+
2083+
# Verify multiframe file was created
2084+
multiframe_files = list(Path(result_dir).rglob("*.dcm"))
2085+
self.assertEqual(len(multiframe_files), 1, "Should create one multiframe file")
2086+
print(f"✓ Created multiframe file: {multiframe_files[0]}")
2087+
2088+
# Load and verify the multiframe file
2089+
ds_multiframe = pydicom.dcmread(str(multiframe_files[0]))
2090+
2091+
# Should have 3 frames (only the valid files)
2092+
self.assertTrue(hasattr(ds_multiframe, "NumberOfFrames"), "Should have NumberOfFrames")
2093+
num_frames = int(ds_multiframe.NumberOfFrames)
2094+
self.assertEqual(num_frames, 3, "Should have 3 frames (files without PixelData excluded)")
2095+
print(f"✓ NumberOfFrames: {num_frames} (correctly excluded files without PixelData)")
2096+
2097+
# Verify PerFrameFunctionalGroupsSequence has correct number of items
2098+
self.assertTrue(
2099+
hasattr(ds_multiframe, "PerFrameFunctionalGroupsSequence"),
2100+
"Should have PerFrameFunctionalGroupsSequence",
2101+
)
2102+
per_frame_seq = ds_multiframe.PerFrameFunctionalGroupsSequence
2103+
self.assertEqual(len(per_frame_seq), 3, "Should have 3 per-frame items")
2104+
print(f"✓ PerFrameFunctionalGroupsSequence has {len(per_frame_seq)} items")
2105+
2106+
print(f"✓ Test passed: Files without PixelData were correctly skipped")
2107+
2108+
finally:
2109+
# Clean up
2110+
shutil.rmtree(input_dir, ignore_errors=True)
2111+
shutil.rmtree(output_dir, ignore_errors=True)
2112+
19772113

19782114
if __name__ == "__main__":
19792115
unittest.main()

0 commit comments

Comments
 (0)