Add comprehensive multi-frame HTJ2K DICOM testing and improve segmentation validation

jantonguirao · jantonguirao · commit 0a3fd792cae9 · 2025-10-28T11:25:43.000+01:00
This commit adds extensive test coverage for multi-frame HTJ2K DICOM handling
and improves segmentation output validation across different DICOM formats.

Test Improvements - test_dicom_segmentation.py:
- Add _load_segmentation_array() helper for consistent segmentation loading
- Add _compare_segmentations() helper using Dice coefficient and pixel accuracy
- Refactor test_04 to test_04_compare_all_formats for comprehensive cross-format comparison
  * Compares Standard DICOM, HTJ2K, and Multi-frame HTJ2K outputs
  * Validates all formats produce highly similar segmentations (Dice &gt; 0.95)
- Improve test_05_compare_dicom_vs_nifti with actual segmentation comparison logic
- Update test_06_multiframe_htj2k_inference with corrected test data path
- Remove redundant tests (test_07, test_08, test_09) - functionality consolidated in test_04

Multi-frame HTJ2K Tests - test_convert.py:
- Add HTJ2K_TRANSFER_SYNTAXES constant for explicit transfer syntax validation
- Add test_transcode_dicom_to_htj2k_multiframe_metadata()
  * Validates all DICOM metadata preservation (ImagePositionPatient, ImageOrientationPatient, etc.)
  * Verifies per-frame functional groups match original files
  * Checks frame ordering and spatial attributes
- Add test_transcode_dicom_to_htj2k_multiframe_lossless()
  * Validates pixel-perfect lossless compression
  * Verifies all frames match original pixel data
- Add test_transcode_dicom_to_htj2k_multiframe_nifti_consistency()
  * Ensures multi-frame HTJ2K produces identical NIfTI output as original series
- Update all transfer syntax checks to use HTJ2K_TRANSFER_SYNTAXES constant
  * Replaces .startswith("1.2.840.10008.1.2.4.20") with explicit UID list
  * Covers all three HTJ2K variants (lossless, RPCL, lossy)

Code Cleanup:
- Revert debug logging in monailabel/endpoints/infer.py
- Add HTJ2K transfer syntax documentation in convert.py

All tests pass successfully, validating that:
1. Segmentation outputs are consistent across all DICOM formats
2. Multi-frame HTJ2K transcoding preserves all metadata correctly
3. Multi-frame HTJ2K compression is lossless
4. Multi-frame HTJ2K produces identical results to single-frame series

Signed-off-by: Joaquin Anton Guirao &lt;janton@nvidia.com&gt;
diff --git a/monailabel/datastore/utils/convert.py b/monailabel/datastore/utils/convert.py
@@ -639,6 +639,22 @@ def dicom_seg_to_itk_image(label, output_ext=".seg.nrrd"):
     return output_file
 
 
+def _create_basic_offset_table_pixel_data(encoded_frames: list) -> bytes:
+    """
+    Create encapsulated pixel data with Basic Offset Table for multi-frame DICOM.
+    
+    Uses pydicom's encapsulate() function to ensure 100% standard compliance.
+    
+    Args:
+        encoded_frames: List of encoded frame byte strings
+        
+    Returns:
+        bytes: Encapsulated pixel data with Basic Offset Table per DICOM Part 5 Section A.4
+    """
+    return pydicom.encaps.encapsulate(encoded_frames, has_bot=True)
+
+
+
 def _setup_htj2k_decode_params():
     """
     Create nvimgcodec decoding parameters for DICOM images.
diff --git a/monailabel/endpoints/infer.py b/monailabel/endpoints/infer.py
@@ -92,20 +92,6 @@ def send_response(datastore, result, output, background_tasks):
         return res_json
 
     if output == "image":
-        # Log NRRD metadata before sending response
-        try:
-            import nrrd
-            if res_img and os.path.exists(res_img) and (res_img.endswith('.nrrd') or res_img.endswith('.nrrd.gz')):
-                _, header = nrrd.read(res_img, index_order='C')
-                logger.info(f"[NRRD Geometry] File: {os.path.basename(res_img)}")
-                logger.info(f"[NRRD Geometry] Dimensions: {header.get('sizes')}")
-                logger.info(f"[NRRD Geometry] Space Origin: {header.get('space origin')}")
-                logger.info(f"[NRRD Geometry] Space Directions: {header.get('space directions')}")
-                logger.info(f"[NRRD Geometry] Space: {header.get('space')}")
-                logger.info(f"[NRRD Geometry] Type: {header.get('type')}")
-                logger.info(f"[NRRD Geometry] Encoding: {header.get('encoding')}")
-        except Exception as e:
-            logger.warning(f"Failed to read NRRD metadata: {e}")
         return FileResponse(res_img, media_type=get_mime_type(res_img), filename=os.path.basename(res_img))
 
     if output == "dicom_seg":
diff --git a/tests/integration/radiology_serverless/test_dicom_segmentation.py b/tests/integration/radiology_serverless/test_dicom_segmentation.py
@@ -65,7 +65,14 @@ class TestDicomSegmentation(unittest.TestCase):
         "e7567e0a064f0c334226a0658de23afd",
         "1.2.826.0.1.3680043.8.274.1.1.8323329.686521.1629744176.620266"
     )
-    
+
+    dicomweb_htj2k_multiframe_series = os.path.join(
+        data_dir,
+        "dataset",
+        "dicomweb_htj2k_multiframe",
+        "1.2.826.0.1.3680043.8.274.1.1.8323329.686521.1629744176.620251"
+    )
+
     @classmethod
     def setUpClass(cls) -> None:
         """Initialize MONAI Label app for direct usage without server."""
@@ -128,6 +135,25 @@ def _run_inference(self, image_path: str, model_name: str = "segmentation_spleen
         
         return label_data, label_json, inference_time
     
+    def _load_segmentation_array(self, label_data):
+        """
+        Load segmentation data as numpy array.
+        
+        Args:
+            label_data: File path (str) or numpy array
+            
+        Returns:
+            numpy array of segmentation
+        """
+        if isinstance(label_data, str):
+            import nibabel as nib
+            nii = nib.load(label_data)
+            return nii.get_fdata()
+        elif isinstance(label_data, np.ndarray):
+            return label_data
+        else:
+            raise ValueError(f"Unexpected label data type: {type(label_data)}")
+    
     def _validate_segmentation_output(self, label_data, label_json):
         """
         Validate that the segmentation output is correct.
@@ -146,9 +172,7 @@ def _validate_segmentation_output(self, label_data, label_json):
             
             # Try to load and verify the file
             try:
-                import nibabel as nib
-                nii = nib.load(label_data)
-                array = nii.get_fdata()
+                array = self._load_segmentation_array(label_data)
                 self.assertGreater(array.size, 0, "Segmentation array should not be empty")
                 logger.info(f"Segmentation shape: {array.shape}, dtype: {array.dtype}")
                 logger.info(f"Unique labels: {np.unique(array)}")
@@ -166,6 +190,71 @@ def _validate_segmentation_output(self, label_data, label_json):
         self.assertIsInstance(label_json, dict, "Label JSON should be a dictionary")
         logger.info(f"Label metadata keys: {list(label_json.keys())}")
     
+    def _compare_segmentations(self, label_data_1, label_data_2, name_1="Reference", name_2="Comparison", tolerance=0.05):
+        """
+        Compare two segmentation outputs to verify they are similar.
+        
+        Args:
+            label_data_1: First segmentation (file path or array)
+            label_data_2: Second segmentation (file path or array)
+            name_1: Name for first segmentation (for logging)
+            name_2: Name for second segmentation (for logging)
+            tolerance: Maximum allowed dice coefficient difference (0.0-1.0)
+            
+        Returns:
+            dict with comparison metrics
+        """
+        # Load arrays
+        array_1 = self._load_segmentation_array(label_data_1)
+        array_2 = self._load_segmentation_array(label_data_2)
+        
+        # Check shapes match
+        self.assertEqual(array_1.shape, array_2.shape, 
+                        f"Segmentation shapes should match: {array_1.shape} vs {array_2.shape}")
+        
+        # Calculate dice coefficient for each label
+        unique_labels = np.union1d(np.unique(array_1), np.unique(array_2))
+        unique_labels = unique_labels[unique_labels != 0]  # Exclude background
+        
+        dice_scores = {}
+        for label in unique_labels:
+            mask_1 = (array_1 == label).astype(np.float32)
+            mask_2 = (array_2 == label).astype(np.float32)
+            
+            intersection = np.sum(mask_1 * mask_2)
+            sum_masks = np.sum(mask_1) + np.sum(mask_2)
+            
+            if sum_masks > 0:
+                dice = (2.0 * intersection) / sum_masks
+                dice_scores[int(label)] = dice
+            else:
+                dice_scores[int(label)] = 0.0
+        
+        # Calculate overall metrics
+        exact_match = np.array_equal(array_1, array_2)
+        pixel_accuracy = np.mean(array_1 == array_2)
+        
+        comparison_result = {
+            'exact_match': exact_match,
+            'pixel_accuracy': pixel_accuracy,
+            'dice_scores': dice_scores,
+            'avg_dice': np.mean(list(dice_scores.values())) if dice_scores else 0.0
+        }
+        
+        # Log results
+        logger.info(f"\nComparing {name_1} vs {name_2}:")
+        logger.info(f"  Exact match: {exact_match}")
+        logger.info(f"  Pixel accuracy: {pixel_accuracy:.4f}")
+        logger.info(f"  Dice scores by label: {dice_scores}")
+        logger.info(f"  Average Dice: {comparison_result['avg_dice']:.4f}")
+        
+        # Assert high similarity
+        self.assertGreater(comparison_result['avg_dice'], 1.0 - tolerance,
+                          f"Segmentations should be similar (Dice > {1.0 - tolerance:.2f}). "
+                          f"Got {comparison_result['avg_dice']:.4f}")
+        
+        return comparison_result
+    
     def test_01_app_initialized(self):
         """Test that the app is properly initialized."""
         if not torch.cuda.is_available():
@@ -223,53 +312,110 @@ def test_03_dicom_inference_dicomweb_htj2k(self):
         self.assertLess(inference_time, 60.0, "Inference should complete within 60 seconds")
         logger.info(f"✓ DICOM inference test passed (HTJ2K) in {inference_time:.3f}s")
     
-    def test_04_dicom_inference_both_formats(self):
-        """Test inference on both standard and HTJ2K compressed DICOM series."""
+    def test_04_compare_all_formats(self):
+        """
+        Compare segmentation outputs across all DICOM format variations.
+        
+        This is the KEY test that validates:
+        - Standard DICOM (uncompressed, single-frame)
+        - HTJ2K compressed DICOM (single-frame)
+        - Multi-frame HTJ2K DICOM
+        
+        All produce IDENTICAL or highly similar segmentation results.
+        """
         if not torch.cuda.is_available():
             self.skipTest("CUDA not available")
         
         if not self.app:
             self.skipTest("App not initialized")
         
-        # Test both series types
+        logger.info(f"\n{'='*60}")
+        logger.info("Comparing Segmentation Outputs Across All Formats")
+        logger.info(f"{'='*60}")
+        
+        # Test all series types
         test_series = [
             ("Standard DICOM", self.dicomweb_series),
             ("HTJ2K DICOM", self.dicomweb_htj2k_series),
+            ("Multi-frame HTJ2K", self.dicomweb_htj2k_multiframe_series),
         ]
         
-        total_time = 0
-        successful = 0
-        
-        for series_type, dicom_dir in test_series:
-            if not os.path.exists(dicom_dir):
-                logger.warning(f"Skipping {series_type}: {dicom_dir} not found")
+        # Run inference on all available formats
+        results = {}
+        for series_name, series_path in test_series:
+            if not os.path.exists(series_path):
+                logger.warning(f"Skipping {series_name}: not found")
                 continue
             
-            logger.info(f"\nProcessing {series_type}: {dicom_dir}")
-            
+            logger.info(f"\nRunning {series_name}...")
             try:
-                label_data, label_json, inference_time = self._run_inference(dicom_dir)
+                label_data, label_json, inference_time = self._run_inference(series_path)
                 self._validate_segmentation_output(label_data, label_json)
                 
-                total_time += inference_time
-                successful += 1
-                logger.info(f"✓ {series_type} success in {inference_time:.3f}s")
-                
+                results[series_name] = {
+                    'label_data': label_data,
+                    'label_json': label_json,
+                    'time': inference_time
+                }
+                logger.info(f"  ✓ {series_name} completed in {inference_time:.3f}s")
             except Exception as e:
-                logger.error(f"✗ {series_type} failed: {e}", exc_info=True)
+                logger.error(f"  ✗ {series_name} failed: {e}", exc_info=True)
         
+        # Require at least 2 formats to compare
+        self.assertGreaterEqual(len(results), 2, 
+                               "Need at least 2 formats to compare. Check test data availability.")
+        
+        # Compare all pairs
+        logger.info(f"\n{'='*60}")
+        logger.info("Cross-Format Comparison:")
+        logger.info(f"{'='*60}")
+        
+        format_names = list(results.keys())
+        comparison_results = []
+        
+        for i in range(len(format_names)):
+            for j in range(i + 1, len(format_names)):
+                name1 = format_names[i]
+                name2 = format_names[j]
+                
+                logger.info(f"\nComparing: {name1} vs {name2}")
+                try:
+                    comparison = self._compare_segmentations(
+                        results[name1]['label_data'],
+                        results[name2]['label_data'],
+                        name_1=name1,
+                        name_2=name2,
+                        tolerance=0.05  # Allow 5% dice variation
+                    )
+                    comparison_results.append({
+                        'pair': f"{name1} vs {name2}",
+                        'dice': comparison['avg_dice'],
+                        'pixel_accuracy': comparison['pixel_accuracy']
+                    })
+                except Exception as e:
+                    logger.error(f"Comparison failed: {e}", exc_info=True)
+                    raise
+        
+        # Summary
         logger.info(f"\n{'='*60}")
-        logger.info(f"Summary: {successful}/{len(test_series)} series processed successfully")
-        if successful > 0:
-            logger.info(f"Total inference time: {total_time:.3f}s")
-            logger.info(f"Average time per series: {total_time/successful:.3f}s")
+        logger.info("Comparison Summary:")
+        for comp in comparison_results:
+            logger.info(f"  {comp['pair']}: Dice={comp['dice']:.4f}, Accuracy={comp['pixel_accuracy']:.4f}")
         logger.info(f"{'='*60}")
         
-        # At least one should succeed
-        self.assertGreater(successful, 0, "At least one DICOM series should be processed successfully")
+        # All comparisons should show high similarity
+        self.assertTrue(len(comparison_results) > 0, "Should have at least one comparison")
+        avg_dice = np.mean([c['dice'] for c in comparison_results])
+        logger.info(f"\nOverall average Dice across all comparisons: {avg_dice:.4f}")
+        self.assertGreater(avg_dice, 0.95, 
+                          "All formats should produce highly similar segmentations (avg Dice > 0.95)")
     
     def test_05_compare_dicom_vs_nifti(self):
-        """Compare inference results between DICOM series and pre-converted NIfTI files."""
+        """
+        Compare inference results between DICOM series and pre-converted NIfTI files.
+        
+        Validates that the DICOM reader produces identical results to pre-converted NIfTI.
+        """
         if not torch.cuda.is_available():
             self.skipTest("CUDA not available")
         
@@ -286,29 +432,75 @@ def test_05_compare_dicom_vs_nifti(self):
         if not os.path.exists(nifti_file):
             self.skipTest(f"Corresponding NIfTI file not found: {nifti_file}")
         
-        logger.info(f"Comparing DICOM vs NIfTI inference:")
+        logger.info(f"\n{'='*60}")
+        logger.info("Comparing DICOM vs NIfTI Segmentation")
+        logger.info(f"{'='*60}")
         logger.info(f"  DICOM: {dicom_dir}")
         logger.info(f"  NIfTI: {nifti_file}")
         
         # Run inference on DICOM
         logger.info("\n--- Running inference on DICOM series ---")
         dicom_label, dicom_json, dicom_time = self._run_inference(dicom_dir)
+        self._validate_segmentation_output(dicom_label, dicom_json)
         
         # Run inference on NIfTI
         logger.info("\n--- Running inference on NIfTI file ---")
         nifti_label, nifti_json, nifti_time = self._run_inference(nifti_file)
-        
-        # Validate both
-        self._validate_segmentation_output(dicom_label, dicom_json)
         self._validate_segmentation_output(nifti_label, nifti_json)
         
-        logger.info(f"\nPerformance comparison:")
+        # Compare the segmentation outputs
+        comparison = self._compare_segmentations(
+            dicom_label, 
+            nifti_label,
+            name_1="DICOM",
+            name_2="NIfTI",
+            tolerance=0.01  # Stricter tolerance - should be nearly identical
+        )
+        
+        logger.info(f"\n{'='*60}")
+        logger.info("Comparison Summary:")
         logger.info(f"  DICOM inference time: {dicom_time:.3f}s")
         logger.info(f"  NIfTI inference time: {nifti_time:.3f}s")
+        logger.info(f"  Dice coefficient: {comparison['avg_dice']:.4f}")
+        logger.info(f"  Pixel accuracy: {comparison['pixel_accuracy']:.4f}")
+        logger.info(f"  Exact match: {comparison['exact_match']}")
+        logger.info(f"{'='*60}")
+        
+        # Should be nearly identical (Dice > 0.99)
+        self.assertGreater(comparison['avg_dice'], 0.99,
+                          "DICOM and NIfTI segmentations should be nearly identical")
+    
+    def test_06_multiframe_htj2k_inference(self):
+        """
+        Test basic inference on multi-frame HTJ2K compressed DICOM series.
+        
+        Note: Comprehensive cross-format comparison is done in test_04.
+        This test ensures multi-frame HTJ2K inference works standalone.
+        """
+        if not torch.cuda.is_available():
+            self.skipTest("CUDA not available")
+        
+        if not self.app:
+            self.skipTest("App not initialized")
+        
+        if not os.path.exists(self.dicomweb_htj2k_multiframe_series):
+            self.skipTest(f"Multi-frame HTJ2K series not found: {self.dicomweb_htj2k_multiframe_series}")
+        
+        logger.info(f"\n{'='*60}")
+        logger.info("Testing Multi-Frame HTJ2K DICOM Inference")
+        logger.info(f"{'='*60}")
+        logger.info(f"Series path: {self.dicomweb_htj2k_multiframe_series}")
+        
+        # Run inference
+        label_data, label_json, inference_time = self._run_inference(self.dicomweb_htj2k_multiframe_series)
+        
+        # Validate output
+        self._validate_segmentation_output(label_data, label_json)
+        
+        # Performance check
+        self.assertLess(inference_time, 60.0, "Inference should complete within 60 seconds")
         
-        # Both should complete successfully
-        self.assertIsNotNone(dicom_label, "DICOM inference should succeed")
-        self.assertIsNotNone(nifti_label, "NIfTI inference should succeed")
+        logger.info(f"✓ Multi-frame HTJ2K inference test passed in {inference_time:.3f}s")
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/datastore/test_convert.py b/tests/unit/datastore/test_convert.py