EOPF-Explorer · emmanuelmathot · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
diff --git a/.vscode/launch.json b/.vscode/launch.json
diff --git a/src/eopf_geozarr/cli.py b/src/eopf_geozarr/cli.py
@@ -1182,14 +1182,6 @@ def add_s2_optimization_commands(subparsers: Any) -> None:
         choices=range(1, 10),
         help="Compression level 1-9 (default: 3)",
     )
-    s2_parser.add_argument(
-        "--skip-geometry", action="store_true", help="Skip creating geometry group"
-    )
-    s2_parser.add_argument(
-        "--skip-meteorology",
-        action="store_true",
-        help="Skip creating meteorology group",
-    )
     s2_parser.add_argument(
         "--skip-validation", action="store_true", help="Skip output validation"
     )
@@ -1229,8 +1221,6 @@ def convert_s2_optimized_command(args: Any) -> int:
             enable_sharding=args.enable_sharding,
             spatial_chunk=args.spatial_chunk,
             compression_level=args.compression_level,
-            create_geometry_group=not args.skip_geometry,
-            create_meteorology_group=not args.skip_meteorology,
             validate_output=not args.skip_validation,
             verbose=args.verbose,
         )

diff --git a/src/eopf_geozarr/conversion/fs_utils.py b/src/eopf_geozarr/conversion/fs_utils.py
@@ -116,6 +116,9 @@ def get_s3_storage_options(s3_path: str, **s3_kwargs: Any) -> S3FsOptions:
         "client_kwargs": {
             "region_name": os.environ.get("AWS_DEFAULT_REGION", "us-east-1")
         },
+        "s3_additional_kwargs": {
+            "StorageClass": "EXPRESS_ONEZONE",
+        },
     }
 
     # Add custom endpoint support (e.g., for OVH Cloud)
@@ -209,6 +212,9 @@ def write_s3_json_metadata(
         "client_kwargs": {
             "region_name": os.environ.get("AWS_DEFAULT_REGION", "us-east-1")
         },
+        "s3_additional_kwargs": {
+            "StorageClass": "EXPRESS_ONEZONE",
+        },
     }
 
     # Add custom endpoint support (e.g., for OVH Cloud)
@@ -251,6 +257,9 @@ def read_s3_json_metadata(s3_path: str, **s3_kwargs: Any) -> dict[str, Any]:
         "client_kwargs": {
             "region_name": os.environ.get("AWS_DEFAULT_REGION", "us-east-1")
         },
+        "s3_additional_kwargs": {
+            "StorageClass": "EXPRESS_ONEZONE",
+        },
     }
 
     # Add custom endpoint support (e.g., for OVH Cloud)
@@ -293,6 +302,9 @@ def s3_path_exists(s3_path: str, **s3_kwargs: Any) -> bool:
         "client_kwargs": {
             "region_name": os.environ.get("AWS_DEFAULT_REGION", "us-east-1")
         },
+        "s3_additional_kwargs": {
+            "StorageClass": "EXPRESS_ONEZONE",
+        },
     }
 
     # Add custom endpoint support (e.g., for OVH Cloud)
@@ -380,6 +392,9 @@ def validate_s3_access(s3_path: str, **s3_kwargs: Any) -> tuple[bool, str | None
             "client_kwargs": {
                 "region_name": os.environ.get("AWS_DEFAULT_REGION", "us-east-1")
             },
+            "s3_additional_kwargs": {
+                "StorageClass": "EXPRESS_ONEZONE",
+            },
         }
 
         # Add custom endpoint support (e.g., for OVH Cloud)

diff --git a/src/eopf_geozarr/s2_optimization/s2_converter.py b/src/eopf_geozarr/s2_optimization/s2_converter.py
@@ -6,6 +6,7 @@
 import time
 from typing import Any, Dict
 
+from pyproj import CRS
 import structlog
 import xarray as xr
 
@@ -37,13 +38,12 @@ def __init__(
         # Initialize components - streaming is always enabled
         self.pyramid_creator = S2MultiscalePyramid(enable_sharding, spatial_chunk)
         self.validator = S2OptimizationValidator()
+        self.crs: CRS | None = None
 
     def convert_s2_optimized(
         self,
         dt_input: xr.DataTree,
         output_path: str,
-        create_geometry_group: bool = True,
-        create_meteorology_group: bool = True,
         validate_output: bool = True,
         verbose: bool = False,
     ) -> xr.DataTree:
@@ -75,28 +75,30 @@ def convert_s2_optimized(
             raise ValueError("Input dataset is not a Sentinel-2 product")
 
         # Step 1: Process data while preserving original structure
-        log.info("Step 1: Processing data with original structure preserved")
+        log.info("Step 1: Preparing data (getting CRS, etc.)...")
+        self._init_crs_for_groups(dt_input)
 
         # Step 2: Create multiscale pyramids for each group in the original structure
-        log.info("Step 2: Creating multiscale pyramids (preserving original hierarchy)")
+        log.info(
+            "Step 2: Creating multiscale pyramids (preserving original hierarchy)..."
+        )
         datasets = self.pyramid_creator.create_multiscale_from_datatree(
-            dt_input, output_path, verbose
+            dt_input, output_path, verbose, self.crs
         )
-
-        log.info("Created multiscale pyramids", num_groups=len(datasets))
+        log.info(f"  Created multiscale pyramids for {len(datasets)} groups")
 
         # Step 3: Root-level consolidation
-        log.info("Step 3: Final root-level metadata consolidation")
+        log.info("Step 3: Final root-level metadata consolidation...")
         self._simple_root_consolidation(output_path, datasets)
 
         # Step 4: Validation
         if validate_output:
-            log.info("Step 4: Validating optimized dataset")
+            log.info("Step 4: Validating optimized dataset...")
             validation_results = self.validator.validate_optimized_dataset(output_path)
             if not validation_results["is_valid"]:
-                log.warning(
-                    "Validation issues found", issues=validation_results["issues"]
-                )
+                log.info("  Warning: Validation issues found:")
+                for issue in validation_results["issues"]:
+                    log.info(f"    - {issue}")
 
         # Create result DataTree
         result_dt = self._create_result_datatree(output_path)
@@ -108,6 +110,27 @@ def convert_s2_optimized(
             self._print_optimization_summary(dt_input, result_dt, output_path)
 
         return result_dt
+
+    def _init_crs_for_groups(self, dt: xr.DataTree) -> None:
+        epsg: int | None = None
+
+        # For CPM >= 2.6.0, the EPSG code is stored in attributes
+        epsg_CPM_260 = dt.attrs.get("other_metadata", {}).get("horizontal_CRS_code", None)
+        if epsg_CPM_260 is not None:
+            epsg = int(epsg_CPM_260.split(":")[-1])
+        # For older CPM versions, look for proj:epsg attribute in data variables
+        else:
+            for group in dt.groups.values():
+                for var in group.to_dataset().data_vars.values():
+                    if "proj:epsg" in var.attrs:
+                        epsg = int(var.attrs["proj:epsg"])
+                        break
+                if epsg is not None:
+                    break
+
+        self.crs = CRS.from_epsg(epsg) if epsg is not None else None
+        self.pyramid_creator.crs = self.crs
+
 
     def _is_sentinel2_dataset(self, dt: xr.DataTree) -> bool:
         """Check if dataset is Sentinel-2."""
@@ -147,7 +170,6 @@ def _simple_root_consolidation(
                     parent_path = "/" + "/".join(parts[:i])
                     if parent_path not in datasets:
                         missing_groups.add(parent_path)
-
             for group_path in missing_groups:
                 dt_parent = xr.DataTree()
                 dt_parent.to_zarr(

diff --git a/src/eopf_geozarr/s2_optimization/s2_multiscale.py b/src/eopf_geozarr/s2_optimization/s2_multiscale.py
@@ -46,6 +46,7 @@ def __init__(self, enable_sharding: bool = True, spatial_chunk: int = 256):
         self.enable_sharding = enable_sharding
         self.spatial_chunk = spatial_chunk
         self.resampler = S2ResamplingEngine()
+        self.crs: CRS | None = None
 
         # Define pyramid levels: resolution in meters
         self.pyramid_levels = {
@@ -58,7 +59,7 @@ def __init__(self, enable_sharding: bool = True, spatial_chunk: int = 256):
         }
 
     def create_multiscale_from_datatree(
-        self, dt_input: xr.DataTree, output_path: str, verbose: bool = False
+        self, dt_input: xr.DataTree, output_path: str, verbose: bool = False, crs: CRS | None = None
     ) -> dict[str, dict]:
         """
         Create multiscale versions preserving original structure.
@@ -68,12 +69,10 @@ def create_multiscale_from_datatree(
             dt_input: Input DataTree with original structure
             output_path: Base output path
             verbose: Enable verbose logging
-
         Returns:
             Dictionary of processed groups
         """
         processed_groups = {}
-
         # Step 1: Copy all original groups as-is
         for group_path in dt_input.groups:
             if group_path == ".":
@@ -90,11 +89,11 @@ def create_multiscale_from_datatree(
             # Skip empty groups
             if not dataset.data_vars:
                 if verbose:
-                    log.info("  Skipping empty group: {}", group_path=group_path)
+                    log.info("  Skipping empty group", group_path=group_path)
                 continue
 
             if verbose:
-                log.info("  Copying original group: {}", group_path=group_path)
+                log.info("  Copying original group", group_path=group_path)
 
             output_group_path = f"{output_path}{group_path}"
 
@@ -124,15 +123,13 @@ def create_multiscale_from_datatree(
             # Only process groups under /measurements/reflectance
             if not group_path.startswith(base_path):
                 continue
-
             group_name = group_path.split("/")[-1]
             if group_name in ["r10m", "r20m", "r60m"]:
                 resolution_groups[group_name] = processed_groups[group_path]
 
         # Find the coarsest resolution (r60m > r20m > r10m)
         source_dataset = None
         source_resolution = None
-
         for res in ["r60m", "r20m", "r10m"]:
             if res in resolution_groups:
                 source_dataset = resolution_groups[res]
@@ -160,7 +157,7 @@ def create_multiscale_from_datatree(
             r120m_path = f"{base_path}/r120m"
             factor = 120 // source_resolution
             if verbose:
-                log.info("    Creating r120m with factor {}", factor=factor)
+                log.info("    Creating r120m with factor", factor=factor)
 
             r120m_dataset = self._create_downsampled_resolution_group(
                 source_dataset, factor=factor, verbose=verbose
@@ -169,7 +166,7 @@ def create_multiscale_from_datatree(
             if r120m_dataset and len(r120m_dataset.data_vars) > 0:
                 output_path_120 = f"{output_path}{r120m_path}"
                 if verbose:
-                    log.info("    Writing r120m to {}", output_path_120=output_path_120)
+                    log.info("    Writing r120m", output_path_120=output_path_120)
                 encoding_120 = self._create_measurements_encoding(r120m_dataset)
                 ds_120 = self._stream_write_dataset(
                     r120m_dataset, output_path_120, encoding_120
@@ -191,7 +188,7 @@ def create_multiscale_from_datatree(
                         output_path_360 = f"{output_path}{r360m_path}"
                         if verbose:
                             log.info(
-                                "    Writing r360m to {}",
+                                "    Writing r360m",
                                 output_path_360=output_path_360,
                             )
                         encoding_360 = self._create_measurements_encoding(r360m_dataset)
@@ -215,7 +212,7 @@ def create_multiscale_from_datatree(
                                 output_path_720 = f"{output_path}{r720m_path}"
                                 if verbose:
                                     log.info(
-                                        "    Writing r720m to {}",
+                                        "    Writing r720m",
                                         output_path_720=output_path_720,
                                     )
                                 encoding_720 = self._create_measurements_encoding(
@@ -240,7 +237,7 @@ def create_multiscale_from_datatree(
                             log.info("    r360m dataset is empty, skipping")
                 except Exception as e:
                     log.warning(
-                        "Could not create r360m for {}: {}", base_path=base_path, e=e
+                        "Could not create r360m", base_path=base_path, e=e
                     )
                 # Track r120m for multiscales if created
                 if verbose:
@@ -249,7 +246,7 @@ def create_multiscale_from_datatree(
                 if verbose:
                     log.info("    r120m dataset is empty, skipping")
         except Exception as e:
-            log.warning("Could not create r120m for {}: {}", base_path=base_path, e=e)
+            log.warning("Could not create r120m", base_path=base_path, e=e)
 
         # Step 3: Add multiscales metadata to parent groups
         if verbose:
@@ -262,7 +259,7 @@ def create_multiscale_from_datatree(
             processed_groups[base_path] = dt_multiscale
         except Exception as e:
             log.warning(
-                "Could not add multiscales metadata to {}: {}", base_path=base_path, e=e
+                "Could not add multiscales metadata to parent groups", base_path=base_path, e=e
             )
 
         return processed_groups
@@ -302,7 +299,6 @@ def _create_downsampled_resolution_group(
         """Create a downsampled version of a dataset by given factor."""
         if not source_dataset or len(source_dataset.data_vars) == 0:
             return xr.Dataset()
-
         # Get reference dimensions
         ref_var = next(iter(source_dataset.data_vars.values()))
         if ref_var.ndim < 2:
@@ -319,13 +315,11 @@ def _create_downsampled_resolution_group(
         downsampled_coords = self._create_downsampled_coordinates(
             source_dataset, target_height, target_width, factor
         )
-
         # Downsample all variables using existing lazy operations
         lazy_vars = {}
         for var_name, var_data in source_dataset.data_vars.items():
             if var_data.ndim < 2:
                 continue
-
             lazy_downsampled = self._create_lazy_downsample_operation_from_existing(
                 var_data, target_height, target_width
             )
@@ -403,15 +397,16 @@ def _stream_write_dataset(
             )
             return existing_ds
 
-        log.info("    Streaming computation and write to {}", dataset_path=dataset_path)
-        log.info("Variables", variables=list(dataset.data_vars.keys()))
+        log.info("    Streaming computation and write", dataset_path=dataset_path)
+        log.info("    Variables", variables=list(dataset.data_vars.keys()))
 
         # Rechunk dataset to align with encoding when sharding is enabled
         if self.enable_sharding:
             dataset = self._rechunk_dataset_for_encoding(dataset, encoding)
 
-        # Add the geo metadata before writing
-        self._write_geo_metadata(dataset)
+        # Add the geo metadata before writing for /measurements/ groups
+        if "/measurements/" in dataset_path:
+            self._write_geo_metadata(dataset)
 
         # Write with streaming computation and progress tracking
         # The to_zarr operation will trigger all lazy computations
@@ -430,14 +425,14 @@ def _stream_write_dataset(
             try:
                 distributed.progress(write_job, notebook=False)
             except Exception as e:
-                log.warning("Could not display progress bar: {}", e=e)
+                log.warning("Could not display progress bar", e=e)
                 write_job.compute()
         else:
             log.info("    Writing zarr file...")
             write_job.compute()
 
         log.info(
-            "    ✅ Streaming write complete for dataset {}", dataset_path=dataset_path
+            "    Streaming write complete for dataset", dataset_path=dataset_path
         )
         return dataset
 
@@ -740,7 +735,6 @@ def _add_multiscales_metadata_to_parent(
             "resampling_method": "average",
             "tile_matrix_limits": tile_matrix_limits,
         }
-
         # Create parent group path
         parent_group_path = f"{output_path}{base_path}"
         dt_multiscale = xr.DataTree()
@@ -766,19 +760,13 @@ def _write_geo_metadata(
     ) -> None:
         """Write geographic metadata to the dataset."""
         # Implementation same as original
-        crs = None
-        for var in dataset.data_vars.values():
-            if hasattr(var, "rio") and var.rio.crs:
-                crs = var.rio.crs
-                break
-            elif "proj:epsg" in var.attrs:
-                epsg = var.attrs["proj:epsg"]
-                crs = CRS.from_epsg(epsg)
-                break
+        if self.crs is None:
+            log.warning("CRS is not set, skipping geo metadata writing")
+            return
 
-        if crs is not None:
+        if self.crs is not None:
             dataset.rio.write_crs(
-                crs, grid_mapping_name=grid_mapping_var_name, inplace=True
+                self.crs, grid_mapping_name=grid_mapping_var_name, inplace=True
             )
             dataset.rio.write_grid_mapping(grid_mapping_var_name, inplace=True)
             dataset.attrs["grid_mapping"] = grid_mapping_var_name

diff --git a/src/eopf_geozarr/s2_optimization/s2_resampling.py b/src/eopf_geozarr/s2_optimization/s2_resampling.py
@@ -102,7 +102,6 @@ def _downsample_classification(
         # Take the center pixel of each block as representative
         center_h = block_h // 2
         center_w = block_w // 2
-
         if data.ndim == 3:
             # Sample every block_h and block_w pixels, starting from center
             downsampled = data.values[:, center_h::block_h, center_w::block_w]