From b5817f2d8993c5f3059a09dfe3d3c3454df23888 Mon Sep 17 00:00:00 2001
From: nicolasK <nicolas.karasiak@earthdaily.com>
Date: Tue, 5 Nov 2024 12:30:03 +0100
Subject: [PATCH] fix(cloudmask) : when share same product

---
 CHANGELOG.md                          |  6 +++++
 earthdaily/__init__.py                |  2 +-
 earthdaily/earthdatastore/__init__.py | 37 +++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 89db3e1d..df9b7add 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.2.13] - 2024-11-05
+
+### Fixed
+
+- Manage when several products have same cloudmask
+
 ## [0.2.12] - 2024-10-31
 
 ### Added
diff --git a/earthdaily/__init__.py b/earthdaily/__init__.py
index 1304908a..e35caac5 100644
--- a/earthdaily/__init__.py
+++ b/earthdaily/__init__.py
@@ -7,7 +7,7 @@
 # to hide warnings from rioxarray or nano seconds conversion
 # warnings.filterwarnings("ignore")
 
-__version__ = "0.2.12"
+__version__ = "0.2.13"
 
 
 def EarthDataStore(
diff --git a/earthdaily/earthdatastore/__init__.py b/earthdaily/earthdatastore/__init__.py
index c1c2b726..053d5c3c 100644
--- a/earthdaily/earthdatastore/__init__.py
+++ b/earthdaily/earthdatastore/__init__.py
@@ -70,6 +70,41 @@ def post_query_items(items, query):
     items = ItemCollection(items_)
     return items
 
+def _select_last_common_occurrences(first, second):
+    """
+    For each date in second dataset, select the last N occurrences of that date from first dataset,
+    where N is the count of that date in second dataset.
+    
+    Parameters:
+    first (xarray.Dataset): Source dataset
+    second (xarray.Dataset): Dataset containing the dates to match and their counts
+    
+    Returns:
+    xarray.Dataset: Subset of first dataset with selected time indices
+    """
+    # Convert times to datetime64[ns] if they aren't already
+    first_times = first.time.astype("datetime64[ns]")
+    second_times = second.time.astype("datetime64[ns]")
+    
+    # Get unique dates and their counts from second dataset
+    unique_dates, counts = np.unique(second_times.values, return_counts=True)
+    
+    # Initialize list to store selected indices
+    selected_indices = []
+    
+    # For each unique date in second
+    for date, count in zip(unique_dates, counts):
+        # Find all indices where this date appears in first
+        date_indices = np.where(first_times == date)[0]
+        # Take the last 'count' number of indices
+        selected_indices.extend(date_indices[-count:])
+    
+    # Sort indices to maintain temporal order (or reverse them if needed)
+    selected_indices = sorted(selected_indices, reverse=True)
+    
+    # Select these indices from the first dataset
+    return first.isel(time=selected_indices)
+
 
 def _cloud_path_to_http(cloud_path):
     """Convert a cloud path to HTTP URL.
@@ -963,6 +998,8 @@ def datacube(
                     **kwargs,
                 )
                 xr_datacube["time"] = xr_datacube.time.astype("M8[ns]")
+                if xr_datacube.time.size != acm_datacube.time.size:
+                    xr_datacube = _select_last_common_occurrences(xr_datacube, acm_datacube)
                 acm_datacube["time"] = xr_datacube["time"].time
                 acm_datacube = cube_utils._match_xy_dims(acm_datacube, xr_datacube)
                 xr_datacube = xr.merge((xr_datacube, acm_datacube), compat="override")