Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 56 additions & 134 deletions data/forest_loss_driver/config_studio_annotation.json

Large diffs are not rendered by default.

14 changes: 8 additions & 6 deletions rslp/forest_loss_driver/scripts/add_area_to_studio_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from rslearn.utils.geometry import STGeometry
from rslearn.utils.get_utm_ups_crs import get_utm_ups_projection

BASE_URL = "https://earth-system-studio.allen.ai/api/v1"
BASE_URL = "https://olmoearth.allenai.org/api/v1"

# Arbitrary user ID to save the annotation under.
# This one is ES Studio User.
Expand All @@ -37,8 +37,10 @@
# Get the annotation metadata field ID for the Area field.
url = f"{BASE_URL}/projects/{project_id}"
response = requests.get(url, headers=headers, timeout=10)
assert response.status_code == 200
project_data = response.json()
response.raise_for_status()
json_data = response.json()
assert len(json_data["records"]) == 1
project_data = json_data["records"][0]
metadata_field_id = None
for metadata_field in project_data["template"]["annotation_metadata_fields"]:
if metadata_field["name"] != "Area":
Expand All @@ -50,13 +52,13 @@
# Now iterate through tasks.
url = f"{BASE_URL}/projects/{project_id}/tasks?limit=1000"
response = requests.get(url, headers=headers, timeout=10)
assert response.status_code == 200
response.raise_for_status()
item_list = response.json()["items"]
for task in tqdm.tqdm(item_list):
task_id = task["id"]
url = f"{BASE_URL}/tasks/{task_id}/annotations"
response = requests.get(url, headers=headers, timeout=10)
assert response.status_code == 200
response.raise_for_status()
fc = response.json()
if len(fc["features"]) != 1:
continue
Expand Down Expand Up @@ -106,4 +108,4 @@

url = f"{BASE_URL}/annotations/{annotation_id}"
response = requests.put(url, json.dumps(post_data), headers=headers, timeout=10)
assert response.status_code == 200
response.raise_for_status()
87 changes: 87 additions & 0 deletions rslp/forest_loss_driver/scripts/peru_20260112/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
This project is for populating examples for new phase of Peru annotation.

## Get Predictions

First we get predictions in Peru for a five-year period. `integrated_config.yaml`
contains the YAML config used for the integrated inference pipeline in
olmoearth_projects:

```
python -m olmoearth_projects.main projects.forest_loss_driver.deploy integrated_pipeline --config ../rslearn_projects/rslp/forest_loss_driver/scripts/peru_20260112/integrated_config.yaml
```

We only need to run it up till it collects the events across the Studio jobs, we got
this file:

```
/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/inference/dataset_20260109/events_from_studio_jobs.geojson
```

## Select Examples

Then we select examples for annotation:

```
python rslp/forest_loss_driver/scripts/peru_20260112/select_examples_for_annotation.py
```

This script will read the events from the file above and write out an rslearn dataset
here:

```
/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/
```

The rslearn dataset should be first created with config file from
`data/forest_loss_driver/config_studio_annotation.json`.

The selection is done by randomly sampling 100 forest loss events that were predicted
as each of logging/burned/none/river/airstrip (500 total), and another 500 where the
maximum probability is <0.5 (indicating the model was not confident).

## Prepare and Materialize

Make sure to set PLANET_API_KEY env var since it is used in the dataset config. Then:

```
rslearn dataset prepare --root /weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/ --workers 128 --retry-max-attempts 10 --retry-backoff-seconds 5
rslearn dataset materialize --root /weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/ --workers 128 --retry-max-attempts 10 --retry-backoff-seconds 5 --ignore-errors
```

## Additional Steps

Afterwards there are a few additional steps we need to do because we forgot to include
it in the initial example selection script.

First, rename the tasks so they have the format `[#113] 2024-05-13 at -8.9846, -76.7046 prediction:burned`:

```
python rslp/forest_loss_driver/scripts/peru_20260112/rename_tasks.py
```

Then, add the label layer (forest loss polygon):

```
python rslp/forest_loss_driver/scripts/peru_20260112/add_label.py
```

## Sync to Studio

Copy to GCS:

```
gsutil -m rsync -r /weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/ gs://ai2-rslearn-projects-data/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/
```

Then make request to have it import the dataset (need to create project in Studio first):

```
curl https://olmoearth.allenai.org/api/v1/datasets/ingest --request PUT --header 'Content-Type: application/json' --header "Authorization: Bearer $STUDIO_API_TOKEN" --data '{"dataset_path": "gs://ai2-rslearn-projects-data/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/", "project_id": "60e16f40-dbe8-4932-af1b-3f762572530d", "layer_source_names": {}, "prediction_layer_names": []}'
```

After the project is populated, copy the annotation metadata fields from another
project (should have Confidence enum with High/Medium/Low and Area number with 0-9999)
and use `../add_area_to_studio_tasks.py` to set the area in hectares for each polygon.

At 2026-01-20 we sent the project to ACA and they are now looking at it, once
annotation is completed we will need to look into retraining the model.
74 changes: 74 additions & 0 deletions rslp/forest_loss_driver/scripts/peru_20260112/add_label.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""Add the label polygon since we forgot to include it initially."""

import multiprocessing
from datetime import datetime, timedelta

import tqdm
from rasterio.crs import CRS
from rslearn.dataset import Dataset
from rslearn.utils.feature import Feature
from rslearn.utils.geometry import Projection
from rslearn.utils.grid_index import GridIndex
from rslearn.utils.vector_format import GeojsonCoordinateMode, GeojsonVectorFormat
from upath import UPath

PREDICTION_FNAME = "/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/inference/dataset_20260109/events_from_studio_jobs.geojson"
OUTPUT_DATASET_PATH = "/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/"
NUM_WORKERS = 128

# Web Mercator projection that all windows are in.
PROJECTION = Projection(CRS.from_epsg(3857), 9.554628535647032, -9.554628535647032)


def reproject_feature(feat: Feature) -> Feature:
"""Helper function to re-project a feature to the WebMercator projection."""
return Feature(feat.geometry.to_projection(PROJECTION), feat.properties)


if __name__ == "__main__":
multiprocessing.set_start_method("forkserver")

# Load features (predictions) and windows.
features = GeojsonVectorFormat().decode_from_file(UPath(PREDICTION_FNAME))
dataset = Dataset(UPath(OUTPUT_DATASET_PATH))
windows = dataset.load_windows(show_progress=True, workers=128)

# We need to find the feature that corresponds to each window so we can add it as
# the label layer. So we create a grid index over the features. We use Web Mercator
# for the grid index since the index needs everything in one projection.
p = multiprocessing.Pool(NUM_WORKERS)
reprojected_features = p.imap_unordered(reproject_feature, features)
grid_index = GridIndex(size=100)
for feat in tqdm.tqdm(
reprojected_features, desc="Creating grid index", total=len(features)
):
grid_index.insert(feat.geometry.shp.bounds, feat)
p.close()

# Now iterate over windows and find the closest feature.
# We make sure that the dates line up.
for window in tqdm.tqdm(windows, desc="Adding labels"):
candidates: list[Feature] = grid_index.query(window.bounds)
best_feat = None
best_distance: int | None = None
for candidate in candidates:
candidate_point = candidate.geometry.to_projection(PROJECTION).shp.centroid
distance = window.get_geometry().shp.centroid.distance(candidate_point)
if best_distance is None or distance < best_distance:
best_feat = candidate
best_distance = distance

if best_feat is None or best_distance is None or best_distance > 10:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be too strict? I mean computing the distance between centroids of two geometries, is it likely that the window is within a polygon but their centroid distance is more than 10?

raise ValueError(f"no spatially matching feature for window {window.name}")

feat_datetime = datetime.fromisoformat(best_feat.properties["oe_start_time"])
if abs(feat_datetime - window.time_range[0]) > timedelta(days=1):
raise ValueError(f"no tempoarlly matching feature for window {window.name}")

layer_dir = window.get_layer_dir("label")
# Reset the label so it is marked unlabeled.
best_feat.properties["new_label"] = "unlabeled"
GeojsonVectorFormat(coordinate_mode=GeojsonCoordinateMode.WGS84).encode_vector(
layer_dir, [best_feat]
)
window.mark_layer_completed("label")
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
integrated_config:
weka_base_dir: "/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/inference/"
gcs_base_dir: "gs://ai2-rslearn-projects-data/forest_loss_driver/dataset_v1/peru_20260112/inference/"
extract_alerts_args:
gcs_tiff_filenames:
- "070W_10S_060W_00N.tif"
- "070W_20S_060W_10S.tif"
- "080W_10S_070W_00N.tif"
- "080W_20S_070W_10S.tif"
out_fname: "placeholder"
country_data_path: "/weka/dfive-default/rslearn-eai/artifacts/natural_earth_countries/20240830/ne_10m_admin_0_countries.shp"
countries: ["PE"]
days: 1825
max_number_of_events: 200000
asset_workers: 128
make_tiles_workers: 128
write_individual_events_workers: 128
28 changes: 28 additions & 0 deletions rslp/forest_loss_driver/scripts/peru_20260112/rename_tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""We initially named the tasks differently so we rename it to better format."""

import random
import shutil

import tqdm
from rslearn.dataset.dataset import Dataset, Window
from upath import UPath

DATASET_PATH = "/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/"


if __name__ == "__main__":
ds_path = UPath(DATASET_PATH)
dataset = Dataset(ds_path)
windows = dataset.load_windows()
random.shuffle(windows)
for idx, window in enumerate(tqdm.tqdm(windows)):
src_name = window.name
_, lon_str, lat_str, predicted_category = src_name.split("_")
date_time_str = window.time_range[0].strftime("%Y-%m-%d")
dst_name = f"[#{idx+1:04d}] {date_time_str} at {float(lat_str):.04f}, {float(lon_str):.04f} prediction:{predicted_category}"
shutil.move(
Window.get_window_root(ds_path, window.group, src_name),
Window.get_window_root(ds_path, window.group, dst_name),
)
window.name = dst_name
window.save()
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""Select examples for this new Peru annotation.

Based on predictions in Peru over five-year period:
- Select 100 for each of logging/burned/none/river/airstrip
- Select 500 where max(probs) < 0.5
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: add "from other categories" so that it's clear that they're targeting different categories

"""

import random
from datetime import datetime

from rasterio.crs import CRS
from rslearn.const import WGS84_PROJECTION
from rslearn.dataset import Dataset, Window
from rslearn.utils.feature import Feature
from rslearn.utils.geometry import Projection
from rslearn.utils.grid_index import GridIndex
from rslearn.utils.vector_format import GeojsonVectorFormat
from upath import UPath

PREDICTION_FNAME = "/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/inference/dataset_20260109/events_from_studio_jobs.geojson"
OUTPUT_DATASET_PATH = "/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/"
TARGET_GROUP = "20260112_peru"
RARE_CATEGORIES = ["logging", "burned", "none", "river", "airstrip"]
PROB_THRESHOLD = 0.5
DISTANCE_THRESHOLD = 1000 / 111111
WINDOW_SIZE = 128


if __name__ == "__main__":
# Load predictions.
predictions = GeojsonVectorFormat().decode_from_file(UPath(PREDICTION_FNAME))

# Create candidates for the different selection criteria.
by_class_options: dict[str, list[Feature]] = {
category: [] for category in RARE_CATEGORIES
}
by_prob_options: list[Feature] = []
for feat in predictions:
category = feat.properties["new_label"]
if category in RARE_CATEGORIES:
by_class_options[category].append(feat)
elif max(feat.properties["probs"]) < PROB_THRESHOLD:
by_prob_options.append(feat)

for category, candidates in by_class_options.items():
print(f"got {len(candidates)} options by class for category={category}")
print(f"got {len(by_prob_options)} options by prob")

# Select windows, we make sure their center points are at least 500 m away from
# each other.
grid_index = GridIndex(size=DISTANCE_THRESHOLD)
selected: list[Feature] = []

def contains_bbox(box: tuple[float, float, float, float]) -> bool:
"""Check whether the box intersects a point in grid_index."""
for other in grid_index.query(box):
if (
other[0] > box[0]
and other[1] > box[1]
and other[0] < box[2]
and other[1] < box[3]
):
return True
return False

def add_random_sample_of_features(features: list[Feature], max_count: int) -> int:
"""Add a random sample of windows from the list to the selected set."""
# Add up to max_count from the features list.
random.shuffle(features)
cur_selected: list[Feature] = []
for feat in features:
center_point = feat.geometry.to_projection(WGS84_PROJECTION).shp.centroid
if contains_bbox(
(
center_point.x - DISTANCE_THRESHOLD,
center_point.y - DISTANCE_THRESHOLD,
center_point.x + DISTANCE_THRESHOLD,
center_point.y + DISTANCE_THRESHOLD,
)
):
continue

cur_selected.append(feat)
grid_index.insert(
(center_point.x, center_point.y, center_point.x, center_point.y),
(center_point.x, center_point.y),
)
if len(cur_selected) >= max_count:
break

selected.extend(cur_selected)
return len(cur_selected)

for category, candidates in by_class_options.items():
count = add_random_sample_of_features(candidates, 100)
print(f"by class category={category} picked {count}/{len(candidates)} windows")
count = add_random_sample_of_features(by_prob_options, 500)
print(f"by prob picked {count}/{len(by_prob_options)} windows")
print(f"got {len(selected)} total to remap")

# Create windows in the destination dataset for these features.
dataset = Dataset(UPath(OUTPUT_DATASET_PATH))
dst_proj = Projection(CRS.from_epsg(3857), 9.554628535647032, -9.554628535647032)
random.shuffle(selected)
for idx, feat in enumerate(selected):
wgs84_geom = feat.geometry.to_projection(WGS84_PROJECTION)
lon = wgs84_geom.shp.centroid.x
lat = wgs84_geom.shp.centroid.y
predicted_category = feat.properties["new_label"]
window_name = f"[#{idx}]_{lon:.04f}_{lat:.04f}_predicted:{predicted_category}"

# Get bounds in our WebMercator projection.
dst_geom = feat.geometry.to_projection(dst_proj)
dst_bounds = (
int(dst_geom.shp.centroid.x) - WINDOW_SIZE // 2,
int(dst_geom.shp.centroid.y) - WINDOW_SIZE // 2,
int(dst_geom.shp.centroid.x) + WINDOW_SIZE // 2,
int(dst_geom.shp.centroid.y) + WINDOW_SIZE // 2,
)

ts = datetime.fromisoformat(feat.properties["oe_start_time"])
window = Window(
storage=dataset.storage,
group=TARGET_GROUP,
name=window_name,
projection=dst_proj,
bounds=dst_bounds,
time_range=(ts, ts),
)
window.save()
4 changes: 2 additions & 2 deletions rslp/sentinel2_vessel_attribute/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,9 +618,9 @@ def apply_state(
"""
for k in image_keys:
if state["horizontal"]:
d[k] = torch.flip(d[k], dims=[-1])
d[k].image = torch.flip(d[k].image, dims=[-1])
if state["vertical"]:
d[k] = torch.flip(d[k], dims=[-2])
d[k].image = torch.flip(d[k].image, dims=[-2])

if update_heading:
if self.heading_mode == HeadingMode.XY:
Expand Down
Loading