plantnet · tlarcher · Apr 16, 2025 · Nov 22, 2024 · Apr 7, 2025 · Apr 7, 2025
diff --git a/.github/workflows/dispatch-test-examples.yml b/.github/workflows/dispatch-test-examples.yml
@@ -19,11 +19,11 @@ jobs:
                 python-version: ${{ matrix.python-version }}
                 cache: 'pip' # caching pip dependencies
             - name: install cartopy system packages
-              run: sudo apt-get install libgeos-dev=3.10.2-1
+              run: sudo apt-get install libgeos-dev
             - name: pip install -r requirements_python3.10.txt
               run: pip install -r ./requirements_python3.10.txt
             - name: pip install -e .
               run: pip install -e .
             - name: Pytest
               run: |
-                pytest malpolon/tests/test_examples.py
+                pytest malpolon/tests/test_examples.py
diff --git a/.github/workflows/dispatch-test.yml b/.github/workflows/dispatch-test.yml
@@ -19,7 +19,7 @@ jobs:
                 python-version: ${{ matrix.python-version }}
                 cache: 'pip' # caching pip dependencies
             - name: install cartopy system packages
-              run: sudo apt-get install libgeos-dev=3.10.2-1
+              run: sudo apt-get install libgeos-dev
             - name: pip install -r requirements_python3.10.txt
               run: pip install -r ./requirements_python3.10.txt
             - name: pip install -e .

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -27,7 +27,7 @@ jobs:
                 python-version: ${{ matrix.python-version }}
                 cache: 'pip' # caching pip dependencies
             - name: install cartopy system packages
-              run: sudo apt-get install libgeos-dev=3.10.2-1
+              run: sudo apt-get install libgeos-dev
             - name: pip install -r requirements_python3.10.txt
               run: pip install -r ./requirements_python3.10.txt
             - name: pip install -e .

diff --git a/examples/benchmarks/geolifeclef/geolifeclef2025_pre_extracted/.gitignore b/examples/benchmarks/geolifeclef/geolifeclef2025_pre_extracted/.gitignore
@@ -0,0 +1,10 @@
+# Data
+dataset/geolifeclef-2025/BioclimTimeSeries
+dataset/geolifeclef-2025/EnvironmentalValues
+dataset/geolifeclef-2025/SatellitePatches
+dataset/geolifeclef-2025/SatelitePatches
+dataset/geolifeclef-2025/SatelliteTimeSeries-Landsat
+dataset/geolifeclef-2025/SateliteTimeSeries-Landsat
+dataset/geolifeclef-2025/*.csv
+
+dataset/geolifeclef-2025/stats/fps*
diff --git a/...marks/geolifeclef/geolifeclef2025_pre_extracted/config/glc25_cnn_multimodal_ensemble.yaml b/...marks/geolifeclef/geolifeclef2025_pre_extracted/config/glc25_cnn_multimodal_ensemble.yaml
@@ -0,0 +1,91 @@
+hydra:
+  run:
+    dir: outputs/${hydra.job.name}/${now:%Y-%m-%d_%H-%M-%S}
+
+run:
+  predict: false
+  checkpoint_path: # "outputs/glc25_cnn_multimodal_ensemble/2025-04-08_14-52-54/last.ckpt"
+
+data:
+  root: "dataset/geolifeclef-2025/"
+  data_paths:
+    train:
+      landsat_data_dir: "${data.root}SateliteTimeSeries-Landsat/cubes/PA-train/"
+      bioclim_data_dir: "${data.root}BioclimTimeSeries/cubes/PA-train/"
+      sentinel_data_dir: "${data.root}SatelitePatches/PA-train/"
+    test:
+      landsat_data_dir: "${data.root}SateliteTimeSeries-Landsat/cubes/PA-test/"
+      bioclim_data_dir: "${data.root}BioclimTimeSeries/cubes/PA-test/"
+      sentinel_data_dir: "${data.root}SatelitePatches/PA-test/"
+  metadata_paths:
+    train:  "${data.root}GLC25_PA_metadata_train_train-0.6min.csv"
+    val:  "${data.root}GLC25_PA_metadata_train_val-0.6min.csv"
+    test:  "${data.root}GLC25_PA_metadata_test.csv"
+  num_classes: &num_classes 11255
+  download_data: True
+  train_batch_size: 64
+  inference_batch_size: 16
+  num_workers: 16
+
+task:
+  task: "classification_multilabel" # ['classification_binary', 'classification_multiclass', 'classification_multilabel']
+
+trainer:
+  # gpus: 1  # Deprecated since pytorchlightning 1.7, removed in 2.0. Replaced by the 2 next attributes
+  accelerator: "cpu"
+  devices: 'auto'
+  max_epochs: 21  # if resuming training from our pre-trained MME model, needs to be > 19
+  val_check_interval: 100
+  check_val_every_n_epoch: 1
+  log_every_n_steps: 100
+
+model:
+  provider_name: "malpolon" # choose from ["malpolon", "timm", "torchvision"]
+  model_name: "glc24_multimodal_ensemble"  # The GLC24 model is used for GLC25
+  model_kwargs:
+    pretrained: true # Deprecated in torchvision since 0.13 (replaced by "weights") but used by timm
+  modifiers:
+    change_last_layer:
+      num_outputs: *num_classes
+
+optim:
+  loss_kwargs:
+    pos_weight: 10.0
+  optimizer:
+    adamw:
+      kwargs:
+        lr: 0.00025
+      scheduler:
+        cosine_annealing_lr:
+          kwargs:
+            T_max: 25
+            verbose: True
+  metrics:
+    multilabel_accuracy:
+      # callable: 'Fmetrics.classification.multilabel_accuracy'
+      kwargs:
+        num_labels: *num_classes
+        # threshold: 0.1
+        average: micro
+    multilabel_recall:
+      callable: 'Fmetrics.classification.multilabel_recall'
+      kwargs:
+        num_labels: *num_classes
+        # threshold: 0.1
+        average: micro
+    multilabel_precision:
+      callable: 'Fmetrics.classification.multilabel_precision'
+      kwargs:
+        num_labels: *num_classes
+        # threshold: 0.1
+        average: micro
+    multilabel_f1-score:
+      callable: 'Fmetrics.classification.multilabel_f1_score'
+      kwargs:
+        num_labels: *num_classes
+        # threshold: 0.1
+        average: micro
+
+loggers:
+  exp_name: "GLC25_MME"  # Name of your experiment
+  log_dir_name: "tensorboard_logs/"  # Name of the logs directory
diff --git a/...ks/geolifeclef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/README.md b/...ks/geolifeclef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/README.md
@@ -0,0 +1,60 @@
+### A. Spatially splitting the dataset
+To split the observation dataset in _train_ and _val_ while avoiding spatial auto-corelation, we use Malpolon's toolbox method `split_obs_spatially.py` based on the library `verde`. The method takes as input an observation CSV file with **lon**, **lat** columns, and evenly splits the data subsets wrt a spacing radius (be default: 10/60 degrees).
+
+The radius value can be whatever real, but it should be coherent with the CRS of the dataset to split. In the case of GLC25, observations coordinates are registered in WGS84 EPSG:4326. So inputting 10/60 as spacing value corresponds to ~0.16 degrees, or 10 arcminutes. Over France, this corresponds to a spacing of around 17km.
+
+In this repository, we chose to split with a spacing of 0.01 degrees, or 0.6 arcminutes which, over France, corresponds to a spacing of around 1.1km.
+
+### B. Computing dataset moments
+To compute the mean and standard deviation values of each modality of our dataset, we use the method `compute_mean_std_iteratively_from_sample.py` Malpolon's toolbox which approximates the real values of mean & std with an iterative computation based on a list of path files.
+
+1. Produce text files containg the filepaths to each data element of the dataset for each modality.
+
+In a Python terminal session:
+```python
+import os
+import pandas as pd
+
+def construct_patch_path(data_path, survey_id):
+    path = data_path
+    for d in (str(survey_id)[-2:], str(survey_id)[-4:-2]):
+        path = os.path.join(path, d)
+    path = os.path.join(path, f"{survey_id}.tiff")
+    return path
+
+df_train = pd.read_csv('GLC25_PA_metadata_train_train-0.6min.csv')
+df_val = pd.read_csv('GLC25_PA_metadata_train_val-0.6min.csv')
+
+# Example for bioclim rasters
+fps_train_bioclim = list(df_train['surveyId'].apply(lambda x: f'BioclimTimeSeries/cubes/PA-train/GLC25-PA-train-bioclimatic_monthly_{x}_cube.pt').values)
+with open('fps_bioclim_train_train-0.6min.txt', 'w') as f:
+    for string in fps_train_bioclim:
+        f.write(string + '\n')
+
+# Example for landsat time series
+fps_train_landsat = list(df_train['surveyId'].apply(lambda x: f'SatelliteTimeSeries-Landsat/cubes/PA-train/GLC25-PA-train-landsat-time-series_{x}_cube.pt').values)
+with open('fps_landsat_train_train-0.6min.txt', 'w') as f:
+    for string in fps_train_landsat:
+        f.write(string + '\n')
+
+# Example for satellite patches
+fps_val_satellite = list(df_val['surveyId'].apply(lambda x: construct_patch_path('SatellitePatches/PA-train/', x)).values)
+with open('fps_satellite_train_val-0.6min.txt', 'w') as f:
+    for string in fps_val_satellite:
+        f.write(string + '\n')
+```
+
+2. Run the moments computation script.
+
+```bash
+python ../../../../../../toolbox/compute_mean_std_iteratively_from_sample.py -p fps_bioclim_train_val-0.6min.txt -o Stats_bioclim_val.csv --type tiff --max_items 10000
+```
+
+For Satellite patches (Sentinel-2A), add the argumen `--per_channel` to compute the moments for each of the 4 channels: red, green, blue, nir. The output CSV contains the values for those channels in the same order row-wise. You can verify the order of bands with the command `gdalinfo <path>/<patch_name>.tiff`.
+
+### Glossary
+- fps: filepaths
+- PA: Presence Absence
+- PO: Presence Only
+- CRS: Coordinate Reference System
+- xxx\_train\_train-[train,val]-\d.\dmin: spatial split, either the train or validation part, of the observation dataset, with a spatial spacing of \d.\d minutes (wrt to WGS84 CRS)
diff --git a/...e_extracted/dataset/geolifeclef-2025/Stats/Satellite_min-max_values_linear_approx-100.npy b/...e_extracted/dataset/geolifeclef-2025/Stats/Satellite_min-max_values_linear_approx-100.npy
diff --git a/...25_pre_extracted/dataset/geolifeclef-2025/Stats/Satellite_quantiles_linear_approx-100.npy b/...25_pre_extracted/dataset/geolifeclef-2025/Stats/Satellite_quantiles_linear_approx-100.npy
diff --git a/...eclef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_bioclim_test.csv b/...eclef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_bioclim_test.csv
@@ -0,0 +1,2 @@
+mean,std
+3932.149871972656,3490.3687862811103
diff --git a/...clef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_bioclim_train.csv b/...clef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_bioclim_train.csv
@@ -0,0 +1,2 @@
+mean,std
+3914.8479827880924,3080.6445717511765
diff --git a/...feclef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_bioclim_val.csv b/...feclef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_bioclim_val.csv
@@ -0,0 +1,2 @@
+mean,std
+3955.529410424809,3234.002077993207
diff --git a/...eclef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_landsat_test.csv b/...eclef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_landsat_test.csv
@@ -0,0 +1,2 @@
+mean,std
+26.188058348891673,29.624102936518728
diff --git a/...clef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_landsat_train.csv b/...clef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_landsat_train.csv
@@ -0,0 +1,2 @@
+mean,std
+30.654699535584506,25.70223457928363
diff --git a/...feclef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_landsat_val.csv b/...feclef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_landsat_val.csv
@@ -0,0 +1,2 @@
+mean,std
+30.269297362566068,25.212980775818476
diff --git a/...lef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_satellite_test.csv b/...lef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_satellite_test.csv
@@ -0,0 +1,5 @@
+mean,std
+517.7869262695312,530.5372924804688
+565.6556396484375,497.5302734375
+376.7779541015625,427.4356994628906
+2289.86279296875,1510.10400390625
diff --git a/...ef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_satellite_train.csv b/...ef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_satellite_train.csv
@@ -0,0 +1,5 @@
+mean,std
+629.6244506835938,435.9951171875
+691.8153076171875,371.3965759277344
+460.6056823730469,342.8971252441406
+2959.370361328125,925.369140625
diff --git a/...clef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_satellite_val.csv b/...clef/geolifeclef2025_pre_extracted/dataset/geolifeclef-2025/Stats/Stats_satellite_val.csv
@@ -0,0 +1,5 @@
+mean,std
+633.1102905273438,465.04644775390625
+692.7642211914062,398.9754333496094
+462.1891784667969,370.75921630859375
+2950.603515625,927.0215454101562
diff --git a/examples/benchmarks/geolifeclef/geolifeclef2025_pre_extracted/evaluate_inference_MME.py b/examples/benchmarks/geolifeclef/geolifeclef2025_pre_extracted/evaluate_inference_MME.py
@@ -0,0 +1,91 @@
+"""This script computes metrics off of model inference predictions.
+
+It computes the Precision, Recall, F1-score (micro, samples and macro)
+for the top-25 predictions of a model inference predictions (in a CSV);
+as well as the AUC (micro, samples and macro) for all the probabilities
+(not just the top-25).
+
+Author: Theo Larcher <theo.larcher@inria.fr>
+        Alexis Joly <alexis.joly@inria.fr>
+"""
+from copy import deepcopy
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
+from tqdm import tqdm
+
+# 0. Load data
+df_gt = pd.read_csv('predictions_and_evaluation/GLC24_SOLUTION_FILE.csv')
+df_preds = pd.read_csv('predictions_and_evaluation/predictions_GLC24_SOLUTION_FILE.csv', sep=';')
+for rowi, row in deepcopy(df_gt).iterrows():
+    tsi = np.array(row['target_species_ids'].split()).astype(int)  # Split the predictions string by space and convert to int
+    inds = np.where(tsi > 11254)[0]
+    vals = tsi[inds]
+    if inds.size > 0:
+        df_gt = df_gt.drop(rowi)
+        df_preds = df_preds.drop(rowi)
+        print(f"obs {rowi} of surveyId {row['surveyId']} removed because target_species_ids value {vals} out of range")
+
+
+# 1. Convert data to usable types and compute one-hot encodings
+res = pd.DataFrame(columns=['Precision_micro', 'Recall_micro', 'F1_micro',
+                            'Precision_samples', 'Recall_samples', 'F1_samples',
+                            'Precision_macro', 'Recall_macro', 'F1_macro',
+                            'AUC_micro', 'AUC_samples', 'AUC_macro'])
+obs_id = df_gt['surveyId']
+
+targets = df_gt['target_species_ids']
+targets = [list(map(int, x.split())) for x in targets]
+
+preds = df_preds['predictions']
+preds = np.array([list(map(int, x.split())) for x in preds])
+
+probas = df_preds['probas']
+probas = np.array([list(map(float, x.split())) for x in probas])
+
+all_targets_oh = np.zeros((len(df_gt), 11255))
+all_probas = np.zeros_like(probas)
+all_predictions_top25_oh = np.zeros((len(df_preds), 11255))
+
+for k, (p, t) in tqdm(enumerate(zip(preds, targets)), total=len(targets)):
+    all_probas[k] = probas[k][np.argsort(p)]
+    for t2 in t:
+        all_targets_oh[k, t2] = 1
+    for p2 in p[:25]:
+        all_predictions_top25_oh[k, p2] = 1
+
+# 2. Compute Precision / Recall / F1-score
+print('\nComputing Precision, Recall, F1-scores...')
+prfs = {}
+for avg in ['micro', 'samples', 'macro']:
+    prf = precision_recall_fscore_support(all_targets_oh, all_predictions_top25_oh, average=avg, zero_division=np.nan)[:3]
+    prfs[f'Precision_{avg}'] = prf[0]
+    prfs[f'Recall_{avg}'] = prf[1]
+    prfs[f'F1_{avg}'] = prf[2]
+    print(f"{avg.upper()}: Precision, Recall, F1", prf)
+
+
+# 3. Compute AUCs
+print('\nComputing AUCs...')
+# Find rows and columns with all zeros in both arrays, that is to say
+# species that are never observed in any plot according to the ground truth
+zero_cols_targets = np.all(all_targets_oh == 0, axis=0)
+ones_cols_targets = np.all(all_targets_oh == 1, axis=0)
+zero_cols = zero_cols_targets | ones_cols_targets
+# Filter out rows and columns containing only zeros
+filtered_targets = all_targets_oh[:][:, ~zero_cols]
+filtered_probas = all_probas[:][:, ~zero_cols]
+filtered_predictions_top25 = all_predictions_top25_oh[:][:, ~zero_cols]
+
+aucs = {}
+for avg in ['micro', 'samples', 'macro']:
+    auc = roc_auc_score(filtered_targets, filtered_probas, average=avg)
+    aucs[f'AUC_{avg}'] = auc
+    print(f"{avg.upper()}: AUC", auc)
+
+
+# 4. Save results
+res.loc[0] = prfs | aucs
+res.to_csv('Inference_PRC-AUC.csv', index=False)
+print('\nResults saved to Inference_PRC-AUC.csv')
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		mean,std
		3914.8479827880924,3080.6445717511765
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		mean,std
		26.188058348891673,29.624102936518728
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		mean,std
		30.269297362566068,25.212980775818476