Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/dispatch-test-examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ jobs:
python-version: ${{ matrix.python-version }}
cache: 'pip' # caching pip dependencies
- name: install cartopy system packages
run: sudo apt-get install libgeos-dev=3.10.2-1
run: sudo apt-get install libgeos-dev
- name: pip install -r requirements_python3.10.txt
run: pip install -r ./requirements_python3.10.txt
- name: pip install -e .
run: pip install -e .
- name: Pytest
run: |
pytest malpolon/tests/test_examples.py
pytest malpolon/tests/test_examples.py
2 changes: 1 addition & 1 deletion .github/workflows/dispatch-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
python-version: ${{ matrix.python-version }}
cache: 'pip' # caching pip dependencies
- name: install cartopy system packages
run: sudo apt-get install libgeos-dev=3.10.2-1
run: sudo apt-get install libgeos-dev
- name: pip install -r requirements_python3.10.txt
run: pip install -r ./requirements_python3.10.txt
- name: pip install -e .
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
python-version: ${{ matrix.python-version }}
cache: 'pip' # caching pip dependencies
- name: install cartopy system packages
run: sudo apt-get install libgeos-dev=3.10.2-1
run: sudo apt-get install libgeos-dev
- name: pip install -r requirements_python3.10.txt
run: pip install -r ./requirements_python3.10.txt
- name: pip install -e .
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Data
dataset/geolifeclef-2025/BioclimTimeSeries
dataset/geolifeclef-2025/EnvironmentalValues
dataset/geolifeclef-2025/SatellitePatches
dataset/geolifeclef-2025/SatelitePatches
dataset/geolifeclef-2025/SatelliteTimeSeries-Landsat
dataset/geolifeclef-2025/SateliteTimeSeries-Landsat
dataset/geolifeclef-2025/*.csv

dataset/geolifeclef-2025/stats/fps*
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
hydra:
run:
dir: outputs/${hydra.job.name}/${now:%Y-%m-%d_%H-%M-%S}

run:
predict: false
checkpoint_path: # "outputs/glc25_cnn_multimodal_ensemble/2025-04-08_14-52-54/last.ckpt"

data:
root: "dataset/geolifeclef-2025/"
data_paths:
train:
landsat_data_dir: "${data.root}SateliteTimeSeries-Landsat/cubes/PA-train/"
bioclim_data_dir: "${data.root}BioclimTimeSeries/cubes/PA-train/"
sentinel_data_dir: "${data.root}SatelitePatches/PA-train/"
test:
landsat_data_dir: "${data.root}SateliteTimeSeries-Landsat/cubes/PA-test/"
bioclim_data_dir: "${data.root}BioclimTimeSeries/cubes/PA-test/"
sentinel_data_dir: "${data.root}SatelitePatches/PA-test/"
metadata_paths:
train: "${data.root}GLC25_PA_metadata_train_train-0.6min.csv"
val: "${data.root}GLC25_PA_metadata_train_val-0.6min.csv"
test: "${data.root}GLC25_PA_metadata_test.csv"
num_classes: &num_classes 11255
download_data: True
train_batch_size: 64
inference_batch_size: 16
num_workers: 16

task:
task: "classification_multilabel" # ['classification_binary', 'classification_multiclass', 'classification_multilabel']

trainer:
# gpus: 1 # Deprecated since pytorchlightning 1.7, removed in 2.0. Replaced by the 2 next attributes
accelerator: "cpu"
devices: 'auto'
max_epochs: 21 # if resuming training from our pre-trained MME model, needs to be > 19
val_check_interval: 100
check_val_every_n_epoch: 1
log_every_n_steps: 100

model:
provider_name: "malpolon" # choose from ["malpolon", "timm", "torchvision"]
model_name: "glc24_multimodal_ensemble" # The GLC24 model is used for GLC25
model_kwargs:
pretrained: true # Deprecated in torchvision since 0.13 (replaced by "weights") but used by timm
modifiers:
change_last_layer:
num_outputs: *num_classes

optim:
loss_kwargs:
pos_weight: 10.0
optimizer:
adamw:
kwargs:
lr: 0.00025
scheduler:
cosine_annealing_lr:
kwargs:
T_max: 25
verbose: True
metrics:
multilabel_accuracy:
# callable: 'Fmetrics.classification.multilabel_accuracy'
kwargs:
num_labels: *num_classes
# threshold: 0.1
average: micro
multilabel_recall:
callable: 'Fmetrics.classification.multilabel_recall'
kwargs:
num_labels: *num_classes
# threshold: 0.1
average: micro
multilabel_precision:
callable: 'Fmetrics.classification.multilabel_precision'
kwargs:
num_labels: *num_classes
# threshold: 0.1
average: micro
multilabel_f1-score:
callable: 'Fmetrics.classification.multilabel_f1_score'
kwargs:
num_labels: *num_classes
# threshold: 0.1
average: micro

loggers:
exp_name: "GLC25_MME" # Name of your experiment
log_dir_name: "tensorboard_logs/" # Name of the logs directory
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
### A. Spatially splitting the dataset
To split the observation dataset in _train_ and _val_ while avoiding spatial auto-corelation, we use Malpolon's toolbox method `split_obs_spatially.py` based on the library `verde`. The method takes as input an observation CSV file with **lon**, **lat** columns, and evenly splits the data subsets wrt a spacing radius (be default: 10/60 degrees).

The radius value can be whatever real, but it should be coherent with the CRS of the dataset to split. In the case of GLC25, observations coordinates are registered in WGS84 EPSG:4326. So inputting 10/60 as spacing value corresponds to ~0.16 degrees, or 10 arcminutes. Over France, this corresponds to a spacing of around 17km.

In this repository, we chose to split with a spacing of 0.01 degrees, or 0.6 arcminutes which, over France, corresponds to a spacing of around 1.1km.

### B. Computing dataset moments
To compute the mean and standard deviation values of each modality of our dataset, we use the method `compute_mean_std_iteratively_from_sample.py` Malpolon's toolbox which approximates the real values of mean & std with an iterative computation based on a list of path files.

1. Produce text files containg the filepaths to each data element of the dataset for each modality.

In a Python terminal session:
```python
import os
import pandas as pd

def construct_patch_path(data_path, survey_id):
path = data_path
for d in (str(survey_id)[-2:], str(survey_id)[-4:-2]):
path = os.path.join(path, d)
path = os.path.join(path, f"{survey_id}.tiff")
return path

df_train = pd.read_csv('GLC25_PA_metadata_train_train-0.6min.csv')
df_val = pd.read_csv('GLC25_PA_metadata_train_val-0.6min.csv')

# Example for bioclim rasters
fps_train_bioclim = list(df_train['surveyId'].apply(lambda x: f'BioclimTimeSeries/cubes/PA-train/GLC25-PA-train-bioclimatic_monthly_{x}_cube.pt').values)
with open('fps_bioclim_train_train-0.6min.txt', 'w') as f:
for string in fps_train_bioclim:
f.write(string + '\n')

# Example for landsat time series
fps_train_landsat = list(df_train['surveyId'].apply(lambda x: f'SatelliteTimeSeries-Landsat/cubes/PA-train/GLC25-PA-train-landsat-time-series_{x}_cube.pt').values)
with open('fps_landsat_train_train-0.6min.txt', 'w') as f:
for string in fps_train_landsat:
f.write(string + '\n')

# Example for satellite patches
fps_val_satellite = list(df_val['surveyId'].apply(lambda x: construct_patch_path('SatellitePatches/PA-train/', x)).values)
with open('fps_satellite_train_val-0.6min.txt', 'w') as f:
for string in fps_val_satellite:
f.write(string + '\n')
```

2. Run the moments computation script.

```bash
python ../../../../../../toolbox/compute_mean_std_iteratively_from_sample.py -p fps_bioclim_train_val-0.6min.txt -o Stats_bioclim_val.csv --type tiff --max_items 10000
```

For Satellite patches (Sentinel-2A), add the argumen `--per_channel` to compute the moments for each of the 4 channels: red, green, blue, nir. The output CSV contains the values for those channels in the same order row-wise. You can verify the order of bands with the command `gdalinfo <path>/<patch_name>.tiff`.

### Glossary
- fps: filepaths
- PA: Presence Absence
- PO: Presence Only
- CRS: Coordinate Reference System
- xxx\_train\_train-[train,val]-\d.\dmin: spatial split, either the train or validation part, of the observation dataset, with a spatial spacing of \d.\d minutes (wrt to WGS84 CRS)
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
mean,std
3932.149871972656,3490.3687862811103
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
mean,std
3914.8479827880924,3080.6445717511765
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
mean,std
3955.529410424809,3234.002077993207
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
mean,std
26.188058348891673,29.624102936518728
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
mean,std
30.654699535584506,25.70223457928363
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
mean,std
30.269297362566068,25.212980775818476
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
mean,std
517.7869262695312,530.5372924804688
565.6556396484375,497.5302734375
376.7779541015625,427.4356994628906
2289.86279296875,1510.10400390625
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
mean,std
629.6244506835938,435.9951171875
691.8153076171875,371.3965759277344
460.6056823730469,342.8971252441406
2959.370361328125,925.369140625
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
mean,std
633.1102905273438,465.04644775390625
692.7642211914062,398.9754333496094
462.1891784667969,370.75921630859375
2950.603515625,927.0215454101562
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""This script computes metrics off of model inference predictions.

It computes the Precision, Recall, F1-score (micro, samples and macro)
for the top-25 predictions of a model inference predictions (in a CSV);
as well as the AUC (micro, samples and macro) for all the probabilities
(not just the top-25).

Author: Theo Larcher <theo.larcher@inria.fr>
Alexis Joly <alexis.joly@inria.fr>
"""
from copy import deepcopy

import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from tqdm import tqdm

# 0. Load data
df_gt = pd.read_csv('predictions_and_evaluation/GLC24_SOLUTION_FILE.csv')
df_preds = pd.read_csv('predictions_and_evaluation/predictions_GLC24_SOLUTION_FILE.csv', sep=';')
for rowi, row in deepcopy(df_gt).iterrows():
tsi = np.array(row['target_species_ids'].split()).astype(int) # Split the predictions string by space and convert to int
inds = np.where(tsi > 11254)[0]
vals = tsi[inds]
if inds.size > 0:
df_gt = df_gt.drop(rowi)
df_preds = df_preds.drop(rowi)
print(f"obs {rowi} of surveyId {row['surveyId']} removed because target_species_ids value {vals} out of range")


# 1. Convert data to usable types and compute one-hot encodings
res = pd.DataFrame(columns=['Precision_micro', 'Recall_micro', 'F1_micro',
'Precision_samples', 'Recall_samples', 'F1_samples',
'Precision_macro', 'Recall_macro', 'F1_macro',
'AUC_micro', 'AUC_samples', 'AUC_macro'])
obs_id = df_gt['surveyId']

targets = df_gt['target_species_ids']
targets = [list(map(int, x.split())) for x in targets]

preds = df_preds['predictions']
preds = np.array([list(map(int, x.split())) for x in preds])

probas = df_preds['probas']
probas = np.array([list(map(float, x.split())) for x in probas])

all_targets_oh = np.zeros((len(df_gt), 11255))
all_probas = np.zeros_like(probas)
all_predictions_top25_oh = np.zeros((len(df_preds), 11255))

for k, (p, t) in tqdm(enumerate(zip(preds, targets)), total=len(targets)):
all_probas[k] = probas[k][np.argsort(p)]
for t2 in t:
all_targets_oh[k, t2] = 1
for p2 in p[:25]:
all_predictions_top25_oh[k, p2] = 1

# 2. Compute Precision / Recall / F1-score
print('\nComputing Precision, Recall, F1-scores...')
prfs = {}
for avg in ['micro', 'samples', 'macro']:
prf = precision_recall_fscore_support(all_targets_oh, all_predictions_top25_oh, average=avg, zero_division=np.nan)[:3]
prfs[f'Precision_{avg}'] = prf[0]
prfs[f'Recall_{avg}'] = prf[1]
prfs[f'F1_{avg}'] = prf[2]
print(f"{avg.upper()}: Precision, Recall, F1", prf)


# 3. Compute AUCs
print('\nComputing AUCs...')
# Find rows and columns with all zeros in both arrays, that is to say
# species that are never observed in any plot according to the ground truth
zero_cols_targets = np.all(all_targets_oh == 0, axis=0)
ones_cols_targets = np.all(all_targets_oh == 1, axis=0)
zero_cols = zero_cols_targets | ones_cols_targets
# Filter out rows and columns containing only zeros
filtered_targets = all_targets_oh[:][:, ~zero_cols]
filtered_probas = all_probas[:][:, ~zero_cols]
filtered_predictions_top25 = all_predictions_top25_oh[:][:, ~zero_cols]

aucs = {}
for avg in ['micro', 'samples', 'macro']:
auc = roc_auc_score(filtered_targets, filtered_probas, average=avg)
aucs[f'AUC_{avg}'] = auc
print(f"{avg.upper()}: AUC", auc)


# 4. Save results
res.loc[0] = prfs | aucs
res.to_csv('Inference_PRC-AUC.csv', index=False)
print('\nResults saved to Inference_PRC-AUC.csv')
Loading