#core python geospatial packages
import xarray as xr
import numpy as np
import rioxarray as riox
import geopandas as gpd
import xvec
from shapely.geometry import box, mapping
import os
os.environ["AWS_NO_SIGN_REQUEST"] = "YES"

#data search
import stackstac
import pystac_client

#plotting
import hvplot.xarray
import holoviews as hv
#interactive plots
hvplot.extension('bokeh')
import geoviews

#static plots
#hvplot.extension('matplotlib')
#import matplotlib.pyplot as plt
#%matplotlib inline

#ml
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay

#other
from dask.diagnostics import ProgressBar
import warnings #dont print warnings
warnings.filterwarnings('ignore')

#read data in geopackage with geopandas
# we will read directly from s3

gdf = gpd.read_file('s3://ocs-training-2026/advanced/gctwf_invasive.gpkg')
gdf = gdf.to_crs("EPSG:32734")
bbox = gdf.total_bounds
gdf

#interactive plot
gdf[['name','geometry']].explore('name',tiles='https://mt1.google.com/vt/lyrs=s&x={x}&y={y}&z={z}', attr='Google')

#We will need this later to turn codes back to names
# Get unique values
name_table = gdf[['class', 'name']].drop_duplicates().sort_values(by=['class'])

#the location of the catalog we want to search (find this on AWS Registry of Open Data)
URL = "https://earth-search.aws.element84.com/v1"
catalog = pystac_client.Client.open(URL)

#we want data that intersect with this location
lat, lon = -33.80, 19.20
#in this time
datefilter = "2018-09-01/2018-09-30"

#search!
items = catalog.search(
    intersects=dict(type="Point", coordinates=[lon, lat]),
    collections=["sentinel-2-l2a"],
    datetime=datefilter,
    query={"eo:cloud_cover": {"lt": 10}}  # Filter for cloud cover less than 10%
).item_collection()

#how many matches?
len(items)

1

items[0]

stack = stackstac.stack(
    items[0],
    bounds=(bbox[0], bbox[1], bbox[2], bbox[3]), 
    rescale = False, 
    fill_value = np.int16(-9999), 
    dtype = np.int16, 
    epsg=32734)

#stackstac creates a time dim, but we only have 1 date so we drop this
stack = stack.squeeze()

#select only the bands we need
stack = stack.sel(band=['blue', 'coastal', 'green', 'nir', 'nir08', 'nir09', 'red', 'rededge1', 'rededge2', 'rededge3', 'swir16', 'swir22'])

#combine bands into one chunk
stack = stack.chunk({'band':-1})

#look at the xarray
stack

<xarray.DataArray 'stackstac-517c8c49216796f44cbdc961a21c9dfb' (band: 12,
                                                                y: 2953, x: 4784)> Size: 339MB
dask.array<rechunk-merge, shape=(12, 2953, 4784), dtype=int16, chunksize=(12, 1024, 1024), chunktype=numpy.ndarray>
Coordinates: (12/52)
    time                                     datetime64[ns] 8B 2018-09-23T08:...
    id                                       <U24 96B 'S2A_34HCH_20180923_0_L2A'
  * band                                     (band) <U12 576B 'blue' ... 'swi...
  * x                                        (x) float64 38kB 3.136e+05 ... 3...
  * y                                        (y) float64 24kB 6.267e+06 ... 6...
    s2:unclassified_percentage               float64 8B 2.227
    ...                                       ...
    title                                    (band) <U31 1kB dask.array<chunksize=(12,), meta=np.ndarray>
    gsd                                      (band) object 96B dask.array<chunksize=(12,), meta=np.ndarray>
    common_name                              (band) object 96B dask.array<chunksize=(12,), meta=np.ndarray>
    center_wavelength                        (band) object 96B dask.array<chunksize=(12,), meta=np.ndarray>
    full_width_half_max                      (band) object 96B dask.array<chunksize=(12,), meta=np.ndarray>
    epsg                                     int64 8B 32734
Attributes:
    spec:           RasterSpec(epsg=32734, bounds=(313578.3159344659, 6236098...
    crs:            epsg:32734
    transform:      | 10.31, 0.00, 313578.32|\n| 0.00,-10.31, 6266531.03|\n| ...
    resolution_xy:  (10.30626161619884, 10.305641846406953)

array('2018-09-23T08:41:05.823000000', dtype='datetime64[ns]')

array('S2A_34HCH_20180923_0_L2A', dtype='<U24')

array(['blue', 'coastal', 'green', 'nir', 'nir08', 'nir09', 'red', 'rededge1',
       'rededge2', 'rededge3', 'swir16', 'swir22'], dtype='<U12')

array([313578.315934, 313588.622196, 313598.928458, ..., 362852.552722,
       362862.858983, 362873.165245])

array([6266531.026261, 6266520.720619, 6266510.414977, ..., 6236129.382814,
       6236119.077172, 6236108.77153 ])

array(2.227378)

array('S2A_OPER_MSI_L2A_TL_SHIT_20200924T005214_A016992_T34HCH_N00.01',
      dtype='<U62')

array('CH', dtype='<U2')

array(44.357371)

rgb = stack.sel(band=['red','green','blue']).astype(float) / 10000
rgb = rgb.clip(0, 1)

#rgb.hvplot.rgb(
#    x='x', y='y', bands='band',rasterize=True,data_aspect=1,title="True colour")

#free up memory
del rgb

#select red and nir bands
red = stack.sel(band='red')
nir = stack.sel(band='nir')

# Set the reflectance threshold
threshold = 500

# Create a shadow mask: identify dark pixels across all bands
shadow_condition = (red < threshold) & (nir < threshold)

# Set shadowed pixels to nodata (use fill_value to preserve int16 dtype)
stack = stack.where(~shadow_condition, other=np.int16(-9999))

# Extract points
point = stack.xvec.extract_points(gdf['geometry'], x_coords="x", y_coords="y",index=True)
point = point.swap_dims({'geometry': 'index'}).to_dataset(name='s2')

#lets actually run this and get the result
with ProgressBar():
    point = point.compute()

[########################################] | 100% Completed | 8.51 ss

#drop points that contain nodata
condition = (point.s2 != -9999).all(dim='band') & (point.s2.notnull().all(dim='band'))

# Apply the mask to keep only the valid indices
point = point.where(condition, drop=True)

#add label from geopandas
gxr =gdf[['class','group']].to_xarray()
point = point.merge(gxr.astype(int),join='left')
point

<xarray.Dataset> Size: 126kB
Dimensions:                                  (band: 12, index: 1517)
Coordinates: (12/52)
  * band                                     (band) <U12 576B 'blue' ... 'swi...
  * index                                    (index) int64 12kB 0 1 ... 1572
    time                                     datetime64[ns] 8B 2018-09-23T08:...
    id                                       <U24 96B 'S2A_34HCH_20180923_0_L2A'
    s2:unclassified_percentage               float64 8B 2.227
    s2:granule_id                            <U62 248B 'S2A_OPER_MSI_L2A_TL_S...
    ...                                       ...
    gsd                                      (band) object 96B 10 60 ... 20 20
    common_name                              (band) object 96B 'blue' ... 'sw...
    center_wavelength                        (band) object 96B 0.49 ... 2.19
    full_width_half_max                      (band) object 96B 0.098 ... 0.242
    epsg                                     int64 8B 32734
    geometry                                 (index) object 12kB POINT (33049...
Data variables:
    s2                                       (band, index) float32 73kB 1.014...
    class                                    (index) int64 12kB 0 0 0 ... 9 9 9
    group                                    (index) int64 12kB 2 2 2 ... 1 0 2

array(['blue', 'coastal', 'green', 'nir', 'nir08', 'nir09', 'red', 'rededge1',
       'rededge2', 'rededge3', 'swir16', 'swir22'], dtype='<U12')

array([   0,    1,    2, ..., 1570, 1571, 1572])

array('2018-09-23T08:41:05.823000000', dtype='datetime64[ns]')

array('S2A_34HCH_20180923_0_L2A', dtype='<U24')

array(2.227378)

array('S2A_OPER_MSI_L2A_TL_SHIT_20200924T005214_A016992_T34HCH_N00.01',
      dtype='<U62')

array('CH', dtype='<U2')

array(44.357371)

array(43.751228)

pointp = point.isel(index=0)
pointp['center_wavelength'] = pointp['center_wavelength'].astype(float)
pointp['s2'].hvplot.scatter(x='center_wavelength',by='index',
                                         color='green',ylim=(0,3000),alpha=0.5,legend=False,title = "Single point data")

geodf = gpd.read_file('s3://ocs-training-2026/advanced/aoi.gpkg').to_crs("EPSG:32734")
geoms = geodf.geometry.apply(mapping)

#split into train and test
dtrain = point.where(point['group']==1,drop=True)
dtest = point.where(point['group']==2,drop=True)

#create separte datasets for labels and features
y_train = dtrain['class'].values.astype(int)
y_test = dtest['class'].values.astype(int)
X_train = dtrain['s2'].values.T
X_test = dtest['s2'].values.T

#free up memory
del point
del dtrain
del dtest
del geodf
del gdf

# Create and train the XGBoost model
model = xgb.XGBClassifier(
    max_depth=7,
    learning_rate=0.1,
    subsample=0.8,
    n_estimators=100,
    tree_method='hist'
)

# Fit the model to the training data
model.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)

y_pred = model.predict(X_test)

# Step 2: Calculate acc and F1 score for the entire dataset
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc}")

# Step 3: Calculate precision and recall for Pine
precision_pine = precision_score(y_test, y_pred, labels=[2], average='macro', zero_division=0)
recall_pine = recall_score(y_test, y_pred, labels=[2], average='macro', zero_division=0)

print(f"Precision for Pines: {precision_pine}")
print(f"Recall for Pines: {recall_pine}")

# Step 4: Plot the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred,normalize='pred')

ConfusionMatrixDisplay(confusion_matrix=conf_matrix,display_labels=list(name_table['name'])).plot()

Accuracy: 0.5754422476586889
Precision for Pines: 0.6730769230769231
Recall for Pines: 0.7894736842105263

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fba2f75b1a0>

def predict_on_chunk(chunk, model):
    probabilities = model.predict_proba(chunk)
    return probabilities

#crop to small area
minx, miny, maxx, maxy = 328205.96, 6236363.01, 341315.31, 6248964.32724
stack = stack.sel(x=slice(minx, maxx), y=slice(maxy, miny))

def predict_xr(ds,geometries):

    #part 1 - data prep
    #condition to use for masking no data later
    condition = (ds > 0).any(dim='band')

    #stack the data into a single dimension. This will be important for applying the model later
    ds = ds.stack(sample=('x','y'))


    #part 2 - apply the model over chunks
    result = xr.apply_ufunc(
        predict_on_chunk,
        ds,
        input_core_dims=[['band']],#input dim with features
        output_core_dims=[['class']],  # name for the new output dim
        exclude_dims=set(('band',)),  #dims to drop in result
        output_sizes={'class': 10}, #length of the new dimension
        output_dtypes=[np.float32],
        dask="parallelized",
        kwargs={'model': model}
    )

    #part 3 - post-processing
    result = result.unstack('sample') #remove the stack
    result = result.rio.set_spatial_dims(x_dim='x',y_dim='y') #set the spatial dims
    result = result.rio.write_crs("EPSG:32734") #set the CRS
    result = result.rio.clip(geometries).where(condition) #clip to the protected areas and no data
    result = result.transpose('class', 'y', 'x') #transpose the data - rio expects it this way
    return result.compute()

with ProgressBar():
    predicted  = predict_xr(stack,geoms)

[########################################] | 100% Completed | 7.93 sms

predicted

<xarray.DataArray 'stackstac-517c8c49216796f44cbdc961a21c9dfb' (class: 10,
                                                                y: 1223, x: 1272)> Size: 62MB
array([[[0.6283501 , 0.06953339, 0.07159919, ..., 0.00232456,
         0.001604  , 0.00595605],
        [0.84500855, 0.0792312 , 0.43602976, ..., 0.21599539,
         0.00543848, 0.00553337],
        [0.4364925 , 0.03451966, 0.04002046, ..., 0.00353127,
         0.00574716, 0.04065739],
        ...,
        [0.9310988 , 0.92980164, 0.93017477, ...,        nan,
                nan,        nan],
        [0.9290078 , 0.93440473, 0.93440473, ...,        nan,
                nan,        nan],
        [0.9286357 , 0.93478155, 0.93440473, ...,        nan,
                nan,        nan]],

       [[0.01203063, 0.02595829, 0.02778718, ..., 0.00491252,
         0.00575299, 0.10540175],
        [0.00717155, 0.02561428, 0.01535589, ..., 0.00684872,
         0.03172698, 0.03228056],
        [0.00747023, 0.02699131, 0.03054993, ..., 0.00926773,
         0.01287293, 0.05682084],
...
        [0.00617054, 0.00644014, 0.00644273, ...,        nan,
                nan,        nan],
        [0.00935319, 0.00940753, 0.00940753, ...,        nan,
                nan,        nan],
        [0.00934944, 0.00941132, 0.00940753, ...,        nan,
                nan,        nan]],

       [[0.0045573 , 0.0098332 , 0.010526  , ..., 0.00336898,
         0.00231379, 0.00859167],
        [0.00271664, 0.00970289, 0.00581693, ..., 0.00435008,
         0.00690316, 0.00702361],
        [0.00422494, 0.01022452, 0.01091577, ..., 0.0042513 ,
         0.00694853, 0.02564234],
        ...,
        [0.00137509, 0.00137318, 0.00137373, ...,        nan,
                nan,        nan],
        [0.00137201, 0.00137998, 0.00137998, ...,        nan,
                nan,        nan],
        [0.00137146, 0.00138053, 0.00137998, ...,        nan,
                nan,        nan]]], dtype=float32)
Coordinates: (12/46)
  * x                                        (x) float64 10kB 3.282e+05 ... 3...
  * y                                        (y) float64 10kB 6.249e+06 ... 6...
    time                                     datetime64[ns] 8B 2018-09-23T08:...
    id                                       <U24 96B 'S2A_34HCH_20180923_0_L2A'
    s2:unclassified_percentage               float64 8B 2.227
    s2:granule_id                            <U62 248B 'S2A_OPER_MSI_L2A_TL_S...
    ...                                       ...
    s2:degraded_msi_data_percentage          int64 8B 0
    s2:sequence                              <U1 4B '0'
    view:sun_azimuth                         float64 8B 41.53
    mgrs:utm_zone                            int64 8B 34
    epsg                                     int64 8B 32734
    spatial_ref                              int64 8B 0
Dimensions without coordinates: class

array([[[0.6283501 , 0.06953339, 0.07159919, ..., 0.00232456,
         0.001604  , 0.00595605],
        [0.84500855, 0.0792312 , 0.43602976, ..., 0.21599539,
         0.00543848, 0.00553337],
        [0.4364925 , 0.03451966, 0.04002046, ..., 0.00353127,
         0.00574716, 0.04065739],
        ...,
        [0.9310988 , 0.92980164, 0.93017477, ...,        nan,
                nan,        nan],
        [0.9290078 , 0.93440473, 0.93440473, ...,        nan,
                nan,        nan],
        [0.9286357 , 0.93478155, 0.93440473, ...,        nan,
                nan,        nan]],

       [[0.01203063, 0.02595829, 0.02778718, ..., 0.00491252,
         0.00575299, 0.10540175],
        [0.00717155, 0.02561428, 0.01535589, ..., 0.00684872,
         0.03172698, 0.03228056],
        [0.00747023, 0.02699131, 0.03054993, ..., 0.00926773,
         0.01287293, 0.05682084],
...
        [0.00617054, 0.00644014, 0.00644273, ...,        nan,
                nan,        nan],
        [0.00935319, 0.00940753, 0.00940753, ...,        nan,
                nan,        nan],
        [0.00934944, 0.00941132, 0.00940753, ...,        nan,
                nan,        nan]],

       [[0.0045573 , 0.0098332 , 0.010526  , ..., 0.00336898,
         0.00231379, 0.00859167],
        [0.00271664, 0.00970289, 0.00581693, ..., 0.00435008,
         0.00690316, 0.00702361],
        [0.00422494, 0.01022452, 0.01091577, ..., 0.0042513 ,
         0.00694853, 0.02564234],
        ...,
        [0.00137509, 0.00137318, 0.00137373, ...,        nan,
                nan,        nan],
        [0.00137201, 0.00137998, 0.00137998, ...,        nan,
                nan,        nan],
        [0.00137146, 0.00138053, 0.00137998, ...,        nan,
                nan,        nan]]], dtype=float32)

array([328213.207429, 328223.513691, 328233.819953, ..., 341291.85342 ,
       341302.159682, 341312.465944])

array([6248959.906913, 6248949.601271, 6248939.295629, ..., 6236387.02386 ,
       6236376.718218, 6236366.412577])

array('2018-09-23T08:41:05.823000000', dtype='datetime64[ns]')

array('S2A_34HCH_20180923_0_L2A', dtype='<U24')

array(2.227378)

array('S2A_OPER_MSI_L2A_TL_SHIT_20200924T005214_A016992_T34HCH_N00.01',
      dtype='<U62')

array('CH', dtype='<U2')

array(44.357371)

#reproject
predicted = predicted.rio.reproject("EPSG:4326",nodata=np.nan)
#select only pines
predicted_plot = predicted.isel({'class':2})
#set low probability to NaN
predicted_plot = predicted_plot.where(predicted_plot > 0.5, np.nan)

#plot with a satellite basemap
predicted_plot.hvplot(tiles=hv.element.tiles.EsriImagery(), 
                              project=True,clim=(0,1),
                              cmap='magma',frame_width=800,data_aspect=1,alpha=0.7,title='Pine probability')

predicted.rio.to_raster('gctwf_invasive.tiff',driver="COG")

import s3fs

fs = s3fs.S3FileSystem(anon=True)

with fs.open("s3://ocs-training-2026/advanced/invasive_data/gctwf_invasive.tiff", "wb") as f:
    predicted.rio.to_raster(f, driver="COG")

import icechunk
from icechunk.xarray import to_icechunk

# 1. Configure S3 storage with anonymous access
storage = icechunk.s3_storage(
    bucket="ocs-training-2026",
    prefix="advanced/invasive_data/gctwf_invasive",
    region="us-west-2",    
    anonymous=True,
)

# 2. Create a new repository
repo = icechunk.Repository.create(storage)

# 3. Open a writable session on the main branch
session = repo.writable_session("main")

# 4. Write your xarray dataset
to_icechunk(ds, session)

# 5. Commit
snapshot_id = session.commit("initial write")

	class	group	name	geometry
0	0	2	Bare Ground	POINT (330491.921 6236818.616)
1	0	2	Bare Ground	POINT (328453.584 6236416.828)
2	0	2	Bare Ground	POINT (329668.747 6237445.797)
3	0	2	Bare Ground	POINT (328198.792 6236338.431)
4	0	2	Bare Ground	POINT (329146.913 6237124.856)
...	...	...	...	...
1568	9	2	Water	POINT (354133.714 6264141.153)
1569	9	2	Water	POINT (354091.305 6265816.336)
1570	9	1	Water	POINT (354119.578 6260811.992)
1571	9	0	Water	POINT (354161.987 6264741.957)
1572	9	2	Water	POINT (353992.349 6262070.146)

Using cloud-native tools to map invasive species with supervised machine learning and Sentinel 2¶

Overview¶

Aims¶

Structure¶

1. Load Python packages¶

2. Load invasive plant data¶

Plot the data¶

3. Search for remote sensing data using STAC¶

4. Load our data¶

Shadow Masking¶

5. Extract Sentinel-2 data at point locations¶

6. Train ML model¶

Model Evaluation¶

Precision and Recall¶

Confusion Matrix¶

7. Predict over entire study area¶

Parallel vs. sequential processing¶

How parallel processing works with Dask¶

Write to Icechunk¶

Credits¶