Skip to content

Acquisition Reference

darts_acquisition

Acquisition of data from various sources for the DARTS dataset.

__version__ = importlib.metadata.version('darts-nextgen') module-attribute

download_admin_files(admin_dir)

Download the admin files for the regions.

Files will be stored under [admin_dir]/adm1.shp and [admin_dir]/adm2.shp.

Parameters:

Name Type Description Default
admin_dir Path

The path to the admin files.

required
Source code in darts-acquisition/src/darts_acquisition/admin.py
def download_admin_files(admin_dir: Path):
    """Download the admin files for the regions.

    Files will be stored under [admin_dir]/adm1.shp and [admin_dir]/adm2.shp.

    Args:
        admin_dir (Path): The path to the admin files.

    """
    tick_fstart = time.perf_counter()

    # Download the admin files
    admin_1_url = "https://github.com/wmgeolab/geoBoundaries/raw/main/releaseData/CGAZ/geoBoundariesCGAZ_ADM1.zip"
    admin_2_url = "https://github.com/wmgeolab/geoBoundaries/raw/main/releaseData/CGAZ/geoBoundariesCGAZ_ADM2.zip"

    admin_dir.mkdir(exist_ok=True, parents=True)

    logger.debug(f"Downloading {admin_1_url} to {admin_dir.resolve()}")
    _download_zip(admin_1_url, admin_dir)

    logger.debug(f"Downloading {admin_2_url} to {admin_dir.resolve()}")
    _download_zip(admin_2_url, admin_dir)

    tick_fend = time.perf_counter()
    logger.info(f"Downloaded admin files in {tick_fend - tick_fstart:.2f} seconds")

load_arcticdem(geobox, data_dir, resolution, buffer=0, persist=True)

Load the ArcticDEM for the given geobox, fetch new data from the STAC server if necessary.

Parameters:

Name Type Description Default
geobox GeoBox

The geobox for which the tile should be loaded.

required
data_dir Path | str

The directory where the ArcticDEM data is stored.

required
resolution Literal[2, 10, 32]

The resolution of the ArcticDEM data in m.

required
buffer int

The buffer around the projected (epsg:3413) geobox in pixels. Defaults to 0.

0
persist bool

If the data should be persisted in memory. If not, this will return a Dask backed Dataset. Defaults to True.

True

Returns:

Type Description
Dataset

xr.Dataset: The ArcticDEM tile, with a buffer applied. Note: The buffer is applied in the arcticdem dataset's CRS, hence the orientation might be different. Final dataset is NOT matched to the reference CRS and resolution.

Warning

Geobox must be in a meter based CRS.

Usage

Since the API of the load_arcticdem is based on GeoBox, one can load a specific ROI based on an existing Xarray DataArray:

import xarray as xr
import odc.geo.xr

from darts_aquisition import load_arcticdem

# Assume "optical" is an already loaded s2 based dataarray

arcticdem = load_arcticdem(
    optical.odc.geobox,
    "/path/to/arcticdem-parent-directory",
    resolution=2,
    buffer=ceil(self.tpi_outer_radius / 2 * sqrt(2))
)

# Now we can for example match the resolution and extent of the optical data:
arcticdem = arcticdem.odc.reproject(optical.odc.geobox, resampling="cubic")

The buffer parameter is used to extend the region of interest by a certain amount of pixels. This comes handy when calculating e.g. the Topographic Position Index (TPI), which requires a buffer around the region of interest to remove edge effects.

Source code in darts-acquisition/src/darts_acquisition/arcticdem/datacube.py
def load_arcticdem(
    geobox: GeoBox,
    data_dir: Path | str,
    resolution: RESOLUTIONS,
    buffer: int = 0,
    persist: bool = True,
) -> xr.Dataset:
    """Load the ArcticDEM for the given geobox, fetch new data from the STAC server if necessary.

    Args:
        geobox (GeoBox): The geobox for which the tile should be loaded.
        data_dir (Path | str): The directory where the ArcticDEM data is stored.
        resolution (Literal[2, 10, 32]): The resolution of the ArcticDEM data in m.
        buffer (int, optional): The buffer around the projected (epsg:3413) geobox in pixels. Defaults to 0.
        persist (bool, optional): If the data should be persisted in memory.
            If not, this will return a Dask backed Dataset. Defaults to True.

    Returns:
        xr.Dataset: The ArcticDEM tile, with a buffer applied.
            Note: The buffer is applied in the arcticdem dataset's CRS, hence the orientation might be different.
            Final dataset is NOT matched to the reference CRS and resolution.

    Warning:
        Geobox must be in a meter based CRS.

    Usage:
        Since the API of the `load_arcticdem` is based on GeoBox, one can load a specific ROI based on an existing Xarray DataArray:

        ```python
        import xarray as xr
        import odc.geo.xr

        from darts_aquisition import load_arcticdem

        # Assume "optical" is an already loaded s2 based dataarray

        arcticdem = load_arcticdem(
            optical.odc.geobox,
            "/path/to/arcticdem-parent-directory",
            resolution=2,
            buffer=ceil(self.tpi_outer_radius / 2 * sqrt(2))
        )

        # Now we can for example match the resolution and extent of the optical data:
        arcticdem = arcticdem.odc.reproject(optical.odc.geobox, resampling="cubic")
        ```

        The `buffer` parameter is used to extend the region of interest by a certain amount of pixels.
        This comes handy when calculating e.g. the Topographic Position Index (TPI), which requires a buffer around the region of interest to remove edge effects.

    """  # noqa: E501
    tick_fstart = time.perf_counter()

    data_dir = Path(data_dir) if isinstance(data_dir, str) else data_dir

    datacube_fpath = data_dir / f"datacube_{resolution}m_v4.1.zarr"
    storage = zarr.storage.FSStore(datacube_fpath)
    logger.debug(f"Getting ArcticDEM tile from {datacube_fpath.resolve()}")

    # ! The geobox must be in a meter based CRS
    logger.debug(f"Found a reference resolution of {geobox.resolution.x}m")

    # Check if the zarr data already exists
    if not datacube_fpath.exists():
        logger.debug(f"Creating a new zarr datacube at {datacube_fpath.resolve()} with {storage=}")
        create_empty_datacube(
            "ArcticDEM Data Cube",
            storage,
            DATA_EXTENT[resolution],
            CHUNK_SIZE,
            DATA_VARS,
            DATA_VARS_META,
            DATA_VARS_ENCODING,
        )

    # Get the adjacent arcticdem tiles
    # Note: We could also use pystac here, but this would result in a slight performance decrease
    # because of the network overhead, hence we use the extent file
    # Download the extent, download if the file does not exist
    extent_fpath = data_dir / f"ArcticDEM_Mosaic_Index_v4_1_{resolution}m.parquet"
    with download_lock:
        if not extent_fpath.exists():
            download_arcticdem_extent(data_dir)
    extent = gpd.read_parquet(extent_fpath)

    # Add a buffer around the geobox to get the adjacent tiles
    reference_geobox = geobox.to_crs("epsg:3413", resolution=resolution).pad(buffer)
    adjacent_tiles = extent[extent.intersects(reference_geobox.extent.geom)]

    # Download the adjacent tiles (if necessary)
    with download_lock:
        procedural_download_datacube(storage, adjacent_tiles)

    # Load the datacube and set the spatial_ref since it is set as a coordinate within the zarr format
    chunks = None if persist else "auto"
    arcticdem_datacube = xr.open_zarr(storage, mask_and_scale=False, chunks=chunks).set_coords("spatial_ref")

    # Get an AOI slice of the datacube
    arcticdem_aoi = arcticdem_datacube.odc.crop(reference_geobox.extent, apply_mask=False)

    # The following code would load the lazy zarr data from disk into memory
    if persist:
        tick_sload = time.perf_counter()
        arcticdem_aoi = arcticdem_aoi.load()
        tick_eload = time.perf_counter()
        logger.debug(f"ArcticDEM AOI loaded from disk in {tick_eload - tick_sload:.2f} seconds")

    # Change dtype of the datamask to uint8 for later reproject_match
    arcticdem_aoi["datamask"] = arcticdem_aoi.datamask.astype("uint8")

    logger.info(
        f"ArcticDEM tile {'loaded' if persist else 'lazy-opened'} in {time.perf_counter() - tick_fstart:.2f} seconds"
    )
    return arcticdem_aoi

load_arcticdem_from_vrt(slope_vrt, elevation_vrt, reference_dataset)

Load ArcticDEM data and reproject it to match the reference dataset.

Parameters:

Name Type Description Default
slope_vrt Path

Path to the ArcticDEM slope VRT file.

required
elevation_vrt Path

Path to the ArcticDEM elevation VRT file.

required
reference_dataset Dataset

The reference dataset to reproject, resampled and cropped the ArcticDEM data to.

required

Returns:

Type Description
Dataset

xr.Dataset: The ArcticDEM data reprojected, resampled and cropped to match the reference dataset.

Source code in darts-acquisition/src/darts_acquisition/arcticdem/vrt.py
def load_arcticdem_from_vrt(slope_vrt: Path, elevation_vrt: Path, reference_dataset: xr.Dataset) -> xr.Dataset:
    """Load ArcticDEM data and reproject it to match the reference dataset.

    Args:
        slope_vrt (Path): Path to the ArcticDEM slope VRT file.
        elevation_vrt (Path): Path to the ArcticDEM elevation VRT file.
        reference_dataset (xr.Dataset): The reference dataset to reproject, resampled and cropped the ArcticDEM data to.

    Returns:
        xr.Dataset: The ArcticDEM data reprojected, resampled and cropped to match the reference dataset.


    """
    start_time = time.time()
    logger.debug(f"Loading ArcticDEM slope from {slope_vrt.resolve()} and elevation from {elevation_vrt.resolve()}")

    slope = load_vrt(slope_vrt, reference_dataset)
    slope: xr.Dataset = (
        slope.assign_attrs({"data_source": "arcticdem", "long_name": "Slope"})
        .rio.write_nodata(float("nan"))
        .astype("float32")
        .to_dataset(name="slope")
    )

    relative_elevation = load_vrt(elevation_vrt, reference_dataset)
    relative_elevation: xr.Dataset = (
        relative_elevation.assign_attrs({"data_source": "arcticdem", "long_name": "Relative Elevation", "units": "m"})
        .fillna(0)
        .rio.write_nodata(0)
        .astype("int16")
        .to_dataset(name="relative_elevation")
    )

    articdem_ds = xr.merge([relative_elevation, slope])
    logger.debug(f"Loaded ArcticDEM data in {time.time() - start_time} seconds.")
    return articdem_ds

load_planet_masks(fpath)

Load the valid and quality data masks from a Planet scene.

Parameters:

Name Type Description Default
fpath str | Path

The file path to the Planet scene from which to derive the masks.

required

Raises:

Type Description
FileNotFoundError

If no matching UDM-2 TIFF file is found in the specified path.

Returns:

Type Description
Dataset

xr.Dataset: A merged xarray Dataset containing two data masks: - 'valid_data_mask': A mask indicating valid (1) and no data (0). - 'quality_data_mask': A mask indicating high quality (1) and low quality (0).

Source code in darts-acquisition/src/darts_acquisition/planet.py
def load_planet_masks(fpath: str | Path) -> xr.Dataset:
    """Load the valid and quality data masks from a Planet scene.

    Args:
        fpath (str | Path): The file path to the Planet scene from which to derive the masks.

    Raises:
        FileNotFoundError: If no matching UDM-2 TIFF file is found in the specified path.

    Returns:
        xr.Dataset: A merged xarray Dataset containing two data masks:
            - 'valid_data_mask': A mask indicating valid (1) and no data (0).
            - 'quality_data_mask': A mask indicating high quality (1) and low quality (0).

    """
    start_time = time.time()

    # Convert to Path object if a string is provided
    fpath = fpath if isinstance(fpath, Path) else Path(fpath)

    logger.debug(f"Loading data masks from {fpath.resolve()}")

    # Get imagepath
    udm_path = next(fpath.glob("*_udm2.tif"), None)
    if not udm_path:
        udm_path = next(fpath.glob("*_udm2_clip.tif"), None)
    if not udm_path:
        raise FileNotFoundError(f"No matching UDM-2 TIFF files found in {fpath.resolve()} (.glob('*_udm2.tif'))")

    # See udm classes here: https://developers.planet.com/docs/data/udm-2/
    da_udm = xr.open_dataarray(udm_path)

    invalids = da_udm.sel(band=8).fillna(0) != 0
    low_quality = da_udm.sel(band=[2, 3, 4, 5, 6]).max(axis=0) == 1
    high_quality = ~low_quality & ~invalids
    qa_ds = xr.Dataset(coords={c: da_udm.coords[c] for c in da_udm.coords})
    qa_ds["quality_data_mask"] = (
        xr.zeros_like(da_udm.sel(band=8)).where(invalids, 0).where(low_quality, 1).where(high_quality, 2)
    )
    qa_ds["quality_data_mask"].attrs = {
        "data_source": "planet",
        "long_name": "Quality data mask",
        "description": "0 = Invalid, 1 = Low Quality, 2 = High Quality",
    }
    logger.debug(f"Loaded data masks in {time.time() - start_time} seconds.")
    return qa_ds

load_planet_scene(fpath)

Load a PlanetScope satellite GeoTIFF file and return it as an xarray datset.

Parameters:

Name Type Description Default
fpath str | Path

The path to the directory containing the TIFF files or a specific path to the TIFF file.

required

Returns:

Type Description
Dataset

xr.Dataset: The loaded dataset

Raises:

Type Description
FileNotFoundError

If no matching TIFF file is found in the specified path.

Source code in darts-acquisition/src/darts_acquisition/planet.py
def load_planet_scene(fpath: str | Path) -> xr.Dataset:
    """Load a PlanetScope satellite GeoTIFF file and return it as an xarray datset.

    Args:
        fpath (str | Path): The path to the directory containing the TIFF files or a specific path to the TIFF file.

    Returns:
        xr.Dataset: The loaded dataset

    Raises:
        FileNotFoundError: If no matching TIFF file is found in the specified path.

    """
    start_time = time.time()

    # Convert to Path object if a string is provided
    fpath = fpath if isinstance(fpath, Path) else Path(fpath)

    # Check if the directory contains a PSOrthoTile or PSScene
    planet_type = parse_planet_type(fpath)
    logger.debug(f"Loading Planet PS {planet_type.capitalize()} from {fpath.resolve()}")

    # Get imagepath
    ps_image = next(fpath.glob("*_SR.tif"), None)
    if not ps_image:
        ps_image = next(fpath.glob("*_SR_clip.tif"), None)
    if not ps_image:
        raise FileNotFoundError(f"No matching TIFF files found in {fpath.resolve()} (.glob('*_SR.tif'))")

    # Define band names and corresponding indices
    planet_da = xr.open_dataarray(ps_image)

    # Create a dataset with the bands
    bands = ["blue", "green", "red", "nir"]
    ds_planet = (
        planet_da.fillna(0).rio.write_nodata(0).astype("uint16").assign_coords({"band": bands}).to_dataset(dim="band")
    )
    for var in ds_planet.variables:
        ds_planet[var].assign_attrs(
            {
                "long_name": f"PLANET {var.capitalize()}",
                "data_source": "planet",
                "planet_type": planet_type,
                "units": "Reflectance",
            }
        )
    ds_planet.attrs = {"tile_id": fpath.parent.stem if planet_type == "orthotile" else fpath.stem}
    logger.debug(f"Loaded Planet scene in {time.time() - start_time} seconds.")
    return ds_planet

load_s2_masks(fpath, reference_geobox)

Load the valid and quality data masks from a Sentinel 2 scene.

Parameters:

Name Type Description Default
fpath str | Path

The path to the directory containing the TIFF files.

required
reference_geobox GeoBox

The reference geobox to reproject, resample and crop the masks data to.

required

Returns:

Type Description
Dataset

xr.Dataset: A merged xarray Dataset containing two data masks: - 'valid_data_mask': A mask indicating valid (1) and no data (0). - 'quality_data_mask': A mask indicating high quality (1) and low quality (0).

Source code in darts-acquisition/src/darts_acquisition/s2.py
def load_s2_masks(fpath: str | Path, reference_geobox: GeoBox) -> xr.Dataset:
    """Load the valid and quality data masks from a Sentinel 2 scene.

    Args:
        fpath (str | Path): The path to the directory containing the TIFF files.
        reference_geobox (GeoBox): The reference geobox to reproject, resample and crop the masks data to.


    Returns:
        xr.Dataset: A merged xarray Dataset containing two data masks:
            - 'valid_data_mask': A mask indicating valid (1) and no data (0).
            - 'quality_data_mask': A mask indicating high quality (1) and low quality (0).

    """
    start_time = time.time()

    # Convert to Path object if a string is provided
    fpath = fpath if isinstance(fpath, Path) else Path(fpath)

    logger.debug(f"Loading data masks from {fpath.resolve()}")

    # TODO: SCL band in SR file
    try:
        scl_path = next(fpath.glob("*_SCL*.tif"))
    except StopIteration:
        logger.warning("Found no data quality mask (SCL). No masking will occur.")
        valid_data_mask = (odc.geo.xr.xr_zeros(reference_geobox, dtype="uint8") + 1).to_dataset(name="valid_data_mask")
        valid_data_mask.attrs = {"data_source": "s2", "long_name": "Valid Data Mask"}
        quality_data_mask = odc.geo.xr.xr_zeros(reference_geobox, dtype="uint8").to_dataset(name="quality_data_mask")
        quality_data_mask.attrs = {"data_source": "s2", "long_name": "Quality Data Mask"}
        qa_ds = xr.merge([valid_data_mask, quality_data_mask])
        return qa_ds

    # See scene classes here: https://custom-scripts.sentinel-hub.com/custom-scripts/sentinel-2/scene-classification/
    da_scl = xr.open_dataarray(scl_path)

    da_scl = da_scl.odc.reproject(reference_geobox, sampling="nearest")

    # Match crs
    da_scl = da_scl.rio.write_crs(reference_geobox.crs)

    # TODO: new masking method
    qa_ds = xr.Dataset(coords={c: da_scl.coords[c] for c in da_scl.coords})
    qa_ds = da_scl.sel(band=1).fillna(0)
    qa_ds = convert_masks(qa_ds)

    logger.debug(f"Loaded data masks in {time.time() - start_time} seconds.")
    return qa_ds

load_s2_scene(fpath)

Load a Sentinel 2 satellite GeoTIFF file and return it as an xarray datset.

Parameters:

Name Type Description Default
fpath str | Path

The path to the directory containing the TIFF files.

required

Returns:

Type Description
Dataset

xr.Dataset: The loaded dataset

Raises:

Type Description
FileNotFoundError

If no matching TIFF file is found in the specified path.

Source code in darts-acquisition/src/darts_acquisition/s2.py
def load_s2_scene(fpath: str | Path) -> xr.Dataset:
    """Load a Sentinel 2 satellite GeoTIFF file and return it as an xarray datset.

    Args:
        fpath (str | Path): The path to the directory containing the TIFF files.

    Returns:
        xr.Dataset: The loaded dataset

    Raises:
        FileNotFoundError: If no matching TIFF file is found in the specified path.

    """
    start_time = time.time()

    # Convert to Path object if a string is provided
    fpath = fpath if isinstance(fpath, Path) else Path(fpath)

    logger.debug(f"Loading Sentinel 2 scene from {fpath.resolve()}")

    # Get imagepath
    try:
        s2_image = next(fpath.glob("*_SR*.tif"))
    except StopIteration:
        raise FileNotFoundError(f"No matching TIFF files found in {fpath.resolve()} (.glob('*_SR*.tif'))")

    # Define band names and corresponding indices
    s2_da = xr.open_dataarray(s2_image)

    # Create a dataset with the bands
    bands = ["blue", "green", "red", "nir"]
    ds_s2 = s2_da.fillna(0).rio.write_nodata(0).astype("uint16").assign_coords({"band": bands}).to_dataset(dim="band")

    for var in ds_s2.data_vars:
        ds_s2[var].assign_attrs(
            {"data_source": "s2", "long_name": f"Sentinel 2 {var.capitalize()}", "units": "Reflectance"}
        )

    planet_crop_id, s2_tile_id, tile_id = parse_s2_tile_id(fpath)
    ds_s2.attrs["planet_crop_id"] = planet_crop_id
    ds_s2.attrs["s2_tile_id"] = s2_tile_id
    ds_s2.attrs["tile_id"] = tile_id
    logger.debug(f"Loaded Sentinel 2 scene in {time.time() - start_time} seconds.")
    return ds_s2

load_tcvis(geobox, data_dir, buffer=0, persist=True)

Load the TCVIS for the given geobox, fetch new data from GEE if necessary.

Parameters:

Name Type Description Default
geobox GeoBox

The geobox to load the data for.

required
data_dir Path | str

The directory to store the downloaded data for faster access for consecutive calls.

required
buffer int

The buffer around the geobox in pixels. Defaults to 0.

0
persist bool

If the data should be persisted in memory. If not, this will return a Dask backed Dataset. Defaults to True.

True

Returns:

Type Description
Dataset

xr.Dataset: The TCVIS dataset.

Usage

Since the API of the load_tcvis is based on GeoBox, one can load a specific ROI based on an existing Xarray DataArray:

import xarray as xr
import odc.geo.xr

from darts_aquisition import load_tcvis

# Assume "optical" is an already loaded s2 based dataarray

tcvis = load_tcvis(
    optical.odc.geobox,
    "/path/to/tcvis-parent-directory",
)

# Now we can for example match the resolution and extent of the optical data:
tcvis = tcvis.odc.reproject(optical.odc.geobox, resampling="cubic")
Source code in darts-acquisition/src/darts_acquisition/tcvis.py
def load_tcvis(
    geobox: GeoBox,
    data_dir: Path | str,
    buffer: int = 0,
    persist: bool = True,
) -> xr.Dataset:
    """Load the TCVIS for the given geobox, fetch new data from GEE if necessary.

    Args:
        geobox (GeoBox): The geobox to load the data for.
        data_dir (Path | str): The directory to store the downloaded data for faster access for consecutive calls.
        buffer (int, optional): The buffer around the geobox in pixels. Defaults to 0.
        persist (bool, optional): If the data should be persisted in memory.
            If not, this will return a Dask backed Dataset. Defaults to True.

    Returns:
        xr.Dataset: The TCVIS dataset.

    Usage:
        Since the API of the `load_tcvis` is based on GeoBox, one can load a specific ROI based on an existing Xarray DataArray:

        ```python
        import xarray as xr
        import odc.geo.xr

        from darts_aquisition import load_tcvis

        # Assume "optical" is an already loaded s2 based dataarray

        tcvis = load_tcvis(
            optical.odc.geobox,
            "/path/to/tcvis-parent-directory",
        )

        # Now we can for example match the resolution and extent of the optical data:
        tcvis = tcvis.odc.reproject(optical.odc.geobox, resampling="cubic")
        ```

    """  # noqa: E501
    tick_fstart = time.perf_counter()

    data_dir = Path(data_dir) if isinstance(data_dir, str) else data_dir

    datacube_fpath = data_dir / "tcvis_2000-2019.zarr"
    storage = zarr.storage.FSStore(datacube_fpath)
    logger.debug(f"Loading TCVis from {datacube_fpath.resolve()}")

    if not datacube_fpath.exists():
        logger.debug(f"Creating a new zarr datacube at {datacube_fpath.resolve()} with {storage=}")
        create_empty_datacube(
            title="Landsat Trends TCVIS 2000-2019",
            storage=storage,
            geobox=DATA_EXTENT,
            chunk_size=CHUNK_SIZE,
            data_vars=DATA_VARS,
            meta=DATA_VARS_META,
            var_encoding=DATA_VARS_ENCODING,
        )

    # Download the adjacent tiles (if necessary)
    reference_geobox = geobox.to_crs("epsg:4326", resolution=DATA_EXTENT.resolution.x).pad(buffer)
    with download_lock:
        procedural_download_datacube(storage, reference_geobox)

    # Load the datacube and set the spatial_ref since it is set as a coordinate within the zarr format
    chunks = None if persist else "auto"
    tcvis_datacube = xr.open_zarr(storage, mask_and_scale=False, chunks=chunks).set_coords("spatial_ref")

    # Get an AOI slice of the datacube
    tcvis_aoi = tcvis_datacube.odc.crop(reference_geobox.extent, apply_mask=False)

    # The following code would load the lazy zarr data from disk into memory
    if persist:
        tick_sload = time.perf_counter()
        tcvis_aoi = tcvis_aoi.load()
        tick_eload = time.perf_counter()
        logger.debug(f"TCVIS AOI loaded from disk in {tick_eload - tick_sload:.2f} seconds")

    logger.info(
        f"TCVIS tile {'loaded' if persist else 'lazy-opened'} in {time.perf_counter() - tick_fstart:.2f} seconds"
    )
    return tcvis_aoi