Skip to content

ftw module

Utilities for downloading and preparing the Fields of The World (FTW) dataset.

The Fields of The World (FTW) dataset is a large-scale benchmark for agricultural field boundary instance segmentation. It contains Sentinel-2 imagery (4 bands: Red, Green, Blue, NIR at 10 m resolution) paired with instance segmentation masks across 25 countries.

Reference

Kerner et al., "Fields of The World: A Machine Learning Benchmark Dataset For Global Agricultural Field Boundary Delineation", 2024. https://fieldsofthe.world/

display_ftw_samples(data_dir, country='luxembourg', num_samples=4, split='train', window='window_a', clip_value=3000, figsize=None, cmap='tab20', save_path=None)

Display FTW image-mask pairs from the raw dataset.

Shows Sentinel-2 RGB images alongside their corresponding instance segmentation masks for visual inspection of the training data.

Parameters:

Name Type Description Default
data_dir str

Path to the root FTW data directory (containing country subdirectories as downloaded by download_ftw).

required
country str

Country subset to display. Defaults to "luxembourg".

'luxembourg'
num_samples int

Number of image-mask pairs to display. Defaults to 4.

4
split str

Dataset split to sample from ("train", "val", or "test"). Defaults to "train".

'train'
window str

Which temporal acquisition to display. The FTW dataset provides two Sentinel-2 images per chip from different dates ("window_a" and "window_b"). Defaults to "window_a".

'window_a'
clip_value int

Upper bound for Sentinel-2 reflectance used for RGB visualization. Defaults to 3000.

3000
figsize Optional[Tuple[int, int]]

Figure size as (width, height) in inches. If None, auto-calculated based on num_samples.

None
cmap str

Colormap for instance mask display. Defaults to "tab20".

'tab20'
save_path Optional[str]

If provided, save figure to this path instead of displaying. Defaults to None.

None
Example

import geoai geoai.download_ftw(countries=["luxembourg"]) geoai.display_ftw_samples("ftw_data", country="luxembourg", num_samples=6)

Source code in geoai/ftw.py
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
def display_ftw_samples(
    data_dir: str,
    country: str = "luxembourg",
    num_samples: int = 4,
    split: str = "train",
    window: str = "window_a",
    clip_value: int = 3000,
    figsize: Optional[Tuple[int, int]] = None,
    cmap: str = "tab20",
    save_path: Optional[str] = None,
) -> None:
    """Display FTW image-mask pairs from the raw dataset.

    Shows Sentinel-2 RGB images alongside their corresponding instance
    segmentation masks for visual inspection of the training data.

    Args:
        data_dir: Path to the root FTW data directory (containing country
            subdirectories as downloaded by ``download_ftw``).
        country: Country subset to display. Defaults to "luxembourg".
        num_samples: Number of image-mask pairs to display. Defaults to 4.
        split: Dataset split to sample from ("train", "val", or "test").
            Defaults to "train".
        window: Which temporal acquisition to display. The FTW dataset
            provides two Sentinel-2 images per chip from different dates
            (``"window_a"`` and ``"window_b"``).  Defaults to
            ``"window_a"``.
        clip_value: Upper bound for Sentinel-2 reflectance used for RGB
            visualization. Defaults to 3000.
        figsize: Figure size as (width, height) in inches. If None,
            auto-calculated based on ``num_samples``.
        cmap: Colormap for instance mask display. Defaults to "tab20".
        save_path: If provided, save figure to this path instead of
            displaying. Defaults to None.

    Example:
        >>> import geoai
        >>> geoai.download_ftw(countries=["luxembourg"])
        >>> geoai.display_ftw_samples("ftw_data", country="luxembourg", num_samples=6)
    """
    import matplotlib.pyplot as plt
    import rasterio

    try:
        import pandas as pd
    except ImportError:
        raise ImportError(
            "pandas is required for display_ftw_samples. "
            "Install it with: pip install pandas"
        )

    country = country.lower()
    country_dir = os.path.join(data_dir, country)

    # Handle alternate directory structure from ftw-tools
    if not os.path.isdir(country_dir):
        alt_dir = os.path.join(data_dir, "ftw", country)
        if os.path.isdir(alt_dir):
            country_dir = alt_dir
        else:
            raise FileNotFoundError(
                f"Country directory not found: {country_dir}. "
                f"Run download_ftw(countries=['{country}'], "
                f"output_dir='{data_dir}') first."
            )

    parquet_path = os.path.join(country_dir, f"chips_{country}.parquet")
    chips_df = pd.read_parquet(parquet_path)

    aois = chips_df[chips_df["split"] == split]["aoi_id"].tolist()
    num_samples = min(num_samples, len(aois))

    if num_samples == 0:
        print(f"No samples found for split='{split}'")
        return

    if figsize is None:
        figsize = (3 * num_samples, 6)

    fig, axes = plt.subplots(2, num_samples, figsize=figsize)
    if num_samples == 1:
        axes = axes.reshape(2, 1)

    for i in range(num_samples):
        aoi = aois[i]
        img_path = os.path.join(country_dir, "s2_images", window, f"{aoi}.tif")
        mask_path = os.path.join(country_dir, "label_masks", "instance", f"{aoi}.tif")

        with rasterio.open(img_path) as src:
            img = src.read([1, 2, 3]).transpose(1, 2, 0)
            img = np.clip(img / float(clip_value), 0, 1)

        with rasterio.open(mask_path) as src:
            mask = src.read(1)

        axes[0, i].imshow(img)
        axes[0, i].set_title(f"Image {i + 1}")
        axes[0, i].axis("off")

        axes[1, i].imshow(mask, cmap=cmap, interpolation="nearest")
        axes[1, i].set_title(f"Mask {i + 1}")
        axes[1, i].axis("off")

    axes[0, 0].set_ylabel("Sentinel-2 RGB", fontsize=12)
    axes[1, 0].set_ylabel("Instance Mask", fontsize=12)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches="tight")

    plt.show()

download_ftw(countries=None, output_dir='ftw_data', overwrite=False)

Download the Fields of The World (FTW) dataset for specified countries.

Downloads Sentinel-2 imagery and instance segmentation masks from the FTW dataset hosted on Source Cooperative. Each country subset includes 256x256 pixel chips with 4-band (Red, Green, Blue, NIR) GeoTIFF images captured at two different dates (window_a and window_b) and corresponding instance mask GeoTIFFs. The two temporal windows allow models to exploit seasonal vegetation differences for better field boundary detection.

Parameters:

Name Type Description Default
countries Optional[List[str]]

List of country names to download. If None, downloads Luxembourg (smallest European subset). Use FTW_COUNTRIES for the full list of available countries.

None
output_dir str

Directory to save downloaded data. Defaults to "ftw_data".

'ftw_data'
overwrite bool

If True, re-download even if data already exists. Defaults to False.

False

Returns:

Type Description
str

Path to the output directory containing downloaded country subsets.

Raises:

Type Description
ValueError

If any country name is not in the list of available countries.

Example

import geoai geoai.download_ftw(countries=["luxembourg"], output_dir="ftw_data") 'ftw_data'

Source code in geoai/ftw.py
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def download_ftw(
    countries: Optional[List[str]] = None,
    output_dir: str = "ftw_data",
    overwrite: bool = False,
) -> str:
    """Download the Fields of The World (FTW) dataset for specified countries.

    Downloads Sentinel-2 imagery and instance segmentation masks from the
    FTW dataset hosted on Source Cooperative. Each country subset includes
    256x256 pixel chips with 4-band (Red, Green, Blue, NIR) GeoTIFF images
    captured at two different dates (``window_a`` and ``window_b``) and
    corresponding instance mask GeoTIFFs.  The two temporal windows allow
    models to exploit seasonal vegetation differences for better field
    boundary detection.

    Args:
        countries: List of country names to download. If None, downloads
            Luxembourg (smallest European subset). Use ``FTW_COUNTRIES`` for
            the full list of available countries.
        output_dir: Directory to save downloaded data. Defaults to "ftw_data".
        overwrite: If True, re-download even if data already exists.
            Defaults to False.

    Returns:
        Path to the output directory containing downloaded country subsets.

    Raises:
        ValueError: If any country name is not in the list of available
            countries.

    Example:
        >>> import geoai
        >>> geoai.download_ftw(countries=["luxembourg"], output_dir="ftw_data")
        'ftw_data'
    """
    from .utils.download import download_file

    if countries is None:
        countries = ["luxembourg"]

    # Validate country names
    invalid = [c for c in countries if c.lower() not in FTW_COUNTRIES]
    if invalid:
        raise ValueError(
            f"Invalid country names: {invalid}. "
            f"Available countries: {FTW_COUNTRIES}"
        )

    os.makedirs(output_dir, exist_ok=True)

    for country in countries:
        country = country.lower()
        country_dir = os.path.join(output_dir, country)

        if os.path.exists(country_dir) and not overwrite:
            print(f"FTW {country} already exists at {country_dir}, skipping.")
            continue

        # Remove existing directory when overwriting
        if os.path.exists(country_dir) and overwrite:
            shutil.rmtree(country_dir)

        url = FTW_BASE_URL.format(country=country)
        zip_path = os.path.join(output_dir, f"{country}.zip")

        # Download the zip file
        print(f"Downloading FTW {country} dataset...")
        download_file(url, output_path=zip_path, overwrite=overwrite, unzip=False)

        # Extract into a temporary directory, then move to country_dir
        import zipfile

        if os.path.isfile(zip_path) and zipfile.is_zipfile(zip_path):
            # Extract into a temp dir first to handle flat zip structure
            tmp_dir = os.path.join(output_dir, f"_tmp_{country}")
            os.makedirs(tmp_dir, exist_ok=True)

            print(f"Extracting {zip_path}...")
            with zipfile.ZipFile(zip_path, "r") as zf:
                zf.extractall(tmp_dir)

            # Check if the zip contained a single top-level directory
            top_items = os.listdir(tmp_dir)
            if len(top_items) == 1 and os.path.isdir(
                os.path.join(tmp_dir, top_items[0])
            ):
                # Single directory — move it
                shutil.move(os.path.join(tmp_dir, top_items[0]), country_dir)
                os.rmdir(tmp_dir)
            else:
                # Flat structure — rename the temp dir
                shutil.move(tmp_dir, country_dir)

            print(f"Extracted to {country_dir}")

    return output_dir

prepare_ftw(data_dir, country='luxembourg', output_dir=None, window='window_a', clip_value=3000, num_test=5, verbose=True)

Prepare FTW data for training with geoai's instance segmentation pipeline.

Rescales Sentinel-2 reflectance images from 0-10000 to uint8 (0-255) and organizes them into images/ and labels/ directories compatible with geoai.train_instance_segmentation_model().

Parameters:

Name Type Description Default
data_dir str

Path to the root FTW data directory (containing country subdirectories as downloaded by download_ftw).

required
country str

Country subset to prepare. Defaults to "luxembourg".

'luxembourg'
output_dir Optional[str]

Directory to write prepared images and labels. If None, defaults to "field_boundaries".

None
window str

Which temporal window to use for imagery. The FTW dataset provides two Sentinel-2 acquisitions from different dates for each chip so that seasonal vegetation differences can help delineate field boundaries. "window_a" and "window_b" correspond to these two dates. Use one window for 4-band input or stack both externally for 8-band input. Defaults to "window_a".

'window_a'
clip_value int

Upper bound for Sentinel-2 reflectance clipping before rescaling to 0-255. Defaults to 3000.

3000
num_test int

Number of test chips to prepare for inference. Set to 0 to skip test data preparation. Defaults to 5.

5
verbose bool

If True, print progress information. Defaults to True.

True

Returns:

Type Description
Dict[str, Any]

Dictionary with keys: - images_dir: Path to prepared training images. - labels_dir: Path to prepared training labels. - test_dir: Path to prepared test images (or None if num_test=0). - num_train: Number of training chips prepared. - num_test: Number of test chips prepared.

Raises:

Type Description
FileNotFoundError

If the country directory or parquet metadata file is not found.

Example

import geoai geoai.download_ftw(countries=["luxembourg"]) result = geoai.prepare_ftw("ftw_data", country="luxembourg") print(result["images_dir"], result["num_train"])

Source code in geoai/ftw.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
def prepare_ftw(
    data_dir: str,
    country: str = "luxembourg",
    output_dir: Optional[str] = None,
    window: str = "window_a",
    clip_value: int = 3000,
    num_test: int = 5,
    verbose: bool = True,
) -> Dict[str, Any]:
    """Prepare FTW data for training with geoai's instance segmentation pipeline.

    Rescales Sentinel-2 reflectance images from 0-10000 to uint8 (0-255) and
    organizes them into ``images/`` and ``labels/`` directories compatible with
    ``geoai.train_instance_segmentation_model()``.

    Args:
        data_dir: Path to the root FTW data directory (containing country
            subdirectories as downloaded by ``download_ftw``).
        country: Country subset to prepare. Defaults to "luxembourg".
        output_dir: Directory to write prepared images and labels. If None,
            defaults to ``"field_boundaries"``.
        window: Which temporal window to use for imagery. The FTW dataset
            provides two Sentinel-2 acquisitions from different dates for
            each chip so that seasonal vegetation differences can help
            delineate field boundaries.  ``"window_a"`` and ``"window_b"``
            correspond to these two dates.  Use one window for 4-band
            input or stack both externally for 8-band input.
            Defaults to ``"window_a"``.
        clip_value: Upper bound for Sentinel-2 reflectance clipping before
            rescaling to 0-255. Defaults to 3000.
        num_test: Number of test chips to prepare for inference. Set to 0 to
            skip test data preparation. Defaults to 5.
        verbose: If True, print progress information. Defaults to True.

    Returns:
        Dictionary with keys:
            - ``images_dir``: Path to prepared training images.
            - ``labels_dir``: Path to prepared training labels.
            - ``test_dir``: Path to prepared test images (or None if
              ``num_test=0``).
            - ``num_train``: Number of training chips prepared.
            - ``num_test``: Number of test chips prepared.

    Raises:
        FileNotFoundError: If the country directory or parquet metadata
            file is not found.

    Example:
        >>> import geoai
        >>> geoai.download_ftw(countries=["luxembourg"])
        >>> result = geoai.prepare_ftw("ftw_data", country="luxembourg")
        >>> print(result["images_dir"], result["num_train"])
    """
    import rasterio

    try:
        import pandas as pd
    except ImportError:
        raise ImportError(
            "pandas is required for prepare_ftw. Install it with: pip install pandas"
        )

    if output_dir is None:
        output_dir = "field_boundaries"

    country = country.lower()
    country_dir = os.path.join(data_dir, country)

    # Handle case where ftw-tools creates an extra subdirectory
    if not os.path.isdir(country_dir):
        alt_dir = os.path.join(data_dir, "ftw", country)
        if os.path.isdir(alt_dir):
            country_dir = alt_dir
        else:
            raise FileNotFoundError(
                f"Country directory not found: {country_dir}. "
                f"Run download_ftw(countries=['{country}'], "
                f"output_dir='{data_dir}') first."
            )

    parquet_path = os.path.join(country_dir, f"chips_{country}.parquet")
    if not os.path.exists(parquet_path):
        raise FileNotFoundError(
            f"Metadata file not found: {parquet_path}. "
            f"The FTW download may be incomplete."
        )

    chips_df = pd.read_parquet(parquet_path)

    images_dir = os.path.join(output_dir, "images")
    labels_dir = os.path.join(output_dir, "labels")
    test_dir = os.path.join(output_dir, "test") if num_test > 0 else None

    os.makedirs(images_dir, exist_ok=True)
    os.makedirs(labels_dir, exist_ok=True)
    if test_dir:
        os.makedirs(test_dir, exist_ok=True)

    # Get split AOI IDs
    train_aois = chips_df[chips_df["split"] == "train"]["aoi_id"].tolist()
    val_aois = chips_df[chips_df["split"] == "val"]["aoi_id"].tolist()
    test_aois = chips_df[chips_df["split"] == "test"]["aoi_id"].tolist()

    # Use train + val for training (geoai handles its own val_split)
    all_train_aois = train_aois + val_aois

    if verbose:
        print(f"FTW {country}: {len(chips_df)} total chips")
        print(
            f"  Train: {len(train_aois)}, Val: {len(val_aois)}, Test: {len(test_aois)}"
        )
        print(f"  Using {len(all_train_aois)} chips for training")
        print(f"Preparing training data...")

    # Process training chips
    prepared_train = 0
    for aoi in all_train_aois:
        src_img = os.path.join(country_dir, "s2_images", window, f"{aoi}.tif")
        src_mask = os.path.join(country_dir, "label_masks", "instance", f"{aoi}.tif")

        if not os.path.exists(src_img) or not os.path.exists(src_mask):
            continue

        _rescale_sentinel2_image(
            src_img, os.path.join(images_dir, f"{aoi}.tif"), clip_value
        )
        shutil.copy2(src_mask, os.path.join(labels_dir, f"{aoi}.tif"))
        prepared_train += 1

    if verbose:
        skipped = len(all_train_aois) - prepared_train
        print(f"Prepared {prepared_train} training chips (skipped {skipped})")

    # Process test chips
    prepared_test = 0
    if test_dir and num_test > 0:
        for aoi in test_aois[:num_test]:
            src_img = os.path.join(country_dir, "s2_images", window, f"{aoi}.tif")
            if os.path.exists(src_img):
                _rescale_sentinel2_image(
                    src_img, os.path.join(test_dir, f"{aoi}.tif"), clip_value
                )
                prepared_test += 1

        if verbose:
            print(f"Prepared {prepared_test} test chips")

    return {
        "images_dir": images_dir,
        "labels_dir": labels_dir,
        "test_dir": test_dir,
        "num_train": prepared_train,
        "num_test": prepared_test,
    }