Skip to content

Instantly share code, notes, and snippets.

@calebrob6
Created April 17, 2025 19:15
Show Gist options
  • Save calebrob6/98ba0ebe36df18e049244064136a359e to your computer and use it in GitHub Desktop.
Save calebrob6/98ba0ebe36df18e049244064136a359e to your computer and use it in GitHub Desktop.
Script for benchmarking polygonization through gdal command line calls vs. in-memory with rasterio and friends.
#!/usr/bin/env python3
"""
benchmark_sieve.py
Benchmark sieving, polygonizing, and simplifying GeoTIFFs using two methods:
- "gdal": subprocess calls to gdal_sieve.py, gdal_polygonize.py, and ogr2ogr
- "python": pure Python using rasterio.features, shapely, and fiona
"""
import os
import sys
import glob
import argparse
import time
import subprocess
import random
import tempfile
from tqdm import tqdm
import rasterio
import rasterio.features
from shapely.geometry import shape, mapping
import fiona
THRESHOLD = 25 # minimum pixel regions (for sieve)
CONNECTIVITY = 4 # connectivity for gdal_sieve (4 or 8)
SIMPLIFY_TOLERANCE = 5.0 # tolerance in CRS units for geometry simplification
FIELD_RASTER_VAL = 2 # raster value to keep (for polygonization)
def run_gdal_pipeline(tif_path, out_gpkg):
basename = os.path.basename(tif_path)
tmp_sieved = os.path.join(
tempfile.gettempdir(), basename.replace(".tif", "_sieved.tif")
)
subprocess.run(
[
"gdal_sieve.py",
"-q",
"-st",
str(THRESHOLD),
f"-{CONNECTIVITY}",
"-of",
"GTiff",
tif_path,
tmp_sieved,
],
check=True,
)
tmp_gpkg = os.path.join(
tempfile.gettempdir(), basename.replace(".tif", "_tmp.gpkg")
)
subprocess.run(
[
"gdal_polygonize.py",
"-q",
"-of",
"GPKG",
tmp_sieved,
tmp_gpkg,
],
check=True,
)
subprocess.run(
[
"ogr2ogr",
"-f",
"GPKG",
out_gpkg,
tmp_gpkg,
"-where",
f"DN = {FIELD_RASTER_VAL}",
"-simplify",
str(SIMPLIFY_TOLERANCE),
],
check=True,
)
os.remove(tmp_sieved)
os.remove(tmp_gpkg)
def run_python_pipeline(tif_path, out_gpkg):
with rasterio.open(tif_path) as src:
arr = src.read(1)
transform = src.transform
mask = arr == FIELD_RASTER_VAL
arr = rasterio.features.sieve(
arr, size=THRESHOLD, connectivity=CONNECTIVITY, mask=mask
)
shapes = rasterio.features.shapes(arr, mask=mask, transform=transform)
rows = []
for geom, val in shapes:
poly = shape(geom)
if val == FIELD_RASTER_VAL:
simp = poly.simplify(SIMPLIFY_TOLERANCE, preserve_topology=True)
rows.append({"geometry": mapping(simp), "properties": {}})
schema = {"geometry": "Polygon", "properties": {}}
with fiona.open(
out_gpkg, mode="w", driver="GPKG", crs=src.crs, schema=schema
) as dst:
dst.writerecords(rows)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input_dir", required=True)
parser.add_argument("--output_dir", required=True)
parser.add_argument("--method", required=True, choices=["gdal", "python"])
parser.add_argument("-n", "--sample_size", type=int, default=100)
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
tifs = glob.glob(os.path.join(args.input_dir, "*.tif"))
if not tifs:
print(f"No .tif files found in {args.input_dir}")
sys.exit(1)
sample = random.sample(tifs, min(args.sample_size, len(tifs)))
print(f"Processing {len(sample)} files using method '{args.method}'...")
start = time.perf_counter()
for tif in tqdm(sample):
base = os.path.splitext(os.path.basename(tif))[0]
out_gpkg = os.path.join(args.output_dir, f"{base}.gpkg")
if args.method == "gdal":
run_gdal_pipeline(tif, out_gpkg)
else:
run_python_pipeline(tif, out_gpkg)
elapsed = time.perf_counter() - start
print(
f"Method '{args.method}' processed {len(sample)} files in {elapsed:.2f} seconds."
)
if __name__ == "__main__":
main()
@calebrob6
Copy link
Author

Python in-memory is the winner!

$ python benchmark_polygonization.py --input_dir output/full_ethiopia_predictions-ethiopia_2022_stack_9/ --output_dir tmp/ --method gdal -n 100
Processing 100 files using method 'gdal'...
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:40<00:00,  2.46it/s]
Method 'gdal' processed 100 files in 40.61 seconds.

$ python benchmark_polygonization.py --input_dir output/full_ethiopia_predictions-ethiopia_2022_stack_9/ --output_dir tmp2/ --method python -n 100
Processing 100 files using method 'python'...
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 28.16it/s]
Method 'python' processed 100 files in 3.56 seconds.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment