Created
April 17, 2025 19:15
-
-
Save calebrob6/98ba0ebe36df18e049244064136a359e to your computer and use it in GitHub Desktop.
Script for benchmarking polygonization through gdal command line calls vs. in-memory with rasterio and friends.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
benchmark_sieve.py | |
Benchmark sieving, polygonizing, and simplifying GeoTIFFs using two methods: | |
- "gdal": subprocess calls to gdal_sieve.py, gdal_polygonize.py, and ogr2ogr | |
- "python": pure Python using rasterio.features, shapely, and fiona | |
""" | |
import os | |
import sys | |
import glob | |
import argparse | |
import time | |
import subprocess | |
import random | |
import tempfile | |
from tqdm import tqdm | |
import rasterio | |
import rasterio.features | |
from shapely.geometry import shape, mapping | |
import fiona | |
THRESHOLD = 25 # minimum pixel regions (for sieve) | |
CONNECTIVITY = 4 # connectivity for gdal_sieve (4 or 8) | |
SIMPLIFY_TOLERANCE = 5.0 # tolerance in CRS units for geometry simplification | |
FIELD_RASTER_VAL = 2 # raster value to keep (for polygonization) | |
def run_gdal_pipeline(tif_path, out_gpkg): | |
basename = os.path.basename(tif_path) | |
tmp_sieved = os.path.join( | |
tempfile.gettempdir(), basename.replace(".tif", "_sieved.tif") | |
) | |
subprocess.run( | |
[ | |
"gdal_sieve.py", | |
"-q", | |
"-st", | |
str(THRESHOLD), | |
f"-{CONNECTIVITY}", | |
"-of", | |
"GTiff", | |
tif_path, | |
tmp_sieved, | |
], | |
check=True, | |
) | |
tmp_gpkg = os.path.join( | |
tempfile.gettempdir(), basename.replace(".tif", "_tmp.gpkg") | |
) | |
subprocess.run( | |
[ | |
"gdal_polygonize.py", | |
"-q", | |
"-of", | |
"GPKG", | |
tmp_sieved, | |
tmp_gpkg, | |
], | |
check=True, | |
) | |
subprocess.run( | |
[ | |
"ogr2ogr", | |
"-f", | |
"GPKG", | |
out_gpkg, | |
tmp_gpkg, | |
"-where", | |
f"DN = {FIELD_RASTER_VAL}", | |
"-simplify", | |
str(SIMPLIFY_TOLERANCE), | |
], | |
check=True, | |
) | |
os.remove(tmp_sieved) | |
os.remove(tmp_gpkg) | |
def run_python_pipeline(tif_path, out_gpkg): | |
with rasterio.open(tif_path) as src: | |
arr = src.read(1) | |
transform = src.transform | |
mask = arr == FIELD_RASTER_VAL | |
arr = rasterio.features.sieve( | |
arr, size=THRESHOLD, connectivity=CONNECTIVITY, mask=mask | |
) | |
shapes = rasterio.features.shapes(arr, mask=mask, transform=transform) | |
rows = [] | |
for geom, val in shapes: | |
poly = shape(geom) | |
if val == FIELD_RASTER_VAL: | |
simp = poly.simplify(SIMPLIFY_TOLERANCE, preserve_topology=True) | |
rows.append({"geometry": mapping(simp), "properties": {}}) | |
schema = {"geometry": "Polygon", "properties": {}} | |
with fiona.open( | |
out_gpkg, mode="w", driver="GPKG", crs=src.crs, schema=schema | |
) as dst: | |
dst.writerecords(rows) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--input_dir", required=True) | |
parser.add_argument("--output_dir", required=True) | |
parser.add_argument("--method", required=True, choices=["gdal", "python"]) | |
parser.add_argument("-n", "--sample_size", type=int, default=100) | |
args = parser.parse_args() | |
os.makedirs(args.output_dir, exist_ok=True) | |
tifs = glob.glob(os.path.join(args.input_dir, "*.tif")) | |
if not tifs: | |
print(f"No .tif files found in {args.input_dir}") | |
sys.exit(1) | |
sample = random.sample(tifs, min(args.sample_size, len(tifs))) | |
print(f"Processing {len(sample)} files using method '{args.method}'...") | |
start = time.perf_counter() | |
for tif in tqdm(sample): | |
base = os.path.splitext(os.path.basename(tif))[0] | |
out_gpkg = os.path.join(args.output_dir, f"{base}.gpkg") | |
if args.method == "gdal": | |
run_gdal_pipeline(tif, out_gpkg) | |
else: | |
run_python_pipeline(tif, out_gpkg) | |
elapsed = time.perf_counter() - start | |
print( | |
f"Method '{args.method}' processed {len(sample)} files in {elapsed:.2f} seconds." | |
) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Python in-memory is the winner!