Created
August 3, 2018 18:59
-
-
Save jnhansen/fa474a536201561653f60ea33045f4e2 to your computer and use it in GitHub Desktop.
Auto-merge xarray datasets along multiple dimensions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import xarray as xr | |
import itertools | |
import numpy as np | |
def auto_merge(datasets): | |
""" | |
Automatically merge a split xarray Dataset. This is designed to behave like | |
`xarray.open_mfdataset`, except it supports concatenation along multiple | |
dimensions. | |
Parameters | |
---------- | |
datasets : str or list of str or list of xarray.Dataset | |
Either a glob expression or list of paths as you would pass to | |
xarray.open_mfdataset, or a list of xarray datasets. If a list of | |
datasets is passed, you should make sure that they are represented | |
as dask arrays to avoid reading the whole dataset into memory. | |
Returns | |
------- | |
xarray.Dataset | |
The merged dataset. | |
""" | |
# Treat `datasets` as a glob expression | |
if isinstance(datasets, str): | |
datasets = glob.glob(datasets) | |
# Treat `datasets` as a list of file paths | |
if isinstance(datasets[0], str): | |
# Pass chunks={} to ensure the dataset is read as a dask array | |
datasets = [xr.open_dataset(path, chunks={}) for path in datasets] | |
def _combine_along_last_dim(datasets): | |
merged = [] | |
# Determine the dimension along which the dataset is split | |
split_dims = [d for d in datasets[0].dims if | |
len(np.unique([ds[d].values[0] for ds in datasets])) > 1] | |
# Concatenate along one of the split dimensions | |
concat_dim = split_dims[-1] | |
# Group along the remaining dimensions and concatenate within each | |
# group. | |
sorted_ds = sorted(datasets, key=lambda ds: tuple(ds[d].values[0] | |
for d in split_dims)) | |
for _, group in itertools.groupby( | |
sorted_ds, | |
key=lambda ds: tuple(ds[d].values[0] for d in split_dims[:-1]) | |
): | |
merged.append(xr.auto_combine(group, concat_dim=concat_dim)) | |
return merged | |
merged = datasets | |
while len(merged) > 1: | |
merged = _combine_along_last_dim(merged) | |
return merged[0] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @ShukhratSh, this function was written for an older version of
xarray
. In the newer versions, combining datasets along multiple dimensions is supported natively throughxarray.combine_by_coords()
.