jnhansen · August 3, 2018 18:59 · jnhansen · Aug 3, 2018 · FaustinCarter · Sep 12, 2018
diff --git a/xr_auto_merge.py b/xr_auto_merge.py
 import glob
 import xarray as xr
 import itertools
 import numpy as np


 def auto_merge(datasets):
    """
    Automatically merge a split xarray Dataset. This is designed to behave like
    `xarray.open_mfdataset`, except it supports concatenation along multiple
    dimensions.

    Parameters
    ----------
    datasets : str or list of str or list of xarray.Dataset
        Either a glob expression or list of paths as you would pass to
        xarray.open_mfdataset, or a list of xarray datasets. If a list of
        datasets is passed, you should make sure that they are represented
        as dask arrays to avoid reading the whole dataset into memory.

    Returns
    -------
    xarray.Dataset
        The merged dataset.
    """
    # Treat `datasets` as a glob expression
    if isinstance(datasets, str):
        datasets = glob.glob(datasets)

    # Treat `datasets` as a list of file paths
    if isinstance(datasets[0], str):
        # Pass chunks={} to ensure the dataset is read as a dask array
        datasets = [xr.open_dataset(path, chunks={}) for path in datasets]

    def _combine_along_last_dim(datasets):
        merged = []

        # Determine the dimension along which the dataset is split
        split_dims = [d for d in datasets[0].dims if
                      len(np.unique([ds[d].values[0] for ds in datasets])) > 1]

        # Concatenate along one of the split dimensions
        concat_dim = split_dims[-1]

        # Group along the remaining dimensions and concatenate within each
        # group.
        sorted_ds = sorted(datasets, key=lambda ds: tuple(ds[d].values[0]
                                                          for d in split_dims))
        for _, group in itertools.groupby(
                sorted_ds,
                key=lambda ds: tuple(ds[d].values[0] for d in split_dims[:-1])
                ):
            merged.append(xr.auto_combine(group, concat_dim=concat_dim))

        return merged

    merged = datasets
    while len(merged) > 1:
        merged = _combine_along_last_dim(merged)

    return merged[0]
	import glob
	import xarray as xr
	import itertools
	import numpy as np


	def auto_merge(datasets):
	"""
	Automatically merge a split xarray Dataset. This is designed to behave like
	`xarray.open_mfdataset`, except it supports concatenation along multiple
	dimensions.

	Parameters
	----------
	datasets : str or list of str or list of xarray.Dataset
	Either a glob expression or list of paths as you would pass to
	xarray.open_mfdataset, or a list of xarray datasets. If a list of
	datasets is passed, you should make sure that they are represented
	as dask arrays to avoid reading the whole dataset into memory.

	Returns
	-------
	xarray.Dataset
	The merged dataset.
	"""
	# Treat `datasets` as a glob expression
	if isinstance(datasets, str):
	datasets = glob.glob(datasets)

	# Treat `datasets` as a list of file paths
	if isinstance(datasets[0], str):
	# Pass chunks={} to ensure the dataset is read as a dask array
	datasets = [xr.open_dataset(path, chunks={}) for path in datasets]

	def _combine_along_last_dim(datasets):
	merged = []

	# Determine the dimension along which the dataset is split
	split_dims = [d for d in datasets[0].dims if
	len(np.unique([ds[d].values[0] for ds in datasets])) > 1]

	# Concatenate along one of the split dimensions
	concat_dim = split_dims[-1]

	# Group along the remaining dimensions and concatenate within each
	# group.
	sorted_ds = sorted(datasets, key=lambda ds: tuple(ds[d].values[0]
	for d in split_dims))
	for _, group in itertools.groupby(
	sorted_ds,
	key=lambda ds: tuple(ds[d].values[0] for d in split_dims[:-1])
	):
	merged.append(xr.auto_combine(group, concat_dim=concat_dim))

	return merged

	merged = datasets
	while len(merged) > 1:
	merged = _combine_along_last_dim(merged)

	return merged[0]