Skip to content

Commit 31a71b9

Browse files
authored
Consistently skip identical indexes in align (#972)
Also give a more informative error message identifying the offending dimension if reindex/align cannot succeed due to duplicate values along an index. Fixes GH956
1 parent 4295b93 commit 31a71b9

File tree

5 files changed

+74
-53
lines changed

5 files changed

+74
-53
lines changed

doc/whats-new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ Enhancements
3535
using ``join=outer``. By `Guido Imperiale <https://github.com/crusaderky>`_.
3636
- Better error message when assigning variables without dimensions
3737
(:issue:`971`). By `Stephan Hoyer <https://github.com/shoyer>`_.
38+
- Better error message when reindex/align fails due to duplicate index values
39+
(:issue:`956`). By `Stephan Hoyer <https://github.com/shoyer>`_.
3840

3941
Bug fixes
4042
~~~~~~~~~
@@ -53,6 +55,8 @@ Bug fixes
5355
By `Stephan Hoyer <https://github.com/shoyer>`_.
5456
- ``groupby_bins`` sorted bin labels as strings (:issue:`952`).
5557
By `Stephan Hoyer <https://github.com/shoyer>`_.
58+
- Fix bug introduced by v0.8.0 that broke assignment to datasets when both the
59+
left and right side have the same non-unique index values (:issue:`956`).
5660

5761
.. _whats-new.0.8.1:
5862

xarray/core/alignment.py

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -25,26 +25,6 @@ def _get_joiner(join):
2525
raise ValueError('invalid value for join: %s' % join)
2626

2727

28-
def _get_all_indexes(objects, exclude=set()):
29-
all_indexes = defaultdict(list)
30-
for obj in objects:
31-
for k, v in iteritems(obj.indexes):
32-
if k not in exclude:
33-
all_indexes[k].append(v)
34-
return all_indexes
35-
36-
37-
def _join_indexes(join, objects, exclude=set()):
38-
joiner = _get_joiner(join)
39-
indexes = _get_all_indexes(objects, exclude=exclude)
40-
# exclude dimensions with all equal indices (the usual case) to avoid
41-
# unnecessary reindexing work.
42-
# TODO: don't bother to check equals for left or right joins
43-
joined_indexes = dict((k, joiner(v)) for k, v in iteritems(indexes)
44-
if any(not v[0].equals(idx) for idx in v[1:]))
45-
return joined_indexes
46-
47-
4828
def align(*objects, **kwargs):
4929
"""align(*objects, join='inner', copy=True)
5030
@@ -87,15 +67,36 @@ def align(*objects, **kwargs):
8767
copy = kwargs.pop('copy', True)
8868
indexes = kwargs.pop('indexes', None)
8969
exclude = kwargs.pop('exclude', None)
70+
if indexes is None:
71+
indexes = {}
9072
if exclude is None:
9173
exclude = set()
9274
if kwargs:
9375
raise TypeError('align() got unexpected keyword arguments: %s'
9476
% list(kwargs))
9577

96-
joined_indexes = _join_indexes(join, objects, exclude=exclude)
97-
if indexes is not None:
98-
joined_indexes.update(indexes)
78+
all_indexes = defaultdict(list)
79+
for obj in objects:
80+
for dim, index in iteritems(obj.indexes):
81+
if dim not in exclude:
82+
all_indexes[dim].append(index)
83+
84+
# We don't join over dimensions with all equal indexes for two reasons:
85+
# - It's faster for the usual case (already aligned objects).
86+
# - It ensures it's possible to do operations that don't require alignment
87+
# on indexes with duplicate values (which cannot be reindexed with
88+
# pandas). This is useful, e.g., for overwriting such duplicate indexes.
89+
joiner = _get_joiner(join)
90+
joined_indexes = {}
91+
for dim, dim_indexes in iteritems(all_indexes):
92+
if dim in indexes:
93+
index = utils.safe_cast_to_index(indexes[dim])
94+
if any(not index.equals(other) for other in dim_indexes):
95+
joined_indexes[dim] = index
96+
else:
97+
if any(not dim_indexes[0].equals(other)
98+
for other in dim_indexes[1:]):
99+
joined_indexes[dim] = joiner(dim_indexes)
99100

100101
result = []
101102
for obj in objects:
@@ -163,6 +164,10 @@ def reindex_variables(variables, indexes, indexers, method=None,
163164
to_shape[name] = index.size
164165
if name in indexers:
165166
target = utils.safe_cast_to_index(indexers[name])
167+
if not index.is_unique:
168+
raise ValueError(
169+
'cannot reindex or align along dimension %r because the '
170+
'index has duplicate values' % name)
166171
indexer = index.get_indexer(target, method=method,
167172
**get_indexer_kwargs)
168173

xarray/core/coordinates.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,7 @@ def to_index(self, ordered_dims=None):
6363
def update(self, other):
6464
other_vars = getattr(other, 'variables', other)
6565
coords = merge_coords([self.variables, other_vars],
66-
priority_arg=1, indexes=self.indexes,
67-
indexes_from_arg=0)
66+
priority_arg=1, indexes=self.indexes)
6867
self._update_coords(coords)
6968

7069
def _merge_raw(self, other):

xarray/core/merge.py

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -281,8 +281,7 @@ def merge_coords_without_align(objs, priority_vars=None):
281281
return variables
282282

283283

284-
def _align_for_merge(input_objects, join, copy, indexes=None,
285-
indexes_from_arg=None):
284+
def _align_for_merge(input_objects, join, copy, indexes=None):
286285
"""Align objects for merging, recursing into dictionary values.
287286
"""
288287
if indexes is None:
@@ -314,14 +313,6 @@ def is_alignable(obj):
314313
targets.append(v)
315314
out.append(OrderedDict(variables))
316315

317-
if not targets or (len(targets) == 1 and
318-
(indexes is None or positions == [indexes_from_arg])):
319-
# Don't align if we only have one object to align and it's already
320-
# aligned to itself. This ensures it's possible to overwrite index
321-
# coordinates with non-unique values, which cannot be reindexed:
322-
# https://github.com/pydata/xarray/issues/943
323-
return input_objects
324-
325316
aligned = align(*targets, join=join, copy=copy, indexes=indexes)
326317

327318
for position, key, aligned_obj in zip(positions, keys, aligned):
@@ -366,7 +357,7 @@ def _get_priority_vars(objects, priority_arg, compat='equals'):
366357

367358

368359
def merge_coords(objs, compat='minimal', join='outer', priority_arg=None,
369-
indexes=None, indexes_from_arg=False):
360+
indexes=None):
370361
"""Merge coordinate variables.
371362
372363
See merge_core below for argument descriptions. This works similarly to
@@ -375,8 +366,7 @@ def merge_coords(objs, compat='minimal', join='outer', priority_arg=None,
375366
"""
376367
_assert_compat_valid(compat)
377368
coerced = coerce_pandas_values(objs)
378-
aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes,
379-
indexes_from_arg=indexes_from_arg)
369+
aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes)
380370
expanded = expand_variable_dicts(aligned)
381371
priority_vars = _get_priority_vars(aligned, priority_arg, compat=compat)
382372
variables = merge_variables(expanded, priority_vars, compat=compat)
@@ -393,7 +383,7 @@ def merge_data_and_coords(data, coords, compat='broadcast_equals',
393383

394384

395385
def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
396-
explicit_coords=None, indexes=None, indexes_from_arg=False):
386+
explicit_coords=None, indexes=None):
397387
"""Core logic for merging labeled objects.
398388
399389
This is not public API.
@@ -412,10 +402,6 @@ def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
412402
An explicit list of variables from `objs` that are coordinates.
413403
indexes : dict, optional
414404
Dictionary with values given by pandas.Index objects.
415-
indexes_from_arg : boolean, optional
416-
If indexes were provided, were these taken from one of the objects in
417-
``objs``? This allows us to skip alignment if this object is the only
418-
one to be aligned.
419405
420406
Returns
421407
-------
@@ -435,8 +421,7 @@ def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
435421
_assert_compat_valid(compat)
436422

437423
coerced = coerce_pandas_values(objs)
438-
aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes,
439-
indexes_from_arg=indexes_from_arg)
424+
aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes)
440425
expanded = expand_variable_dicts(aligned)
441426

442427
coord_names, noncoord_names = determine_coords(coerced)
@@ -552,5 +537,4 @@ def dataset_merge_method(dataset, other, overwrite_vars=frozenset(),
552537

553538
def dataset_update_method(dataset, other):
554539
"""Guts of the Dataset.update method"""
555-
return merge_core([dataset, other], priority_arg=1, indexes=dataset.indexes,
556-
indexes_from_arg=0)
540+
return merge_core([dataset, other], priority_arg=1, indexes=dataset.indexes)

xarray/test/test_dataset.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,20 +1066,29 @@ def test_align(self):
10661066
align(left, right, foo='bar')
10671067

10681068
def test_align_exclude(self):
1069-
x = Dataset({'foo': DataArray([[1, 2],[3, 4]], dims=['x', 'y'], coords={'x': [1, 2], 'y': [3, 4]})})
1070-
y = Dataset({'bar': DataArray([[1, 2],[3, 4]], dims=['x', 'y'], coords={'x': [1, 3], 'y': [5, 6]})})
1069+
x = Dataset({'foo': DataArray([[1, 2],[3, 4]], dims=['x', 'y'],
1070+
coords={'x': [1, 2], 'y': [3, 4]})})
1071+
y = Dataset({'bar': DataArray([[1, 2],[3, 4]], dims=['x', 'y'],
1072+
coords={'x': [1, 3], 'y': [5, 6]})})
10711073
x2, y2 = align(x, y, exclude=['y'], join='outer')
10721074

1073-
expected_x2 = Dataset({'foo': DataArray([[1, 2], [3, 4], [np.nan, np.nan]], dims=['x', 'y'], coords={'x': [1, 2, 3], 'y': [3, 4]})})
1074-
expected_y2 = Dataset({'bar': DataArray([[1, 2], [np.nan, np.nan], [3, 4]], dims=['x', 'y'], coords={'x': [1, 2, 3], 'y': [5, 6]})})
1075+
expected_x2 = Dataset(
1076+
{'foo': DataArray([[1, 2], [3, 4], [np.nan, np.nan]],
1077+
dims=['x', 'y'],
1078+
coords={'x': [1, 2, 3], 'y': [3, 4]})})
1079+
expected_y2 = Dataset(
1080+
{'bar': DataArray([[1, 2], [np.nan, np.nan], [3, 4]],
1081+
dims=['x', 'y'],
1082+
coords={'x': [1, 2, 3], 'y': [5, 6]})})
10751083
self.assertDatasetIdentical(expected_x2, x2)
10761084
self.assertDatasetIdentical(expected_y2, y2)
10771085

10781086
def test_align_nocopy(self):
10791087
x = Dataset({'foo': DataArray([1, 2, 3], coords={'x': [1, 2, 3]})})
10801088
y = Dataset({'foo': DataArray([1, 2], coords={'x': [1, 2]})})
10811089
expected_x2 = x
1082-
expected_y2 = Dataset({'foo': DataArray([1, 2, np.nan], coords={'x': [1, 2, 3]})})
1090+
expected_y2 = Dataset({'foo': DataArray([1, 2, np.nan],
1091+
coords={'x': [1, 2, 3]})})
10831092

10841093
x2, y2 = align(x, y, copy=False, join='outer')
10851094
self.assertDatasetIdentical(expected_x2, x2)
@@ -1097,6 +1106,15 @@ def test_align_indexes(self):
10971106
expected_x2 = Dataset({'foo': DataArray([2, 3, 1], coords={'x': [2, 3, 1]})})
10981107
self.assertDatasetIdentical(expected_x2, x2)
10991108

1109+
def test_align_non_unique(self):
1110+
x = Dataset({'foo': ('x', [3, 4, 5]), 'x': [0, 0, 1]})
1111+
x1, x2 = align(x, x)
1112+
assert x1.identical(x) and x2.identical(x)
1113+
1114+
y = Dataset({'bar': ('x', [6, 7]), 'x': [0, 1]})
1115+
with self.assertRaisesRegexp(ValueError, 'cannot reindex or align'):
1116+
align(x, y)
1117+
11001118
def test_broadcast(self):
11011119
ds = Dataset({'foo': 0, 'bar': ('x', [1]), 'baz': ('y', [2, 3])},
11021120
{'c': ('x', [4])})
@@ -1573,7 +1591,7 @@ def test_assign(self):
15731591
expected = expected.set_coords('c')
15741592
self.assertDatasetIdentical(actual, expected)
15751593

1576-
def test_setitem_non_unique_index(self):
1594+
def test_setitem_original_non_unique_index(self):
15771595
# regression test for GH943
15781596
original = Dataset({'data': ('x', np.arange(5))},
15791597
coords={'x': [0, 1, 2, 0, 1]})
@@ -1591,6 +1609,17 @@ def test_setitem_non_unique_index(self):
15911609
actual.coords['x'] = list(range(5))
15921610
self.assertDatasetIdentical(actual, expected)
15931611

1612+
def test_setitem_both_non_unique_index(self):
1613+
# regression test for GH956
1614+
names = ['joaquin', 'manolo', 'joaquin']
1615+
values = np.random.randint(0, 256, (3, 4, 4))
1616+
array = DataArray(values, dims=['name', 'row', 'column'],
1617+
coords=[names, range(4), range(4)])
1618+
expected = Dataset({'first': array, 'second': array})
1619+
actual = array.rename('first').to_dataset()
1620+
actual['second'] = array
1621+
self.assertDatasetIdentical(expected, actual)
1622+
15941623
def test_delitem(self):
15951624
data = create_test_data()
15961625
all_items = set(data)

0 commit comments

Comments
 (0)