Fix printing of unicode attributes on Python 2 (#914)

shoyer · web-flow · commit 70611a930f82 · 2016-07-23T19:55:40.000-07:00
* Fix printing of unicode attributes on Python 2

Fixes GH834

* What's New

* Add ground truth for test_unicode_data
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -98,7 +98,10 @@ Bug fixes
 - ``Variable.copy(deep=True)`` no longer converts MultiIndex into a base Index
   (:issue:`769`). By `Benoit Bovy <https://github.com/benbovy>`_.
 
-- Fixed incorrect test for dask version :issue:`891`. By
+- Fix printing datasets with unicode attributes on Python 2 (:issue:`892`). By
+  `Stephan Hoyer <https://github.com/shoyer>`_.
+
+- Fixed incorrect test for dask version (:issue:`891`). By
   `Stephan Hoyer <https://github.com/shoyer>`_.
 
 - :py:func:`~xarray.plot.contour` now plots the correct number of contours
diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -109,7 +109,7 @@ def wrapped_func(self, **kwargs):
         return wrapped_func
 
 
-class AbstractArray(ImplementsArrayReduce):
+class AbstractArray(ImplementsArrayReduce, formatting.ReprMixin):
     def __bool__(self):
         return bool(self.values)
 
diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py
@@ -37,7 +37,7 @@ def _dim_shape(var):
     return [(dim, size) for dim, size in zip(var.dims, var.shape)]
 
 
-class AbstractCoordinates(Mapping):
+class AbstractCoordinates(Mapping, formatting.ReprMixin):
     def __getitem__(self, key):
         if (key in self._names or
             (isinstance(key, basestring) and
@@ -62,7 +62,7 @@ def __len__(self):
     def __contains__(self, key):
         return key in self._names
 
-    def __repr__(self):
+    def __unicode__(self):
         return formatting.coords_repr(self)
 
     @property
@@ -219,7 +219,7 @@ def __delitem__(self, key):
         del self._data._coords[key]
 
 
-class Indexes(Mapping):
+class Indexes(Mapping, formatting.ReprMixin):
     def __init__(self, source):
         self._source = source
 
@@ -238,5 +238,5 @@ def __getitem__(self, key):
         else:
             raise KeyError(key)
 
-    def __repr__(self):
+    def __unicode__(self):
         return formatting.indexes_repr(self)
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -105,7 +105,7 @@ def as_dataset(obj):
     return obj
 
 
-class DataVariables(Mapping):
+class DataVariables(Mapping, formatting.ReprMixin):
     def __init__(self, dataset):
         self._dataset = dataset
 
@@ -126,7 +126,7 @@ def __getitem__(self, key):
         else:
             raise KeyError(key)
 
-    def __repr__(self):
+    def __unicode__(self):
         return formatting.vars_repr(self)
 
 
@@ -140,7 +140,8 @@ def __getitem__(self, key):
         return self.dataset.sel(**key)
 
 
-class Dataset(Mapping, ImplementsDatasetReduce, BaseDataObject):
+class Dataset(Mapping, ImplementsDatasetReduce, BaseDataObject,
+              formatting.ReprMixin):
     """A multi-dimensional, in memory, array database.
 
     A dataset resembles an in-memory representation of a NetCDF file, and
@@ -810,7 +811,7 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,
     dump = utils.function_alias(to_netcdf, 'dump')
     dumps = utils.function_alias(to_netcdf, 'dumps')
 
-    def __repr__(self):
+    def __unicode__(self):
         return formatting.dataset_repr(self)
 
     @property
diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py
@@ -1,23 +1,33 @@
+"""String formatting routines for __repr__.
+
+For the sake of sanity, we only do internal formatting with unicode, which can
+be returned by the __unicode__ special method. We use ReprMixin to provide the
+__repr__ method so that things can work on Python 2.
+"""
 from datetime import datetime, timedelta
 import functools
 
 import numpy as np
 import pandas as pd
 
 from .options import OPTIONS
-from .pycompat import iteritems, unicode_type, bytes_type, dask_array_type
+from .pycompat import PY2, iteritems, unicode_type, bytes_type, dask_array_type
 
 
 def pretty_print(x, numchars):
     """Given an object `x`, call `str(x)` and format the returned string so
     that it is numchars long, padding with trailing spaces or truncating with
     ellipses as necessary
     """
-    s = str(x)
-    if len(s) > numchars:
-        return s[:(numchars - 3)] + '...'
-    else:
-        return s + ' ' * (numchars - len(s))
+    s = maybe_truncate(x, numchars)
+    return s + ' ' * max(numchars - len(s), 0)
+
+
+def maybe_truncate(obj, maxlen=500):
+    s = unicode_type(obj)
+    if len(s) > maxlen:
+        s = s[:(maxlen - 3)] + u'...'
+    return s
 
 
 def wrap_indent(text, start='', length=None):
@@ -27,6 +37,23 @@ def wrap_indent(text, start='', length=None):
     return start + indent.join(x for x in text.splitlines())
 
 
+def ensure_valid_repr(string):
+    """Ensure that the given value is valid for the result of __repr__.
+
+    On Python 2, this means we need to convert unicode to bytes. We won't need
+    this function once we drop Python 2.7 support.
+    """
+    if PY2 and isinstance(string, unicode):
+        string = string.encode('utf-8')
+    return string
+
+
+class ReprMixin(object):
+    """Mixin that defines __repr__ for a class that already has __unicode__."""
+    def __repr__(self):
+        return ensure_valid_repr(self.__unicode__())
+
+
 def _get_indexer_at_least_n_items(shape, n_desired):
     assert 0 < n_desired <= np.prod(shape)
     cum_items = np.cumprod(shape[::-1])
@@ -58,7 +85,7 @@ def first_n_items(x, n_desired):
 
 def format_timestamp(t):
     """Cast given object to a Timestamp and return a nicely formatted string"""
-    datetime_str = str(pd.Timestamp(t))
+    datetime_str = unicode_type(pd.Timestamp(t))
     try:
         date_str, time_str = datetime_str.split()
     except ValueError:
@@ -73,7 +100,7 @@ def format_timestamp(t):
 
 def format_timedelta(t, timedelta_format=None):
     """Cast given object to a Timestamp and return a nicely formatted string"""
-    timedelta_str = str(pd.Timedelta(t))
+    timedelta_str = unicode_type(pd.Timedelta(t))
     try:
         days_str, time_str = timedelta_str.split(' days ')
     except ValueError:
@@ -97,9 +124,9 @@ def format_item(x, timedelta_format=None, quote_strings=True):
     elif isinstance(x, (unicode_type, bytes_type)):
         return repr(x) if quote_strings else x
     elif isinstance(x, (float, np.float)):
-        return '{0:.4}'.format(x)
+        return u'{0:.4}'.format(x)
     else:
-        return str(x)
+        return unicode_type(x)
 
 
 def format_items(x):
@@ -135,27 +162,30 @@ def format_array_flat(items_ndarray, max_width):
     cum_len = np.cumsum([len(s) + 1 for s in pprint_items]) - 1
     if (max_possibly_relevant < items_ndarray.size or
             (cum_len > max_width).any()):
-        end_padding = ' ...'
+        end_padding = u' ...'
         count = max(np.argmax((cum_len + len(end_padding)) > max_width), 1)
         pprint_items = pprint_items[:count]
     else:
-        end_padding = ''
+        end_padding = u''
 
-    pprint_str = ' '.join(pprint_items) + end_padding
+    pprint_str = u' '.join(pprint_items) + end_padding
     return pprint_str
 
 
 def _summarize_var_or_coord(name, var, col_width, show_values=True,
                             marker=' ', max_width=None):
     if max_width is None:
         max_width = OPTIONS['display_width']
-    first_col = pretty_print('  %s %s ' % (marker, name), col_width)
-    dims_str = '(%s) ' % ', '.join(map(str, var.dims)) if var.dims else ''
-    front_str = first_col + dims_str + ('%s ' % var.dtype)
+    first_col = pretty_print(u'  %s %s ' % (marker, name), col_width)
+    if var.dims:
+        dims_str = u'(%s) ' % u', '.join(map(unicode_type, var.dims))
+    else:
+        dims_str = u''
+    front_str = u'%s%s%s ' % (first_col, dims_str, var.dtype)
     if show_values:
         values_str = format_array_flat(var, max_width - len(front_str))
     else:
-        values_str = '...'
+        values_str = u'...'
     return front_str + values_str
 
 
@@ -177,79 +207,73 @@ def summarize_var(name, var, col_width):
 def summarize_coord(name, var, col_width):
     is_index = name in var.dims
     show_values = is_index or _not_remote(var)
-    marker = '*' if is_index else ' '
+    marker = u'*' if is_index else u' '
     return _summarize_var_or_coord(name, var, col_width, show_values, marker)
 
 
-def _maybe_truncate(obj, maxlen=500):
-    s = str(obj)
-    if len(s) > maxlen:
-        s = s[:(maxlen - 3)] + '...'
-    return s
-
-
 def summarize_attr(key, value, col_width=None):
     # ignore col_width for now to more clearly distinguish attributes
-    return '    %s: %s' % (key, _maybe_truncate(value))
+    return u'    %s: %s' % (key, maybe_truncate(value))
 
 
-EMPTY_REPR = '    *empty*'
+EMPTY_REPR = u'    *empty*'
 
 
 def _calculate_col_width(mapping):
-    max_name_length = max(len(str(k)) for k in mapping) if mapping else 0
+    max_name_length = (max(len(unicode_type(k)) for k in mapping)
+                       if mapping else 0)
     col_width = max(max_name_length, 7) + 6
     return col_width
 
 
 def _mapping_repr(mapping, title, summarizer, col_width=None):
     if col_width is None:
         col_width = _calculate_col_width(mapping)
-    summary = ['%s:' % title]
+    summary = [u'%s:' % title]
     if mapping:
         summary += [summarizer(k, v, col_width) for k, v in mapping.items()]
     else:
         summary += [EMPTY_REPR]
-    return '\n'.join(summary)
+    return u'\n'.join(summary)
 
 
-coords_repr = functools.partial(_mapping_repr, title='Coordinates',
+coords_repr = functools.partial(_mapping_repr, title=u'Coordinates',
                                 summarizer=summarize_coord)
 
 
-vars_repr = functools.partial(_mapping_repr, title='Data variables',
+vars_repr = functools.partial(_mapping_repr, title=u'Data variables',
                               summarizer=summarize_var)
 
 
-attrs_repr = functools.partial(_mapping_repr, title='Attributes',
+attrs_repr = functools.partial(_mapping_repr, title=u'Attributes',
                                summarizer=summarize_attr)
 
 
 def indexes_repr(indexes):
     summary = []
     for k, v in indexes.items():
         summary.append(wrap_indent(repr(v), '%s: ' % k))
-    return '\n'.join(summary)
+    return u'\n'.join(summary)
 
 
 def array_repr(arr):
     # used for DataArray, Variable and Coordinate
     if hasattr(arr, 'name') and arr.name is not None:
         name_str = '%r ' % arr.name
     else:
-        name_str = ''
-    dim_summary = ', '.join('%s: %s' % (k, v) for k, v
+        name_str = u''
+    dim_summary = u', '.join(u'%s: %s' % (k, v) for k, v
                             in zip(arr.dims, arr.shape))
 
-    summary = ['<xarray.%s %s(%s)>'
+    summary = [u'<xarray.%s %s(%s)>'
                % (type(arr).__name__, name_str, dim_summary)]
 
     if isinstance(getattr(arr, 'variable', arr)._data, dask_array_type):
         summary.append(repr(arr.data))
     elif arr._in_memory or arr.size < 1e5:
         summary.append(repr(arr.values))
     else:
-        summary.append('[%s values with dtype=%s]' % (arr.size, arr.dtype))
+        summary.append(u'[%s values with dtype=%s]' % (arr.size, arr.dtype))
 
     if hasattr(arr, 'coords'):
         if arr.coords:
@@ -258,21 +282,21 @@ def array_repr(arr):
     if arr.attrs:
         summary.append(attrs_repr(arr.attrs))
 
-    return '\n'.join(summary)
+    return u'\n'.join(summary)
 
 
 def dataset_repr(ds):
-    summary = ['<xarray.%s>' % type(ds).__name__]
+    summary = [u'<xarray.%s>' % type(ds).__name__]
 
     col_width = _calculate_col_width(ds)
 
-    dims_start = pretty_print('Dimensions:', col_width)
-    all_dim_strings = ['%s: %s' % (k, v) for k, v in iteritems(ds.dims)]
-    summary.append('%s(%s)' % (dims_start, ', '.join(all_dim_strings)))
+    dims_start = pretty_print(u'Dimensions:', col_width)
+    all_dim_strings = [u'%s: %s' % (k, v) for k, v in iteritems(ds.dims)]
+    summary.append(u'%s(%s)' % (dims_start, ', '.join(all_dim_strings)))
 
     summary.append(coords_repr(ds.coords, col_width=col_width))
     summary.append(vars_repr(ds.data_vars, col_width=col_width))
     if ds.attrs:
         summary.append(attrs_repr(ds.attrs))
 
-    return '\n'.join(summary)
+    return u'\n'.join(summary)
diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py
@@ -1,5 +1,6 @@
 import sys
 
+PY2 = sys.version_info[0] < 3
 PY3 = sys.version_info[0] >= 3
 
 if PY3:  # pragma: no cover
diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 from copy import copy, deepcopy
 from textwrap import dedent
 try:
@@ -16,7 +17,7 @@
                     DataArray, Variable, Coordinate, auto_combine,
                     open_dataset, set_options)
 from xarray.core import indexing, utils
-from xarray.core.pycompat import iteritems, OrderedDict
+from xarray.core.pycompat import iteritems, OrderedDict, unicode_type
 
 from . import (TestCase, unittest, InaccessibleArray, UnexpectedDataAccess,
                requires_dask)
@@ -114,6 +115,23 @@ def test_repr_period_index(self):
         # check that creating the repr doesn't raise an error #GH645
         repr(data)
 
+    def test_unicode_data(self):
+        # regression test for GH834
+        data = Dataset({u'foø': [u'ba®']}, attrs={u'å': u'∑'})
+        repr(data)  # should not raise
+
+        expected = dedent(u"""\
+        <xarray.Dataset>
+        Dimensions:  (foø: 1)
+        Coordinates:
+          * foø      (foø) <U3 %r
+        Data variables:
+            *empty*
+        Attributes:
+            å: ∑""" % u'ba®')
+        actual = unicode_type(data)
+        self.assertEqual(expected, actual)
+
     def test_constructor(self):
         x1 = ('x', 2 * np.arange(100))
         x2 = ('x', np.arange(1000))
diff --git a/xarray/test/test_formatting.py b/xarray/test/test_formatting.py