Skip to content

Commit

Permalink
Fix __str__ behavior
Browse files Browse the repository at this point in the history
- Adds implements_to_string decorator that makes __str__
return unicode. This fixes print statements so the output is
as expected.
  • Loading branch information
sloria committed Sep 25, 2013
1 parent 56ea4ec commit 84099ad
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 129 deletions.
1 change: 1 addition & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Changelog
- Basic extensions framework in place. TextBlob has been refactored to make it easier to develop extensions.
- Add ``text.classifiers.PositiveNaiveBayesClassifier``.
- Update NLTK.
- Fix ``__str__`` behavior. ``print blob`` should now print expected output in both Python 2 and 3.
- *Backwards-incompatible*: All abstract base classes have been moved to the ``text.base`` module.
- *Backwards-incompatible*: ``PerceptronTagger`` will now be maintained as an extension, ``textblob-aptagger``. Instantiating a ``text.taggers.PerceptronTagger()`` will raise a ``DeprecationWarning``.

Expand Down
3 changes: 1 addition & 2 deletions text/blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from text.utils import lowerstrip, PUNCTUATION_REGEX
from text.inflect import singularize as _singularize, pluralize as _pluralize
from text.mixins import BlobComparableMixin, StringlikeMixin
from text.compat import unicode, basestring, python_2_unicode_compatible
from text.compat import unicode, basestring
from text.base import (BaseNPExtractor, BaseTagger, BaseTokenizer,
BaseSentimentAnalyzer, BaseParser)
from text.np_extractors import FastNPExtractor
Expand Down Expand Up @@ -287,7 +287,6 @@ def _initialize_models(obj, tokenizer, pos_tagger,
obj.classifier = classifier


@python_2_unicode_compatible
class BaseBlob(StringlikeMixin, BlobComparableMixin):

'''An abstract base class that all text.blob classes will inherit from.
Expand Down
130 changes: 11 additions & 119 deletions text/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@ def u(s):
imap = imap
izip = izip
import unicodecsv as csv

def implements_to_string(cls):
'''Class decorator that renames __str__ to __unicode__ and
modifies __str__ that returns utf-8.
'''
cls.__unicode__ = cls.__str__
cls.__str__ = lambda x: x.__unicode__().encode('utf-8')
return cls

else: # PY3
def b(s):
return s.encode("latin-1")
Expand All @@ -35,6 +44,8 @@ def u(s):
izip = zip
import csv

implements_to_string = lambda x: x


def add_metaclass(metaclass):
"""Class decorator for creating a class with a metaclass.
Expand All @@ -48,122 +59,3 @@ def wrapper(cls):
orig_vars.pop(slots_var)
return metaclass(cls.__name__, cls.__bases__, orig_vars)
return wrapper

# ======= Compatibility layer for __str__ and __repr__ from NLTK ==========

import unicodedata
import functools

def remove_accents(text):

if isinstance(text, bytes):
text = text.decode('ascii')

category = unicodedata.category # this gives a small (~10%) speedup
return ''.join(
c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn'
)

# Select the best transliteration method:
try:
# Older versions of Unidecode are licensed under Artistic License;
# assume an older version is installed.
from unidecode import unidecode as transliterate
except ImportError:
try:
# text-unidecode implementation is worse than Unidecode
# implementation so Unidecode is preferred.
from text_unidecode import unidecode as transliterate
except ImportError:
# This transliteration method should be enough
# for many Western languages.
transliterate = remove_accents


def python_2_unicode_compatible(klass):
"""
This decorator defines __unicode__ method and fixes
__repr__ and __str__ methods under Python 2.
To support Python 2 and 3 with a single code base,
define __str__ and __repr__ methods returning unicode
text and apply this decorator to the class.
Original __repr__ and __str__ would be available
as unicode_repr and __unicode__ (under both Python 2
and Python 3).
"""

if not issubclass(klass, object):
raise ValueError("This decorator doesn't work for old-style classes")

# both __unicode__ and unicode_repr are public because they
# may be useful in console under Python 2.x

# if __str__ or __repr__ are not overriden in a subclass,
# they may be already fixed by this decorator in a parent class
# and we shouldn't them again

if not _was_fixed(klass.__str__):
klass.__unicode__ = klass.__str__
if PY2:
klass.__str__ = _7bit(_transliterated(klass.__unicode__))


if not _was_fixed(klass.__repr__):
klass.unicode_repr = klass.__repr__
if PY2:
klass.__repr__ = _7bit(klass.unicode_repr)

return klass


def unicode_repr(obj):
"""
For classes that was fixed with @python_2_unicode_compatible
``unicode_repr`` returns ``obj.unicode_repr()``; for unicode strings
the result is returned without "u" letter (to make output the
same under Python 2.x and Python 3.x); for other variables
it is the same as ``repr``.
"""
if not PY2:
return repr(obj)

# Python 2.x
if hasattr(obj, 'unicode_repr'):
return obj.unicode_repr()

if isinstance(obj, unicode):
return repr(obj)[1:] # strip "u" letter from output

return repr(obj)


def _transliterated(method):
def wrapper(self):
return transliterate(method(self))

functools.update_wrapper(wrapper, method, ["__name__", "__doc__"])
if hasattr(method, "_nltk_compat_7bit"):
wrapper._nltk_compat_7bit = method._nltk_compat_7bit

wrapper._nltk_compat_transliterated = True
return wrapper


def _7bit(method):
def wrapper(self):
return method(self).encode('ascii', 'backslashreplace')

functools.update_wrapper(wrapper, method, ["__name__", "__doc__"])

if hasattr(method, "_nltk_compat_transliterated"):
wrapper._nltk_compat_transliterated = method._nltk_compat_transliterated

wrapper._nltk_compat_7bit = True
return wrapper


def _was_fixed(method):
return (getattr(method, "_nltk_compat_7bit", False) or
getattr(method, "_nltk_compat_transliterated", False))
13 changes: 5 additions & 8 deletions text/mixins.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import sys
from text.compat import basestring, u
from text.compat import basestring, implements_to_string


class ComparableMixin(object):
Expand Down Expand Up @@ -46,6 +46,7 @@ def _compare(self, other, method):
return super(BlobComparableMixin, self)._compare(other, method)


@implements_to_string
class StringlikeMixin(object):

'''Make blob objects behave like Python strings.
Expand All @@ -61,18 +62,14 @@ def __repr__(self):
return "{cls}({text})".format(cls=class_name,
text=repr(self._strkey()))

def __len__(self):
'''Returns the length of the raw text.'''
return len(self._strkey())

def __str__(self):
'''Returns a string representation used in print statements
or str(my_blob).'''
return self._strkey()

def __unicode__(self):
'''Returns the unicode representation of the blob.'''
return u(self._strkey())
def __len__(self):
'''Returns the length of the raw text.'''
return len(self._strkey())

def __iter__(self):
'''Makes the object iterable as if it were a string,
Expand Down

0 comments on commit 84099ad

Please sign in to comment.