diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 7b7697ff47fff..14eab8e4af945 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from collections.abc import Mapping import re +import unicodedata import warnings import pytest @@ -98,6 +99,20 @@ def test_strip_accents(): assert strip_accents_unicode(a) == expected +def test_strip_accents_unicode_nfkd_inputs(): + assert strip_accents_unicode('ñ') == 'n' + assert strip_accents_unicode('n' + '\u0303') == 'n' + + assert strip_accents_unicode('e' + '\u0301' + '\u0308') == 'e' + + pre_normalized = unicodedata.normalize('NFKD', 'é') + assert strip_accents_unicode(pre_normalized) == 'e' + + mixed = '\u0625' + 'ñ' + '\uFF21' + expected = '\u0627' + 'n' + 'A' + assert strip_accents_unicode(mixed) == expected + + def test_to_ascii(): # check some classical latin accentuated symbols a = 'àáâãäåçèéêë' diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index bb5a9d646789c..11236d9582a01 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -130,10 +130,7 @@ def strip_accents_unicode(s): ASCII equivalent. """ normalized = unicodedata.normalize('NFKD', s) - if normalized == s: - return s - else: - return ''.join([c for c in normalized if not unicodedata.combining(c)]) + return ''.join([c for c in normalized if not unicodedata.combining(c)]) def strip_accents_ascii(s):