Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions sklearn/feature_extraction/tests/test_text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
from collections.abc import Mapping
import re
import unicodedata
import warnings

import pytest
Expand Down Expand Up @@ -98,6 +99,20 @@ def test_strip_accents():
assert strip_accents_unicode(a) == expected


def test_strip_accents_unicode_nfkd_inputs():
assert strip_accents_unicode('ñ') == 'n'
assert strip_accents_unicode('n' + '\u0303') == 'n'

assert strip_accents_unicode('e' + '\u0301' + '\u0308') == 'e'

pre_normalized = unicodedata.normalize('NFKD', 'é')
assert strip_accents_unicode(pre_normalized) == 'e'

mixed = '\u0625' + 'ñ' + '\uFF21'
expected = '\u0627' + 'n' + 'A'
assert strip_accents_unicode(mixed) == expected


def test_to_ascii():
# check some classical latin accentuated symbols
a = 'àáâãäåçèéêë'
Expand Down
5 changes: 1 addition & 4 deletions sklearn/feature_extraction/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,7 @@ def strip_accents_unicode(s):
ASCII equivalent.
"""
normalized = unicodedata.normalize('NFKD', s)
if normalized == s:
return s
else:
return ''.join([c for c in normalized if not unicodedata.combining(c)])
return ''.join([c for c in normalized if not unicodedata.combining(c)])


def strip_accents_ascii(s):
Expand Down