|
| 1 | +# -*- coding: utf-8 -* |
| 2 | + |
1 | 3 | """
|
2 |
| - initialization package imports |
| 4 | +Georgian Language Toolkit for Python 3 |
| 5 | +
|
| 6 | +Source: <https://github.com/Lh4cKg/simple-geolang-toolkit> |
3 | 7 | """
|
4 | 8 |
|
5 |
| -from geolang.geolang import ( |
6 |
| - __author__, |
7 |
| - __version__, |
8 |
| - encode_slugify, |
9 |
| - encode_text, |
10 |
| - GeoLangToolKit, |
11 |
| -) |
| 9 | +import re |
| 10 | +from typing import Dict, Any, Iterable, Union, List, Tuple |
| 11 | +from functools import lru_cache, partial |
| 12 | +from unicodedata import normalize |
| 13 | + |
| 14 | + |
| 15 | +__author__ = 'Lasha Gogua' |
| 16 | +__email__ = 'Lh4cKg@gmail.com' |
| 17 | +__version__ = '0.2.1' |
| 18 | + |
| 19 | +__all__ = ['GeoLangToolKit', 'encode_slugify', 'encode_text'] |
| 20 | + |
| 21 | + |
| 22 | +class GeoLangToolKit(object): |
| 23 | + |
| 24 | + def __init__( |
| 25 | + self, |
| 26 | + latin_script: Union[str, List[str], Tuple[Iterable[str]]] = None |
| 27 | + ) -> None: |
| 28 | + """ |
| 29 | + Romanization of Georgian is the process of transliterating the Georgian |
| 30 | + language from the Georgian script into the Latin script. |
| 31 | +
|
| 32 | + default script is National: |
| 33 | + თ - t |
| 34 | + კ - k' |
| 35 | + ტ - t' |
| 36 | + ფ - p |
| 37 | + ქ - k |
| 38 | + პ - p' |
| 39 | + ჟ - zh |
| 40 | + ღ - gh |
| 41 | + ყ - q' |
| 42 | + შ - sh |
| 43 | + ჩ - ch |
| 44 | + ც - ts |
| 45 | + ძ - dz |
| 46 | + წ - ts' |
| 47 | + ჭ - ch' |
| 48 | + """ |
| 49 | + self.ka_script: str = 'აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ' |
| 50 | + |
| 51 | + if isinstance(latin_script, (list, tuple)): |
| 52 | + self.latin_script = latin_script |
| 53 | + elif isinstance(latin_script, str): |
| 54 | + if len(latin_script) < 33: |
| 55 | + raise ValueError( |
| 56 | + 'Wrong latin script characters, available list, ' |
| 57 | + 'tuple or comma separated string, max length 33.' |
| 58 | + ) |
| 59 | + else: |
| 60 | + self.latin_script = latin_script.split(',') |
| 61 | + else: |
| 62 | + self.latin_script: Iterable[str] = ( |
| 63 | + 'a', 'b', 'g', 'd', 'e', 'v', 'z', 't', 'i', 'k', 'l', 'm', |
| 64 | + 'n', 'o', 'p', 'zh', 'r', 's', 't', 'u', 'p', 'k', 'gh', 'q', |
| 65 | + 'sh', 'ch', 'ts', 'dz', 'ts', 'ch', 'kh', 'j', 'h' |
| 66 | + ) |
| 67 | + |
| 68 | + @property |
| 69 | + @lru_cache(maxsize=None) |
| 70 | + def ka2lat_map(self) -> Dict[str, str]: |
| 71 | + """ |
| 72 | +
|
| 73 | + :return characters map of georgian to latin |
| 74 | + """ |
| 75 | + |
| 76 | + return {ka: lat for ka, lat in zip(self.ka_script, self.latin_script)} |
| 77 | + |
| 78 | + @property |
| 79 | + @lru_cache(maxsize=None) |
| 80 | + def lat2ka_map(self) -> Dict[str, str]: |
| 81 | + """ |
| 82 | +
|
| 83 | + :return georgian characters map of latin to georgian |
| 84 | + """ |
| 85 | + |
| 86 | + return {lat: ka for lat, ka in zip(self.latin_script, self.ka_script)} |
| 87 | + |
| 88 | + def lat2ka(self, value: str, na_value: str = None) -> str: |
| 89 | + """ |
| 90 | + convert the given string from latin into georgian chars |
| 91 | +
|
| 92 | + :param value: Georgian or Latin text |
| 93 | + :param na_value: N/A value if could not find character, default None. |
| 94 | +
|
| 95 | + :return |
| 96 | +
|
| 97 | + >>> # example |
| 98 | + >>> self.lat2ka('laSas uyvars ana da piToni lol ))') |
| 99 | + "ლაSას უyვარს ანა და ფიTონი ლოლ ))" |
| 100 | + """ |
| 101 | + |
| 102 | + chars = list() |
| 103 | + i = 0 |
| 104 | + while i < len(value): |
| 105 | + char = value[i] |
| 106 | + try: |
| 107 | + chars.append(self.lat2ka_map[char]) |
| 108 | + except KeyError: |
| 109 | + if na_value: |
| 110 | + chars.append(na_value) |
| 111 | + else: |
| 112 | + chars.append(char) |
| 113 | + i += 1 |
| 114 | + |
| 115 | + return ''.join(chars) |
| 116 | + |
| 117 | + def ka2lat(self, value: str, na_value: str = None) -> str: |
| 118 | + """ |
| 119 | + convert the given name from georgian into latin chars |
| 120 | +
|
| 121 | + :param value: Georgian or Latin text |
| 122 | + :param na_value: N/A value if could not find character, default None. |
| 123 | +
|
| 124 | + :return |
| 125 | +
|
| 126 | + >>> # example |
| 127 | + >>> self.ka2lat('მე მიყვარს ანა!') |
| 128 | + "me miqvars ana!" |
| 129 | + """ |
| 130 | + |
| 131 | + chars = list() |
| 132 | + i = 0 |
| 133 | + while i < len(value): |
| 134 | + char = value[i] |
| 135 | + try: |
| 136 | + chars.append(self.ka2lat_map[char]) |
| 137 | + except KeyError: |
| 138 | + if na_value: |
| 139 | + chars.append(na_value) |
| 140 | + else: |
| 141 | + chars.append(char) |
| 142 | + i += 1 |
| 143 | + |
| 144 | + return ''.join(chars) |
| 145 | + |
| 146 | + def _replace_str(self, ka2latin: bool, match: re.Match) -> str: |
| 147 | + """ |
| 148 | + replace strings |
| 149 | + """ |
| 150 | + |
| 151 | + char = match.group() |
| 152 | + |
| 153 | + if ka2latin and char in self.ka2lat_map: |
| 154 | + return self.ka2lat_map[char] |
| 155 | + |
| 156 | + return char |
| 157 | + |
| 158 | + @staticmethod |
| 159 | + def _slugify(value: Any) -> str: |
| 160 | + """ |
| 161 | + Converts to ASCII. Converts spaces to hyphens. |
| 162 | + Removes characters that |
| 163 | + aren't alphanumerics, underscores, or hyphens. |
| 164 | + Also strips leading and trailing whitespace. |
| 165 | +
|
| 166 | + """ |
| 167 | + |
| 168 | + if isinstance(value, bytes): |
| 169 | + s = str(value, 'utf-8', 'strict') |
| 170 | + else: |
| 171 | + s = str(value) |
| 172 | + s = normalize('NFKD', s).encode('ascii', 'ignore').decode('ascii') |
| 173 | + return re.sub(r'[-\s]+', '-', re.sub(r'[^\w\s-]', '', s).strip()) |
| 174 | + |
| 175 | + def encode_slugify(self, value: str, ka2latin: bool = False) -> str: |
| 176 | + """ |
| 177 | +
|
| 178 | + Convert Georgian letters to latin if 'ka2latin' is True. |
| 179 | + Convert spaces to hyphens. |
| 180 | + Remove characters that aren't alphanumerics, underscores, or hyphens. |
| 181 | + Convert to lowercase. Also strip leading and trailing whitespace. |
| 182 | +
|
| 183 | + :param value: Georgian or Latin text |
| 184 | + :param ka2latin: if True, value with Georgian letters will be converted |
| 185 | + to Latin letters, default False. |
| 186 | + :return: |
| 187 | +
|
| 188 | + >>> encode_slugify("მე\'მიყვარს-ანი და ის/ჩემი ცხოვბრებაა! ჩ", True) |
| 189 | + "memiqvars-ani-da-ischemi-tskhovbrebaa-ch" |
| 190 | + >>> encode_slugify("პითონი და ჯანგო") |
| 191 | + >>> encode_slugify("adé\jcà lr\\rr'huété") # could not find unicode |
| 192 | + "adé\jcà lr\\rr'huété" |
| 193 | + >>> encode_slugify("更新时间") # could not find unicode |
| 194 | + "更新时间" |
| 195 | +
|
| 196 | + """ |
| 197 | + |
| 198 | + if isinstance(value, bytes): |
| 199 | + value = str(value, 'utf-8', 'strict') |
| 200 | + else: |
| 201 | + value = str(value) |
| 202 | + |
| 203 | + replace_str = partial(self._replace_str, ka2latin) |
| 204 | + s = re.sub(r'[^a-zA-Z0-9\\s\\-]{1}', replace_str, value) |
| 205 | + |
| 206 | + return re.sub(r'[-\s]+', '-', re.sub(r'[^\w\s-]', '', s).strip().lower()) |
| 207 | + |
| 208 | + def encode_text( |
| 209 | + self, |
| 210 | + value: str, |
| 211 | + ka2latin: bool = True, |
| 212 | + latin2ka: bool = False, |
| 213 | + na_value: str = None) -> str: |
| 214 | + """ |
| 215 | +
|
| 216 | + :param value: Georgian or Latin text |
| 217 | + :param ka2latin: if True, value with Georgian letters will be converted |
| 218 | + to Latin letters, default True. |
| 219 | + :param latin2ka: if True, value with Latin letters will be converted |
| 220 | + to Georgian letters, default False. |
| 221 | + :param na_value: N/A value if could not find character, default None. |
| 222 | + :return: georgian or latin letters |
| 223 | + """ |
| 224 | + |
| 225 | + if not ka2latin and not latin2ka: |
| 226 | + raise ValueError( |
| 227 | + 'Missing required argument, ' |
| 228 | + 'Choose one `ka2latin` or `latin2ka`' |
| 229 | + ) |
| 230 | + |
| 231 | + if isinstance(value, bytes): |
| 232 | + value = str(value, 'utf-8', 'strict') |
| 233 | + else: |
| 234 | + value = str(value) |
| 235 | + |
| 236 | + if latin2ka: |
| 237 | + return self.lat2ka(value, na_value) |
| 238 | + |
| 239 | + return self.ka2lat(value, na_value) |
| 240 | + |
| 241 | + |
| 242 | +instance: GeoLangToolKit = GeoLangToolKit() |
| 243 | +encode_slugify = instance.encode_slugify |
| 244 | +encode_text = instance.encode_text |
0 commit comments