ellyChar.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
# PyElly - rule-based tool for analyzing natural language (Python v3.8)
#
# ellyChar.py : 17nov2019 CPM
# ------------------------------------------------------------------------------
# Copyright (c) 2019, Clinton Prentiss Mah
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#   Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
#   Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# -----------------------------------------------------------------------------

"""
for defining PyElly subset of Unicode for input text
"""

DOT = '.'         # Unicode period
COM = ','         # Unicode comma
COL = ':'         # Unicode colon
AST = '*'         # Unicode asterix
APO = chr(39)     # Unicode apostrophe
APX = '\u2019'    # Unicode formatted apostrophe
USC = '_'         # Unicode underscore
LBR = '['         # Unicode left bracket
RBR = ']'         # Unicode right bracket
SLA = '/'         # Unicode slash
BSL = '\\'        # Unicode backslash
SPC = ' '         # Unicode space
AMP = '&'         # Unicode ampersand
HYM = '-'         # Unicode minus, hyphen
PLS = '+'         # Unicode plus
DEG = '\u00B0'    # Unicode degree symbol
NBS = '\u00A0'    # Unicode no-break space
THS = '\u2009'    # Unicode thin space
TAB = '\u0009'    # ASCII horizontal tab
RS  = '\u001E'    # ASCII record separator with special significance for parsing

SHARP = '\u266F'  # for musical accidentals
FLAT  = '\u266D'

LSQm = '\u2018'   # left  single quote
RSQm = '\u2019'   # right single quote (same as APX)
LDQm = '\u201C'   # left  double quote
RDQm = '\u201D'   # right double quote
PRME = '\u2032'   # prime
ELLP = '\u2026'   # horizontal ellipsis
NDSH = '\u2013'   # en dash
MDSH = '\u2014'   # em dash
ABrL = '\u3008'   # left angle  bracket
ABrR = '\u3009'   # right angle bracket
HYPH = '\u2010'   # Unicode hyphen only

EURO = '\u20AC'

Exc = [ AMP , HYM , PLS , PRME ]           # extension of span for token

Spc = [ HYM , NDSH , PLS ]                 # special joining

Apd = [ AST , PLS , HYM ]                  # marks appending to token

Pnc = [ '“' , '”' , '‘' , '’' , '–' ,      # special punctuation
        '—' , '…' , '™' , '′' ]

Opn = [ '“' , '‘' , '"' , "'" , '[' , '(' ]
Cls = [ '”' , '’' , '"' , "'" , ']' , ')' ]

Grk = [            # small Greek letters, not treated as alphabetic
    'α','β','γ','δ','ε','ζ','η','θ',
    'ι','κ','λ','μ','ν','ξ','ο','π',
    'ρ','σ','τ','υ','φ','χ','ψ','ω'
]

Misc = [ THS , SHARP , FLAT , ABrL , ABrR , HYPH , EURO ]
Spm  = [ SHARP , FLAT ]

Quo  = [ LSQm , LDQm , RSQm , RDQm , '"' , "'" ]

Lim = 0x01D5      # main limit of Unicode alphabetic chars recognized

LaS = 0x0080      # start of Latin supplements
LaT = 0x00A0      # end of supplement control chars

######## The alphabet currently defined for PyElly is ASCII plus Latin-1
######## Supplement and Latin Extended A plus parts of Latin Extended B.
######## These are in the first four blocks of Unicode chars as follows:

# 0000  . . . . . . . . . . . . . . . .     . . . . . . . . . . . . . . . .
# 0020    ! " # $ % & ' ( ) * + , - . /     0 1 2 3 4 5 6 7 8 9 : ; < = > ?
# 0040  @ A B C D E F G H I J K L M N O     P Q R S T U V W X Y Z [ \ ] ^ _
# 0060  ` a b c d e f g h i j k l m n o     p q r s t u v w x y z { | } ~

# 0080  . . . . . . . . . . . . . . . .     . . . . . . . . . . . . . . . .
# 00A0    ¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬   ® ¯     ° ± ² ³ ´ µ ¶ · ¸ ¹ º » ¼ ½ ¾ ¿
# 00C0  À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï     Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß
# 00E0  à á â ã ä å æ ç è é ê ë ì í î ï     ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ

# 0100  Ā ā Ă ă Ą ą Ć ć Ĉ ĉ Ċ ċ Č č Ď ď     Đ đ Ē ē Ĕ ĕ Ė ė Ę ę Ě ě Ĝ ĝ Ğ ğ
# 0120  Ġ ġ Ģ ģ Ĥ ĥ Ħ ħ Ĩ ĩ Ī ī Ĭ ĭ Į į     İ ı Ĳ ĳ Ĵ ĵ Ķ ķ ĸ Ĺ ĺ Ļ ļ Ľ ľ Ŀ
# 0140  ŀ Ł ł Ń ń Ņ ņ Ň ň ŉ Ŋ ŋ Ō ō Ŏ ŏ     Ő ő Œ œ Ŕ ŕ Ŗ ŗ Ř ř Ś ś Ŝ ŝ Ş ş
# 0160  Š š Ţ ţ Ť ť Ŧ ŧ Ũ ũ Ū ū Ŭ ŭ Ů ů     Ű ű Ų ų Ŵ ŵ Ŷ ŷ Ÿ Ź ź Ż ż Ž ž ſ

# 0180  . . . . . . . . . . . . . . . .     . . . . . . . . . . . . . . . .
# 01A0  . . . . . . . . . . . . . . . .     . . . . . . . . . . . . . . . .
# 01C0  . . . . . . . . . . . . . Ǎ ǎ Ǐ     ǐ Ǒ ǒ Ǔ ǔ

######## The entire PyElly character set also includes Unicode punctuation,
######## spaces, and other special characters outside this range.

def isStrongConsonant ( x ):
    """
    test whether char is consonant, not including Y

    arguments:
        x - the char
    returns:
        True if non-Y consonant, False otherwise
    """
    if not isConsonant(x) or x == 'Y' or x == 'y':
        return False
    else:
        return True

def isConsonant ( x ):
    """
    test whether char is consonant, including Y

    arguments:
        x - the char
    returns:
        True if consonant, False otherwise
    """
    if not isLetter(x) or isVowel(x):
        return False
    else:
        return True

## ASCII plus Latin-1 vowels

T = True
F = False

Vowel = [
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    F,T,F,F,F,T,F,F,F,T,F,F,F,F,F,T, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    F,T,F,F,F,T,F,F,F,T,F,F,F,F,F,T, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,

    F,F,F,F,F,F,F,F,F,F,F,F,T,F,F,F, F,F,F,F,F,F,F,F,F,F,F,F,T,F,F,F,
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    T,T,T,T,T,T,T,F,T,T,T,T,T,T,T,T, F,F,T,T,T,T,T,F,T,T,T,T,T,T,F,T,
    T,T,T,T,T,T,T,F,T,T,T,T,T,T,T,T, F,F,T,T,T,T,T,F,T,T,T,T,T,F,F,F,

    T,T,T,T,T,T,F,F,F,F,F,F,F,F,F,F, F,F,T,T,T,T,T,T,T,T,T,T,F,F,F,F,
    F,F,F,F,F,F,F,F,T,T,T,T,T,T,T,T, T,T,T,T,F,F,F,F,F,F,F,F,F,F,F,F,
    F,F,F,F,F,F,F,F,F,F,F,F,T,T,T,T, T,T,T,T,F,F,F,F,F,F,F,F,F,F,F,F,
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,

    F,F,F,F,F,F,T,T,F,F,F,F,F,F,T,T, T,F,F,F,F,F,T,T,F,F,F,F,F,F,F,T,
    T,T,T,T,F,F,F,F,F,F,F,F,F,F,F,T, T,T,T,T,F,F,F,F,F,F,F,F,F,F,F,F,
    F,F,F,F,F,F,F,F,F,F,F,F,F,T,T,T, T,T,T,T,T
]

Uindex = ord('u') - ord('a') + 1

def isStrictVowel ( x ):
    """
    test whether char is vowel, not including U

    arguments:
        x - the char
    returns:
        True if non-U vowel, False otherwise
    """
    return ord(x) < Lim and Vowel[ord(x)]

def isVowel ( x ):
    """
    test whether char is lowercase vowel, including U

    arguments:
        x - the char
    returns:
        True if vowel, False otherwise
    """
    return isStrictVowel(x) or toIndex(x) == Uindex

## chars allowed in tokens

def isCombining ( x ):
    """
    test whether char can be in multi-char token

    arguments:
        x - the char
    returns:
        True if a most general token char, False otherwise
    """
    return isLetterOrDigit(x) or x in [ USC , APO , APX , BSL , NBS ]

def isEmbeddedCombining ( x ):
    """
    test whether special char can be embedded in token

    arguments:
        x - the char
    returns:
        True if char can be in the middle of a token, False otherwise
    """
    return x in [ DOT , COM , COL , APO , APX , AST , AMP , SLA , DEG ]

def isPureCombining ( x ):
    """
    test whether char is token char, not punctuation nor apostrophe

    arguments:
        x - the char
    returns:
        True if strictest kind of token char, False otherwise
    """
    return isLetterOrDigit(x) or x == USC or x == NBS

def isSpace ( x ):
    """
    tests whether char is text space, including _

    arguments:
        x - the char
    returns:
        True if text space char, False otherwise
    """
    return x in [ SPC , NBS , USC , THS ]

## for replacing standard library char typing and case conversion

Letter = [
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    F,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,F,F,F,F,F,
    F,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,F,F,F,F,F,

    F,F,F,F,F,F,F,F,F,F,T,F,T,F,T,F, F,F,F,F,F,F,F,F,F,F,F,F,T,F,T,T,
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,F,T,T,T,T,T,T,T,T,
    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,F,T,T,T,T,T,T,T,T,

    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,
    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,
    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,
    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,

    T,T,T,T,F,F,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,
    T,T,T,T,T,T,T,F,F,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,T,F,F,F,F,
    F,F,F,F,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T
]

LetterOrDigit = [
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F, T,T,T,T,T,T,T,T,T,T,F,F,F,F,F,F,
    F,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,F,F,F,F,F,
    F,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,F,F,F,F,F,

    F,F,F,F,F,F,F,F,F,F,T,F,T,F,T,F, F,F,F,F,F,F,F,F,F,F,F,F,T,F,T,T,
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F, F,F,T,T,F,F,F,F,F,T,F,F,F,F,F,F,
    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,F,T,T,T,T,T,T,T,T,
    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,F,T,T,T,T,T,T,T,T,

    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,
    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,
    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,
    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,

    T,T,T,T,F,F,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,
    T,T,T,T,T,T,T,F,F,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T,T,T,T,T,F,F,F,F,
    F,F,F,F,T,T,T,T,T,T,T,T,T,T,T,T, T,T,T,T,T
]

def isLetterOrDigit ( x ):
    """
    check for ASCII or Latin-1 letter or digit

    arguments:
        x - the char
    returns:
        True if letter or digit, False otherwise
    """
    return LetterOrDigit[ord(x)] if x != '' and ord(x) < Lim else x in Digits

def isNotLetterOrDigit ( x ):
    """
    check for nonalphanumeric

    arguments:
        x - the char
    returns:
        True if not letter or digit, False otherwise
    """
    return not isLetterOrDigit(x)

def isLetter ( x ):
    """
    check for ASCII or Latin-1 letter

    arguments:
        x - the char
    returns:
        True if letter, False otherwise
    """
    return x != '' and ord(x) < Lim and Letter[ord(x)]

Digits = [
    '0' , '1' , '2' , '3' , '4',
    '5' , '6' , '7' , '8' , '9',
    '¹' , '²' , '³'
]

def isDigit ( x ):
    """
    check for ASCII or Latin-1 digit or exponent

    arguments:
        x - the char
    returns:
        True if digit, False otherwise
    """
    return x in Digits

Space = [
    T,F,F,F,F,F,F,F,F,T,T,T,T,T,F,F, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    T,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F, F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F
]

LimA = len(Space)

def isWhiteSpace ( x ):
    """
    check for white space (but NOT no-break space)

    arguments:
        x - the char
    returns:
        True if Unicode space, False otherwise
    """
    if x == THS:
        return True
    else:
        return x != '' and x != None and ord(x) < LimA and Space[ord(x)]

def isApostrophe ( x ):
    """
    check for variations of apostrophes

    arguments:
        x - the char
    returns:
        True if apostrophe space, False otherwise
    """
    return x == APO or x == APX or x == PRME

## for equating PyElly Unicode with 26 ASCII letters when indexing

Mapping = [
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   27,28,29,30,31,32,33,34,35,36, 0, 0, 0, 0, 0, 0,
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
   16,17,18,19,20,21,22,23,24,25,26, 0, 0, 0, 0, 0,
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
   16,17,18,19,20,21,22,23,24,25,26, 0, 0, 0, 0, 0,

    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,19, 0,15, 0,26, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,19, 0,15, 0,26,25,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0,29,30, 0, 0, 0, 0, 0,28, 0, 0, 0, 0, 0, 0,
    1, 1, 1, 1, 1, 1, 1, 3, 5, 5, 5, 5, 9, 9, 9, 9,
   20,14,15,15,15,15,15, 0,15,21,21,21,21,25,20,19,
    1, 1, 1, 1, 1, 1, 1, 3, 5, 5, 5, 5, 9, 9, 9, 9,
   20,14,15,15,15,15,15, 0,15,21,21,21,21,25,20,25,

    1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4,
    4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 7, 7, 7, 7,
    7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
    9, 9, 9, 9,10,10,11,11,11,12,12,12,12,12,12,12,
   12,12,12,14,14,14,14,14,14,14,14,14,15,15,15,15,
   15,15, 1, 1,18,18,18,18,18,18,19,19,19,19,19,19,
   19,19,20,20,20,20,20,20,21,21,21,21,21,21,21,21,
   21,21,21,21,23,23,25,25,25,26,26,26,26,26,26,19,

    2, 2, 2, 2, 0, 0,15,15, 3, 4, 4, 4, 4, 4, 5, 5,
    5, 6, 6, 7, 7, 8, 9, 9,11,11,12,12,13,14,14,15,
   15,15,15,15,16,16,18, 0, 0,19,19,20,20,20,20,21,
   21,21,22,25,25,26,26,26,26,26,26,20, 0, 0, 0, 0,
    0, 0, 0, 0,10,10,10,12,12,12,14,14,14, 1, 1, 9,
    9,15,15,21,21
]

## letters for codes

Unmapping = " abcdefghijklmnopqrstuvwxyz01234356789"  # ASCII string!

Max  = 0       # maximum mapping defined (=pure alphabetic count)

for m in Mapping:
    if Max < m: Max = m

DigB = Max + 1 # starting index for digits after alphabetic

def toIndex ( x ):
    """
    map alphanumic Unicode Latin-1 to equivalent ASCII for indexing

    arguments:
        x - the char
    returns:
        index value for char
    """
    if isLetterOrDigit(x):
        return Mapping[ord(x)]
    else:
        return 0

def toChar ( k ):
    """
    unmap index value to base ASCII letter or digit

    arguments:
        k - numerical index
    returns:
        letter represented by index
    """

    if k > Max + 10 or k <= 0:
        return HYM
    elif k > Max:
        return chr(ord('0') + k - DigB)
    else:
        return Unmapping[k]

Lower = [
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,     F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,     F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,     F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    F,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,     T,T,T,T,T,T,T,T,T,T,T,F,F,F,F,F,

    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,     F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,     F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,
    F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,     F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,T,
    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,     T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,

    F,T,F,T,F,T,F,T,F,T,F,T,F,T,F,T,     F,T,F,T,F,T,F,T,F,T,F,T,F,T,F,T,
    F,T,F,T,F,T,F,T,F,T,F,T,F,T,F,T,     F,T,F,T,F,T,F,T,T,F,T,F,T,F,T,F,
    T,F,T,F,T,F,T,F,T,T,F,T,F,T,F,T,     F,T,F,T,F,T,F,T,F,T,F,T,F,T,F,T,
    F,T,F,T,F,T,F,T,F,T,F,T,F,T,F,T,     F,T,F,T,F,T,F,T,F,F,T,F,T,F,T,T,

    T,F,F,T,F,F,F,F,F,T,F,F,F,F,F,F,     F,F,T,F,F,F,F,F,F,T,F,F,F,F,F,F,
    F,T,F,T,F,T,F,F,F,F,F,F,F,F,F,F,     T,F,F,F,T,F,T,F,F,T,F,F,F,F,F,F,
    T,T,F,F,F,F,T,F,F,T,F,F,T,F,T,F,     T,F,T,F,T
]

Quoting  = { LSQm : "'" , RSQm : "'" , LDQm : '"' , RDQm : '"' }
Exponent = { '¹' : '1' , '²' : '2' , '³' : '3' }

def toLowerCaseASCII ( ls , alph=False ):
    """
    convert a list of chars to lowercase ASCII with option to
    keep alphabetic only, depending on argument alph,
    with placeholder . or _ inserted for nonalphabetic

    arguments:
        ls   - a Unicode list, both input and output
        alph - True if alphabetic conversion only
    """

    k = 0
    for c in ls:
        if c in Quoting:
            c = Quoting[c]
        elif c in Exponent:
            c = Exponent[c]
        elif c == HYPH:
            c = '-'

        if ord(c) >= Lim or c == '_':
            ls[k] = '_'
        elif not isLetterOrDigit(c):
            if alph or ord(c) > 127: ls[k] = '.'
            elif c == '-' : ls[k] = c
        elif isDigit(c):
            if alph or ord(c) > 127: ls[k] = '_'
        else:
            ls[k] = Unmapping[Mapping[ord(c)]]
        k += 1

def isLowerCaseLetter ( x ):
    """
    check for no capitalization

    arguments:
        x - the char
    returns:
        True if lower case letter, False otherwise
    """
#   print ( 'x=' , x )
    return x != '' and ord(x) < Lim and Lower[ord(x)]

def isUpperCaseLetter ( x ):
    """
    check for capitalization

    arguments:
        x - the char
    returns:
        True if lower case letter, False otherwise
    """
    no = ord(x) if x != '' and ord(x) < Lim else 0
    return Letter[no] and not Lower[no]

def isText ( x ):
    """
    check for ASCII, Latin, or Greek char or punctuation or thin space or accidental

    arguments:
        x - the char
    returns:
        True if in PyElly text chars, False otherwise
    """
    if x == '' or isPureControl(x):
#       print ( ord(x) , 'is control char' )
        return False
    else:
#       print ( 'x=' , x , 'Misc=' , Misc )
        return ord(x) < Lim or x in Pnc or x in Grk or x in Misc

control = [
    T,T,T,T,T,T,T,T,T,F,F,T,T,F,T,T,
    T,T,T,T,T,T,T,T,T,T,T,T,T,T,T,T
]

def isPureControl ( x ):
    """
    identify ASCII non-text control char

    arguments:
        x - the char
    returns:
        True if non-text control, False otherwise
    """
    if ord(x) < LaS:
#       print ( ord(x) , '< SPC=', ord(SPC) )
        return x < SPC and control[ord(x)]
    else:
#       print ( ord(x) , '<' , LaT )
        return ord(x) < LaT

termina = [ COL , COM ]

def findExtendedBreak ( text , offset=0 , nspace=0 ):

    """
    look for next break in text from given offset
    possibly skipping over a specified number of spaces

    arguments:
        text   - what to scan in text stream
        offset - starting offset
        nspace - how many spaces can be included in scan
    returns:
        remaining char count in text if no break is found
        otherwise, count of chars to next break if nonzero, but 1 if zero,
    """
    nalnm = False                                  # preceding char not alphanumeric?
    k = offset
    n = len(text)
#   print ('find break k=' , k , 'n=' , n , 'nspace=' , nspace )
    while k < n:
        x = text[k]                                # iterate on next chars in input
#       print ( 'char=' , x , '@' , k )
        if not isPureCombining(x):                 # check if not ordinary token char
#           print ( 'special checking needed' )
            if (x in Spc or
                isEmbeddedCombining(x)):           # check if embeddable punctuation
#               print ( 'nalnm=' , nalnm )
                if nalnm:
                    break
                nalnm = True
#               print ( 'k=' , k , 'n=' , n )
                if k + 1 < n:
                    c = text[k+1]                  # look at next char in input
#                   print ( 'next char=' , c )
                    if isApostrophe(c) or isLetterOrDigit(c) or c in Exc:
                        nalnm = False              # must fib here
                        k += 1
#                       print ( 'k=' , k )
                        continue
                    elif isSpace(c):               # if next char is space, is it expected?
#                       print ( 'is space, nspace=' , nspace )
                        if nspace > 0:
                            k += 2                 # allow for space in token
                            nspace -= 1
                            continue
                        elif not x in termina:     # look for a token break
#                           print ( 'non breaking' , x )
                            k += 1                 # if none, continue scan
                            break
                    if x in Cls: k += 1
#                   print ( "done" )
                elif not x in termina:             # the above code must be repeated
#                   print ( 'non breaking' , x )   # since the elif code is paired
                    k += 1                         # with a different if
                    break
            elif x in Spm:                         # music accidentals
#               print ( 'special breaking' )
                k += 1
                break
            elif x in Quo:                         # quotation marks
#               print ( 'special case of any quotation mark' )
                k += 1
                nalnm = True
                continue
            elif k == offset and x in Opn:
#               print ( 'look for short bracketed segment' )
                j  = k + 1
                jl = k + 4
                if jl > n: jl = n
#               print ( 'j=' , j , 'jl=' , jl )
#               print ( 'input=' , list(text) )
                while j < jl:
                    if text[j] in Cls: break
                    j += 1
                if j < jl:
#                   print ( 'segment found' )
                    k = j
                k += 1
#               print ( 'done k=' , k )
                break

#           print ( 'space check, nspace=' , nspace  )
            if nspace > 0 and isSpace(x):
                k += 1
                nspace -= 1
                nalnm = False
                continue
            if k == offset:
                k += 1      # if immediate break, take 1 char anyway
            break
        elif x in Pnc:
#           print ( 'punctuation break, x=' , x )
            if x in [ RSQm , RDQm ]:
                k += 1
            break
        else:
            nalnm = False
        k += 1
#   print ( 'break=' , k , 'offset=' , offset )
    return k - offset

############################ special CJK  ############################

def isCJK ( x ):

    """
    check for CJK Unicode - only the 20,950 most common characters!

    arguments:
        x   - char to check
    returns:
        True if CJK, otherwise False
    """

    xo = ord(x)
    return 0xA000 > xo and xo >= 0x4E00

############################ unit testing ############################

if __name__ == "__main__":

    chr1 = "pqrstuvwxyz{|}~."
    chr2 = "ðñòóôõö÷øùúûüýþÿ"
    chr3 = "ŰűŲųŴŵŶŷŸŹźŻżŽžſ"
    chr4 = "ǀǁǂǃǄǅǆǇǈǉǊǋǌǍǎĚěǏǐǑǒǓǔ"
    chr5 = "ßÿĸŉŸſ"
    chr6 = "αβγδεζ"
    chrs = chr1 + chr2 + chr3 + chr4 + chr5 + chr6

    for cx in list(chrs):
        ko = ord(cx)
        mp = toChar(toIndex(cx))
        vo = 'vowel' if isVowel(cx) else ''
        cy = cx.lower()
        print ( '<' + cx + '/' + cy +  '> ord=' , '{:3d}'.format(ko) , 'map=' , mp , vo )

    print ( '寶=CJK' , isCJK('寶') , isText('寶') )
    print ( '譽=CJK' , isCJK('譽') , isText('譽') )
    print ( '禮=CJK' , isCJK('禮') , isText('禮') )
    print ( '… =CJK' , isCJK('…') )
    print ( 'thin space' , isWhiteSpace(THS) )
    print ( 'thin space' , isSpace(THS) )
    print ( 'ASCII space' , isWhiteSpace(' ') , ord(' ') )
    print ( 'ASCII space' , isSpace(' ') , ord(' ') )
    print ( 'text \u266F' , isText(SHARP) )
    print ( 'text \u266D' , isText(FLAT) )
    print ( 'text \u2032' , isText(PRME) )
    print ( 'digit ¹' , isDigit('¹') , isLetterOrDigit('¹') , toIndex('¹') , toChar(toIndex('¹')) )
    print ( 'digit ²' , isDigit('²') , isLetterOrDigit('²') , toIndex('²') , toChar(toIndex('²')) )
    print ( 'digit ³' , isDigit('³') , isLetterOrDigit('³') , toIndex('³') , toChar(toIndex('³')) )