Skip to content

Commit

Permalink
Merge pull request #18 from neelsmith/wip
Browse files Browse the repository at this point in the history
Add Latin 25, Latin24 types
  • Loading branch information
neelsmith authored Feb 4, 2024
2 parents 2cde318 + 850c5df commit 3aaed21
Show file tree
Hide file tree
Showing 9 changed files with 374 additions and 25 deletions.
8 changes: 4 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LatinOrthography"
uuid = "1e3032c9-fa1e-4efb-a2df-a06f238f6146"
authors = ["nsmith "]
version = "0.6.0"
version = "0.7.0"

[deps]
CitableBase = "d6f014bd-995c-41bd-9893-703339864534"
Expand All @@ -15,8 +15,8 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[compat]
CitableBase = "10"
CitableCorpus = "0.13"
CitableText = "0.15"
CitableText = "0.16"
DocStringExtensions = "0.9"
Documenter = "0.27"
Orthography = "0.17, 0.18"
Documenter = "1"
Orthography = "0.21"
julia = "1"
6 changes: 6 additions & 0 deletions src/LatinOrthography.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@ import Orthography: tokentypes
import Orthography: codepoints
import Orthography: tokenize


export LatinOrthographicSystem

export Latin23, latin23
export Latin24, latin24
export Latin25, latin25
export codepoints, tokentypes, tokenize
export alphabetic, punctuation, whitespace

Expand All @@ -19,6 +22,9 @@ export EncliticToken


include("orthography.jl")
include("common.jl")
include("latin23.jl")
include("latin24.jl")
include("latin25.jl")

end # module
16 changes: 16 additions & 0 deletions src/common.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@

"""Define recognized punctuation characters for Latin23 orthography.
$(SIGNATURES)
"""
function latinpunctuation()
".,;:?"
end

"""Define recognized punctuation characters for Latin23 orthography.
$(SIGNATURES)
"""
function latinwhitespace()
" \n\t"
end
23 changes: 5 additions & 18 deletions src/latin23.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,14 @@ function latin23alphabet()
join(ranges,"")
end

"""Define recognized punctuation characters for Latin23 orthography.

$(SIGNATURES)
"""
function latin23punctuation()
".,;:?"
end

"""Define recognized punctuation characters for Latin23 orthography.
$(SIGNATURES)
"""
function punctuation(ortho::Latin23)
latin23punctuation()
latinpunctuation()
end


Expand All @@ -79,25 +73,16 @@ end
$(SIGNATURES)
"""
function whitespace(ortho::Latin23)
latin23whitespace()
latinwhitespace()
end

"""Define recognized punctuation characters for Latin23 orthography.
$(SIGNATURES)
"""
function latin23whitespace()
" \n\t"
end


"""Instantiate a Latin23 with correct code points and token types.
$(SIGNATURES)
"""
function latin23()

cps = latin23alphabet() * latin23punctuation() * latin23whitespace() * "+"
cps = latin23alphabet() * latinpunctuation() * latinwhitespace() * "+"
ttypes = [
Orthography.LexicalToken,
#Orthography.NumericToken,
Expand Down Expand Up @@ -131,6 +116,8 @@ function tokenizeLatin23(s::AbstractString)
tkns = map(t -> tokenforstring(t, latin23()), tknstrings)
end



"""True if all characters in s are alphabetic.
$(SIGNATURES)
Expand Down
152 changes: 152 additions & 0 deletions src/latin24.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@

"""
An orthographic system for encoding Latin with a 24-character alphabet.
`i` represents both vocalic and semi-vocalic or consonantal values.
"""
struct Latin24 <: LatinOrthographicSystem
codepoints
tokencategories
tokenizer
end

OrthographyTrait(::Type{Latin24}) = IsOrthographicSystem()

"""Implement Orthography's tokenize function for `Latin24`.
$(SIGNATURES)
"""
function tokenize(s::AbstractString, o::Latin24)
tokenizeLatin24(s)
end

"""Implement `codepoints` function of MID `OrthographicSystem` interface.
$(SIGNATURES)
"""
function codepoints(ortho::Latin24)
ortho.codepoints
end

"""Implement `tokentypes` function of MID `OrthographicSystem` interface.
$(SIGNATURES)
"""
function tokentypes(ortho::Latin24)
ortho.tokencategories
end

"""Define range of alphabetic character for Latin24 orthography.
$(SIGNATURES)
"""
function alphabetic(ortho::Latin24)
latinalphabet()
end


"""Define range of alphabetic character for Latin24 orthography.
$(SIGNATURES)
"""
function latin24alphabet()
ranges = [
'a':'i' ; 'k':'v'; 'x':'z';
'A':'I' ; 'K':'V'; 'X':'Z';
]
join(ranges,"")
end





"""Define recognized punctuation characters for Latin orthographies.
$(SIGNATURES)
"""
function whitespace(ortho::Latin24)
latinwhitespace()
end



"""Instantiate a Latin24 with correct code points and token types.
$(SIGNATURES)
"""
function latin24()

cps = latin24alphabet() * latinpunctuation() * latinwhitespace() * "+"
ttypes = [
Orthography.LexicalToken,
#Orthography.NumericToken,
Orthography.PunctuationToken,
EncliticToken
]
Latin24(cps, ttypes, tokenizeLatin24)
end

"""Define recognized punctuation characters for Latin orthographies.
$(SIGNATURES)
"""
function punctuation(ortho::Latin24)
latinpunctuation()
end


"""Tokenize Latin text.
$(SIGNATURES)
"""
function tokenizeLatin24(s::AbstractString)
wsdelimited = split(s)

depunctuated = map(nows -> splitPunctuation(nows, latin24()), wsdelimited) |> Iterators.flatten |> collect

tknstrings = []
for depunctedstr in depunctuated
parts = split(depunctedstr, "+")
if length(parts) == 2
push!(tknstrings, parts[1])
push!(tknstrings, string("+", parts[2]))
else
push!(tknstrings, depunctedstr)
end

end
tkns = map(t -> tokenforstring(t, latin24()), tknstrings)
end

"""True if all characters in s are alphabetic.
$(SIGNATURES)
"""
function isAlphabetic(s::AbstractString, ortho::Latin24)
#chlist = split(s,"")
alphas = alphabetic(ortho)
tfs = []
for i in collect(eachindex(s))
push!(tfs, occursin(s[i], alphas))
end
nogood = false in tfs

!nogood
end

"""True if all characters in s are punctuation.
$(SIGNATURES)
"""
function isPunctuation(s::AbstractString, ortho::Latin24)::Bool
#chlist = split(s,"")
puncts = punctuation(ortho)
tfs = []
for i in collect(eachindex(s))
push!(tfs, occursin(s[i], puncts))
end
nogood = false in tfs

!nogood
end

Loading

0 comments on commit 3aaed21

Please sign in to comment.