-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add CoNNL-U language support, see #3790
- Loading branch information
Showing
9 changed files
with
1,561 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
(function (Prism) { | ||
|
||
Prism.languages.conllu = { | ||
// comment lines | ||
comment: { | ||
pattern: /#(?:[^\n])*/, | ||
inside: { | ||
metadata: { | ||
pattern: /(?:\w+)\s*=\s*.*/, | ||
inside: { | ||
key: { | ||
pattern: /\w+(?=\s*=)/, | ||
alias: 'property', | ||
}, | ||
value: { | ||
pattern: /(\s*=\s*)\S.*$/, | ||
lookbehind: true, | ||
alias: 'string', | ||
}, | ||
operator: /[=]/, | ||
} | ||
}, | ||
punctuation: /^#/, | ||
} | ||
}, | ||
// separator between two sentence blocks | ||
"sentence-separator": { | ||
pattern: /(\r?\n)(?=\r?\n)/s, | ||
lookbehind: true, | ||
}, | ||
// word lines | ||
token: { | ||
pattern: /.+/, | ||
inside: { | ||
id: { | ||
pattern: /^\d+(?:[.-]\d+)?/, | ||
alias: 'number', | ||
}, | ||
// form / lemma / upos / xpos / feats / head / deprel / deps / misc | ||
value: { | ||
pattern: /^(\t)[^\t]*(?=\t|$)/, | ||
lookbehind: true, | ||
// alias: 'string', | ||
// inside: { | ||
// unspecified: /_/, | ||
// } | ||
}, | ||
}, | ||
}, | ||
}; | ||
|
||
const featKeyExp = /[A-Z][A-Za-z0-9]*(?:\[[a-z0-9]+\])?/; | ||
const featValueExp = /.+/; // we just want everything here ... not /[A-Z0-9][A-Za-z0-9]*/; | ||
const featsGrammar = { | ||
punctuation: /\|/, | ||
feature: { | ||
pattern: RegExp('^' + featKeyExp.source + '=' + '.*' + '$'), | ||
inside: { | ||
key: { | ||
pattern: RegExp(featKeyExp.source + '(?==)'), // /\w+(?==)/, | ||
alias: 'property', | ||
}, | ||
value: [ | ||
{ | ||
pattern: /(=)(?:yes|no)$/i, | ||
lookbehind: true, | ||
alias: 'boolean', | ||
}, { | ||
pattern: RegExp('(=)' + featValueExp.source + '$'), // /(=).+$/, | ||
lookbehind: true, | ||
alias: 'string', | ||
} | ||
], | ||
operator: /=/, | ||
}, | ||
}, | ||
}; | ||
|
||
const relationExp = /^[a-z]+(:[a-z]+)?(:[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(_[\p{Ll}\p{Lm}\p{Lo}\p{M}]+)*)?(:[a-z]+)?$/; | ||
const depsGrammar = { | ||
punctuation: /\|/, | ||
dep: { | ||
pattern: /^\S+$/, | ||
inside: { | ||
head: { | ||
pattern: /\d+(?=:)/, | ||
alias: 'number', | ||
}, | ||
punctuation: /^:/, | ||
relation: { | ||
pattern: /.+/, // we just capture everything, should be ok | ||
alias: 'symbol', | ||
}, | ||
} | ||
}, | ||
} | ||
|
||
// hook to assign roles to value fields | ||
const entryTypes = ['form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc']; | ||
const entryTypesAlias = [null, null, 'symbol', 'symbol', null, 'number', 'symbol', null, null]; | ||
const entryTypeInside = [null, null, null, null, featsGrammar, null, null, depsGrammar, featsGrammar]; | ||
Prism.hooks.add('after-tokenize', function (env) { | ||
if (env.language !== 'conllu') { | ||
return; | ||
} | ||
|
||
for (const row of env.tokens) { | ||
// go over each token row (if it is a "token" and not a comment/sentence-separator) | ||
if (row.type === 'token') { | ||
let entryTypeCounter = 0; | ||
for (const field of row.content) { | ||
// skip space between | ||
if (typeof field === 'string') { continue; } | ||
// only fields, not ids | ||
if (field?.type !== 'value') { continue; } | ||
|
||
if (field.alias === undefined) { field.alias = []; } | ||
if (typeof field.alias === 'string') { field.alias = [field.alias]; } | ||
|
||
// check if "_" value, and assign class | ||
if (field.content === '_') { | ||
field.alias.push('unspecified'); | ||
} | ||
|
||
// assign role to value based on position | ||
if (entryTypeCounter < entryTypes.length) { | ||
// add "value" as one alias | ||
field.alias.push(field.type); | ||
// change field type | ||
field.type = entryTypes[entryTypeCounter]; | ||
// add alias if available | ||
if (entryTypesAlias[entryTypeCounter] !== null) { | ||
field.alias.push(entryTypesAlias[entryTypeCounter]); | ||
} else if (entryTypeInside[entryTypeCounter] === null) { | ||
// only assign string if there is no inner processing? | ||
field.alias.push('string'); | ||
} | ||
|
||
// run inner processing only for selected types! | ||
if (field.content !== '_' && entryTypeInside[entryTypeCounter] !== null) { | ||
field.content = Prism.tokenize(field.content, entryTypeInside[entryTypeCounter]); | ||
} | ||
} | ||
|
||
entryTypeCounter++; | ||
} | ||
} | ||
} | ||
}); | ||
|
||
// just to have the classes listed on /faq.html#how-do-i-know-which-tokens-i-can-style-for | ||
// insert dummy rules that do not match anything | ||
// TODO: unsure about possible performance hit? - there should not be anything left to match but regex matching steps increase linearly with input string length ... | ||
// for (let index = 0; index < entryTypes.length; index++) { | ||
// const entryType = entryTypes[index]; | ||
// const entryTypeAlias = entryTypesAlias[index]; | ||
// const name = 'value.' + entryType + (entryTypeAlias !== null ? '.' + entryTypeAlias : ''); | ||
// // use some invalid pattern | ||
// Prism.languages.conllu.token.inside[name] = /\b\B/; | ||
// } | ||
|
||
}(Prism)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
<p>Full details can be fround at <a href="https://universaldependencies.org/format.html" target="_blank">Universal Dependencies - Format</a>.</p> | ||
|
||
<h2>Comments</h2> | ||
|
||
<pre><code># sent_id = 2 | ||
# text = I have no clue. | ||
# or a simple string</code></pre> | ||
|
||
<h2>Full Example</h2> | ||
|
||
<pre><code># sent_id = 2 | ||
# text = I have no clue. | ||
1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _ | ||
2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _ | ||
3 no no DET DT PronType=Neg 4 det _ _ | ||
4 clue clue NOUN NN Number=Sing 2 obj _ SpaceAfter=No | ||
5 . . PUNCT . _ 2 punct _ _</code></pre> | ||
|
||
<h2>Words, Tokens and Empty Nodes</h2> | ||
|
||
<pre><code>1-2 vámonos _ | ||
1 vamos ir | ||
2 nos nosotros | ||
3-4 al _ | ||
3 a a | ||
4 el el | ||
5 mar mar</code></pre> | ||
|
||
<pre><code>1 Sue Sue | ||
2 likes like | ||
3 coffee coffee | ||
4 and and | ||
5 Bill Bill | ||
5.1 likes like | ||
6 tea tea</code></pre> | ||
|
||
<pre><code>1 nosotros nosotros | ||
2 vamos ir | ||
3-4 al _ | ||
3 a a | ||
4 el el | ||
5 mar mar | ||
6 y y | ||
7 vosotros vosotros | ||
7.1 vais ir | ||
8-9 al _ | ||
8 a a | ||
9 el el | ||
10 parque parque</code></pre> | ||
|
||
<h2>Morphological Annotation</h2> | ||
|
||
<pre><code>1 Då då ADV AB _ | ||
2 var vara VERB VB.PRET.ACT Tense=Past|Voice=Act | ||
3 han han PRON PN.UTR.SIN.DEF.NOM Case=Nom|Definite=Def|Gender=Com|Number=Sing | ||
4 elva elva NUM RG.NOM Case=Nom|NumType=Card | ||
5 år år NOUN NN.NEU.PLU.IND.NOM Case=Nom|Definite=Ind|Gender=Neut|Number=Plur | ||
6 . . PUNCT DL.MAD _</code></pre> | ||
|
||
<h2>Syntactic Annotation</h2> | ||
|
||
<pre><code>1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj | ||
2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root | ||
3 and and CCONJ CC _ 4 cc 4:cc | ||
4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj | ||
5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj | ||
6 . . PUNCT . _ 2 punct 2:punct</code></pre> | ||
|
||
<h2>Untokenized Text</h2> | ||
|
||
<pre><code># text = Er arbeitet fürs FBI (deutsch etwa: „Bundesamt für Ermittlung“). | ||
# text_en = He works for the FBI (German approx: “Bundesamt für Ermittlung”). | ||
1 Er er PRON … _ | ||
2 arbeitet arbeiten VERB … _ | ||
3-4 fürs _ _ … _ | ||
3 für für ADP … _ | ||
4 das der DET … _ | ||
5 FBI FBI PROPN … _ | ||
6 ( ( PUNCT … SpaceAfter=No | ||
7 deutsch deutsch ADV … _ | ||
8 etwa etwa ADV … SpaceAfter=No | ||
9 : : PUNCT … _ | ||
10 „ „ PUNCT … SpaceAfter=No | ||
11 Bundesamt Bundesamt NOUN … _ | ||
12 für für ADP … _ | ||
13 Ermittlung Ermittlung NOUN … SpaceAfter=No | ||
14 “ “ PUNCT … SpaceAfter=No | ||
15 ) ) PUNCT … SpaceAfter=No | ||
16 . . PUNCT … _</code></pre> | ||
|
||
<h2>Sentence Boundaries and Comments</h2> | ||
|
||
<pre><code># sent_id = 1 | ||
# text = They buy and sell books. | ||
1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _ | ||
2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _ | ||
3 and and CCONJ CC _ 4 cc 4:cc _ | ||
4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _ | ||
5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No | ||
6 . . PUNCT . _ 2 punct 2:punct _ | ||
|
||
# sent_id = 2 | ||
# text = I have no clue. | ||
1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _ | ||
2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _ | ||
3 no no DET DT PronType=Neg 4 det _ _ | ||
4 clue clue NOUN NN Number=Sing 2 obj _ SpaceAfter=No | ||
5 . . PUNCT . _ 2 punct _ _ | ||
|
||
# sent_id = panc0.s4 | ||
# text = तत् यथानुश्रूयते। | ||
# translit = tat yathānuśrūyate. | ||
# text_fr = Voilà ce qui nous est parvenu par la tradition orale. | ||
# text_en = This is what is heard. | ||
1 तत् तद् DET _ Case=Nom|…|PronType=Dem 3 nsubj _ Translit=tat|LTranslit=tad|Gloss=it | ||
2-3 यथानुश्रूयते _ _ _ _ _ _ _ SpaceAfter=No | ||
2 यथा यथा ADV _ PronType=Rel 3 advmod _ Translit=yathā|LTranslit=yathā|Gloss=how | ||
3 अनुश्रूयते अनु-श्रु VERB _ Mood=Ind|…|Voice=Pass 0 root _ Translit=anuśrūyate|LTranslit=anu-śru|Gloss=it-is-heard | ||
4 । । PUNCT _ _ 3 punct _ Translit=.|LTranslit=.|Gloss=.</code></pre> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
1-2 vámonos _ | ||
1 vamos ir | ||
2 nos nosotros | ||
3-4 al _ | ||
3 a a | ||
4 el el | ||
5 mar mar | ||
|
||
1 Sue Sue | ||
2 likes like | ||
3 coffee coffee | ||
4 and and | ||
5 Bill Bill | ||
5.1 likes like | ||
6 tea tea | ||
|
||
---------------------------------------------------- | ||
|
||
[ | ||
["token", [ | ||
["id", "1-2"], | ||
["form", "vámonos"], | ||
["lemma", "_"] | ||
]], | ||
["token", [ | ||
["id", "1"], | ||
["form", "vamos"], | ||
["lemma", "ir"] | ||
]], | ||
["token", [ | ||
["id", "2"], | ||
["form", "nos"], | ||
["lemma", "nosotros"] | ||
]], | ||
["token", [ | ||
["id", "3-4"], | ||
["form", "al"], | ||
["lemma", "_"] | ||
]], | ||
["token", [ | ||
["id", "3"], | ||
["form", "a"], | ||
["lemma", "a"] | ||
]], | ||
["token", [ | ||
["id", "4"], | ||
["form", "el"], | ||
["lemma", "el"] | ||
]], | ||
["token", [ | ||
["id", "5"], | ||
["form", "mar"], | ||
["lemma", "mar"] | ||
]], | ||
["sentence-separator", ""], | ||
["token", [ | ||
["id", "1"], | ||
["form", "Sue"], | ||
["lemma", "Sue"] | ||
]], | ||
["token", [ | ||
["id", "2"], | ||
["form", "likes"], | ||
["lemma", "like"] | ||
]], | ||
["token", [ | ||
["id", "3"], | ||
["form", "coffee"], | ||
["lemma", "coffee"] | ||
]], | ||
["token", [ | ||
["id", "4"], | ||
["form", "and"], | ||
["lemma", "and"] | ||
]], | ||
["token", [ | ||
["id", "5"], | ||
["form", "Bill"], | ||
["lemma", "Bill"] | ||
]], | ||
["token", [ | ||
["id", "5.1"], | ||
["form", "likes"], | ||
["lemma", "like"] | ||
]], | ||
["token", [ | ||
["id", "6"], | ||
["form", "tea"], | ||
["lemma", "tea"] | ||
]] | ||
] | ||
|
||
---------------------------------------------------- | ||
|
||
Testing indexing schemes. | ||
|
||
https://universaldependencies.org/format.html |
Oops, something went wrong.