From cd6641a261f83e20761ba352f61b12452029e25b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Santanch=C3=A8?= Date: Thu, 21 Dec 2023 21:20:24 -0300 Subject: [PATCH] feat (annotator): BILOU export --- .../public/report/annotations/index.html | 10 +- .../public/report/js/report-annotations.js | 183 ++++++++++++++++-- 2 files changed, 180 insertions(+), 13 deletions(-) diff --git a/src/adonisjs/public/report/annotations/index.html b/src/adonisjs/public/report/annotations/index.html index 868ac0e2..704f583e 100644 --- a/src/adonisjs/public/report/annotations/index.html +++ b/src/adonisjs/public/report/annotations/index.html @@ -23,6 +23,7 @@ + @@ -61,7 +62,9 @@ - + + + @@ -98,7 +101,10 @@ - + + + diff --git a/src/adonisjs/public/report/js/report-annotations.js b/src/adonisjs/public/report/js/report-annotations.js index f798548d..459e48cd 100644 --- a/src/adonisjs/public/report/js/report-annotations.js +++ b/src/adonisjs/public/report/js/report-annotations.js @@ -1,7 +1,7 @@ class ReportManager { start () { - this._download = this._download.bind(this) - MessageBus.i.subscribe('report/download', this._download) + MessageBus.i.subscribe('report/download', this._downloadAnalysis.bind(this)) + MessageBus.i.subscribe('report/bilou', this._downloadBILOU.bind(this)) this._roomId = new URL(document.location).searchParams.get('roomid') } @@ -78,6 +78,141 @@ class ReportManager { return result } + async _buildBILOU (caseId, annotations) { + const tt = await this._tokenize(caseId) + const tokens = tt.tokens + + const ranges = [] + for (const an of annotations) { + // find a proper category + let cat = null + for (const c of an.categories) + if (ReportManager.catList.includes(c)) { + cat = c + break + } + + if (cat != null) { + if (!this._rangeConflict(ranges, an.fragments)) { + this._addRanges(ranges, an.fragments) + for (let f = 0; f < an.fragments.length; f++) { + let firstMatch = true + let firstToken = null + let prevLast = null + for (const tk of tokens) { + if (tk[1] >= an.fragments[f].start && + tk[1] <= an.fragments[f].start + an.fragments[f].size - 1) { + tk[3] = + (an.fragments.length == 1 && firstMatch) + ? 'U' + : ((f == 0 && firstMatch) + ? 'B' + : ((f+1 < an.fragments.length) || (tk[2] < an.fragments[f].last)) + ? 'I' + : 'L') + tk[4] = cat + if (firstMatch) + firstToken = tk + else if (firstToken != -1) { + firstToken[3] = 'B' + firstToken = -1 + } + firstMatch = false + if (prevLast != null) { + prevLast[3] = 'I' + prevLast = null + } + if (tk[3] == 'L') + prevLast = tk + } + } + } + } + } + } + + console.log('=== tokens NER') + console.log(tokens) + + return { + doc_id: caseId, + text: tt.text, + labels: tokens + } + } + + async _tokenize (caseId) { + + const tokens = [] + + let result = {} + + const cs = await MessageBus.i.request('case/source/get', + {room_id: this._roomId, case_id: caseId}) + + if (cs != null && cs.message != null) { + const compiled = + await Translator.instance.compileMarkdown( + cs.message.id, cs.message.source) + + let text = '' + for (const knot in compiled.knots) { + const mkHTML = + await Translator.instance.generateHTML(compiled.knots[knot]) + text += mkHTML + } + + text = text.replace(/<[^>]+>/gm,'') + .replace(/<\/[^>]+>/gm, '') + .replace(/^[\r\n][\r\n]?/, '') + + let c = 0 + let tk = '' + let tks = -1 + while (c <= text.length) { + if (c == text.length || ReportManager.separators.includes(text[c])) { + if (tks != -1) { + tokens.push([tk, tks, c-1, 'O', null]) + tk = '' + tks = -1 + } + } else { + if (tks == -1) + tks = c + tk += text[c] + } + c++ + } + + result = { + text: text, + tokens: tokens + } + } + + return result + } + + _rangeConflict (ranges, fragments) { + const start = fragments[0].start + const final = fragments[fragments.length-1].start + fragments[fragments.length-1].size - 1 + let r = 0 + while (r < ranges.length && start < ranges[r][1]) { + if ((start >= ranges[r][0] && start <= ranges[r][1]) || + (final >= ranges[r][0] && final <= ranges[r][1]) || + (ranges[r][0] >= start && ranges[r][0] <= final) || + (ranges[r][1] >= start && ranges[r][1] <= final)) + return true + r++ + } + return false + } + + _addRanges (ranges, fragments) { + for (const f of fragments) + ranges.push([f.start, f.start + f.size - 1]) + } + _calculateMetrics (annotations) { let ctideas = 0, ctright = 0, ctinfright = 0 let ctwrong = 0, ctrightencap = 0, ctinfrightencap = 0, ctwrongencap = 0 @@ -141,7 +276,26 @@ class ReportManager { `${(ctideas == 0) ? 0 : selfOrder.score / ctideas},${clustering}${countCat},"${o1csv}","${o2csv}"` } - async _download () { + async _downloadBILOU () { + const tprefix = document.querySelector('#tprefix').value + + const cases = await this._requestCases() + + console.log(cases) + let table = '' + + if (cases != null) { + for (const c of cases.message) { + const ant = await this._loadAnnotations(c.id) + const bilou = await this._buildBILOU(c.id, ant.annotations) + table += JSON.stringify(bilou) + '\n' + } + + this._download(table) + } + } + + async _downloadAnalysis () { const tprefix = document.querySelector('#tprefix').value const cases = await this._requestCases() @@ -172,21 +326,28 @@ class ReportManager { metrics + '\n' } - const element = document.createElement('a') - element.setAttribute('href', - 'data:text/plain;charset=utf-8,' + encodeURIComponent(table)) - element.setAttribute('download', 'annotations.csv') - element.style.display = 'none' - document.body.appendChild(element) - element.click() - document.body.removeChild(element) + this._download(table) } } + + _download (table) { + const element = document.createElement('a') + element.setAttribute('href', + 'data:text/plain;charset=utf-8,' + encodeURIComponent(table)) + element.setAttribute('download', 'annotations.csv') + element.style.display = 'none' + document.body.appendChild(element) + element.click() + document.body.removeChild(element) + } } (function () { ReportManager.i = new ReportManager() + ReportManager.separators = [ + ' ', '\n', '\r', '\t', '.', ',', ';', ':', '(', ')', '[', ']', '{', '}'] + // Copy of the same constants in annotator.js // isc: Illness Script Components ReportManager.category = {