diff --git a/mostfiles/parse_dutch.dat b/mostfiles/parse_dutch.dat index b224e6f..a501094 100644 --- a/mostfiles/parse_dutch.dat +++ b/mostfiles/parse_dutch.dat @@ -384,6 +384,7 @@ geraakt meldt meld melden +meldd gemeld slaan slaat diff --git a/mostfiles/process_text.nim b/mostfiles/process_text.nim index b4a9eb7..a5c79eb 100644 --- a/mostfiles/process_text.nim +++ b/mostfiles/process_text.nim @@ -1105,7 +1105,7 @@ proc extractSentencesFromText(input_tekst, languagest:string, # add the first sentences always to the summary if sentencecountit <= introductionit: if sentencest.len < stringsizeit: - summarysq.add(sentencest) + summarysq.add(sentencest & ". ") else: processingbo = false # header not yet reached @@ -1134,15 +1134,15 @@ proc extractSentencesFromText(input_tekst, languagest:string, countit = count(sentencest, '.') if countit == 0 or countit > 1: summarysq.add("
" & $sentencecountit & " ===============================" & "

") - summarysq.add(sentencest & ".") + summarysq.add(sentencest & ". ") elif countit == 1: summarysq.add("
" & $sentencecountit & " ===============================" & "

") linesq = sentencest.split('.') leftpartst = linesq[0] rightpartst = linesq[1] - if leftpartst.contains(line): summarysq.add(leftpartst & ".") - if rightpartst.contains(line): summarysq.add(rightpartst & ".") + if leftpartst.contains(line): summarysq.add(leftpartst & ". ") + if rightpartst.contains(line): summarysq.add(rightpartst & ". ") # to prevent more adds for more extraction-words @@ -1161,7 +1161,9 @@ proc extractSentencesFromText(input_tekst, languagest:string, # concatenate extracted sentences to text summaryst = "Number of extractions: " & $linecountit & "

" for senst in summarysq: - summaryst &= strip(senst, true, true) + # summaryst &= strip(senst, true, true) + summaryst &= senst + except IOError: echo "IO error!" diff --git a/mostfiles/settings_flashread.conf b/mostfiles/settings_flashread.conf index 588ca4c..c85ded7 100644 --- a/mostfiles/settings_flashread.conf +++ b/mostfiles/settings_flashread.conf @@ -20,7 +20,7 @@ port-number___5050___the port thru which to connect to the web-server___ fr_checkset1___none___default values of the named checkboxset___false,,false,,false,,false,,false >>>Other<<< -abbreviations___none___abbreviations from which the dots must be pre-removed___Dr. ,,U.S.,,Mr. ,,etc. ,,B.C.,,A.D. +abbreviations___none___abbreviations from which the dots must be pre-removed___Dr. ,,U.S.,,Mr. ,,etc. ,,B.C.,,A.D.,,D.C. >>>Test<<< diff --git a/mostfiles/source_files.nim b/mostfiles/source_files.nim index 73dc693..93bec34 100644 --- a/mostfiles/source_files.nim +++ b/mostfiles/source_files.nim @@ -4,19 +4,30 @@ -to limt file-access ]# -import strutils +import strutils, sequtils import tables import os import fr_tools - type DataFileType* = enum datFileLanguage datFileSummary datFileAll + FileSpecs = object of RootObj + fsName: string + fsVersion: float + + FilePhase = object of FileSpecs + phaNameFull: string # block-header-name + phaSequenceNum: int # order-num in phase-list + phaNameCount: int # should be 1 + phaItemCount: int # preferably > 0 + phaHasEmptyItem: bool # = zero-length line; must be false + phaEndMarkerFound: bool # must be true + var versionfl:float = 0.2 @@ -27,6 +38,31 @@ var sourcefilestatust*: string = "" faultsfoundbo: bool = false + parse_file_phasesq = @[ + "PUNCTUATION OF SENTENCES TO HANDLE", + "PUNCTUATION OF SENTENCE-PARTS TO HANDLE", + "PRONOUNS TO HANDLE", + "VERBS TO HANDLE", + "LINK-WORDS TO HANDLE", + "PREPOSITIONS TO HANDLE", + "NOUN-ANNOUNCERS TO HANDLE", + "NOUN-REPLACERS TO HANDLE", + "AMBIGUOUS WORD-FUNCTIONS TO HANDLE"] + + summary_file_phasesq = @["SIGNAL-WORDS TO HANDLE"] + + + +template withFile*(f, fn, mode, actions: untyped): untyped = + var f: File + if open(f, fn, mode): + try: + actions + finally: + close(f) + else: + quit("cannot open: " & fn) + proc addLanguageFilesToList() = # Dynamicly add the language.dat files from the config-file @@ -99,8 +135,132 @@ proc writeFilePatternToSeq*(filestartwithst: string): seq[string] = proc evaluateDataFiles*(filetypeu: DataFileType): string = + #[ + - gather the files + - move thru the lines + - search the first / next cat-header + if not found then report + test the items until end-marker + report if no or empty items + ]# + + var + parse_lang_filesq, summary_filesq, all_filesq: seq[string] + file_reportta = initOrderedTable[string, FilePhase]() + tablekeyst: string + reportst: string = "Validation of the datafiles (no comment = OK):\p
" + phasecountit, itemcountit: int + inphasebo: bool = false + endmarkerst: string = ">----------------------------------<" + phasesq: seq[string] + + + parse_lang_filesq = writeFilePatternToSeq("parse_") + summary_filesq = writeFilePatternToSeq("summary_") + all_filesq = concat(parse_lang_filesq, summary_filesq) + + if filetypeu == datFileAll or filetypeu == datFileLanguage: + # parse_lang_filesq = writeFilePatternToSeq("parse_") + for filest in all_filesq: + phasecountit = 1 + # select correct phase-sequence + case filest[0..4] + of "parse": + phasesq = parse_file_phasesq + of "summa": + phasesq = summary_file_phasesq + + for phasest in phasesq: + tablekeyst = filest & "___" & phasest[0..phasest.len - 11] + # preset objects for file + file_reportta[tablekeyst] = FilePhase( + fsName: filest, + phaNameFull: phasest, + phaNameCount: 0, + phaSequenceNum: phasecountit, + phaItemCount: 0, + phaHasEmptyItem: false, + phaEndMarkerFound: false + ) + phasecountit += 1 + + withFile(txt, filest, fmRead): + for linest in txt.lines: + if linest in phasesq: + inphasebo = true + itemcountit = 0 + # blockphase reached; update object + tablekeyst = filest & "___" & linest[0..linest.len - 11] + file_reportta[tablekeyst].phaNameCount += 1 + elif inphasebo: + if linest == endmarkerst: + file_reportta[tablekeyst].phaItemCount = itemcountit + file_reportta[tablekeyst].phaEndMarkerFound = true + inphasebo = false + else: # walking thru items + if linest.len == 0: + file_reportta[tablekeyst].phaHasEmptyItem = true + file_reportta[tablekeyst].phaItemCount = itemcountit + itemcountit += 1 + + + # echo file_reportta + + var + curfilest, formerfilest: string + curphasest, formerphasest: string + complaintst, endst, startst: string + faultfoundbo: bool = false + verbosebo: bool = false + skip_othersbo: bool = false + + startst = "
\p" + endst = "
\p" + + for keyst, valob in file_reportta: + + curfilest = valob.fsName + curphasest = valob.phaNameFull + if curfilest != formerfilest: + reportst &= curfilest & endst + + complaintst = "" + + if valob.phaNameCount == 0: + complaintst &= "---- This block-phase is not found (or mis-spelled)" & endst + faultfoundbo = true + skip_othersbo = true + elif valob.phaNameCount > 1: + complaintst &= "---- This block-phase occurs multiple times: " & $valob.phaNameCount & endst + faultfoundbo = true + skip_othersbo = true + if not valob.phaEndMarkerFound: + if not skip_othersbo: + complaintst &= "---- This block-phase has no (valid) end-marker" & endst + faultfoundbo = true + skip_othersbo = true + if valob.phaItemCount == 0: + if not skip_othersbo: + complaintst &= "---- This block-phase has NO items (no lines)" & endst + faultfoundbo = true + if valob.phaHasEmptyItem: + if not skip_othersbo: + complaintst &= "---- This block-phase has EMPTY items (zero-length lines)" & endst + faultfoundbo = true + + formerfilest = valob.fsName + + if faultfoundbo or verbosebo: + reportst &= "++ " & curphasest & endst + reportst &= complaintst + if verbosebo: + reportst &= $valob & endst + + faultfoundbo = false + skip_othersbo = false + + result = reportst - result = "Nothing evaluated yet" @@ -111,5 +271,6 @@ loadTextSourceFiles() when isMainModule: # echo textsourcefileta["dutch.dat"] - echo sourcefilestatust + # echo sourcefilestatust + echo evaluateDataFiles(datFileAll) \ No newline at end of file diff --git a/mostfiles/summary_english_small.dat b/mostfiles/summary_english_small.dat new file mode 100644 index 0000000..fc63f3e --- /dev/null +++ b/mostfiles/summary_english_small.dat @@ -0,0 +1,35 @@ +Extraction-strings for summarizing with Flashread. +Language: english +Summary-type: generic +version: 1.3 + + + +SIGNAL-WORDS TO HANDLE +scenario +theor +hypothe +assum +caus +effect +result +consequence +conclu +plan +plans +goal +agenda +summar +outcome +input +output +relat +variabl +question +answer +evidence +proof +prove +>----------------------------------< + + diff --git a/mostfiles/summary_english_tiny.dat b/mostfiles/summary_english_tiny.dat new file mode 100644 index 0000000..c391d64 --- /dev/null +++ b/mostfiles/summary_english_tiny.dat @@ -0,0 +1,20 @@ +Extraction-strings for summarizing with Flashread. +Language: english +Summary-type: generic +version: 1.3 + + + + +SIGNAL-WORDS TO HANDLE +scenario +theor +hypothe +plan +plans +goal +agenda +>----------------------------------< + + +