Skip to content

Commit

Permalink
(1) Automatic validation of data-files (.dat) on startup (2) tiny sum…
Browse files Browse the repository at this point in the history
…mary-improvement
  • Loading branch information
some-avail committed Sep 19, 2023
1 parent 871de97 commit 065be2d
Show file tree
Hide file tree
Showing 6 changed files with 229 additions and 10 deletions.
1 change: 1 addition & 0 deletions mostfiles/parse_dutch.dat
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ geraakt
meldt
meld
melden
meldd
gemeld
slaan
slaat
Expand Down
12 changes: 7 additions & 5 deletions mostfiles/process_text.nim
Original file line number Diff line number Diff line change
Expand Up @@ -1105,7 +1105,7 @@ proc extractSentencesFromText(input_tekst, languagest:string,
# add the first sentences always to the summary
if sentencecountit <= introductionit:
if sentencest.len < stringsizeit:
summarysq.add(sentencest)
summarysq.add(sentencest & ". ")
else:
processingbo = false # header not yet reached

Expand Down Expand Up @@ -1134,15 +1134,15 @@ proc extractSentencesFromText(input_tekst, languagest:string,
countit = count(sentencest, '.')
if countit == 0 or countit > 1:
summarysq.add("<br>" & $sentencecountit & " ===============================" & "<br><br>")
summarysq.add(sentencest & ".")
summarysq.add(sentencest & ". ")

elif countit == 1:
summarysq.add("<br>" & $sentencecountit & " ===============================" & "<br><br>")
linesq = sentencest.split('.')
leftpartst = linesq[0]
rightpartst = linesq[1]
if leftpartst.contains(line): summarysq.add(leftpartst & ".")
if rightpartst.contains(line): summarysq.add(rightpartst & ".")
if leftpartst.contains(line): summarysq.add(leftpartst & ". ")
if rightpartst.contains(line): summarysq.add(rightpartst & ". ")


# to prevent more adds for more extraction-words
Expand All @@ -1161,7 +1161,9 @@ proc extractSentencesFromText(input_tekst, languagest:string,
# concatenate extracted sentences to text
summaryst = "Number of extractions: " & $linecountit & "<br><br>"
for senst in summarysq:
summaryst &= strip(senst, true, true)
# summaryst &= strip(senst, true, true)
summaryst &= senst


except IOError:
echo "IO error!"
Expand Down
2 changes: 1 addition & 1 deletion mostfiles/settings_flashread.conf
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ port-number___5050___the port thru which to connect to the web-server___
fr_checkset1___none___default values of the named checkboxset___false,,false,,false,,false,,false

>>>Other<<<
abbreviations___none___abbreviations from which the dots must be pre-removed___Dr. ,,U.S.,,Mr. ,,etc. ,,B.C.,,A.D.
abbreviations___none___abbreviations from which the dots must be pre-removed___Dr. ,,U.S.,,Mr. ,,etc. ,,B.C.,,A.D.,,D.C.


>>>Test<<<
Expand Down
169 changes: 165 additions & 4 deletions mostfiles/source_files.nim
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,30 @@
-to limt file-access
]#

import strutils
import strutils, sequtils
import tables
import os
import fr_tools



type
DataFileType* = enum
datFileLanguage
datFileSummary
datFileAll

FileSpecs = object of RootObj
fsName: string
fsVersion: float

FilePhase = object of FileSpecs
phaNameFull: string # block-header-name
phaSequenceNum: int # order-num in phase-list
phaNameCount: int # should be 1
phaItemCount: int # preferably > 0
phaHasEmptyItem: bool # = zero-length line; must be false
phaEndMarkerFound: bool # must be true


var
versionfl:float = 0.2
Expand All @@ -27,6 +38,31 @@ var
sourcefilestatust*: string = ""
faultsfoundbo: bool = false

parse_file_phasesq = @[
"PUNCTUATION OF SENTENCES TO HANDLE",
"PUNCTUATION OF SENTENCE-PARTS TO HANDLE",
"PRONOUNS TO HANDLE",
"VERBS TO HANDLE",
"LINK-WORDS TO HANDLE",
"PREPOSITIONS TO HANDLE",
"NOUN-ANNOUNCERS TO HANDLE",
"NOUN-REPLACERS TO HANDLE",
"AMBIGUOUS WORD-FUNCTIONS TO HANDLE"]

summary_file_phasesq = @["SIGNAL-WORDS TO HANDLE"]



template withFile*(f, fn, mode, actions: untyped): untyped =
var f: File
if open(f, fn, mode):
try:
actions
finally:
close(f)
else:
quit("cannot open: " & fn)


proc addLanguageFilesToList() =
# Dynamicly add the language.dat files from the config-file
Expand Down Expand Up @@ -99,8 +135,132 @@ proc writeFilePatternToSeq*(filestartwithst: string): seq[string] =


proc evaluateDataFiles*(filetypeu: DataFileType): string =
#[
- gather the files
- move thru the lines
- search the first / next cat-header
if not found then report
test the items until end-marker
report if no or empty items
]#

var
parse_lang_filesq, summary_filesq, all_filesq: seq[string]
file_reportta = initOrderedTable[string, FilePhase]()
tablekeyst: string
reportst: string = "<b>Validation of the datafiles (no comment = OK):</b>\p<br>"
phasecountit, itemcountit: int
inphasebo: bool = false
endmarkerst: string = ">----------------------------------<"
phasesq: seq[string]


parse_lang_filesq = writeFilePatternToSeq("parse_")
summary_filesq = writeFilePatternToSeq("summary_")
all_filesq = concat(parse_lang_filesq, summary_filesq)

if filetypeu == datFileAll or filetypeu == datFileLanguage:
# parse_lang_filesq = writeFilePatternToSeq("parse_")
for filest in all_filesq:
phasecountit = 1
# select correct phase-sequence
case filest[0..4]
of "parse":
phasesq = parse_file_phasesq
of "summa":
phasesq = summary_file_phasesq

for phasest in phasesq:
tablekeyst = filest & "___" & phasest[0..phasest.len - 11]
# preset objects for file
file_reportta[tablekeyst] = FilePhase(
fsName: filest,
phaNameFull: phasest,
phaNameCount: 0,
phaSequenceNum: phasecountit,
phaItemCount: 0,
phaHasEmptyItem: false,
phaEndMarkerFound: false
)
phasecountit += 1

withFile(txt, filest, fmRead):
for linest in txt.lines:
if linest in phasesq:
inphasebo = true
itemcountit = 0
# blockphase reached; update object
tablekeyst = filest & "___" & linest[0..linest.len - 11]
file_reportta[tablekeyst].phaNameCount += 1
elif inphasebo:
if linest == endmarkerst:
file_reportta[tablekeyst].phaItemCount = itemcountit
file_reportta[tablekeyst].phaEndMarkerFound = true
inphasebo = false
else: # walking thru items
if linest.len == 0:
file_reportta[tablekeyst].phaHasEmptyItem = true
file_reportta[tablekeyst].phaItemCount = itemcountit
itemcountit += 1


# echo file_reportta

var
curfilest, formerfilest: string
curphasest, formerphasest: string
complaintst, endst, startst: string
faultfoundbo: bool = false
verbosebo: bool = false
skip_othersbo: bool = false

startst = "<br>\p"
endst = "<br>\p"

for keyst, valob in file_reportta:

curfilest = valob.fsName
curphasest = valob.phaNameFull
if curfilest != formerfilest:
reportst &= curfilest & endst

complaintst = ""

if valob.phaNameCount == 0:
complaintst &= "---- This block-phase is not found (or mis-spelled)" & endst
faultfoundbo = true
skip_othersbo = true
elif valob.phaNameCount > 1:
complaintst &= "---- This block-phase occurs multiple times: " & $valob.phaNameCount & endst
faultfoundbo = true
skip_othersbo = true
if not valob.phaEndMarkerFound:
if not skip_othersbo:
complaintst &= "---- This block-phase has no (valid) end-marker" & endst
faultfoundbo = true
skip_othersbo = true
if valob.phaItemCount == 0:
if not skip_othersbo:
complaintst &= "---- This block-phase has NO items (no lines)" & endst
faultfoundbo = true
if valob.phaHasEmptyItem:
if not skip_othersbo:
complaintst &= "---- This block-phase has EMPTY items (zero-length lines)" & endst
faultfoundbo = true

formerfilest = valob.fsName

if faultfoundbo or verbosebo:
reportst &= "++ " & curphasest & endst
reportst &= complaintst
if verbosebo:
reportst &= $valob & endst

faultfoundbo = false
skip_othersbo = false

result = reportst

result = "Nothing evaluated yet"



Expand All @@ -111,5 +271,6 @@ loadTextSourceFiles()

when isMainModule:
# echo textsourcefileta["dutch.dat"]
echo sourcefilestatust
# echo sourcefilestatust

echo evaluateDataFiles(datFileAll)
35 changes: 35 additions & 0 deletions mostfiles/summary_english_small.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
Extraction-strings for summarizing with Flashread.
Language: english
Summary-type: generic
version: 1.3



SIGNAL-WORDS TO HANDLE
scenario
theor
hypothe
assum
caus
effect
result
consequence
conclu
plan
plans
goal
agenda
summar
outcome
input
output
relat
variabl
question
answer
evidence
proof
prove
>----------------------------------<


20 changes: 20 additions & 0 deletions mostfiles/summary_english_tiny.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Extraction-strings for summarizing with Flashread.
Language: english
Summary-type: generic
version: 1.3




SIGNAL-WORDS TO HANDLE
scenario
theor
hypothe
plan
plans
goal
agenda
>----------------------------------<



0 comments on commit 065be2d

Please sign in to comment.