(1) Automatic validation of data-files (.dat) on startup (2) tiny sum…

…mary-improvement
some-avail · Sep 19, 2023 · 065be2d · 065be2d
1 parent 871de97
commit 065be2d
Show file tree

Hide file tree

Showing 6 changed files with 229 additions and 10 deletions.
diff --git a/mostfiles/parse_dutch.dat b/mostfiles/parse_dutch.dat
@@ -384,6 +384,7 @@ geraakt
 meldt 
 meld 
 melden 
+meldd
 gemeld 
 slaan
 slaat

diff --git a/mostfiles/process_text.nim b/mostfiles/process_text.nim
@@ -1105,7 +1105,7 @@ proc extractSentencesFromText(input_tekst, languagest:string,
         # add the first sentences always to the summary
         if sentencecountit <= introductionit:
           if sentencest.len < stringsizeit:
-            summarysq.add(sentencest)
+            summarysq.add(sentencest & ". ")
         else:
           processingbo = false  # header not yet reached
 
@@ -1134,15 +1134,15 @@ proc extractSentencesFromText(input_tekst, languagest:string,
                     countit = count(sentencest, '.')
                     if countit == 0 or  countit > 1:
                       summarysq.add("<br>" & $sentencecountit & " ===============================" & "<br><br>")
-                      summarysq.add(sentencest & ".")
+                      summarysq.add(sentencest & ". ")
 
                     elif countit == 1:
                       summarysq.add("<br>" &  $sentencecountit & " ===============================" & "<br><br>")
                       linesq = sentencest.split('.')
                       leftpartst = linesq[0]
                       rightpartst = linesq[1]
-                      if leftpartst.contains(line): summarysq.add(leftpartst & ".")
-                      if rightpartst.contains(line): summarysq.add(rightpartst & ".")
+                      if leftpartst.contains(line): summarysq.add(leftpartst & ". ")
+                      if rightpartst.contains(line): summarysq.add(rightpartst & ". ")
 
 
                   # to prevent more adds for more extraction-words
@@ -1161,7 +1161,9 @@ proc extractSentencesFromText(input_tekst, languagest:string,
       # concatenate extracted sentences to text
       summaryst = "Number of extractions: " & $linecountit & "<br><br>"
       for senst in summarysq:
-        summaryst &= strip(senst, true, true)
+        # summaryst &= strip(senst, true, true)
+        summaryst &= senst
+
 
     except IOError:
       echo "IO error!"

diff --git a/mostfiles/settings_flashread.conf b/mostfiles/settings_flashread.conf
@@ -20,7 +20,7 @@ port-number___5050___the port thru which to connect to the web-server___
 fr_checkset1___none___default values of the named checkboxset___false,,false,,false,,false,,false
 
 >>>Other<<<
-abbreviations___none___abbreviations from which the dots must be pre-removed___Dr. ,,U.S.,,Mr. ,,etc. ,,B.C.,,A.D.
+abbreviations___none___abbreviations from which the dots must be pre-removed___Dr. ,,U.S.,,Mr. ,,etc. ,,B.C.,,A.D.,,D.C.
 
 
 >>>Test<<<

diff --git a/mostfiles/source_files.nim b/mostfiles/source_files.nim
@@ -4,19 +4,30 @@
   -to limt file-access
  ]#
 
-import strutils
+import strutils, sequtils
 import tables
 import os
 import fr_tools
 
 
-
 type
   DataFileType* = enum
     datFileLanguage
     datFileSummary
     datFileAll
 
+  FileSpecs = object of RootObj
+    fsName: string
+    fsVersion: float
+
+  FilePhase = object of FileSpecs
+    phaNameFull: string   # block-header-name
+    phaSequenceNum: int   # order-num in phase-list
+    phaNameCount: int     # should be 1
+    phaItemCount: int     # preferably > 0
+    phaHasEmptyItem: bool    # = zero-length line; must be false
+    phaEndMarkerFound: bool   # must be true
+
 
 var
   versionfl:float = 0.2
@@ -27,6 +38,31 @@ var
   sourcefilestatust*: string = ""
   faultsfoundbo: bool = false
 
+  parse_file_phasesq = @[
+        "PUNCTUATION OF SENTENCES TO HANDLE",
+        "PUNCTUATION OF SENTENCE-PARTS TO HANDLE",
+        "PRONOUNS TO HANDLE",
+        "VERBS TO HANDLE",
+        "LINK-WORDS TO HANDLE",
+        "PREPOSITIONS TO HANDLE",
+        "NOUN-ANNOUNCERS TO HANDLE",
+        "NOUN-REPLACERS TO HANDLE",
+        "AMBIGUOUS WORD-FUNCTIONS TO HANDLE"]
+
+  summary_file_phasesq = @["SIGNAL-WORDS TO HANDLE"]
+
+
+
+template withFile*(f, fn, mode, actions: untyped): untyped =
+  var f: File
+  if open(f, fn, mode):
+    try:
+      actions
+    finally:
+      close(f)
+  else:
+    quit("cannot open: " & fn)
+
 
 proc addLanguageFilesToList() =
   # Dynamicly add the language.dat files from the config-file 
@@ -99,8 +135,132 @@ proc writeFilePatternToSeq*(filestartwithst: string): seq[string] =
 
 
 proc evaluateDataFiles*(filetypeu: DataFileType): string = 
+  #[
+  - gather the files
+  - move thru the lines 
+    - search the first / next cat-header
+      if not found then report
+      test the items until end-marker
+        report if no or empty items
+    ]#
+
+  var
+    parse_lang_filesq, summary_filesq, all_filesq: seq[string]
+    file_reportta = initOrderedTable[string, FilePhase]()
+    tablekeyst: string
+    reportst: string = "<b>Validation of the datafiles (no comment = OK):</b>\p<br>"
+    phasecountit, itemcountit: int
+    inphasebo: bool = false
+    endmarkerst: string = ">----------------------------------<"
+    phasesq: seq[string]
+
+
+  parse_lang_filesq = writeFilePatternToSeq("parse_")
+  summary_filesq = writeFilePatternToSeq("summary_")
+  all_filesq = concat(parse_lang_filesq, summary_filesq)
+
+  if filetypeu == datFileAll or filetypeu == datFileLanguage:
+    # parse_lang_filesq = writeFilePatternToSeq("parse_")
+    for filest in all_filesq:
+      phasecountit = 1
+      # select correct phase-sequence
+      case filest[0..4]
+      of "parse":
+        phasesq = parse_file_phasesq
+      of "summa":
+        phasesq = summary_file_phasesq
+
+      for phasest in phasesq:
+        tablekeyst = filest & "___" & phasest[0..phasest.len - 11]
+        # preset objects for file
+        file_reportta[tablekeyst] = FilePhase(
+          fsName: filest,
+          phaNameFull: phasest,
+          phaNameCount: 0,
+          phaSequenceNum: phasecountit,
+          phaItemCount: 0,
+          phaHasEmptyItem: false,
+          phaEndMarkerFound: false
+          )
+        phasecountit += 1
+
+      withFile(txt, filest, fmRead):
+        for linest in txt.lines:
+          if linest in phasesq:
+            inphasebo = true
+            itemcountit = 0
+            # blockphase reached; update object
+            tablekeyst = filest & "___" & linest[0..linest.len - 11]
+            file_reportta[tablekeyst].phaNameCount += 1
+          elif inphasebo:
+            if linest == endmarkerst:
+              file_reportta[tablekeyst].phaItemCount = itemcountit
+              file_reportta[tablekeyst].phaEndMarkerFound = true
+              inphasebo = false
+            else:   # walking thru items
+              if linest.len == 0:
+                file_reportta[tablekeyst].phaHasEmptyItem = true
+              file_reportta[tablekeyst].phaItemCount = itemcountit              
+            itemcountit += 1
+
+
+  # echo file_reportta
+
+  var 
+    curfilest, formerfilest: string
+    curphasest, formerphasest: string
+    complaintst, endst, startst: string
+    faultfoundbo: bool = false
+    verbosebo: bool = false
+    skip_othersbo: bool = false
+
+  startst = "<br>\p"
+  endst = "<br>\p"
+
+  for keyst, valob in file_reportta:
+
+    curfilest = valob.fsName
+    curphasest = valob.phaNameFull
+    if curfilest != formerfilest:
+      reportst &= curfilest & endst
+
+    complaintst = ""
+
+    if valob.phaNameCount == 0:
+      complaintst &= "---- This block-phase is not found (or mis-spelled)" & endst
+      faultfoundbo = true
+      skip_othersbo = true
+    elif valob.phaNameCount > 1:
+      complaintst &= "---- This block-phase occurs multiple times: " & $valob.phaNameCount & endst
+      faultfoundbo = true
+      skip_othersbo = true
+    if not valob.phaEndMarkerFound:
+      if not skip_othersbo:
+        complaintst &= "---- This block-phase has no (valid) end-marker" & endst
+        faultfoundbo = true
+        skip_othersbo = true
+    if valob.phaItemCount == 0:
+      if not skip_othersbo:
+        complaintst &= "---- This block-phase has NO items (no lines)" & endst
+        faultfoundbo = true
+    if valob.phaHasEmptyItem:
+      if not skip_othersbo:
+        complaintst &= "---- This block-phase has EMPTY items (zero-length lines)" & endst
+        faultfoundbo = true
+
+    formerfilest = valob.fsName
+
+    if faultfoundbo or verbosebo:
+      reportst &= "++ " & curphasest & endst
+      reportst &= complaintst
+      if verbosebo:
+        reportst &= $valob & endst
+
+    faultfoundbo = false
+    skip_othersbo = false
+
+  result = reportst
 
-  result = "Nothing evaluated yet"
 
 
 
@@ -111,5 +271,6 @@ loadTextSourceFiles()
 
 when isMainModule:
   # echo textsourcefileta["dutch.dat"]
-  echo sourcefilestatust
+  # echo sourcefilestatust
 
+  echo evaluateDataFiles(datFileAll)
diff --git a/mostfiles/summary_english_small.dat b/mostfiles/summary_english_small.dat
@@ -0,0 +1,35 @@
+Extraction-strings for summarizing with Flashread.
+Language: english
+Summary-type: generic
+version: 1.3
+
+
+
+SIGNAL-WORDS TO HANDLE
+scenario
+theor
+hypothe
+assum
+caus
+effect
+result
+consequence
+conclu
+plan 
+plans 
+goal
+agenda
+summar
+outcome
+input
+output
+relat
+variabl
+question
+answer
+evidence
+proof
+prove
+>----------------------------------<
+
+
diff --git a/mostfiles/summary_english_tiny.dat b/mostfiles/summary_english_tiny.dat
@@ -0,0 +1,20 @@
+Extraction-strings for summarizing with Flashread.
+Language: english
+Summary-type: generic
+version: 1.3
+
+
+
+
+SIGNAL-WORDS TO HANDLE
+scenario
+theor
+hypothe
+plan 
+plans 
+goal
+agenda
+>----------------------------------<
+
+
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -384,6 +384,7 @@ geraakt @@
     meldt
     meld
     melden
+    meldd
     gemeld
     slaan
     slaat
@@ Expand Down @@