-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.go
345 lines (307 loc) · 8.95 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
package main
import (
"bytes"
"fmt"
"io"
"log"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
)
const HKA_SOURCE = "sources/HKA.txt"
const NF_GLOB = "sources/html/NF-*.html"
const WERKE_GLOB = "sources/html/W-*.html"
const MOCK_NF = "sources/mock/NF-1885,39.html"
const MOCK_WERKE = "sources/mock/WA.html"
// we expect later that all stored strings are already html.
type Entry struct {
h2 string
html string
}
// represents a complete eKGW grouping as downloaded from nietzschesource.org
type eKGWDoc struct {
h1 string
entries []Entry
}
func ParseWithGoquery(doc *goquery.Document) eKGWDoc {
var ekgw eKGWDoc
// clean it first
doc.Find("div.tooltip").Each(func(i int, s *goquery.Selection) {
s.Remove()
})
doc.Find("div.head").Each(func(i int, s *goquery.Selection) {
// leaving h2 as it appears in the div.titel
s.Find("h2").Remove()
})
doc.Find("span.bold").Each(func(i int, s *goquery.Selection) {
s.ReplaceWithHtml("<em>" + s.Text() + "</em>")
})
doc.Find("span.bolditalic").Each(func(i int, s *goquery.Selection) {
s.ReplaceWithHtml("<b>" + s.Text() + "</b>")
})
// TODO: replace with <ul> ?
doc.Find("table").Each(func(i int, s *goquery.Selection) {
s.ReplaceWithSelection(s.Find("div.p"))
})
// h1
// in the case of published works:
title := doc.Find("div.titel")
// remove anchors
title.Find("a").Each(func(i int, s *goquery.Selection) {
s.Remove()
})
// we get the whole block, since it might contain h2s
title_html, err := doc.Find("div.titel").Html()
if err != nil {
panic(err)
}
// or the Nachlass:
p := doc.Find("p.Gruppe").Last().Text()
if title_html != "" {
ekgw.h1 = title_html
} else {
ekgw.h1 = "<h1>" + p + "</h1>"
}
// entries : h2 and html
doc.Find("div.txt_block").Each(func(i int, s *goquery.Selection) {
var e Entry
// the first anchor seems to be the most reliable place to get the h2:
// .Attr() will just get the value from the first element:
id, ok := s.Find("a[name]").Attr("name")
if !ok || strings.Contains(id, "Gruppe") {
return
}
e.h2 = "<h2>" + id + "</h2>"
// remove all extraneous stuff now that we have what we want:
s.Find("a").Each(func(i int, s *goquery.Selection) {
s.Remove()
})
s.Find("h2").Each(func(i int, s *goquery.Selection) {
s.Remove()
})
s.Find("h3").Each(func(i int, s *goquery.Selection) {
s.Remove()
})
// then get the rest:
inner, err := s.Html()
if err != nil {
panic(err)
}
e.html = inner
ekgw.entries = append(ekgw.entries, e)
})
return ekgw
}
func Render(ekgw eKGWDoc) (out string) {
out += fmt.Sprintln(ekgw.h1)
for _, e := range ekgw.entries {
out += fmt.Sprintln(e.h2)
out += e.html
}
return out
}
func PreCleanupHtml(content []byte) []byte {
content = bytes.ReplaceAll(content, []byte("<"), []byte(""))
content = bytes.ReplaceAll(content, []byte(">"), []byte(""))
return content
}
func CleanupMd(content string) (out string) {
out = strings.ReplaceAll(content, `\`, "")
out = strings.ReplaceAll(out, `#eKGWB`, "eKGWB")
out = strings.ReplaceAll(out, ` `, " ")
// left behind by empty divs and whatnot:
out = strings.ReplaceAll(out, "\n\n\n\n", "\n\n")
return out
}
func RunPandoc(content string) string {
cmd := exec.Command("pandoc", "--wrap=none", "--from=html-native_divs-native_spans", "--to=markdown-smart")
// https://pkg.go.dev/os/exec#Cmd.StdoutPipe
stdin, err := cmd.StdinPipe()
if err != nil {
panic(err)
}
// pipe it in:
go func() {
defer stdin.Close()
io.WriteString(stdin, content)
}()
// get stdout:
out, err := cmd.CombinedOutput()
if err != nil {
panic(err)
}
return string(out)
}
// Creates a map from a notebook name to a set of aphorism citations.
//
// The somwhat odd and arbitrary notebook naming, which might date back to the original Nietzsche
// archive, is the only reliable link between the HKA and the eKGWB. Here we're trying to create a
// map from those names to the aphorism citations of the HKA, which are systematic and consistent,
// so that the eKGWB mappings, which are consistent but incomplete and largely useless, can be
// replaced.
func MapHKA() map[string][]string {
books := map[string][]string{}
// [ 30 = Z II 5, 83. Z II 7b. Z II 6b. Herbst 1884 — Anfang 1885 ]
// [ 31 = Z II 8. Winter 1884 — 85 ]
book_rx, _ := regexp.Compile(`(?m)^\[(.+)\]$`)
// Aphorism n=9963 id='VII.31[1]' kgw='VII-3.71' ksa='11.359'
aphorism_rx, _ := regexp.Compile(`(?m)^Aphorism .* kgw='.*' ksa='.*'$`)
dat, err := os.ReadFile(HKA_SOURCE)
if err != nil {
panic(err)
}
s := string(dat)
res := book_rx.FindAllStringIndex(s, -1)
for j, indices := range res {
// current match
book := s[indices[0]:indices[1]]
book = TrimBookName(book)
// slice the whole to look forward for the Aphorism match
end := len(s)
if j+1 < len(res) {
end = res[j+1][0]
}
sub := s[indices[1]:end]
aph := aphorism_rx.FindAllString(sub, -1)
books[book] = append(books[book], aph...)
}
return books
}
// Transforms both the HKA and eKGWB book listings into a minimal string, ignoring:
// wrapping [], all whitespace, and case. Cuts at first "."
//
// so this:
// [ 31 = Z II 8. Winter 1884 — 85 ]
// becomes:
// 31=zii8
func TrimBookName(book string) string {
// 31 = Z II 8. Winter 1884 — 85
book = strings.Trim(book, "[] ")
// up to the first period:
// 31 = Z II 8
book, _, _ = strings.Cut(book, ".")
// collapse whitespace:
book = strings.ReplaceAll(book, " ", "")
book = strings.ToLower(book)
return book
}
// Find the unique id mapping the eKGWB to the HKA by progressively shortening the key. See MapHKA()
//
// TODO: manually fix the outliers. eKGWB differs in about 20 books. Unless I can fuzzy match.
// https://github.com/lithammer/fuzzysearch
func FindAphorisms(books map[string][]string, book string) (aphs []string, found bool) {
origbook := book
book = TrimBookName(book)
short_book := book
aphs, ok := books[book]
for i := 1; !ok && len(short_book) > 2; i++ {
short_book = book[:len(book)-i]
for key, _ := range books {
j := len(key) - i
if j < 2 {
continue
}
short_key := key[:j]
if short_book == short_key {
aphs, ok = books[key]
log.Println("INFO: found book by shortening:", short_key, "original:", origbook)
break
}
}
}
if !ok {
log.Println("WARN: didn't find the book within the books map:", origbook)
return nil, false
}
return aphs, true
}
// takes the markdown rendered string and replaces the bullshit eKGWB citations with the proper KGW
// numbers mapped from the HKA.
func AnnotateKGW(markdown string, books map[string][]string, book_rx *regexp.Regexp, aphorism_rx *regexp.Regexp) string {
book_match := book_rx.FindStringSubmatch(markdown)
if book_match == nil || len(book_match) < 2 {
log.Println("WARN: didn't find the book within the markdown", markdown[:10])
return markdown
}
// get the submatch only:
book := book_match[1]
aphs, ok := FindAphorisms(books, book)
if !ok {
return markdown
}
out := markdown
h2s := aphorism_rx.FindAllString(markdown, -1)
for i, header := range h2s {
_, number, ok := strings.Cut(header, ",")
if !ok {
log.Println("WARN: didn't find the header number in the header", header)
continue
}
// NOTE: happens when the eKGWB combines multiple books, eg NF-1884,28.html:
if i >= len(aphs) {
log.Printf("WARN: more eKGW h2 headers: %v found than HKA headers: %v. %v", len(h2s), len(aphs), book)
break
}
// NOTE: the index here is assumed to match the []string from the map:
if strings.Contains(aphs[i], number) {
// '## '
// 012
j := strings.Index(out, header) + 3
aph := strings.TrimPrefix(aphs[i], "Aphorism ")
// NOTE: j+len(header)-3 : effectively removes the eKGWB header
// NOTE: we're not building back the markdown string, but interpolating:
out = out[:j] + aph + out[j+len(header)-3:]
}
}
return out
}
func ProcessGlob(glob string, outdir string) {
books := MapHKA()
// # [15 = W II 6a. Frühjahr 1888]
md_book_rx, _ := regexp.Compile(`(?m)^# \[(.+)\]$`)
// ## eKGWB/NF-1888,15[1]
// not:
// ## eKGWB/NF-1888,15[Titel]
md_aphorism_rx, _ := regexp.Compile(`(?m)^## eKGWB/.*,[0-9]+\[[0-9]+\]$`)
files, err := filepath.Glob(glob)
if err != nil {
panic(err)
}
for _, f := range files {
dat, err := os.ReadFile(f)
if err != nil {
panic(err)
}
log.Println("INFO: processing", f)
dat = PreCleanupHtml(dat)
r := bytes.NewReader(dat)
doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
panic(err)
}
ekgw := ParseWithGoquery(doc)
out := Render(ekgw)
md := RunPandoc(out)
md = CleanupMd(md)
md = AnnotateKGW(md, books, md_book_rx, md_aphorism_rx)
mdname := filepath.Join(outdir, strings.TrimSuffix(filepath.Base(f), filepath.Ext(f))+".md")
f, err := os.Create(mdname)
if err != nil {
panic(err)
}
defer f.Close()
_, err = f.WriteString(md)
if err != nil {
panic(err)
}
log.Println("INFO: wrote", mdname)
}
}
func main() {
log.SetFlags(log.LstdFlags ^ log.Ldate ^ log.Ltime)
ProcessGlob(NF_GLOB, "./output/")
ProcessGlob(WERKE_GLOB, "./output/")
}