From 194de9b98c39158e4412a6c63a06b763c01db161 Mon Sep 17 00:00:00 2001 From: Ariel Flesler Date: Thu, 6 Jul 2023 18:44:29 -0300 Subject: [PATCH] Document a few details about the final dataset --- README.md | 13 ++++++++++--- docs/index.html | 32 ++++++++++++++++---------------- parse.ts | 10 +++++----- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 248d956..8216e03 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ In addition, the final JSON includes several questions generated by GPT-4, they I aimed to keep the original dataset intact, so all modifications are recorded in [inputs/overrides.json](inputs/overrides.json) and even those don't change the original text, just re-arrange it. It still needs some more cleaning up but I won't be doing that in the short-term. -You can view the dataset here: https://flesler.github.io/satoshi-data/ +You can view the dataset as JSON [here](docs/qa.json) and visually here: https://flesler.github.io/satoshi-data/ ## Setup @@ -20,14 +20,21 @@ You can view the dataset here: https://flesler.github.io/satoshi-data/ npm install ``` -## Regenerate the JSON file +### Regenerate the JSON file ``` npm run start ``` -## Preview the JSON file locally +### Preview the JSON file locally ``` npm run serve ``` + +## More about the dataset + +The output is a JSON file, with an array of objects. They are sorted chronologically. Many have a `type` which is one of: +1. gpt-4: Those generated with GPT-4 +1. favorite: The ones I personally liked the most +1. ignore: The ones I personally didn't find useful for my particular needs diff --git a/docs/index.html b/docs/index.html index 8b4381d..22bb741 100644 --- a/docs/index.html +++ b/docs/index.html @@ -1,20 +1,20 @@ - - Q&As of Satoshi Nakamoto - - - + + Q&As of Satoshi Nakamoto + + + @@ -46,5 +46,5 @@ document.body.className = check.checked ? '' : 'hide-ignored' } - + \ No newline at end of file diff --git a/parse.ts b/parse.ts index 42fcbaa..15d7b79 100644 --- a/parse.ts +++ b/parse.ts @@ -48,7 +48,7 @@ const splitEmail = (email: string | undefined) => { .replace(/> >/g, '>>') .split('\n') .map(l => l.trim()) - + const signature = lines.indexOf('---------------------------------------------------------------------') if (signature !== -1) { let len = signature - 1 @@ -114,7 +114,7 @@ const parseEmails = () => { } } } - + return out } @@ -167,7 +167,7 @@ const parsePosts = () => { delete post.satoshi_id continue } - + const [first, ...parts] = overrides[post.url]?.parts || splitPost(post.content) if (first) { // Should be using nested_level for some, but seems like Satoshi replied without nesting correctly (?) @@ -189,14 +189,14 @@ const parsePosts = () => { } } } - + return out } const qas = parsePosts().concat(parseEmails()) .map((qa) => ({ type: overrides[qa.src]?.type, ...qa, - date: new Date(qa.date + ' UTC').toISOString().split('.')[0].replace('T', ' '), + date: new Date(qa.date + ' UTC').toISOString().split('.')[0].replace('T', ' '), })) .sort((a, b) => a.date > b.date ? 1 : a.date < b.date ? -1 : 0) .map((qa, i) => ({ id: i + 1, ...qa }))