From f45f6cc3c4f933dba6e649f49cdb14a40dcf333f Mon Sep 17 00:00:00 2001 From: Dirk Roorda Date: Thu, 9 Dec 2021 16:09:34 +0100 Subject: [PATCH] better metadata --- programs/parallels.ipynb | 493 +++++++++++++++------------------------ programs/parallels.py | 215 ++++++++--------- tf/2021/crossref.tf | 7 +- tf/2021/crossrefLCS.tf | 7 +- tf/2021/crossrefSET.tf | 7 +- yaml/generic.yaml | 3 + yaml/parallels.yaml | 7 + 7 files changed, 316 insertions(+), 423 deletions(-) create mode 100644 yaml/generic.yaml create mode 100644 yaml/parallels.yaml diff --git a/programs/parallels.ipynb b/programs/parallels.ipynb index ec59d84..23ef3fb 100644 --- a/programs/parallels.ipynb +++ b/programs/parallels.ipynb @@ -1,15 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "lines_to_next_cell": 0 - }, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -140,22 +130,18 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "# Caveat\n", "\n", "This notebook makes use of a new feature of text-fabric, first present in 2.3.15.\n", "Make sure to upgrade first.\n", "\n", - "```sudo -H pip3 install --upgrade text-fabric" - ] - }, - { - "cell_type": "markdown", - "id": "aggressive-husband", - "metadata": {}, - "source": [ - "In[1]:" + "```\n", + "sudo -H pip3 install --upgrade text-fabric\n", + "```" ] }, { @@ -171,14 +157,15 @@ "import pickle\n", "import math\n", "import difflib\n", + "import yaml\n", "from difflib import SequenceMatcher\n", "from IPython.display import HTML\n", - "import matplotlib.pyplot as plt" + "import matplotlib.pyplot as plt\n", + "from tf.core.helpers import formatMeta" ] }, { "cell_type": "markdown", - "id": "handled-catalyst", "metadata": {}, "source": [ "pip3 install python-Levenshtein" @@ -186,8 +173,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "fabulous-cookie", + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -196,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "lines_to_next_cell": 2 }, @@ -208,8 +194,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "african-mortality", + "execution_count": 4, "metadata": { "lines_to_next_cell": 2 }, @@ -222,7 +207,6 @@ }, { "cell_type": "markdown", - "id": "acknowledged-necklace", "metadata": { "lines_to_next_cell": 2 }, @@ -232,8 +216,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "democratic-invite", + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -254,7 +237,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": { "lines_to_next_cell": 2 }, @@ -267,7 +250,6 @@ }, { "cell_type": "markdown", - "id": "speaking-timber", "metadata": { "lines_to_next_cell": 2 }, @@ -277,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": { "lines_to_next_cell": 2 }, @@ -574,8 +556,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "rubber-electronics", + "execution_count": 8, "metadata": { "lines_to_next_cell": 2 }, @@ -596,7 +577,6 @@ }, { "cell_type": "markdown", - "id": "lyric-seventh", "metadata": { "lines_to_next_cell": 2 }, @@ -606,8 +586,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "romance-polymer", + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -618,8 +597,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "freelance-dream", + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -628,8 +606,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "fundamental-musician", + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -640,7 +617,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, "metadata": { "lines_to_next_cell": 2 }, @@ -652,7 +629,6 @@ }, { "cell_type": "markdown", - "id": "architectural-helping", "metadata": { "lines_to_next_cell": 2 }, @@ -662,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 13, "metadata": { "lines_to_next_cell": 2 }, @@ -685,7 +661,6 @@ }, { "cell_type": "markdown", - "id": "julian-demonstration", "metadata": { "lines_to_next_cell": 2 }, @@ -695,7 +670,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "metadata": { "lines_to_next_cell": 2 }, @@ -722,7 +697,6 @@ }, { "cell_type": "markdown", - "id": "italian-worcester", "metadata": { "lines_to_next_cell": 2 }, @@ -732,7 +706,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 15, "metadata": { "lines_to_next_cell": 2 }, @@ -744,7 +718,7 @@ "..............................................................................................\n", ". 0.00s Load the existing TF dataset .\n", "..............................................................................................\n", - "This is Text-Fabric 8.5.13\n", + "This is Text-Fabric 9.1.7\n", "Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html\n", "\n", "114 features found and 0 ignored\n" @@ -758,7 +732,6 @@ }, { "cell_type": "markdown", - "id": "worthy-alarm", "metadata": { "lines_to_next_cell": 2 }, @@ -768,7 +741,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 16, "metadata": { "lines_to_next_cell": 2 }, @@ -779,7 +752,7 @@ "text": [ " 0.00s loading features ...\n", " | 0.00s Dataset without structure sections in otext:no structure functions in the T-API\n", - " 3.65s All features loaded/computed - for details use loadLog()\n" + " 11s All features loaded/computed - for details use TF.isLoaded()\n" ] }, { @@ -799,7 +772,7 @@ " ('Text', 'text', ('T Text',))]" ] }, - "execution_count": 9, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -832,7 +805,6 @@ }, { "cell_type": "markdown", - "id": "electronic-company", "metadata": { "lines_to_next_cell": 2 }, @@ -842,8 +814,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "compatible-restaurant", + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -856,8 +827,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "thrown-boston", + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -868,8 +838,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "acknowledged-skiing", + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -881,8 +850,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "transsexual-ethiopia", + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -896,8 +864,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "civic-ground", + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -915,7 +882,6 @@ }, { "cell_type": "markdown", - "id": "urban-february", "metadata": {}, "source": [ "note that the TF_TABLE and LOCAL_BASE_COMP are deliberately\n", @@ -927,8 +893,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "seventh-expression", + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -938,8 +903,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "available-wisdom", + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -953,8 +917,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "extraordinary-account", + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -968,7 +931,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 25, "metadata": { "lines_to_next_cell": 2 }, @@ -1002,7 +965,6 @@ }, { "cell_type": "markdown", - "id": "smart-whale", "metadata": { "lines_to_next_cell": 2 }, @@ -1012,8 +974,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "advanced-nudist", + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1044,8 +1005,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "precious-appeal", + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -1068,8 +1028,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "photographic-virginia", + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -1103,8 +1062,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "under-reminder", + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -1171,7 +1129,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 30, "metadata": { "lines_to_next_cell": 2 }, @@ -1193,7 +1151,6 @@ }, { "cell_type": "markdown", - "id": "whole-grill", "metadata": { "lines_to_next_cell": 2 }, @@ -1203,7 +1160,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 31, "metadata": { "lines_to_next_cell": 2 }, @@ -1321,7 +1278,6 @@ }, { "cell_type": "markdown", - "id": "solved-category", "metadata": { "lines_to_next_cell": 2 }, @@ -1331,7 +1287,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 32, "metadata": { "lines_to_next_cell": 2 }, @@ -1396,7 +1352,6 @@ }, { "cell_type": "markdown", - "id": "skilled-climb", "metadata": { "lines_to_next_cell": 2 }, @@ -1406,8 +1361,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "sudden-mobile", + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -1433,7 +1387,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 34, "metadata": { "lines_to_next_cell": 2 }, @@ -1620,7 +1574,6 @@ }, { "cell_type": "markdown", - "id": "modified-asian", "metadata": { "lines_to_next_cell": 2 }, @@ -1630,8 +1583,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "fiscal-polish", + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -1648,8 +1600,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "touched-smile", + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -1677,8 +1628,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "polish-compilation", + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -1698,8 +1648,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "intermediate-reducing", + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -1743,7 +1692,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 39, "metadata": { "lines_to_next_cell": 2 }, @@ -1909,7 +1858,6 @@ }, { "cell_type": "markdown", - "id": "nervous-plane", "metadata": { "lines_to_next_cell": 2 }, @@ -1919,8 +1867,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "original-terrorism", + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -1987,8 +1934,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "equivalent-beach", + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -2017,8 +1963,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "manufactured-organizer", + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -2044,7 +1989,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 43, "metadata": { "lines_to_next_cell": 2 }, @@ -2073,7 +2018,6 @@ }, { "cell_type": "markdown", - "id": "transsexual-poker", "metadata": { "lines_to_next_cell": 2 }, @@ -2083,8 +2027,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "rational-brazilian", + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -2098,8 +2041,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "enhanced-confusion", + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -2109,8 +2051,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "smoking-short", + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -2125,8 +2066,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "balanced-slave", + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -2136,8 +2076,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "former-oakland", + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -2157,8 +2096,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "imposed-montgomery", + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -2170,8 +2108,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "bibliographic-reproduction", + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -2199,8 +2136,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "empirical-forwarding", + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -2225,8 +2161,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "charged-folks", + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -2253,8 +2188,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "furnished-times", + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -2269,8 +2203,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "grand-brick", + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -2284,8 +2217,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "desperate-quick", + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -2322,8 +2254,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "raised-freedom", + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -2352,8 +2283,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "angry-nevada", + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -2367,8 +2297,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "beneficial-technician", + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -2392,8 +2321,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "linear-reach", + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -2423,8 +2351,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "demanding-revelation", + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -2442,7 +2369,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 61, "metadata": { "lines_to_next_cell": 2 }, @@ -2473,7 +2400,6 @@ }, { "cell_type": "markdown", - "id": "related-imaging", "metadata": { "lines_to_next_cell": 2 }, @@ -2483,7 +2409,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 62, "metadata": { "lines_to_next_cell": 2 }, @@ -2616,7 +2542,6 @@ }, { "cell_type": "markdown", - "id": "chinese-painting", "metadata": { "lines_to_next_cell": 2 }, @@ -2626,8 +2551,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "virgin-rider", + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -2647,7 +2571,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 64, "metadata": { "lines_to_next_cell": 2 }, @@ -2929,7 +2853,6 @@ }, { "cell_type": "markdown", - "id": "coral-worthy", "metadata": { "lines_to_next_cell": 2 }, @@ -2939,8 +2862,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "preceding-occasions", + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -2949,8 +2871,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "welcome-flood", + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -2962,8 +2883,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "optional-authentication", + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -2978,8 +2898,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "discrete-ribbon", + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -3003,8 +2922,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "bright-marijuana", + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -3015,8 +2933,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "blind-termination", + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -3031,8 +2948,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "coupled-affair", + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -3055,8 +2971,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "dressed-field", + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -3074,7 +2989,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 73, "metadata": { "lines_to_next_cell": 2 }, @@ -3170,7 +3085,6 @@ }, { "cell_type": "markdown", - "id": "affected-operator", "metadata": { "lines_to_next_cell": 2 }, @@ -3180,8 +3094,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "sought-heaven", + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -3193,8 +3106,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "marked-hygiene", + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -3255,7 +3167,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 76, "metadata": { "lines_to_next_cell": 2 }, @@ -3298,7 +3210,6 @@ }, { "cell_type": "markdown", - "id": "opening-louisiana", "metadata": { "lines_to_next_cell": 2 }, @@ -3308,20 +3219,36 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "ruled-polyester", + "execution_count": 77, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "..............................................................................................\n", + ". 13s CROSSREFS: Fetching crossrefs .\n", + "..............................................................................................\n" + ] + } + ], "source": [ "utils.caption(4, \"CROSSREFS: Fetching crossrefs\")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "inside-visiting", + "execution_count": 78, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "| 13s \tReading existing /Users/werk/github/etcbc/parallels/_temp/parallelTable.tsv\n" + ] + } + ], "source": [ "xTable = os.path.exists(TF_TABLE)\n", "if FORCE_MATRIX:\n", @@ -3341,7 +3268,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 79, "metadata": { "lines_to_next_cell": 2 }, @@ -3350,12 +3277,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "..............................................................................................\n", - ". 27s CROSSREFS: Fetching crossrefs .\n", - "..............................................................................................\n", - "| 27s \tReading existing /Users/dirk/github/etcbc/parallels/_temp/parallelTable.tsv\n", - "| 27s \t\tINFO: 3783 verse nodes have been changed between versions\n", - "| 27s \t\tINFO: We will save and use the recomputed ones\n" + "| 13s \t\tINFO: All verse nodes are the same as in the previous version\n" ] } ], @@ -3368,7 +3290,6 @@ }, { "cell_type": "markdown", - "id": "historical-mileage", "metadata": { "lines_to_next_cell": 2 }, @@ -3378,7 +3299,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 80, "metadata": { "lines_to_next_cell": 2 }, @@ -3387,26 +3308,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "('LCS', 1414375, 1414381, 84, 'Genesis', 1, 13, 'Genesis', 1, 19)\n", - "('LCS', 1414375, 1414385, 89, 'Genesis', 1, 13, 'Genesis', 1, 23)\n", - "('LCS', 1414377, 1414379, 77, 'Genesis', 1, 15, 'Genesis', 1, 17)\n", - "('LCS', 1414381, 1414385, 84, 'Genesis', 1, 19, 'Genesis', 1, 23)\n", - "('LCS', 1414472, 1414475, 79, 'Genesis', 5, 4, 'Genesis', 5, 7)\n", - "('LCS', 1414472, 1414481, 75, 'Genesis', 5, 4, 'Genesis', 5, 13)\n", - "('LCS', 1414472, 1414484, 78, 'Genesis', 5, 4, 'Genesis', 5, 16)\n", - "('LCS', 1414472, 1414487, 86, 'Genesis', 5, 4, 'Genesis', 5, 19)\n", - "('LCS', 1414472, 1414498, 77, 'Genesis', 5, 4, 'Genesis', 5, 30)\n", - "('LCS', 1414472, 1414640, 79, 'Genesis', 5, 4, 'Genesis', 11, 11)\n", - "('SET', 1414479, 1414597, 80, 'Genesis', 5, 11, 'Genesis', 9, 29)\n", - "('SET', 1414484, 1414487, 77, 'Genesis', 5, 16, 'Genesis', 5, 19)\n", - "('SET', 1414599, 1435815, 100, 'Genesis', 10, 2, '1_Chronicles', 1, 5)\n", - "('SET', 1414603, 1435818, 100, 'Genesis', 10, 6, '1_Chronicles', 1, 8)\n", - "('SET', 1414604, 1435819, 100, 'Genesis', 10, 7, '1_Chronicles', 1, 9)\n", - "('SET', 1414605, 1435820, 100, 'Genesis', 10, 8, '1_Chronicles', 1, 10)\n", - "('SET', 1414610, 1435821, 100, 'Genesis', 10, 13, '1_Chronicles', 1, 11)\n", - "('SET', 1414611, 1435822, 100, 'Genesis', 10, 14, '1_Chronicles', 1, 12)\n", - "('SET', 1414612, 1435823, 100, 'Genesis', 10, 15, '1_Chronicles', 1, 13)\n", - "('SET', 1414613, 1414744, 83, 'Genesis', 10, 16, 'Genesis', 15, 21)\n" + "('LCS', 1414401, 1414407, 84, 'Genesis', 1, 13, 'Genesis', 1, 19)\n", + "('LCS', 1414401, 1414411, 89, 'Genesis', 1, 13, 'Genesis', 1, 23)\n", + "('LCS', 1414403, 1414405, 77, 'Genesis', 1, 15, 'Genesis', 1, 17)\n", + "('LCS', 1414407, 1414411, 84, 'Genesis', 1, 19, 'Genesis', 1, 23)\n", + "('LCS', 1414498, 1414501, 79, 'Genesis', 5, 4, 'Genesis', 5, 7)\n", + "('LCS', 1414498, 1414507, 75, 'Genesis', 5, 4, 'Genesis', 5, 13)\n", + "('LCS', 1414498, 1414510, 78, 'Genesis', 5, 4, 'Genesis', 5, 16)\n", + "('LCS', 1414498, 1414513, 86, 'Genesis', 5, 4, 'Genesis', 5, 19)\n", + "('LCS', 1414498, 1414524, 77, 'Genesis', 5, 4, 'Genesis', 5, 30)\n", + "('LCS', 1414498, 1414666, 79, 'Genesis', 5, 4, 'Genesis', 11, 11)\n", + "('SET', 1414505, 1414623, 80, 'Genesis', 5, 11, 'Genesis', 9, 29)\n", + "('SET', 1414510, 1414513, 77, 'Genesis', 5, 16, 'Genesis', 5, 19)\n", + "('SET', 1414625, 1435841, 100, 'Genesis', 10, 2, '1_Chronicles', 1, 5)\n", + "('SET', 1414629, 1435844, 100, 'Genesis', 10, 6, '1_Chronicles', 1, 8)\n", + "('SET', 1414630, 1435845, 100, 'Genesis', 10, 7, '1_Chronicles', 1, 9)\n", + "('SET', 1414631, 1435846, 100, 'Genesis', 10, 8, '1_Chronicles', 1, 10)\n", + "('SET', 1414636, 1435847, 100, 'Genesis', 10, 13, '1_Chronicles', 1, 11)\n", + "('SET', 1414637, 1435848, 100, 'Genesis', 10, 14, '1_Chronicles', 1, 12)\n", + "('SET', 1414638, 1435849, 100, 'Genesis', 10, 15, '1_Chronicles', 1, 13)\n", + "('SET', 1414639, 1414770, 83, 'Genesis', 10, 16, 'Genesis', 15, 21)\n" ] } ], @@ -3416,20 +3337,9 @@ " print(\"\\n\".join(sorted(repr(sim) for sim in similars if sim[0] == \"SET\")[0:10]))" ] }, - { - "cell_type": "markdown", - "id": "conservative-position", - "metadata": { - "lines_to_next_cell": 2 - }, - "source": [ - "In[29]:" - ] - }, { "cell_type": "code", - "execution_count": null, - "id": "saving-brush", + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -3439,7 +3349,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 82, "metadata": { "lines_to_next_cell": 2 }, @@ -3466,40 +3376,55 @@ ] }, { - "cell_type": "markdown", - "id": "angry-cornell", - "metadata": { - "lines_to_next_cell": 2 - }, + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "..............................................................................................\n", + ". 6m 16s Writing TF parallel features .\n", + "..............................................................................................\n" + ] + } + ], "source": [ - "In[32]:" + "utils.caption(4, \"Writing TF parallel features\")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "encouraging-infrared", + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ - "utils.caption(4, \"Writing TF parallel features\")" + "newFeatureStr = \"crossref crossrefSET crossrefLCS\"\n", + "newFeatures = newFeatureStr.strip().split()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "wound-cooperation", + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ - "newFeatureStr = \"crossref crossrefSET crossrefLCS\"\n", - "newFeatures = newFeatureStr.strip().split()" + "genericMetaPath = f\"{thisRepo}/yaml/generic.yaml\"\n", + "parallelsMetaPath = f\"{thisRepo}/yaml/parallels.yaml\"\n", + "\n", + "with open(genericMetaPath) as fh:\n", + " genericMeta = yaml.load(fh, Loader=yaml.FullLoader)\n", + " genericMeta[\"version\"] = VERSION\n", + "with open(parallelsMetaPath) as fh:\n", + " parallelsMeta = formatMeta(yaml.load(fh, Loader=yaml.FullLoader))\n", + "\n", + "metaData = {\"\": genericMeta, **parallelsMeta}" ] }, { "cell_type": "code", - "execution_count": null, - "id": "intended-monday", + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ @@ -3511,45 +3436,29 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "corresponding-heavy", + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ - "provenance = dict(\n", - " source=\"Parallels Module\",\n", - " coreData=\"BHSA\",\n", - " coreVersion=VERSION,\n", - " author=\"BHSA Data: Constantijn Sikkel; Parallels Notebook: Dirk Roorda, Martijn Naaijer\",\n", - ")\n", - "metaData = {\"\": provenance}\n", "for newFeature in newFeatures:\n", - " metaData[newFeature] = dict(valueType=\"int\", edgeValues=True)" + " metaData[newFeature][\"valueType\"] = \"int\"\n", + " metaData[newFeature][\"edgeValues\"] = True" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 94, "metadata": { "lines_to_next_cell": 2 }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "..............................................................................................\n", - ". 8m 54s Writing TF parallel features .\n", - "..............................................................................................\n" - ] - }, { "data": { "text/plain": [ "True" ] }, - "execution_count": 32, + "execution_count": 94, "metadata": {}, "output_type": "execute_result" } @@ -3569,7 +3478,6 @@ }, { "cell_type": "markdown", - "id": "sustainable-ratio", "metadata": { "lines_to_next_cell": 2 }, @@ -3580,7 +3488,6 @@ { "cell_type": "code", "execution_count": null, - "id": "spare-script", "metadata": {}, "outputs": [], "source": [ @@ -3590,7 +3497,6 @@ { "cell_type": "code", "execution_count": null, - "id": "simple-church", "metadata": {}, "outputs": [], "source": [ @@ -3617,7 +3523,6 @@ { "cell_type": "code", "execution_count": null, - "id": "incident-photography", "metadata": {}, "outputs": [], "source": [ @@ -3627,7 +3532,6 @@ { "cell_type": "code", "execution_count": null, - "id": "muslim-antigua", "metadata": {}, "outputs": [], "source": [ @@ -3684,7 +3588,6 @@ }, { "cell_type": "markdown", - "id": "specialized-vampire", "metadata": { "lines_to_next_cell": 2 }, @@ -3695,7 +3598,6 @@ { "cell_type": "code", "execution_count": null, - "id": "cleared-render", "metadata": {}, "outputs": [], "source": [ @@ -3716,7 +3618,6 @@ { "cell_type": "code", "execution_count": null, - "id": "boring-rescue", "metadata": {}, "outputs": [], "source": [ @@ -3726,7 +3627,6 @@ { "cell_type": "code", "execution_count": null, - "id": "fifth-writer", "metadata": {}, "outputs": [], "source": [ @@ -3737,7 +3637,6 @@ { "cell_type": "code", "execution_count": null, - "id": "finished-lebanon", "metadata": {}, "outputs": [], "source": [ @@ -3792,7 +3691,6 @@ }, { "cell_type": "markdown", - "id": "italic-childhood", "metadata": { "lines_to_next_cell": 2 }, @@ -3839,7 +3737,6 @@ }, { "cell_type": "markdown", - "id": "checked-washington", "metadata": { "lines_to_next_cell": 2 }, @@ -3877,7 +3774,6 @@ }, { "cell_type": "markdown", - "id": "graduate-tomato", "metadata": { "lines_to_next_cell": 2 }, @@ -3888,7 +3784,6 @@ { "cell_type": "code", "execution_count": null, - "id": "annual-wesley", "metadata": {}, "outputs": [], "source": [ @@ -3962,7 +3857,6 @@ }, { "cell_type": "markdown", - "id": "extreme-analysis", "metadata": { "lines_to_next_cell": 2 }, @@ -3973,7 +3867,6 @@ { "cell_type": "code", "execution_count": null, - "id": "signed-channels", "metadata": {}, "outputs": [], "source": [ @@ -3983,7 +3876,6 @@ { "cell_type": "code", "execution_count": null, - "id": "personalized-delhi", "metadata": {}, "outputs": [], "source": [ @@ -4164,7 +4056,6 @@ }, { "cell_type": "markdown", - "id": "matched-receipt", "metadata": { "lines_to_next_cell": 2 }, @@ -4177,6 +4068,9 @@ "execution_count": 29, "metadata": { "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, "lines_to_next_cell": 2 }, "outputs": [], @@ -4202,7 +4096,6 @@ }, { "cell_type": "markdown", - "id": "arranged-public", "metadata": { "lines_to_next_cell": 2 }, @@ -4213,7 +4106,6 @@ { "cell_type": "code", "execution_count": null, - "id": "neural-robin", "metadata": {}, "outputs": [], "source": [ @@ -4224,7 +4116,6 @@ { "cell_type": "code", "execution_count": null, - "id": "dominican-hierarchy", "metadata": {}, "outputs": [], "source": [ @@ -4235,7 +4126,6 @@ { "cell_type": "code", "execution_count": null, - "id": "hazardous-regression", "metadata": {}, "outputs": [], "source": [ @@ -4251,7 +4141,6 @@ { "cell_type": "code", "execution_count": null, - "id": "little-margin", "metadata": {}, "outputs": [], "source": [ @@ -4262,7 +4151,6 @@ { "cell_type": "code", "execution_count": null, - "id": "portuguese-cross", "metadata": {}, "outputs": [], "source": [ @@ -4272,7 +4160,6 @@ { "cell_type": "code", "execution_count": null, - "id": "introductory-turkish", "metadata": {}, "outputs": [], "source": [ @@ -4299,7 +4186,6 @@ { "cell_type": "code", "execution_count": null, - "id": "hidden-rachel", "metadata": {}, "outputs": [], "source": [ @@ -4317,7 +4203,6 @@ { "cell_type": "code", "execution_count": null, - "id": "danish-artist", "metadata": {}, "outputs": [], "source": [ @@ -4327,7 +4212,6 @@ { "cell_type": "code", "execution_count": null, - "id": "tight-baseball", "metadata": {}, "outputs": [], "source": [ @@ -4377,7 +4261,6 @@ { "cell_type": "code", "execution_count": null, - "id": "southwest-inspector", "metadata": {}, "outputs": [], "source": [ @@ -4433,7 +4316,6 @@ { "cell_type": "code", "execution_count": null, - "id": "wireless-panama", "metadata": {}, "outputs": [], "source": [ @@ -4470,7 +4352,6 @@ { "cell_type": "code", "execution_count": null, - "id": "featured-sydney", "metadata": {}, "outputs": [], "source": [ @@ -4501,7 +4382,6 @@ { "cell_type": "code", "execution_count": null, - "id": "parallel-benchmark", "metadata": {}, "outputs": [], "source": [ @@ -4532,7 +4412,6 @@ { "cell_type": "code", "execution_count": null, - "id": "familiar-agent", "metadata": {}, "outputs": [], "source": [ @@ -4553,7 +4432,6 @@ { "cell_type": "code", "execution_count": null, - "id": "friendly-strap", "metadata": {}, "outputs": [], "source": [ @@ -4563,7 +4441,6 @@ { "cell_type": "code", "execution_count": null, - "id": "discrete-dimension", "metadata": {}, "outputs": [], "source": [ @@ -4666,6 +4543,9 @@ "execution_count": 30, "metadata": { "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, "lines_to_next_cell": 2 }, "outputs": [], @@ -4696,7 +4576,6 @@ }, { "cell_type": "markdown", - "id": "athletic-begin", "metadata": { "lines_to_next_cell": 2 }, @@ -4715,7 +4594,6 @@ }, { "cell_type": "markdown", - "id": "random-tissue", "metadata": {}, "source": [ "do_experiment(False, 'sentence', 'LCS', 60, False)" @@ -4724,7 +4602,6 @@ { "cell_type": "code", "execution_count": null, - "id": "contrary-proportion", "metadata": {}, "outputs": [], "source": [ @@ -4733,7 +4610,6 @@ }, { "cell_type": "markdown", - "id": "hollywood-avatar", "metadata": { "lines_to_next_cell": 2 }, @@ -4747,7 +4623,6 @@ }, { "cell_type": "markdown", - "id": "infinite-tanzania", "metadata": { "lines_to_next_cell": 2 }, @@ -4760,6 +4635,9 @@ "execution_count": null, "metadata": { "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, "lines_to_next_cell": 2 }, "outputs": [], @@ -4782,7 +4660,6 @@ }, { "cell_type": "markdown", - "id": "figured-daily", "metadata": { "lines_to_next_cell": 2 }, @@ -4793,7 +4670,6 @@ { "cell_type": "code", "execution_count": null, - "id": "quarterly-albert", "metadata": {}, "outputs": [], "source": [ @@ -4808,6 +4684,9 @@ "execution_count": null, "metadata": { "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, "lines_to_next_cell": 2 }, "outputs": [], @@ -4826,7 +4705,6 @@ }, { "cell_type": "markdown", - "id": "infinite-panel", "metadata": { "lines_to_next_cell": 2 }, @@ -4837,7 +4715,6 @@ { "cell_type": "code", "execution_count": null, - "id": "generous-jonathan", "metadata": {}, "outputs": [], "source": [ @@ -4852,6 +4729,9 @@ "execution_count": null, "metadata": { "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, "lines_to_next_cell": 2 }, "outputs": [], @@ -4870,7 +4750,6 @@ }, { "cell_type": "markdown", - "id": "delayed-collection", "metadata": {}, "source": [ "In[ ]:" @@ -4879,7 +4758,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -4893,7 +4772,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.5" + "version": "3.10.0" }, "toc": { "nav_menu": {}, diff --git a/programs/parallels.py b/programs/parallels.py index 417d207..a157471 100644 --- a/programs/parallels.py +++ b/programs/parallels.py @@ -1,12 +1,6 @@ #!/usr/bin/env python # coding: utf-8 -# In[ ]: - - - - - #

Table of Contents

#
@@ -120,9 +114,9 @@ # This notebook makes use of a new feature of text-fabric, first present in 2.3.15. # Make sure to upgrade first. # -# ```sudo -H pip3 install --upgrade text-fabric - -# In[1]: +# ``` +# sudo -H pip3 install --upgrade text-fabric +# ``` # In[1]: @@ -134,27 +128,29 @@ import pickle import math import difflib +import yaml from difflib import SequenceMatcher from IPython.display import HTML import matplotlib.pyplot as plt +from tf.core.helpers import formatMeta # pip3 install python-Levenshtein -# In[ ]: +# In[2]: from Levenshtein import ratio -# In[4]: +# In[3]: import utils from tf.fabric import Fabric -# In[ ]: +# In[4]: get_ipython().run_line_magic("load_ext", "autoreload") # noqa F821 @@ -164,7 +160,7 @@ # In[2]: -# In[ ]: +# In[5]: if "SCRIPT" not in locals(): @@ -182,7 +178,7 @@ VERSION = "2021" -# In[2]: +# In[6]: def stop(good=False): @@ -192,7 +188,7 @@ def stop(good=False): # In[3]: -# In[3]: +# In[7]: # run this cell after all other cells @@ -464,7 +460,7 @@ def stop(good=False): # # The rest is code. From here we fire up the engines and start computing. -# In[ ]: +# In[8]: PICKLE_PROTOCOL = 3 @@ -477,7 +473,7 @@ def stop(good=False): # In[5]: -# In[ ]: +# In[9]: repoBase = os.path.expanduser("~/github/etcbc") @@ -485,13 +481,13 @@ def stop(good=False): thisRepo = "{}/{}".format(repoBase, NAME) -# In[ ]: +# In[10]: coreTf = "{}/tf/{}".format(coreRepo, VERSION) -# In[ ]: +# In[11]: allTemp = "{}/_temp".format(thisRepo) @@ -499,7 +495,7 @@ def stop(good=False): thisTempTf = "{}/tf".format(thisTemp) -# In[5]: +# In[12]: thisTf = "{}/tf/{}".format(thisRepo, VERSION) @@ -508,7 +504,7 @@ def stop(good=False): # In[6]: -# In[6]: +# In[13]: notesFile = "crossrefNotes.csv" @@ -523,7 +519,7 @@ def stop(good=False): # In[7]: -# In[7]: +# In[14]: if SCRIPT: @@ -542,7 +538,7 @@ def stop(good=False): # In[8]: -# In[8]: +# In[15]: utils.caption(4, "Load the existing TF dataset") @@ -551,7 +547,7 @@ def stop(good=False): # In[9]: -# In[9]: +# In[16]: api = TF.load( @@ -576,7 +572,7 @@ def stop(good=False): # In[10]: -# In[ ]: +# In[17]: # chunking @@ -586,7 +582,7 @@ def stop(good=False): CHUNK_OBJECTS = ("chapter", "verse", "half_verse", "sentence") -# In[ ]: +# In[18]: # preparing @@ -594,7 +590,7 @@ def stop(good=False): EXCLUDED_PAT = re.compile(EXCLUDED_CONS) -# In[ ]: +# In[19]: # similarity @@ -603,7 +599,7 @@ def stop(good=False): SIMILARITIES = (100, 95, 90, 85, 80, 75, 70, 65, 60, 55, 50, 45, 40, 35, 30) -# In[ ]: +# In[20]: # printing @@ -614,7 +610,7 @@ def stop(good=False): CLIQUES_PER_FILE = 50 -# In[ ]: +# In[21]: # assessing results @@ -635,14 +631,14 @@ def stop(good=False): # Here the results of expensive calculations are stored, # to be used by all versions -# In[ ]: +# In[22]: # crossrefs for TF TF_TABLE = "{}/parallelTable.tsv".format(allTemp) -# In[ ]: +# In[23]: # crossrefs for SHEBANQ @@ -653,7 +649,7 @@ def stop(good=False): CROSSREF_KEYWORD = "crossref" -# In[ ]: +# In[24]: # progress indication @@ -664,7 +660,7 @@ def stop(good=False): CLIQUES_PROGRESS = 1 * KILO -# In[10]: +# In[25]: # locations and hyperlinks @@ -690,7 +686,7 @@ def stop(good=False): # In[11]: -# In[ ]: +# In[26]: def reset_params(): @@ -718,7 +714,7 @@ def reset_params(): meta = collections.OrderedDict() -# In[ ]: +# In[27]: def set_matrix_threshold(sim_m=None, chunk_o=None): @@ -738,7 +734,7 @@ def set_matrix_threshold(sim_m=None, chunk_o=None): MATRIX_THRESHOLD = 60 -# In[ ]: +# In[28]: def do_params_chunk(chunk_f, chunk_i): @@ -769,7 +765,7 @@ def do_params_chunk(chunk_f, chunk_i): return do_chunk -# In[ ]: +# In[29]: def do_params(chunk_f, chunk_i, sim_m, sim_thr): @@ -833,7 +829,7 @@ def do_params(chunk_f, chunk_i, sim_m, sim_thr): return (do_chunk, do_prep, do_sim, do_clique, False) -# In[11]: +# In[30]: reset_params() @@ -847,7 +843,7 @@ def do_params(chunk_f, chunk_i, sim_m, sim_thr): # In[12]: -# In[12]: +# In[31]: def chunking(do_chunk): @@ -957,7 +953,7 @@ def chunking(do_chunk): # In[13]: -# In[13]: +# In[32]: def preparing(do_prepare): @@ -1014,7 +1010,7 @@ def preparing(do_prepare): # In[14]: -# In[ ]: +# In[33]: def similarity_post(): @@ -1037,7 +1033,7 @@ def similarity_post(): ) -# In[14]: +# In[34]: def similarity(do_sim): @@ -1216,7 +1212,7 @@ def similarity(do_sim): # In[15]: -# In[ ]: +# In[35]: def key_chunk(i): @@ -1230,7 +1226,7 @@ def key_chunk(i): ) -# In[ ]: +# In[36]: def meta_clique_pre(): @@ -1255,7 +1251,7 @@ def meta_clique_pre(): meta["# SIMILAR PASSAGES"] = len(passages) -# In[ ]: +# In[37]: def meta_clique_pre2(): @@ -1272,7 +1268,7 @@ def meta_clique_pre2(): ) -# In[ ]: +# In[38]: def meta_clique_post(): @@ -1313,7 +1309,7 @@ def meta_clique_post(): ) -# In[15]: +# In[39]: def cliqueing(do_clique): @@ -1471,7 +1467,7 @@ def cliqueing(do_clique): # In[16]: -# In[ ]: +# In[40]: # clique lists @@ -1535,7 +1531,7 @@ def cliqueing(do_clique): """ -# In[ ]: +# In[41]: # chapter diffs @@ -1561,7 +1557,7 @@ def cliqueing(do_clique): """ -# In[ ]: +# In[42]: # table of experiments @@ -1584,7 +1580,7 @@ def cliqueing(do_clique): """ -# In[16]: +# In[43]: legend = """ @@ -1605,7 +1601,7 @@ def cliqueing(do_clique): # In[17]: -# In[ ]: +# In[44]: def xterse_chunk(i): @@ -1616,14 +1612,14 @@ def xterse_chunk(i): return (book, chapter) -# In[ ]: +# In[45]: def xterse_clique(ii): return tuple(sorted({xterse_chunk(i) for i in ii})) -# In[ ]: +# In[46]: def terse_chunk(i): @@ -1635,14 +1631,14 @@ def terse_chunk(i): return (book, chapter, verse) -# In[ ]: +# In[47]: def terse_clique(ii): return tuple(sorted({terse_chunk(i) for i in ii})) -# In[ ]: +# In[48]: def verse_chunk(i): @@ -1659,7 +1655,7 @@ def verse_chunk(i): return '{}'.format(htext) -# In[ ]: +# In[49]: def verse_clique(ii): @@ -1668,7 +1664,7 @@ def verse_clique(ii): ) -# In[ ]: +# In[50]: def condense(vlabels): @@ -1693,7 +1689,7 @@ def condense(vlabels): return cnd -# In[ ]: +# In[51]: def print_diff(a, b): @@ -1715,7 +1711,7 @@ def print_diff(a, b): return (arep, brep) -# In[ ]: +# In[52]: def print_chunk_fine(prev, text, verse_labels, prevlabels): @@ -1739,7 +1735,7 @@ def print_chunk_fine(prev, text, verse_labels, prevlabels): ) -# In[ ]: +# In[53]: def print_chunk_coarse(text, verse_labels): @@ -1751,7 +1747,7 @@ def print_chunk_coarse(text, verse_labels): ) -# In[ ]: +# In[54]: def print_clique(ii, ncliques): @@ -1762,7 +1758,7 @@ def print_clique(ii, ncliques): ) -# In[ ]: +# In[55]: def print_clique_fine(ii): @@ -1796,7 +1792,7 @@ def print_clique_fine(ii): return '{}
\n'.format("".join(result)) -# In[ ]: +# In[56]: def print_clique_coarse(ii): @@ -1822,7 +1818,7 @@ def print_clique_coarse(ii): return '{}
\n'.format("".join(result)) -# In[ ]: +# In[57]: def index_clique(bnm, n, ii, ncliques): @@ -1833,7 +1829,7 @@ def index_clique(bnm, n, ii, ncliques): ) -# In[ ]: +# In[58]: def index_clique_fine(bnm, n, ii): @@ -1854,7 +1850,7 @@ def index_clique_fine(bnm, n, ii): ) -# In[ ]: +# In[59]: def index_clique_coarse(bnm, n, ii): @@ -1881,7 +1877,7 @@ def index_clique_coarse(bnm, n, ii): ) -# In[ ]: +# In[60]: def lines_chapter(c): @@ -1896,7 +1892,7 @@ def lines_chapter(c): return lines -# In[17]: +# In[61]: def compare_chapters(c1, c2, lb1, lb2): @@ -1919,7 +1915,7 @@ def compare_chapters(c1, c2, lb1, lb2): # In[18]: -# In[18]: +# In[62]: # generate the table of experiments @@ -2044,7 +2040,7 @@ def gen_html(standalone=False): # In[19]: -# In[ ]: +# In[63]: def assess_exp(cf, np, nc, ll): @@ -2061,7 +2057,7 @@ def assess_exp(cf, np, nc, ll): ) -# In[19]: +# In[64]: def printing(): @@ -2335,13 +2331,13 @@ def printing(): # In[20]: -# In[ ]: +# In[65]: outputs = {} -# In[ ]: +# In[66]: def writeoutputs(): @@ -2350,7 +2346,7 @@ def writeoutputs(): pickle.dump(outputs, f, protocol=PICKLE_PROTOCOL) -# In[ ]: +# In[67]: def readoutputs(): @@ -2362,7 +2358,7 @@ def readoutputs(): outputs = pickle.load(f) -# In[ ]: +# In[68]: def do_experiment(chunk_f, chunk_i, sim_m, sim_thr, do_index): @@ -2383,7 +2379,7 @@ def do_experiment(chunk_f, chunk_i, sim_m, sim_thr, do_index): gen_html() -# In[ ]: +# In[69]: def do_only_chunk(chunk_f, chunk_i): @@ -2391,7 +2387,7 @@ def do_only_chunk(chunk_f, chunk_i): chunking(do_chunk) -# In[ ]: +# In[70]: def reset_experiments(): @@ -2403,7 +2399,7 @@ def reset_experiments(): gen_html() -# In[ ]: +# In[71]: def do_all_experiments(no_fixed=False, only_object=None): @@ -2423,7 +2419,7 @@ def do_all_experiments(no_fixed=False, only_object=None): gen_html(standalone=True) -# In[ ]: +# In[72]: def do_all_chunks(no_fixed=False, only_object=None): @@ -2438,7 +2434,7 @@ def do_all_chunks(no_fixed=False, only_object=None): do_only_chunk(chunk_f, chunk_i) -# In[20]: +# In[73]: def show_all_experiments(): @@ -2516,7 +2512,7 @@ def show_all_experiments(): # In[21]: -# In[ ]: +# In[74]: def writeSimTable(similars): @@ -2525,7 +2521,7 @@ def writeSimTable(similars): h.write("{}\n".format("\t".join(str(x) for x in entry))) -# In[ ]: +# In[75]: def readSimTable(): @@ -2583,7 +2579,7 @@ def readSimTable(): return similars -# In[21]: +# In[76]: def makeSimTable(): @@ -2623,13 +2619,13 @@ def makeSimTable(): # In[22]: -# In[ ]: +# In[77]: utils.caption(4, "CROSSREFS: Fetching crossrefs") -# In[ ]: +# In[78]: xTable = os.path.exists(TF_TABLE) @@ -2648,7 +2644,7 @@ def makeSimTable(): utils.caption(0, "\tComputing missing {}".format(TF_TABLE)) -# In[22]: +# In[79]: if FORCE_MATRIX or not xTable: @@ -2659,7 +2655,7 @@ def makeSimTable(): # In[23]: -# In[23]: +# In[80]: if not SCRIPT: @@ -2667,16 +2663,14 @@ def makeSimTable(): print("\n".join(sorted(repr(sim) for sim in similars if sim[0] == "SET")[0:10])) -# In[29]: - -# In[ ]: +# In[81]: crossrefData = {} otherMethod = dict(LCS="SET", SET="LCS") -# In[29]: +# In[82]: for (method, v1, v2, sim, *x) in similars: @@ -2694,22 +2688,35 @@ def makeSimTable(): # We generate the feature `crossref`. # It is an edge feature between verse nodes, with the similarity as weight. -# In[32]: - -# In[ ]: +# In[89]: utils.caption(4, "Writing TF parallel features") -# In[ ]: +# In[90]: newFeatureStr = "crossref crossrefSET crossrefLCS" newFeatures = newFeatureStr.strip().split() -# In[ ]: +# In[91]: + + +genericMetaPath = f"{thisRepo}/yaml/generic.yaml" +parallelsMetaPath = f"{thisRepo}/yaml/parallels.yaml" + +with open(genericMetaPath) as fh: + genericMeta = yaml.load(fh, Loader=yaml.FullLoader) + genericMeta["version"] = VERSION +with open(parallelsMetaPath) as fh: + parallelsMeta = formatMeta(yaml.load(fh, Loader=yaml.FullLoader)) + +metaData = {"": genericMeta, **parallelsMeta} + + +# In[92]: nodeFeatures = dict() @@ -2718,21 +2725,15 @@ def makeSimTable(): edgeFeatures["crossref{}".format(method)] = crossrefData[method] -# In[ ]: +# In[93]: -provenance = dict( - source="Parallels Module", - coreData="BHSA", - coreVersion=VERSION, - author="BHSA Data: Constantijn Sikkel; Parallels Notebook: Dirk Roorda, Martijn Naaijer", -) -metaData = {"": provenance} for newFeature in newFeatures: - metaData[newFeature] = dict(valueType="int", edgeValues=True) + metaData[newFeature]["valueType"] = "int" + metaData[newFeature]["edgeValues"] = True -# In[32]: +# In[94]: TF = Fabric(locations=thisTempTf, silent=True) diff --git a/tf/2021/crossref.tf b/tf/2021/crossref.tf index 1885ad5..2adb0d2 100644 --- a/tf/2021/crossref.tf +++ b/tf/2021/crossref.tf @@ -2,11 +2,12 @@ @edgeValues @author=BHSA Data: Constantijn Sikkel; Parallels Notebook: Dirk Roorda, Martijn Naaijer @coreData=BHSA -@coreVersion=2021 -@source=Parallels Module +@description=🆗 links between similar passages +@provenance=Parallels notebook, see https://github.com/ETCBC/parallels @valueType=int +@version=2021 @writtenBy=Text-Fabric -@dateWritten=2021-11-30T15:36:19Z +@dateWritten=2021-12-09T14:40:46Z 1414401 1414407 84 1414401 1414411 89 diff --git a/tf/2021/crossrefLCS.tf b/tf/2021/crossrefLCS.tf index 990f172..bc953f5 100644 --- a/tf/2021/crossrefLCS.tf +++ b/tf/2021/crossrefLCS.tf @@ -2,11 +2,12 @@ @edgeValues @author=BHSA Data: Constantijn Sikkel; Parallels Notebook: Dirk Roorda, Martijn Naaijer @coreData=BHSA -@coreVersion=2021 -@source=Parallels Module +@description=🆗 links between similar passages, based on LCS method +@provenance=Parallels notebook, see https://github.com/ETCBC/parallels @valueType=int +@version=2021 @writtenBy=Text-Fabric -@dateWritten=2021-11-30T15:36:19Z +@dateWritten=2021-12-09T14:40:46Z 1414401 1414407 84 1414401 1414411 89 diff --git a/tf/2021/crossrefSET.tf b/tf/2021/crossrefSET.tf index bdbf6a3..1d2476e 100644 --- a/tf/2021/crossrefSET.tf +++ b/tf/2021/crossrefSET.tf @@ -2,11 +2,12 @@ @edgeValues @author=BHSA Data: Constantijn Sikkel; Parallels Notebook: Dirk Roorda, Martijn Naaijer @coreData=BHSA -@coreVersion=2021 -@source=Parallels Module +@description=🆗 links between similar passages, based on SET method +@provenance=Parallels notebook, see https://github.com/ETCBC/parallels @valueType=int +@version=2021 @writtenBy=Text-Fabric -@dateWritten=2021-11-30T15:36:19Z +@dateWritten=2021-12-09T14:40:46Z 1414505 1414623 80 1414510 1414513 77 diff --git a/yaml/generic.yaml b/yaml/generic.yaml new file mode 100644 index 0000000..87482aa --- /dev/null +++ b/yaml/generic.yaml @@ -0,0 +1,3 @@ +provenance: Parallels notebook, see https://github.com/ETCBC/parallels +coreData: BHSA +author: "BHSA Data: Constantijn Sikkel; Parallels Notebook: Dirk Roorda, Martijn Naaijer" diff --git a/yaml/parallels.yaml b/yaml/parallels.yaml new file mode 100644 index 0000000..1e6e9a0 --- /dev/null +++ b/yaml/parallels.yaml @@ -0,0 +1,7 @@ +crossref: + desc: 🆗 links between similar passages +crossrefSET: + desc: 🆗 links between similar passages, based on SET method +crossrefLCS: + desc: 🆗 links between similar passages, based on LCS method +