From a229f3ff19693e53b0731e26920ee491e54d918a Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 1 May 2024 17:05:20 +0200
Subject: [PATCH 1/6] Controls for pseudonymisation

---
 popup/interface.html | 20 ++++++++++++++++++--
 popup/interface.js   | 12 ++++++++++--
 2 files changed, 28 insertions(+), 4 deletions(-)
diff --git a/popup/interface.html b/popup/interface.html
index 0441bd7..fdd8723 100644
--- a/popup/interface.html
+++ b/popup/interface.html
@@ -219,7 +219,7 @@
             margin-right: 0.25em;
         }
 
-        input:not([type=checkbox]):not([type=radio]), button {
+        input:not([type=checkbox]):not([type=radio]), button, select {
             background: var(--neutral-contrast-alt);
             color: var(--accent);
             border: 2px solid var(--neutral-contrast);
@@ -265,11 +265,15 @@
             content: ' \2022';
         }
 
-        .fourcat-url-container, .zeeschuimer-master-switch, .import-container {
+        .fourcat-url-container, .zeeschuimer-master-switch, .import-container, .fourcat-pseudonymisation-container {
             text-align: center;
             margin-bottom: 0.5em;
         }
 
+        .fourcat-pseudonymisation-container select {
+            max-width: 15em;
+        }
+
         #upload-status {
             text-align: center;
         }
@@ -410,6 +414,18 @@ <h2><span>Connect to 4CAT</span></h2>
                       title="The URL of the 4CAT server to upload datasets to. Make sure you're logged in to this URL with this browser.">?</span>
             </label>
         </div>
+        <div class="fourcat-pseudonymisation-container">
+            <label>
+                <span>Pseudonymise data in 4CAT:</span>
+                <select id="fourcat-pseudonymisation">
+                    <option value="anonymise">Anonymise - Replace author information with 'REDACTED'</option>
+                    <option value="pseudonymise">Pseudonymise - Replace author information with hashed values</option>
+                    <option value="none">Leave author information as-is</option>
+                </select>
+                <span class="tooltippable"
+                        title="4CAT can pseudonymise the data after importing it in a number of ways. Note that this happens AFTER uploading and non-pseudonymised data will always be sent to the 4CAT server first.">?</span>
+            </label>
+        </div>
         <p id="upload-status"></p>
     </section>
     <section>
diff --git a/popup/interface.js b/popup/interface.js
index c7180ee..a397c84 100644
--- a/popup/interface.js
+++ b/popup/interface.js
@@ -82,7 +82,7 @@ async function get_4cat_url(e) {
  * @returns {Promise<void>}
  */
 async function set_4cat_url(e) {
-    if(e !== true && !e.target.matches('#fourcat-url')) {
+    if(e !== true && !e.target.matches('#fourcat-url') && !e.target.matches('#fourcat-pseudonymisation')) {
         return;
     }
 
@@ -105,6 +105,8 @@ async function set_4cat_url(e) {
         }
     }
 
+    await background.browser.storage.local.set({'4cat-pseudonymise': document.querySelector('#fourcat-pseudonymisation').value});
+
     have_4cat = (url && url.length > 0);
 }
 
@@ -314,8 +316,10 @@ async function button_handler(event) {
         xhr = new XMLHttpRequest();
         xhr.aborted = false;
         let upload_url = await get_4cat_url();
+        let pseudonymise_bit = document.querySelector('#fourcat-pseudonymisation').value
+        pseudonymise_bit = pseudonymise_bit !== 'none' ? '?pseudonymise=' + pseudonymise_bit : ''
 
-        xhr.open("POST", upload_url + "/api/import-dataset/", true);
+        xhr.open("POST", upload_url + "/api/import-dataset/" + pseudonymise_bit, true);
         xhr.setRequestHeader("X-Zeeschuimer-Platform", platform)
         xhr.onloadstart = function () {
             status.innerText = 'Starting upload...';
@@ -673,5 +677,9 @@ document.addEventListener('DOMContentLoaded', async function () {
     const fourcat_url = await background.browser.storage.local.get('4cat-url');
     document.querySelector('#fourcat-url').value = fourcat_url['4cat-url'] ? fourcat_url['4cat-url'] : '';
 
+    const pseudonymise = await background.browser.storage.local.get('4cat-pseudonymise');
+    console.log(pseudonymise);
+    document.querySelector('#fourcat-pseudonymisation').value = pseudonymise['4cat-pseudonymise'] ? pseudonymise['4cat-pseudonymise'] : 'none';
+
     browser.downloads.onChanged.addListener(downloadListener);
 });
\ No newline at end of file

From 70305bfaaffd829f60d76c594951cb139f633099 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Thu, 23 May 2024 15:29:17 +0200
Subject: [PATCH 2/6] Clarify tooltip

---
 popup/interface.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/popup/interface.html b/popup/interface.html
index 9ee8d06..8f5d842 100644
--- a/popup/interface.html
+++ b/popup/interface.html
@@ -385,7 +385,7 @@
 <article>
     <header>
         <h1>Zeeschuimer</h1>
-        <span class="version"><a href="https://github.com/digitalmethodsinitiative/zeeschuimer/releases/tag/v1.10.1">v1.10.1</a></span>
+        <span class="version"><a href="https://github.com/digitalmethodsinitiative/zeeschuimer/releases/tag/v1.11.0">v1.11.0</a></span>
     </header>
     <section id="status">
         <h2><span>Captured data objects</span></h2>
@@ -423,7 +423,7 @@ <h2><span>Connect to 4CAT</span></h2>
                     <option value="none">Leave author information as-is</option>
                 </select>
                 <span class="tooltippable"
-                        title="4CAT can pseudonymise the data after importing it in a number of ways. Note that this happens AFTER uploading and non-pseudonymised data will always be sent to the 4CAT server first.">?</span>
+                        title="4CAT can pseudonymise the data after importing it in a number of ways. Note that this happens AFTER uploading and non-pseudonymised data will always be sent to the 4CAT server first. 4CAT versions prior to 1.43 do not support this and will require you to manually pseudonymise after uploading.">?</span>
             </label>
         </div>
         <p id="upload-status"></p>

From dfd934946a4a8a18614185fec9330227542f5103 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Thu, 23 May 2024 15:29:24 +0200
Subject: [PATCH 3/6] Bump version

---
 .zenodo.json  | 2 +-
 manifest.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.zenodo.json b/.zenodo.json
index 01a63d4..0589681 100644
--- a/.zenodo.json
+++ b/.zenodo.json
@@ -3,7 +3,7 @@
   "license": "MPL-2.0",
   "title": "Zeeschuimer",
   "upload_type": "software",
-  "version": "v1.10.1",
+  "version": "v1.11.0",
   "keywords": [
     "scraping", "data capture", "4cat", "instagram", "tiktok"
   ],
diff --git a/manifest.json b/manifest.json
index 9478347..85cf353 100644
--- a/manifest.json
+++ b/manifest.json
@@ -3,7 +3,7 @@
   "description": "Collect data while browsing social media platforms and upload it for analysis later",
   "manifest_version": 2,
   "name": "Zeeschuimer",
-  "version": "1.10.1",
+  "version": "1.11.0",
   "homepage_url": "https://github.com/digitalmethodsinitiative/zeeschuimer",
 
   "browser_specific_settings": {

From 73c1f92037ec20a9ee832db9f32636c3c229e20d Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Fri, 23 Aug 2024 17:23:43 +0200
Subject: [PATCH 4/6] Squashed commit of the following:

commit 03fb948fe5ab410c3edccb49e20491df644cac03
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Aug 21 14:12:51 2024 +0200

    Bump version

commit 1fca90410f1060c7797c043e22978b49871a8779
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Aug 21 14:12:01 2024 +0200

    Fix Instagram module

commit 2d2337cdf10f1fae3219671bfbeda006c996fa2c
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Aug 21 14:11:54 2024 +0200

    Fix 9gag module

commit 10c4396dd33b36e7cf4ea69b59b23973ce356c37
Author: Dale Wahl <dalewahl@gmail.com>
Date:   Wed Jun 12 12:03:52 2024 +0200

    linkedin: fix regex; check additional data_bits (if present)
---
 modules/9gag.js      | 17 ++++++++--------
 modules/instagram.js | 46 ++++++++++++++++++--------------------------
 modules/linkedin.js  | 38 ++++++++++++++++++++++++++++++++++--
 3 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/modules/9gag.js b/modules/9gag.js
index 11e0242..eec376a 100644
--- a/modules/9gag.js
+++ b/modules/9gag.js
@@ -22,19 +22,20 @@ zeeschuimer.register_module(
                 return [];
             }
             try {
-                response = JSON.parse(response.split(embedded_sigil_end)[0]);
+                const embedded_json = response.split(embedded_sigil_end)[0];
+                response = JSON.parse(embedded_json);
             } catch (e) {
                 return [];
             }
-        } else {
-            try {
-                data = JSON.parse(response);
-            } catch (SyntaxError) {
-                return [];
-            }
         }
 
-        if(!("data" in data) || typeof data["data"] !== 'object' || !("posts" in data["data"])) {
+        try {
+            data = JSON.parse(response);
+        } catch (e) {
+            return [];
+        }
+
+        if(!data || typeof data["data"] !== 'object' || !("data" in data) || !("posts" in data["data"])) {
             return [];
         }
 
diff --git a/modules/instagram.js b/modules/instagram.js
index 6bf0a1a..b71eb60 100644
--- a/modules/instagram.js
+++ b/modules/instagram.js
@@ -3,25 +3,12 @@ zeeschuimer.register_module(
     'instagram.com',
     function (response, source_platform_url, source_url) {
         let domain = source_platform_url.split("/")[2].toLowerCase().replace(/^www\./, '');
-        let endpoint = source_url.split("/").slice(3).join("/").split("?")[0].split("#")[0].replace(/\/$/, '');
 
         if (!["instagram.com"].includes(domain)) {
+            console.log('ignoring non-instagram url ' + source_url);
             return [];
         }
 
-        /*let whitelisted_endpoints = [
-            "graphql/query", //live-loading @ front page
-            "api/v1/collections/list",
-            "api/v1/feed/user/33646200", //live-loading @ user page
-            "api/v1/tags/blessed/sections", //live-loading @ tag explore page
-            "api/v1/locations/214262158/sections", //live-loading @ location explore page
-            "api/v1/clips/music", //live-loading @ music overview page
-        ]
-
-        if(!whitelisted_endpoints.includes(endpoint)) {
-            return [];
-        }*/
-
         // determine what part of instagram we're working in
         // 'view' unused for now but may have some bearing on how to parse the data
         // in any case
@@ -32,11 +19,13 @@ zeeschuimer.register_module(
             view = "frontpage";
         } else if (["direct", "account", "directory", "lite", "legal"].includes(path[3])) {
             // not post listings but misc instagram views/pages
+            // console.log('ignoring misc url ' + source_url);
             return [];
         } else if (source_url.indexOf('injected_story_units') >= 0) {
             // injected ads (this URL appears on many ad blocklists!)
             // might enable if we decide to also capture ads? but not clear where these actually show up in the
             // interface...
+            // console.log('ignoring ads from ' + source_url);
             return [];
         } else if (path[3] === "explore") {
             // hashtag, location view
@@ -52,14 +41,14 @@ zeeschuimer.register_module(
         if ((source_platform_url.indexOf('reels/audio') >= 0
                 || source_platform_url.indexOf('/explore/') >= 0
             )
-            && source_url.endsWith('graphql')) {
+            && (source_url.endsWith('graphql') || source_url.endsWith('graphql/query'))) {
             // reels audio page f.ex. loads personalised reels in the background (unrelated to the audio) but doesn't
-            // seem to actually use them
+            // seem to actually use them)
 
+            // console.log('ignoring pre-cache ' + source_url);
             return [];
         }
 
-
         let datas = [];
         try {
             // if it's JSON already, just parse it
@@ -68,7 +57,7 @@ zeeschuimer.register_module(
             // data can be embedded in the HTML in these JavaScript statements
             // this is mostly used for:
             // - single post pages (e.g. https://www.instagram.com/p/C1hWCZLPQ9T/)
-            //   ✔️ confirmed working as of 2024-apr-19
+            //   ✔️ confirmed working as of 2024-aug-21
 
             let js_prefixes = [
                 "{\"require\":[[\"ScheduledServerJS\",\"handle\",null,[{\"__bbox\":{\"require\":[[\"RelayPrefetchedStreamCache\",\"next\",[],["
@@ -93,7 +82,7 @@ zeeschuimer.register_module(
                         json_bit = json_bit.substring(0, -1);
                     }
 
-                    if (json_bit.indexOf('adp_PolarisDesktopPostPageRelatedMediaGridQueryRelayPreloader') >= 0) {
+                    if (json_bit.indexOf('adp_PolarisDesktopPostPageRelatedMediaGrid') >= 0) {
                         // 'related posts', this is never what we are looking for
                         continue;
                     }
@@ -117,12 +106,14 @@ zeeschuimer.register_module(
             }
 
             if (datas.length === 0) {
+                // console.log('no datas for ' + source_url);
                 return [];
             }
         }
 
         if (datas.length === 1 && 'lightspeed_web_request_for_igd' in datas[0] && source_url.endsWith('graphql')) {
             // this is one of those background requests
+            // console.log('ignoring background request ' + source_url);
             datas = [];
         }
 
@@ -142,11 +133,11 @@ zeeschuimer.register_module(
 
                 // pages not covered:
                 // - explore (e.g. https://www.instagram.com/explore/)
-                //   ❌ as of 2024-feb-20
+                //   ❌ as of 2024-aug-21
                 // - 'tagged' pages for a user (e.g. https://www.instagram.com/steveo/tagged/)
-                //   ❌ as of 2024-feb-20
+                //   ❌ as of 2024-aug-21
                 // - 'reels' user pages (e.g. https://www.instagram.com/ogata.yoshiyuki/reels/)
-                //   ❌ as of 2024-feb-20
+                //   ❌ as of 2024-aug-21
                 // these do not load enough post metadata (e.g. author or caption), so too different from other items
                 // to parse
                 // - suggested posts on user feed
@@ -155,11 +146,11 @@ zeeschuimer.register_module(
                 if (possible_item_lists.includes(property) || property === "items") {
                     // - posts on explore pages for specific tags (e.g. https://www.instagram.com/explore/tags/blessed/)
                     // - posts on explore pages for locations (e.g. https://www.instagram.com/explore/locations/238875664/switzerland/)
-                    //   ✔️ confirmed working as of 2024-feb-20
+                    //   ✔️ confirmed working as of 2024-aug-21
                     // - posts on explore pages for sounds (e.g. https://www.instagram.com/reels/audio/290315579897542/)
-                    //   ✔️ confirmed working as of 2024-feb-20
+                    //   ✔️ confirmed working as of 2024-aug-21
                     // - posts when opened by clicking on them
-                    //   ✔️ confirmed working as of 2024-feb-20
+                    //   ✔️ confirmed working as of 2024-aug-21
                     let items;
                     if (property === "medias" || property === "fill_items") {
                         items = obj[property].map(media => media["media"]);
@@ -167,7 +158,7 @@ zeeschuimer.register_module(
                         items = obj[property].map(media => media["media_or_ad"]);
                     } else if (property === "items" && obj[property].length === obj[property].filter(i => Object.getOwnPropertyNames(i).join('') === 'media').length) {
                         // - posts on explore pages for sounds (e.g. https://www.instagram.com/reels/audio/290315579897542/)
-                        //   ✔️ confirmed working as of 2024-feb-20
+                        //   ✔️ confirmed working as of 2024-aug-21
                         if(property === 'items' && 'design' in obj) {
                             // this is loaded, but never actually displayed...
                             // seems to be a preview of reels for a given tag, but again, not
@@ -211,7 +202,7 @@ zeeschuimer.register_module(
                     }).map(node => node["media"]));
                 } else if (["xdt_api__v1__feed__user_timeline_graphql_connection"].includes(property)) {
                     // - posts on user pages (e.g. https://www.instagram.com/ogata.yoshiyuki/)
-                    //   ✔️ confirmed working as of 2024-feb-20
+                    //   ✔️ confirmed working as of 2024-aug-21
                     edges.push(...obj[property]["edges"].filter(edge => "node" in edge).map(edge => edge["node"]).filter(node => {
                         return node !== null
                             && "id" in node
@@ -236,6 +227,7 @@ zeeschuimer.register_module(
             }
         }
 
+        // console.log('got ' + edges.length + ' via ' + source_url)
         return edges;
     }
 );
\ No newline at end of file
diff --git a/modules/linkedin.js b/modules/linkedin.js
index cf387ad..58ff675 100644
--- a/modules/linkedin.js
+++ b/modules/linkedin.js
@@ -10,45 +10,75 @@ zeeschuimer.register_module(
         // objects embedded in HTML are identified by this bit of text
         let items = [];
         let data = [];
+        let data_type = "";
         try {
             // when dealing with JSON, just parse that JSON and process it
             data.push(JSON.parse(response));
+            data_type = "JSON";
         } catch (e) {
             // data is not JSON, so it's probably HTML
             // HTML has data embedded in <code> tags
             // store these for processing
-            const code_regex = RegExp(/<code>(.[^<]+)<\/code>/g);
+            const code_regex = RegExp(/<code(.[^<]+)<\/code>/g);
+
             for (const code_bit of response.matchAll(code_regex)) {
+                // console.log("Code; checking for JSON");
                 try {
                     // use he to decode from HTML entities (the way the data is embedded)
                     data.push(JSON.parse(he.decode(code_bit)));
+                    data_type = "HTML";
+                    // console.log("Found JSON in code block");
                 } catch (e) {
                 }
             }
         }
 
         const eligible_list_types = ["feedDashMainFeedByMainFeed", "feedDashInterestUpdatesByInterestFeedByKeywords", "feedDashProfileUpdatesByMemberShareFeed"]
+        const uninterseting_list_types = ["*dashMySettings", "messagingDashMessagingSettings", "*searchDashSearchHome", "searchDashTypeaheadByGlobalTypeahead", "messagingDashAffiliatedMailboxesAll", "legoDashPageContentsByPageKeyAndSlotId", "searchDashFilterClustersByFilters"]
         for (const data_bit of data) {
             // now we have the data, try to parse it
             // is this object post data?
             let item_index = [];
+            let location = "";
             if ("data" in data_bit && "included" in data_bit) {
                 // items may be referenced as 'results' for search result pages or 'elements' for the feed
                 let item_key = '';
                 if ("*elements" in data_bit["data"]) {
                     item_index = data_bit["data"]["*elements"];
+                    location = "data.*elements";
                 } else if ("results" in data_bit["data"]) {
                     item_index = data_bit["data"]["results"];
+                    location = "data.results";
                 } else if ("data" in data_bit["data"] && Object.keys(data_bit["data"]["data"]).filter(k => eligible_list_types.includes(k))) {
                     for(const k of eligible_list_types) {
                         if(k in data_bit["data"]["data"]) {
                             item_index = data_bit["data"]["data"][k]["*elements"];
+                            location = `data.data.${k}.*elements`;
                             break;
                         }
                     }
+                    if (location === "") {
+                        // Found nothing eligible
+                        let uninteresting = false;
+                        for (const k of uninterseting_list_types) {
+                            if(k in data_bit["data"]["data"]) {
+                                uninteresting = true;
+                            }
+                        }
+
+                        if (!uninteresting) {
+                            // Possibly interesting data
+                            // console.log("No items found in data_bit:");
+                            // console.log(data_bit);
+                        }
+                        continue;
+                    }
                 } else {
-                    return [];
+                    // console.log("No items found in data:");
+                    // console.log(data_bit);
+                    continue;
                 }
+                //console.log(`Searching items at ${location} from ${data_type} data on ${source_platform_url}`);
 
                 // there is a list of objects, each with an ID
                 // and a separate list of items to display, a list of those IDs
@@ -61,6 +91,7 @@ zeeschuimer.register_module(
 
                 // then we get the objects with the IDs in the item list
                 // and that is our result set!
+                let num_items = 0;
                 for (let object_ref in item_index) {
                     let result = item_index[object_ref];
 
@@ -72,6 +103,7 @@ zeeschuimer.register_module(
                     // we are (for now?) only interested in posts, which are identified in this way
                     if (result.indexOf('urn:li:fs_updateV2:(urn:li:activity:') !== 0
                       && result.indexOf('urn:li:fsd_update:(urn:li:activity:') !== 0) {
+                        // console.log(`Skipping non-post item ${result}`);
                         continue;
                     }
 
@@ -79,7 +111,9 @@ zeeschuimer.register_module(
                     result_object["id"] = result;
 
                     items.push(result_object);
+                    num_items++;
                 }
+                console.log(`Found ${num_items} items in ${location} from ${data_type} data on ${source_platform_url}`);
 
             }
         }

From 4c5847aafff3eecf642b34e136c540d532a1d0b9 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 23 Sep 2024 13:50:59 +0200
Subject: [PATCH 5/6] Fix wait on captcha in tests

---
 tests/test.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/test.py b/tests/test.py
index a176967..71b8a7d 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -182,6 +182,7 @@
                     captcha_element = driver.find_element(By.CSS_SELECTOR, settings.get("captcha-selector"))
                     if captcha_element.is_displayed():
                         print(colored(f"{indent} :: [⚠️] Captcha detected... Press Enter after you have solved the captcha", "yellow"))
+                        input()
                 except selenium_exceptions.NoSuchElementException:
                     pass
 
@@ -197,7 +198,11 @@
                 # scroll and check if more items are loaded
                 driver.switch_to.window(handles[1])
                 driver.execute_script("window.scrollBy(0, document.querySelector('html').scrollHeight);")
-                time.sleep(settings.get("wait", 5))
+                time.sleep(0.5)
+                driver.execute_script("window.scrollBy(0, document.querySelector('html').scrollHeight);")
+                time.sleep(0.5)
+                driver.execute_script("window.scrollBy(0, document.querySelector('html').scrollHeight);")
+                time.sleep(settings.get("wait", 5) - 1)
 
                 driver.switch_to.window(handles[0])
                 num_after_scroll = int(re.sub("[^0-9]", "", driver.execute_script(

From 6992475f28148992eb6a001d1ac17a298b26bbb6 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 23 Sep 2024 15:19:51 +0200
Subject: [PATCH 6/6] Squashed commit of the following:

commit 9b6e26310f83e8f7c3190f6e51b577ee7fcd9371
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Mon Sep 23 15:18:20 2024 +0200

    Fix wait on captcha in tests

commit 0ad0fb6ef8fa7b61f188cec41b623e3d5e971c35
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Mon Sep 23 15:17:23 2024 +0200

    Remove debug log from Instagram module

commit 0400b1af4390abe0cbc253d340d350455fac15b9
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Mon Sep 23 15:17:10 2024 +0200

    Fix LinkedIn search results

commit 03fb948fe5ab410c3edccb49e20491df644cac03
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Aug 21 14:12:51 2024 +0200

    Bump version

commit 1fca90410f1060c7797c043e22978b49871a8779
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Aug 21 14:12:01 2024 +0200

    Fix Instagram module

commit 2d2337cdf10f1fae3219671bfbeda006c996fa2c
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Aug 21 14:11:54 2024 +0200

    Fix 9gag module

commit 10c4396dd33b36e7cf4ea69b59b23973ce356c37
Author: Dale Wahl <dalewahl@gmail.com>
Date:   Wed Jun 12 12:03:52 2024 +0200

    linkedin: fix regex; check additional data_bits (if present)
---
 modules/linkedin.js | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/modules/linkedin.js b/modules/linkedin.js
index 58ff675..320dff9 100644
--- a/modules/linkedin.js
+++ b/modules/linkedin.js
@@ -12,20 +12,24 @@ zeeschuimer.register_module(
         let data = [];
         let data_type = "";
         try {
+            if(response.indexOf('<!DOCTYPE html>') >= 0) {
+                throw new Error();
+            }
             // when dealing with JSON, just parse that JSON and process it
-            data.push(JSON.parse(response));
+            const json_data = JSON.parse(response);
+            data.push(json_data);
             data_type = "JSON";
         } catch (e) {
             // data is not JSON, so it's probably HTML
             // HTML has data embedded in <code> tags
             // store these for processing
-            const code_regex = RegExp(/<code(.[^<]+)<\/code>/g);
+            const code_regex = RegExp(/<code.*>([^<]+)<\/code>/g);
 
             for (const code_bit of response.matchAll(code_regex)) {
                 // console.log("Code; checking for JSON");
                 try {
                     // use he to decode from HTML entities (the way the data is embedded)
-                    data.push(JSON.parse(he.decode(code_bit)));
+                    data.push(JSON.parse(he.decode(code_bit[1].trim())));
                     data_type = "HTML";
                     // console.log("Found JSON in code block");
                 } catch (e) {
@@ -33,8 +37,8 @@ zeeschuimer.register_module(
             }
         }
 
-        const eligible_list_types = ["feedDashMainFeedByMainFeed", "feedDashInterestUpdatesByInterestFeedByKeywords", "feedDashProfileUpdatesByMemberShareFeed"]
-        const uninterseting_list_types = ["*dashMySettings", "messagingDashMessagingSettings", "*searchDashSearchHome", "searchDashTypeaheadByGlobalTypeahead", "messagingDashAffiliatedMailboxesAll", "legoDashPageContentsByPageKeyAndSlotId", "searchDashFilterClustersByFilters"]
+        const eligible_list_types = ["feedDashMainFeedByMainFeed", "feedDashInterestUpdatesByInterestFeedByKeywords", "feedDashProfileUpdatesByMemberShareFeed", "searchDashClustersByAll"]
+        const uninteresting_list_types = ["*dashMySettings", "messagingDashMessagingSettings", "*searchDashSearchHome", "searchDashTypeaheadByGlobalTypeahead", "messagingDashAffiliatedMailboxesAll", "legoDashPageContentsByPageKeyAndSlotId", "searchDashFilterClustersByFilters"]
         for (const data_bit of data) {
             // now we have the data, try to parse it
             // is this object post data?
@@ -52,15 +56,23 @@ zeeschuimer.register_module(
                 } else if ("data" in data_bit["data"] && Object.keys(data_bit["data"]["data"]).filter(k => eligible_list_types.includes(k))) {
                     for(const k of eligible_list_types) {
                         if(k in data_bit["data"]["data"]) {
-                            item_index = data_bit["data"]["data"][k]["*elements"];
-                            location = `data.data.${k}.*elements`;
+                            const elements_key = (data_bit["data"]["data"]['*elements'] !== undefined) ? '*elements' : 'elements';
+                            item_index = data_bit["data"]["data"][k][elements_key];
+                            location = `data.data.${k}.${elements_key}`;
+
+                            if (typeof (item_index) !== 'string' && item_index.length > 0 && item_index[0]['items'] !== undefined) {
+                                // embedded results on search page
+                                item_index = item_index[0]['items'].map(item => {
+                                    return item['item']['searchFeedUpdate']['*update'];
+                                });
+                            }
                             break;
                         }
                     }
                     if (location === "") {
                         // Found nothing eligible
                         let uninteresting = false;
-                        for (const k of uninterseting_list_types) {
+                        for (const k of uninteresting_list_types) {
                             if(k in data_bit["data"]["data"]) {
                                 uninteresting = true;
                             }