From a229f3ff19693e53b0731e26920ee491e54d918a Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Wed, 1 May 2024 17:05:20 +0200 Subject: [PATCH 1/6] Controls for pseudonymisation --- popup/interface.html | 20 ++++++++++++++++++-- popup/interface.js | 12 ++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/popup/interface.html b/popup/interface.html index 0441bd7..fdd8723 100644 --- a/popup/interface.html +++ b/popup/interface.html @@ -219,7 +219,7 @@ margin-right: 0.25em; } - input:not([type=checkbox]):not([type=radio]), button { + input:not([type=checkbox]):not([type=radio]), button, select { background: var(--neutral-contrast-alt); color: var(--accent); border: 2px solid var(--neutral-contrast); @@ -265,11 +265,15 @@ content: ' \2022'; } - .fourcat-url-container, .zeeschuimer-master-switch, .import-container { + .fourcat-url-container, .zeeschuimer-master-switch, .import-container, .fourcat-pseudonymisation-container { text-align: center; margin-bottom: 0.5em; } + .fourcat-pseudonymisation-container select { + max-width: 15em; + } + #upload-status { text-align: center; } @@ -410,6 +414,18 @@

Connect to 4CAT

title="The URL of the 4CAT server to upload datasets to. Make sure you're logged in to this URL with this browser.">? +
+ +

diff --git a/popup/interface.js b/popup/interface.js index c7180ee..a397c84 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -82,7 +82,7 @@ async function get_4cat_url(e) { * @returns {Promise} */ async function set_4cat_url(e) { - if(e !== true && !e.target.matches('#fourcat-url')) { + if(e !== true && !e.target.matches('#fourcat-url') && !e.target.matches('#fourcat-pseudonymisation')) { return; } @@ -105,6 +105,8 @@ async function set_4cat_url(e) { } } + await background.browser.storage.local.set({'4cat-pseudonymise': document.querySelector('#fourcat-pseudonymisation').value}); + have_4cat = (url && url.length > 0); } @@ -314,8 +316,10 @@ async function button_handler(event) { xhr = new XMLHttpRequest(); xhr.aborted = false; let upload_url = await get_4cat_url(); + let pseudonymise_bit = document.querySelector('#fourcat-pseudonymisation').value + pseudonymise_bit = pseudonymise_bit !== 'none' ? '?pseudonymise=' + pseudonymise_bit : '' - xhr.open("POST", upload_url + "/api/import-dataset/", true); + xhr.open("POST", upload_url + "/api/import-dataset/" + pseudonymise_bit, true); xhr.setRequestHeader("X-Zeeschuimer-Platform", platform) xhr.onloadstart = function () { status.innerText = 'Starting upload...'; @@ -673,5 +677,9 @@ document.addEventListener('DOMContentLoaded', async function () { const fourcat_url = await background.browser.storage.local.get('4cat-url'); document.querySelector('#fourcat-url').value = fourcat_url['4cat-url'] ? fourcat_url['4cat-url'] : ''; + const pseudonymise = await background.browser.storage.local.get('4cat-pseudonymise'); + console.log(pseudonymise); + document.querySelector('#fourcat-pseudonymisation').value = pseudonymise['4cat-pseudonymise'] ? pseudonymise['4cat-pseudonymise'] : 'none'; + browser.downloads.onChanged.addListener(downloadListener); }); \ No newline at end of file From 70305bfaaffd829f60d76c594951cb139f633099 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Thu, 23 May 2024 15:29:17 +0200 Subject: [PATCH 2/6] Clarify tooltip --- popup/interface.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/popup/interface.html b/popup/interface.html index 9ee8d06..8f5d842 100644 --- a/popup/interface.html +++ b/popup/interface.html @@ -385,7 +385,7 @@

Zeeschuimer

- v1.10.1 + v1.11.0

Captured data objects

@@ -423,7 +423,7 @@

Connect to 4CAT

? + title="4CAT can pseudonymise the data after importing it in a number of ways. Note that this happens AFTER uploading and non-pseudonymised data will always be sent to the 4CAT server first. 4CAT versions prior to 1.43 do not support this and will require you to manually pseudonymise after uploading.">?

From dfd934946a4a8a18614185fec9330227542f5103 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Thu, 23 May 2024 15:29:24 +0200 Subject: [PATCH 3/6] Bump version --- .zenodo.json | 2 +- manifest.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.zenodo.json b/.zenodo.json index 01a63d4..0589681 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -3,7 +3,7 @@ "license": "MPL-2.0", "title": "Zeeschuimer", "upload_type": "software", - "version": "v1.10.1", + "version": "v1.11.0", "keywords": [ "scraping", "data capture", "4cat", "instagram", "tiktok" ], diff --git a/manifest.json b/manifest.json index 9478347..85cf353 100644 --- a/manifest.json +++ b/manifest.json @@ -3,7 +3,7 @@ "description": "Collect data while browsing social media platforms and upload it for analysis later", "manifest_version": 2, "name": "Zeeschuimer", - "version": "1.10.1", + "version": "1.11.0", "homepage_url": "https://github.com/digitalmethodsinitiative/zeeschuimer", "browser_specific_settings": { From 73c1f92037ec20a9ee832db9f32636c3c229e20d Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Fri, 23 Aug 2024 17:23:43 +0200 Subject: [PATCH 4/6] Squashed commit of the following: commit 03fb948fe5ab410c3edccb49e20491df644cac03 Author: Stijn Peeters Date: Wed Aug 21 14:12:51 2024 +0200 Bump version commit 1fca90410f1060c7797c043e22978b49871a8779 Author: Stijn Peeters Date: Wed Aug 21 14:12:01 2024 +0200 Fix Instagram module commit 2d2337cdf10f1fae3219671bfbeda006c996fa2c Author: Stijn Peeters Date: Wed Aug 21 14:11:54 2024 +0200 Fix 9gag module commit 10c4396dd33b36e7cf4ea69b59b23973ce356c37 Author: Dale Wahl Date: Wed Jun 12 12:03:52 2024 +0200 linkedin: fix regex; check additional data_bits (if present) --- modules/9gag.js | 17 ++++++++-------- modules/instagram.js | 46 ++++++++++++++++++-------------------------- modules/linkedin.js | 38 ++++++++++++++++++++++++++++++++++-- 3 files changed, 64 insertions(+), 37 deletions(-) diff --git a/modules/9gag.js b/modules/9gag.js index 11e0242..eec376a 100644 --- a/modules/9gag.js +++ b/modules/9gag.js @@ -22,19 +22,20 @@ zeeschuimer.register_module( return []; } try { - response = JSON.parse(response.split(embedded_sigil_end)[0]); + const embedded_json = response.split(embedded_sigil_end)[0]; + response = JSON.parse(embedded_json); } catch (e) { return []; } - } else { - try { - data = JSON.parse(response); - } catch (SyntaxError) { - return []; - } } - if(!("data" in data) || typeof data["data"] !== 'object' || !("posts" in data["data"])) { + try { + data = JSON.parse(response); + } catch (e) { + return []; + } + + if(!data || typeof data["data"] !== 'object' || !("data" in data) || !("posts" in data["data"])) { return []; } diff --git a/modules/instagram.js b/modules/instagram.js index 6bf0a1a..b71eb60 100644 --- a/modules/instagram.js +++ b/modules/instagram.js @@ -3,25 +3,12 @@ zeeschuimer.register_module( 'instagram.com', function (response, source_platform_url, source_url) { let domain = source_platform_url.split("/")[2].toLowerCase().replace(/^www\./, ''); - let endpoint = source_url.split("/").slice(3).join("/").split("?")[0].split("#")[0].replace(/\/$/, ''); if (!["instagram.com"].includes(domain)) { + console.log('ignoring non-instagram url ' + source_url); return []; } - /*let whitelisted_endpoints = [ - "graphql/query", //live-loading @ front page - "api/v1/collections/list", - "api/v1/feed/user/33646200", //live-loading @ user page - "api/v1/tags/blessed/sections", //live-loading @ tag explore page - "api/v1/locations/214262158/sections", //live-loading @ location explore page - "api/v1/clips/music", //live-loading @ music overview page - ] - - if(!whitelisted_endpoints.includes(endpoint)) { - return []; - }*/ - // determine what part of instagram we're working in // 'view' unused for now but may have some bearing on how to parse the data // in any case @@ -32,11 +19,13 @@ zeeschuimer.register_module( view = "frontpage"; } else if (["direct", "account", "directory", "lite", "legal"].includes(path[3])) { // not post listings but misc instagram views/pages + // console.log('ignoring misc url ' + source_url); return []; } else if (source_url.indexOf('injected_story_units') >= 0) { // injected ads (this URL appears on many ad blocklists!) // might enable if we decide to also capture ads? but not clear where these actually show up in the // interface... + // console.log('ignoring ads from ' + source_url); return []; } else if (path[3] === "explore") { // hashtag, location view @@ -52,14 +41,14 @@ zeeschuimer.register_module( if ((source_platform_url.indexOf('reels/audio') >= 0 || source_platform_url.indexOf('/explore/') >= 0 ) - && source_url.endsWith('graphql')) { + && (source_url.endsWith('graphql') || source_url.endsWith('graphql/query'))) { // reels audio page f.ex. loads personalised reels in the background (unrelated to the audio) but doesn't - // seem to actually use them + // seem to actually use them) + // console.log('ignoring pre-cache ' + source_url); return []; } - let datas = []; try { // if it's JSON already, just parse it @@ -68,7 +57,7 @@ zeeschuimer.register_module( // data can be embedded in the HTML in these JavaScript statements // this is mostly used for: // - single post pages (e.g. https://www.instagram.com/p/C1hWCZLPQ9T/) - // ✔️ confirmed working as of 2024-apr-19 + // ✔️ confirmed working as of 2024-aug-21 let js_prefixes = [ "{\"require\":[[\"ScheduledServerJS\",\"handle\",null,[{\"__bbox\":{\"require\":[[\"RelayPrefetchedStreamCache\",\"next\",[],[" @@ -93,7 +82,7 @@ zeeschuimer.register_module( json_bit = json_bit.substring(0, -1); } - if (json_bit.indexOf('adp_PolarisDesktopPostPageRelatedMediaGridQueryRelayPreloader') >= 0) { + if (json_bit.indexOf('adp_PolarisDesktopPostPageRelatedMediaGrid') >= 0) { // 'related posts', this is never what we are looking for continue; } @@ -117,12 +106,14 @@ zeeschuimer.register_module( } if (datas.length === 0) { + // console.log('no datas for ' + source_url); return []; } } if (datas.length === 1 && 'lightspeed_web_request_for_igd' in datas[0] && source_url.endsWith('graphql')) { // this is one of those background requests + // console.log('ignoring background request ' + source_url); datas = []; } @@ -142,11 +133,11 @@ zeeschuimer.register_module( // pages not covered: // - explore (e.g. https://www.instagram.com/explore/) - // ❌ as of 2024-feb-20 + // ❌ as of 2024-aug-21 // - 'tagged' pages for a user (e.g. https://www.instagram.com/steveo/tagged/) - // ❌ as of 2024-feb-20 + // ❌ as of 2024-aug-21 // - 'reels' user pages (e.g. https://www.instagram.com/ogata.yoshiyuki/reels/) - // ❌ as of 2024-feb-20 + // ❌ as of 2024-aug-21 // these do not load enough post metadata (e.g. author or caption), so too different from other items // to parse // - suggested posts on user feed @@ -155,11 +146,11 @@ zeeschuimer.register_module( if (possible_item_lists.includes(property) || property === "items") { // - posts on explore pages for specific tags (e.g. https://www.instagram.com/explore/tags/blessed/) // - posts on explore pages for locations (e.g. https://www.instagram.com/explore/locations/238875664/switzerland/) - // ✔️ confirmed working as of 2024-feb-20 + // ✔️ confirmed working as of 2024-aug-21 // - posts on explore pages for sounds (e.g. https://www.instagram.com/reels/audio/290315579897542/) - // ✔️ confirmed working as of 2024-feb-20 + // ✔️ confirmed working as of 2024-aug-21 // - posts when opened by clicking on them - // ✔️ confirmed working as of 2024-feb-20 + // ✔️ confirmed working as of 2024-aug-21 let items; if (property === "medias" || property === "fill_items") { items = obj[property].map(media => media["media"]); @@ -167,7 +158,7 @@ zeeschuimer.register_module( items = obj[property].map(media => media["media_or_ad"]); } else if (property === "items" && obj[property].length === obj[property].filter(i => Object.getOwnPropertyNames(i).join('') === 'media').length) { // - posts on explore pages for sounds (e.g. https://www.instagram.com/reels/audio/290315579897542/) - // ✔️ confirmed working as of 2024-feb-20 + // ✔️ confirmed working as of 2024-aug-21 if(property === 'items' && 'design' in obj) { // this is loaded, but never actually displayed... // seems to be a preview of reels for a given tag, but again, not @@ -211,7 +202,7 @@ zeeschuimer.register_module( }).map(node => node["media"])); } else if (["xdt_api__v1__feed__user_timeline_graphql_connection"].includes(property)) { // - posts on user pages (e.g. https://www.instagram.com/ogata.yoshiyuki/) - // ✔️ confirmed working as of 2024-feb-20 + // ✔️ confirmed working as of 2024-aug-21 edges.push(...obj[property]["edges"].filter(edge => "node" in edge).map(edge => edge["node"]).filter(node => { return node !== null && "id" in node @@ -236,6 +227,7 @@ zeeschuimer.register_module( } } + // console.log('got ' + edges.length + ' via ' + source_url) return edges; } ); \ No newline at end of file diff --git a/modules/linkedin.js b/modules/linkedin.js index cf387ad..58ff675 100644 --- a/modules/linkedin.js +++ b/modules/linkedin.js @@ -10,45 +10,75 @@ zeeschuimer.register_module( // objects embedded in HTML are identified by this bit of text let items = []; let data = []; + let data_type = ""; try { // when dealing with JSON, just parse that JSON and process it data.push(JSON.parse(response)); + data_type = "JSON"; } catch (e) { // data is not JSON, so it's probably HTML // HTML has data embedded in tags // store these for processing - const code_regex = RegExp(/(.[^<]+)<\/code>/g); + const code_regex = RegExp(//g); + for (const code_bit of response.matchAll(code_regex)) { + // console.log("Code; checking for JSON"); try { // use he to decode from HTML entities (the way the data is embedded) data.push(JSON.parse(he.decode(code_bit))); + data_type = "HTML"; + // console.log("Found JSON in code block"); } catch (e) { } } } const eligible_list_types = ["feedDashMainFeedByMainFeed", "feedDashInterestUpdatesByInterestFeedByKeywords", "feedDashProfileUpdatesByMemberShareFeed"] + const uninterseting_list_types = ["*dashMySettings", "messagingDashMessagingSettings", "*searchDashSearchHome", "searchDashTypeaheadByGlobalTypeahead", "messagingDashAffiliatedMailboxesAll", "legoDashPageContentsByPageKeyAndSlotId", "searchDashFilterClustersByFilters"] for (const data_bit of data) { // now we have the data, try to parse it // is this object post data? let item_index = []; + let location = ""; if ("data" in data_bit && "included" in data_bit) { // items may be referenced as 'results' for search result pages or 'elements' for the feed let item_key = ''; if ("*elements" in data_bit["data"]) { item_index = data_bit["data"]["*elements"]; + location = "data.*elements"; } else if ("results" in data_bit["data"]) { item_index = data_bit["data"]["results"]; + location = "data.results"; } else if ("data" in data_bit["data"] && Object.keys(data_bit["data"]["data"]).filter(k => eligible_list_types.includes(k))) { for(const k of eligible_list_types) { if(k in data_bit["data"]["data"]) { item_index = data_bit["data"]["data"][k]["*elements"]; + location = `data.data.${k}.*elements`; break; } } + if (location === "") { + // Found nothing eligible + let uninteresting = false; + for (const k of uninterseting_list_types) { + if(k in data_bit["data"]["data"]) { + uninteresting = true; + } + } + + if (!uninteresting) { + // Possibly interesting data + // console.log("No items found in data_bit:"); + // console.log(data_bit); + } + continue; + } } else { - return []; + // console.log("No items found in data:"); + // console.log(data_bit); + continue; } + //console.log(`Searching items at ${location} from ${data_type} data on ${source_platform_url}`); // there is a list of objects, each with an ID // and a separate list of items to display, a list of those IDs @@ -61,6 +91,7 @@ zeeschuimer.register_module( // then we get the objects with the IDs in the item list // and that is our result set! + let num_items = 0; for (let object_ref in item_index) { let result = item_index[object_ref]; @@ -72,6 +103,7 @@ zeeschuimer.register_module( // we are (for now?) only interested in posts, which are identified in this way if (result.indexOf('urn:li:fs_updateV2:(urn:li:activity:') !== 0 && result.indexOf('urn:li:fsd_update:(urn:li:activity:') !== 0) { + // console.log(`Skipping non-post item ${result}`); continue; } @@ -79,7 +111,9 @@ zeeschuimer.register_module( result_object["id"] = result; items.push(result_object); + num_items++; } + console.log(`Found ${num_items} items in ${location} from ${data_type} data on ${source_platform_url}`); } } From 4c5847aafff3eecf642b34e136c540d532a1d0b9 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 23 Sep 2024 13:50:59 +0200 Subject: [PATCH 5/6] Fix wait on captcha in tests --- tests/test.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test.py b/tests/test.py index a176967..71b8a7d 100644 --- a/tests/test.py +++ b/tests/test.py @@ -182,6 +182,7 @@ captcha_element = driver.find_element(By.CSS_SELECTOR, settings.get("captcha-selector")) if captcha_element.is_displayed(): print(colored(f"{indent} :: [⚠️] Captcha detected... Press Enter after you have solved the captcha", "yellow")) + input() except selenium_exceptions.NoSuchElementException: pass @@ -197,7 +198,11 @@ # scroll and check if more items are loaded driver.switch_to.window(handles[1]) driver.execute_script("window.scrollBy(0, document.querySelector('html').scrollHeight);") - time.sleep(settings.get("wait", 5)) + time.sleep(0.5) + driver.execute_script("window.scrollBy(0, document.querySelector('html').scrollHeight);") + time.sleep(0.5) + driver.execute_script("window.scrollBy(0, document.querySelector('html').scrollHeight);") + time.sleep(settings.get("wait", 5) - 1) driver.switch_to.window(handles[0]) num_after_scroll = int(re.sub("[^0-9]", "", driver.execute_script( From 6992475f28148992eb6a001d1ac17a298b26bbb6 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 23 Sep 2024 15:19:51 +0200 Subject: [PATCH 6/6] Squashed commit of the following: commit 9b6e26310f83e8f7c3190f6e51b577ee7fcd9371 Author: Stijn Peeters Date: Mon Sep 23 15:18:20 2024 +0200 Fix wait on captcha in tests commit 0ad0fb6ef8fa7b61f188cec41b623e3d5e971c35 Author: Stijn Peeters Date: Mon Sep 23 15:17:23 2024 +0200 Remove debug log from Instagram module commit 0400b1af4390abe0cbc253d340d350455fac15b9 Author: Stijn Peeters Date: Mon Sep 23 15:17:10 2024 +0200 Fix LinkedIn search results commit 03fb948fe5ab410c3edccb49e20491df644cac03 Author: Stijn Peeters Date: Wed Aug 21 14:12:51 2024 +0200 Bump version commit 1fca90410f1060c7797c043e22978b49871a8779 Author: Stijn Peeters Date: Wed Aug 21 14:12:01 2024 +0200 Fix Instagram module commit 2d2337cdf10f1fae3219671bfbeda006c996fa2c Author: Stijn Peeters Date: Wed Aug 21 14:11:54 2024 +0200 Fix 9gag module commit 10c4396dd33b36e7cf4ea69b59b23973ce356c37 Author: Dale Wahl Date: Wed Jun 12 12:03:52 2024 +0200 linkedin: fix regex; check additional data_bits (if present) --- modules/linkedin.js | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/modules/linkedin.js b/modules/linkedin.js index 58ff675..320dff9 100644 --- a/modules/linkedin.js +++ b/modules/linkedin.js @@ -12,20 +12,24 @@ zeeschuimer.register_module( let data = []; let data_type = ""; try { + if(response.indexOf('') >= 0) { + throw new Error(); + } // when dealing with JSON, just parse that JSON and process it - data.push(JSON.parse(response)); + const json_data = JSON.parse(response); + data.push(json_data); data_type = "JSON"; } catch (e) { // data is not JSON, so it's probably HTML // HTML has data embedded in tags // store these for processing - const code_regex = RegExp(//g); + const code_regex = RegExp(/([^<]+)<\/code>/g); for (const code_bit of response.matchAll(code_regex)) { // console.log("Code; checking for JSON"); try { // use he to decode from HTML entities (the way the data is embedded) - data.push(JSON.parse(he.decode(code_bit))); + data.push(JSON.parse(he.decode(code_bit[1].trim()))); data_type = "HTML"; // console.log("Found JSON in code block"); } catch (e) { @@ -33,8 +37,8 @@ zeeschuimer.register_module( } } - const eligible_list_types = ["feedDashMainFeedByMainFeed", "feedDashInterestUpdatesByInterestFeedByKeywords", "feedDashProfileUpdatesByMemberShareFeed"] - const uninterseting_list_types = ["*dashMySettings", "messagingDashMessagingSettings", "*searchDashSearchHome", "searchDashTypeaheadByGlobalTypeahead", "messagingDashAffiliatedMailboxesAll", "legoDashPageContentsByPageKeyAndSlotId", "searchDashFilterClustersByFilters"] + const eligible_list_types = ["feedDashMainFeedByMainFeed", "feedDashInterestUpdatesByInterestFeedByKeywords", "feedDashProfileUpdatesByMemberShareFeed", "searchDashClustersByAll"] + const uninteresting_list_types = ["*dashMySettings", "messagingDashMessagingSettings", "*searchDashSearchHome", "searchDashTypeaheadByGlobalTypeahead", "messagingDashAffiliatedMailboxesAll", "legoDashPageContentsByPageKeyAndSlotId", "searchDashFilterClustersByFilters"] for (const data_bit of data) { // now we have the data, try to parse it // is this object post data? @@ -52,15 +56,23 @@ zeeschuimer.register_module( } else if ("data" in data_bit["data"] && Object.keys(data_bit["data"]["data"]).filter(k => eligible_list_types.includes(k))) { for(const k of eligible_list_types) { if(k in data_bit["data"]["data"]) { - item_index = data_bit["data"]["data"][k]["*elements"]; - location = `data.data.${k}.*elements`; + const elements_key = (data_bit["data"]["data"]['*elements'] !== undefined) ? '*elements' : 'elements'; + item_index = data_bit["data"]["data"][k][elements_key]; + location = `data.data.${k}.${elements_key}`; + + if (typeof (item_index) !== 'string' && item_index.length > 0 && item_index[0]['items'] !== undefined) { + // embedded results on search page + item_index = item_index[0]['items'].map(item => { + return item['item']['searchFeedUpdate']['*update']; + }); + } break; } } if (location === "") { // Found nothing eligible let uninteresting = false; - for (const k of uninterseting_list_types) { + for (const k of uninteresting_list_types) { if(k in data_bit["data"]["data"]) { uninteresting = true; }