From a7f9a69725bb5bb9fe0a62b5d4ef92bb390e31c4 Mon Sep 17 00:00:00 2001 From: Mohammed Ashour Date: Mon, 2 Nov 2020 01:39:53 +0200 Subject: [PATCH 1/2] code optimizations --- package-lock.json | 137 ---------------------------------------------- package.json | 3 +- scraper.js | 49 ++--------------- 3 files changed, 5 insertions(+), 184 deletions(-) diff --git a/package-lock.json b/package-lock.json index b399af4..70b1c3e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -4,11 +4,6 @@ "lockfileVersion": 1, "requires": true, "dependencies": { - "@types/node": { - "version": "14.14.6", - "resolved": "https://registry.npmjs.org/@types/node/-/node-14.14.6.tgz", - "integrity": "sha512-6QlRuqsQ/Ox/aJEQWBEJG7A9+u7oSYl3mem/K8IzxXG/kAGbV1YPD9Bg9Zw3vyxC/YP+zONKwy8hGkSt1jxFMw==" - }, "accepts": { "version": "1.3.7", "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.7.tgz", @@ -87,11 +82,6 @@ "type-is": "~1.6.17" } }, - "boolbase": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", - "integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24=" - }, "bytes": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.0.tgz", @@ -102,19 +92,6 @@ "resolved": "https://registry.npmjs.org/caseless/-/caseless-0.12.0.tgz", "integrity": "sha1-G2gcIf+EAzyCZUMJBolCDRhxUdw=" }, - "cheerio": { - "version": "1.0.0-rc.3", - "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.3.tgz", - "integrity": "sha512-0td5ijfUPuubwLUu0OBoe98gZj8C/AA+RW3v67GPlGOrvxWjZmBXiBCRU+I8VEiNyJzjth40POfHiz2RB3gImA==", - "requires": { - "css-select": "~1.2.0", - "dom-serializer": "~0.1.1", - "entities": "~1.1.1", - "htmlparser2": "^3.9.1", - "lodash": "^4.15.0", - "parse5": "^3.0.1" - } - }, "combined-stream": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", @@ -158,22 +135,6 @@ "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" }, - "css-select": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz", - "integrity": "sha1-KzoRBTnFNV8c2NMUYj6HCxIeyFg=", - "requires": { - "boolbase": "~1.0.0", - "css-what": "2.1", - "domutils": "1.5.1", - "nth-check": "~1.0.1" - } - }, - "css-what": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.3.tgz", - "integrity": "sha512-a+EPoD+uZiNfh+5fxw2nO9QwFa6nJe2Or35fGY6Ipw1R3R4AGz1d1TEZrCegvw2YTmZ0jXirGYlzxxpYSHwpEg==" - }, "dashdash": { "version": "1.14.1", "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz", @@ -205,37 +166,6 @@ "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz", "integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA=" }, - "dom-serializer": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.1.tgz", - "integrity": "sha512-l0IU0pPzLWSHBcieZbpOKgkIn3ts3vAh7ZuFyXNwJxJXk/c4Gwj9xaTJwIDVQCXawWD0qb3IzMGH5rglQaO0XA==", - "requires": { - "domelementtype": "^1.3.0", - "entities": "^1.1.1" - } - }, - "domelementtype": { - "version": "1.3.1", - "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.3.1.tgz", - "integrity": "sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w==" - }, - "domhandler": { - "version": "2.4.2", - "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.4.2.tgz", - "integrity": "sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==", - "requires": { - "domelementtype": "1" - } - }, - "domutils": { - "version": "1.5.1", - "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.5.1.tgz", - "integrity": "sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8=", - "requires": { - "dom-serializer": "0", - "domelementtype": "1" - } - }, "ecc-jsbn": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.2.tgz", @@ -255,11 +185,6 @@ "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", "integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k=" }, - "entities": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz", - "integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==" - }, "escape-html": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", @@ -395,19 +320,6 @@ "har-schema": "^2.0.0" } }, - "htmlparser2": { - "version": "3.10.1", - "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.1.tgz", - "integrity": "sha512-IgieNijUMbkDovyoKObU1DUhm1iwNYE/fuifEoEHfd1oZKZDaONBSkal7Y01shxsM49R4XaMdGez3WnF9UfiCQ==", - "requires": { - "domelementtype": "^1.3.1", - "domhandler": "^2.3.0", - "domutils": "^1.5.1", - "entities": "^1.1.1", - "inherits": "^2.0.1", - "readable-stream": "^3.1.1" - } - }, "http-errors": { "version": "1.7.2", "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.7.2.tgz", @@ -445,11 +357,6 @@ "safer-buffer": ">= 2.1.2 < 3" } }, - "inherits": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", - "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" - }, "ipaddr.js": { "version": "1.9.1", "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz", @@ -496,11 +403,6 @@ "verror": "1.10.0" } }, - "lodash": { - "version": "4.17.20", - "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.20.tgz", - "integrity": "sha512-PlhdFcillOINfeV7Ni6oF1TAEayyZBoZ8bcshTHqOYJYlrqzRK5hagpagky5o4HfCzzd1TRkXPMFq6cKk9rGmA==" - }, "media-typer": { "version": "0.3.0", "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", @@ -544,14 +446,6 @@ "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz", "integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw==" }, - "nth-check": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz", - "integrity": "sha512-WeBOdju8SnzPN5vTUJYxYUxLeXpCaVP5i5e0LF8fg7WORF2Wd7wFX/pk0tYZk7s8T+J7VLy0Da6J1+wCT0AtHg==", - "requires": { - "boolbase": "~1.0.0" - } - }, "oauth-sign": { "version": "0.9.0", "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.9.0.tgz", @@ -565,14 +459,6 @@ "ee-first": "1.1.1" } }, - "parse5": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz", - "integrity": "sha512-rgO9Zg5LLLkfJF9E6CCmXlSE4UVceloys8JrFqCcHloC3usd/kJCyPDwH2SOlzix2j3xaP9sUX3e8+kvkuleAA==", - "requires": { - "@types/node": "*" - } - }, "parseurl": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", @@ -628,16 +514,6 @@ "unpipe": "1.0.0" } }, - "readable-stream": { - "version": "3.6.0", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.0.tgz", - "integrity": "sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA==", - "requires": { - "inherits": "^2.0.3", - "string_decoder": "^1.1.1", - "util-deprecate": "^1.0.1" - } - }, "request": { "version": "2.88.2", "resolved": "https://registry.npmjs.org/request/-/request-2.88.2.tgz", @@ -746,14 +622,6 @@ "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz", "integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=" }, - "string_decoder": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", - "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", - "requires": { - "safe-buffer": "~5.2.0" - } - }, "toidentifier": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.0.tgz", @@ -803,11 +671,6 @@ "punycode": "^2.1.0" } }, - "util-deprecate": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", - "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" - }, "utils-merge": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", diff --git a/package.json b/package.json index e15deac..3940748 100644 --- a/package.json +++ b/package.json @@ -6,7 +6,6 @@ "author": "Herman Fassett", "dependencies": { "express": "latest", - "request": "latest", - "cheerio": "latest" + "request": "latest" } } diff --git a/scraper.js b/scraper.js index 340b952..9d10f7a 100644 --- a/scraper.js +++ b/scraper.js @@ -1,4 +1,3 @@ -const cheerio = require('cheerio'); const request = require('request'); async function youtube(query, page) { @@ -10,15 +9,8 @@ async function youtube(query, page) { request(url, (error, response, html) => { // Check for errors if (!error && response.statusCode === 200) { - const $ = cheerio.load(html); let json = { results: [], version: require('./package.json').version }; - // First attempt to parse old youtube search result style - $(".yt-lockup-dismissable").each((index, vid) => { - json["parser"] = "html_format"; - json.results.push(parseOldFormat($, vid)); - }); - // If that fails, we have to parse new format from json data in html script tag if (!json.results.length) { json["parser"] = "json_format"; @@ -26,13 +18,13 @@ async function youtube(query, page) { // Get script json data from html to parse let data, sectionLists = []; try { - let match = html.match(/ytInitialData"[^{]*(.*);\s*window\["ytInitialPlayerResponse"\]/s); + let match = html.match(/ytInitialData[^{]*(.*);\s*\/\/ scraper_data_end/s); if (match && match.length > 1) { - json["parser"] += ".original"; + json["parser"] += ".scraper_data"; } else { - json["parser"] += ".scraper_data"; - match = html.match(/ytInitialData[^{]*(.*);\s*\/\/ scraper_data_end/s); + json["parser"] += ".original"; + match = html.match(/ytInitialData"[^{]*(.*);\s*window\["ytInitialPlayerResponse"\]/s); } data = JSON.parse(match[1]); json["estimatedResults"] = data.estimatedResults || "0"; @@ -81,39 +73,6 @@ async function youtube(query, page) { }); } -/** - * Parse youtube search results from dom elements - * @param {CheerioStatic} $ - The youtube search results loaded with cheerio - * @param {CheerioElement} vid - The current video being parsed - * @returns object with data to return for this video - */ -function parseOldFormat($, vid) { - // Get video details - let $metainfo = $(vid).find(".yt-lockup-meta-info li"); - let $thumbnail = $(vid).find(".yt-thumb img"); - let video = { - "id": $(vid).parent().data("context-item-id"), - "title": $(vid).find(".yt-lockup-title").children().first().text(), - "url": `https://www.youtube.com${$(vid).find(".yt-lockup-title").children().first().attr("href")}`, - "duration": $(vid).find(".video-time").text().trim() || "Playlist", - "snippet": $(vid).find(".yt-lockup-description").text(), - "upload_date": $metainfo.first().text(), - "thumbnail_src": $thumbnail.data("thumb") || $thumbnail.attr("src"), - "views": $metainfo.last().text() - }; - - // Get user details - let $byline = $(vid).find(".yt-lockup-byline"); - let uploader = { - "username": $byline.text(), - "url": `https://www.youtube.com${$byline.find("a").attr("href")}`, - "verified": !!$byline.find("[title=Verified]").length - }; - - // Return json - return { video: video, uploader: uploader }; -} - /** * Parse a channelRenderer object from youtube search results * @param {object} renderer - The channel renderer From 7f5620c4f110087acc55c942755d33c5fc9e122e Mon Sep 17 00:00:00 2001 From: Mohammed Ashour Date: Mon, 2 Nov 2020 09:19:36 +0200 Subject: [PATCH 2/2] version bump --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 3940748..c1b56c7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "youtube-scrape", - "version": "0.1.2", + "version": "0.1.3", "description": "Scrape YouTube searches", "main": "server.js", "author": "Herman Fassett",