Skip to content

Commit

Permalink
Merge pull request #42 from MhmdSalah/code-optimize
Browse files Browse the repository at this point in the history
code optimizations
  • Loading branch information
HermanFassett authored Nov 2, 2020
2 parents 15f5078 + 7f5620c commit d426ace
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 185 deletions.
137 changes: 0 additions & 137 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
{
"name": "youtube-scrape",
"version": "0.1.2",
"version": "0.1.3",
"description": "Scrape YouTube searches",
"main": "server.js",
"author": "Herman Fassett",
"dependencies": {
"express": "latest",
"request": "latest",
"cheerio": "latest"
"request": "latest"
}
}
49 changes: 4 additions & 45 deletions scraper.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
const cheerio = require('cheerio');
const request = require('request');

async function youtube(query, page) {
Expand All @@ -10,29 +9,22 @@ async function youtube(query, page) {
request(url, (error, response, html) => {
// Check for errors
if (!error && response.statusCode === 200) {
const $ = cheerio.load(html);
let json = { results: [], version: require('./package.json').version };

// First attempt to parse old youtube search result style
$(".yt-lockup-dismissable").each((index, vid) => {
json["parser"] = "html_format";
json.results.push(parseOldFormat($, vid));
});

// If that fails, we have to parse new format from json data in html script tag
if (!json.results.length) {
json["parser"] = "json_format";

// Get script json data from html to parse
let data, sectionLists = [];
try {
let match = html.match(/ytInitialData"[^{]*(.*);\s*window\["ytInitialPlayerResponse"\]/s);
let match = html.match(/ytInitialData[^{]*(.*);\s*\/\/ scraper_data_end/s);
if (match && match.length > 1) {
json["parser"] += ".original";
json["parser"] += ".scraper_data";
}
else {
json["parser"] += ".scraper_data";
match = html.match(/ytInitialData[^{]*(.*);\s*\/\/ scraper_data_end/s);
json["parser"] += ".original";
match = html.match(/ytInitialData"[^{]*(.*);\s*window\["ytInitialPlayerResponse"\]/s);
}
data = JSON.parse(match[1]);
json["estimatedResults"] = data.estimatedResults || "0";
Expand Down Expand Up @@ -81,39 +73,6 @@ async function youtube(query, page) {
});
}

/**
* Parse youtube search results from dom elements
* @param {CheerioStatic} $ - The youtube search results loaded with cheerio
* @param {CheerioElement} vid - The current video being parsed
* @returns object with data to return for this video
*/
function parseOldFormat($, vid) {
// Get video details
let $metainfo = $(vid).find(".yt-lockup-meta-info li");
let $thumbnail = $(vid).find(".yt-thumb img");
let video = {
"id": $(vid).parent().data("context-item-id"),
"title": $(vid).find(".yt-lockup-title").children().first().text(),
"url": `https://www.youtube.com${$(vid).find(".yt-lockup-title").children().first().attr("href")}`,
"duration": $(vid).find(".video-time").text().trim() || "Playlist",
"snippet": $(vid).find(".yt-lockup-description").text(),
"upload_date": $metainfo.first().text(),
"thumbnail_src": $thumbnail.data("thumb") || $thumbnail.attr("src"),
"views": $metainfo.last().text()
};

// Get user details
let $byline = $(vid).find(".yt-lockup-byline");
let uploader = {
"username": $byline.text(),
"url": `https://www.youtube.com${$byline.find("a").attr("href")}`,
"verified": !!$byline.find("[title=Verified]").length
};

// Return json
return { video: video, uploader: uploader };
}

/**
* Parse a channelRenderer object from youtube search results
* @param {object} renderer - The channel renderer
Expand Down

0 comments on commit d426ace

Please sign in to comment.