Skip to content

Commit 29aff30

Browse files
committed
Refactor analyzer, get most used keywords
1 parent 334f03b commit 29aff30

File tree

5 files changed

+210
-118
lines changed

5 files changed

+210
-118
lines changed

src/analyze.js

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
'use strict';
2+
3+
import * as fsa from 'async-file';
4+
5+
import {
6+
readLines,
7+
handleIndexSourceErrors,
8+
readCached,
9+
figureOutTruncateAndSelector,
10+
cheerioLoad
11+
} from './common';
12+
13+
import {stopWords} from './stopwords';
14+
15+
const URL_LIST = 'archive/index.csv';
16+
const OVERWRITE = true;
17+
18+
function removePunctuation(input) {
19+
return input.replace(/[^\w\s]|_/g, '');
20+
}
21+
22+
async function extractWords(recv, source) {
23+
const loaded = cheerioLoad(recv);
24+
return loaded.then(shard => {
25+
const {_, truncate} = figureOutTruncateAndSelector(source);
26+
shard(truncate).remove();
27+
const text = shard.text().split(' ');
28+
const words = Object.create(null);
29+
const foundOnce = new Set();
30+
for (let i = 0; i < text.length; i++) {
31+
const w = removePunctuation(text[i]).toLowerCase();
32+
if (/^[a-zA-ZÀ-ÖØ-öø-ÿ]+$/.test(w) && stopWords.has(w) === false) {
33+
if (foundOnce.has(w)) {
34+
if (Object.prototype.hasOwnProperty.call(words, w)) {
35+
words[w]++;
36+
} else {
37+
words[w] = 2;
38+
}
39+
} else {
40+
foundOnce.add(w);
41+
}
42+
}
43+
}
44+
return words;
45+
});
46+
}
47+
48+
async function read(source) {
49+
const path = `archive/${source.slug}`;
50+
const cache = `${path}/cache.html`;
51+
const targetFileName = `${path}/analyze.json`;
52+
const cacheExists = await fsa.exists(cache);
53+
const data = {};
54+
if (cacheExists === true) {
55+
const cached = await readCached(cache);
56+
const words = await extractWords(cached, source);
57+
data.words = words;
58+
}
59+
60+
return {file: targetFileName, data};
61+
}
62+
63+
function sort(subject) {
64+
let sortable = [];
65+
for (let key in subject) {
66+
sortable.push([key, subject[key]]);
67+
}
68+
// Sort from more occurences, to least
69+
sortable.sort((a, b) => {
70+
return -1 * (a[1] - b[1]);
71+
});
72+
73+
return sortable; // array in format [ [ key1, val1 ], [ key2, val2 ], ... ]
74+
}
75+
76+
async function analyze(recv) {
77+
const words = recv.data.words;
78+
const keywords = Object.create(null);
79+
const sorted = sort(words);
80+
const max = 10;
81+
let iter = 0;
82+
for (let popular of sorted) {
83+
let used = popular[1]; // word has been used n times
84+
let word = popular[0];
85+
if (iter <= max && used > 3) {
86+
keywords[word] = used;
87+
}
88+
iter++;
89+
}
90+
91+
recv.data.keywords = keywords;
92+
93+
return recv;
94+
}
95+
96+
async function write({file, data = {}}, boolOverwrite = false) {
97+
const destExists = await fsa.exists(file);
98+
if (destExists === false || (destExists === true && boolOverwrite)) {
99+
await fsa.writeTextFile(file, JSON.stringify(data), 'utf8');
100+
}
101+
102+
return {file, data};
103+
}
104+
105+
/**
106+
* Something is going somewhat as an anti-pattern here.
107+
* We want Promise.all(...) at each step, and it's not how
108+
* it is as of now. Needs rework here. TODO
109+
*/
110+
for (const url of readLines(URL_LIST)) {
111+
Promise.resolve(url)
112+
.then(u => read(u))
113+
.then(descriptor => analyze(descriptor))
114+
.then(descriptor => write(descriptor, OVERWRITE))
115+
.catch(handleIndexSourceErrors);
116+
}

src/archive.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ const [...urls] = readLines(URL_LIST);
99

1010
/**
1111
* Something is going somewhat as an anti-pattern here.
12-
* Gotta wire generator and async/await TODO
12+
* We want Promise.all(...) at each step, and it's not how
13+
* it is as of now. Needs rework here. TODO
1314
*/
1415
Promise.all(urls)
1516
.then(u => fetcher(u))

src/common.js

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import fs from 'fs';
44
import pathutil from 'path';
5+
import cheerio from 'cheerio';
56
import * as fsa from 'async-file';
67
import lines from 'gen-readlines';
78
import slugifier from './normalizer/slugs';
@@ -75,10 +76,35 @@ function readCachedError(errorObj) {
7576
}
7677
}
7778

79+
// Make possible to do extractLinks, markdownify, ... in parallel TODO
80+
async function cheerioLoad(recv, configObj = {}) {
81+
return new Promise(resolve => resolve(cheerio.load(recv, configObj)));
82+
}
83+
84+
/**
85+
* Given every row in source file .csv
86+
* http://example.org/a/b.html;selector;truncate
87+
*
88+
* selector is the CSS selector where the main content is
89+
* truncate is a list of CSS selectors to strip off
90+
*/
91+
function figureOutTruncateAndSelector(sourceArgument) {
92+
// If we know exactly where the main content is, otherwise grab the whole
93+
// document body.
94+
const selector = (sourceArgument.selector.length === 0) ? 'body' : `${sourceArgument.selector}`;
95+
// Truncate is to strip off any patterns we do not want
96+
// as part of our archived article.
97+
let truncate = (sourceArgument.truncate.length === 0) ? '' : `${sourceArgument.truncate},`;
98+
truncate += 'script,style,noscript';
99+
return {selector, truncate};
100+
}
101+
78102
export {
79103
readCached,
80104
readLines,
81105
coroutine,
82106
parseCsvLine,
83-
handleIndexSourceErrors
107+
handleIndexSourceErrors,
108+
figureOutTruncateAndSelector,
109+
cheerioLoad
84110
};

src/stopwords.js

Lines changed: 5 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)