From ba9c132ce3f94a7a4f244e87210b74a8eda786ed Mon Sep 17 00:00:00 2001 From: Vadim Kovalenko Date: Tue, 17 Oct 2023 12:09:59 +0300 Subject: [PATCH 01/10] Deprecate setBaseUrl() and getArticleUrl() in favor of downloader.getArticleUrl() --- src/Downloader.ts | 60 +++---------- src/MediaWiki.ts | 6 +- src/mwoffliner.lib.ts | 1 - src/util/saveArticles.ts | 9 +- test/unit/downloader.test.ts | 14 +-- test/unit/renderers/renderer.builder.test.ts | 2 +- test/unit/saveArticles.test.ts | 85 ++++++++++++++++--- .../unit/treatments/article.treatment.test.ts | 5 +- test/unit/urlRewriting.test.ts | 1 - test/util.ts | 5 ++ 10 files changed, 106 insertions(+), 82 deletions(-) diff --git a/src/Downloader.ts b/src/Downloader.ts index d470d66b..e6bde059 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -21,7 +21,6 @@ import S3 from './S3.js' import * as logger from './Logger.js' import MediaWiki, { QueryOpts } from './MediaWiki.js' import ApiURLDirector from './util/builders/url/api.director.js' -import basicURLDirector from './util/builders/url/basic.director.js' import urlHelper from './util/url.helper.js' const imageminOptions = new Map() @@ -170,52 +169,21 @@ class Downloader { } } - public async setBaseUrls(forceRender = null) { - if (!forceRender) { - //* Objects order in array matters! - this.baseUrl = basicURLDirector.buildDownloaderBaseUrl([ - { condition: await MediaWiki.hasWikimediaMobileApi(), value: MediaWiki.WikimediaMobileApiUrl.href }, - { condition: await MediaWiki.hasWikimediaDesktopApi(), value: MediaWiki.WikimediaDesktopApiUrl.href }, - { condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href }, - ]) - - //* Objects order in array matters! - this.baseUrlForMainPage = basicURLDirector.buildDownloaderBaseUrl([ - { condition: await MediaWiki.hasWikimediaDesktopApi(), value: MediaWiki.WikimediaDesktopApiUrl.href }, - { condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href }, - { condition: await MediaWiki.hasWikimediaMobileApi(), value: MediaWiki.WikimediaMobileApiUrl.href }, - ]) - } else { - switch (forceRender) { - case 'WikimediaDesktop': - if (MediaWiki.hasWikimediaDesktopApi()) { - this.baseUrl = MediaWiki.WikimediaDesktopApiUrl.href - this.baseUrlForMainPage = MediaWiki.WikimediaDesktopApiUrl.href - break - } - break - case 'VisualEditor': - if (MediaWiki.hasVisualEditorApi()) { - this.baseUrl = MediaWiki.visualEditorApiUrl.href - this.baseUrlForMainPage = MediaWiki.visualEditorApiUrl.href - break - } - break - case 'WikimediaMobile': - if (MediaWiki.hasWikimediaMobileApi()) { - this.baseUrl = MediaWiki.WikimediaMobileApiUrl.href - this.baseUrlForMainPage = MediaWiki.WikimediaMobileApiUrl.href - break - } - break - default: - throw new Error('Unable to find specific API end-point to retrieve article HTML') - } + public getArticleUrl(renderer, articleId: string): string { + let articleUrl: string + // Renders and URL builders are independent so there should be a switch/case scenario here + switch (renderer.constructor.name) { + case 'WikimediaDesktopRenderer': + articleUrl = MediaWiki.wikimediaDesktopUrlDirector.buildArticleURL(articleId) + break + case 'VisualEditorRenderer': + articleUrl = MediaWiki.visualEditorURLDirector.buildArticleURL(articleId) + break + case 'WikimediaMobileRenderer': + articleUrl = MediaWiki.wikimediaMobileUrlDirector.buildArticleURL(articleId) + break } - logger.log('Base Url: ', this.baseUrl) - logger.log('Base Url for Main Page: ', this.baseUrlForMainPage) - - if (!this.baseUrl || !this.baseUrlForMainPage) throw new Error('Unable to find appropriate API end-point to retrieve article HTML') + return articleUrl } public removeEtagWeakPrefix(etag: string): string { diff --git a/src/MediaWiki.ts b/src/MediaWiki.ts index 36684b5c..6cd9c768 100644 --- a/src/MediaWiki.ts +++ b/src/MediaWiki.ts @@ -52,9 +52,9 @@ class MediaWiki { #domain: string private apiUrlDirector: ApiURLDirector private baseUrlDirector: BaseURLDirector - private wikimediaDesktopUrlDirector: WikimediaDesktopURLDirector - private wikimediaMobileUrlDirector: WikimediaMobileURLDirector - private visualEditorURLDirector: VisualEditorURLDirector + public wikimediaDesktopUrlDirector: WikimediaDesktopURLDirector + public wikimediaMobileUrlDirector: WikimediaMobileURLDirector + public visualEditorURLDirector: VisualEditorURLDirector public visualEditorApiUrl: URL public actionApiUrl: URL diff --git a/src/mwoffliner.lib.ts b/src/mwoffliner.lib.ts index b0f8a9de..323485bb 100644 --- a/src/mwoffliner.lib.ts +++ b/src/mwoffliner.lib.ts @@ -214,7 +214,6 @@ async function execute(argv: any) { await MediaWiki.hasWikimediaDesktopApi() const hasWikimediaMobileApi = await MediaWiki.hasWikimediaMobileApi() await MediaWiki.hasVisualEditorApi() - await downloader.setBaseUrls(forceRender) RedisStore.setOptions(argv.redis || config.defaults.redisPath) await RedisStore.connect() diff --git a/src/util/saveArticles.ts b/src/util/saveArticles.ts index 847ffae8..f715a526 100644 --- a/src/util/saveArticles.ts +++ b/src/util/saveArticles.ts @@ -131,9 +131,9 @@ async function getAllArticlesToKeep(downloader: Downloader, articleDetailXId: RK const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title) let rets: any try { - const articleUrl = getArticleUrl(downloader, dump, articleId) const isMainPage = dump.isMainPage(articleId) const renderer = isMainPage ? mainPageRenderer : articlesRenderer + const articleUrl = downloader.getArticleUrl(renderer, articleId) rets = await downloader.getArticle(downloader.webp, _moduleDependencies, articleId, articleDetailXId, renderer, articleUrl, dump, articleDetail, isMainPage) for (const { articleId, html } of rets) { @@ -224,10 +224,6 @@ async function saveArticle( } } -export function getArticleUrl(downloader: Downloader, dump: Dump, articleId: string): string { - return `${dump.isMainPage(articleId) ? downloader.baseUrlForMainPage : downloader.baseUrl}${encodeURIComponent(articleId)}` -} - /* * Fetch Articles */ @@ -293,10 +289,11 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade let rets: any try { - const articleUrl = getArticleUrl(downloader, dump, articleId) const isMainPage = dump.isMainPage(articleId) const renderer = isMainPage ? mainPageRenderer : articlesRenderer + const articleUrl = downloader.getArticleUrl(renderer, articleId) + rets = await downloader.getArticle(downloader.webp, _moduleDependencies, articleId, articleDetailXId, renderer, articleUrl, dump, articleDetail, isMainPage) for (const { articleId, displayTitle: articleTitle, html: finalHTML, mediaDependencies, moduleDependencies, staticFiles, subtitles } of rets) { diff --git a/test/unit/downloader.test.ts b/test/unit/downloader.test.ts index ed1a64b9..d7d03d86 100644 --- a/test/unit/downloader.test.ts +++ b/test/unit/downloader.test.ts @@ -6,7 +6,6 @@ import Axios from 'axios' import { mwRetToArticleDetail, stripHttpFromUrl, isImageUrl } from '../../src/util/index.js' import S3 from '../../src/S3.js' import { Dump } from '../../src/Dump.js' -import { getArticleUrl } from '../../src/util/saveArticles.js' import { config } from '../../src/config.js' import 'dotenv/config.js' import * as FileType from 'file-type' @@ -18,6 +17,7 @@ import { WikimediaDesktopRenderer } from '../../src/renderers/wikimedia-desktop. import { VisualEditorRenderer } from '../../src/renderers/visual-editor.renderer.js' import { WikimediaMobileRenderer } from '../../src/renderers/wikimedia-mobile.renderer.js' import { RENDERERS_LIST } from '../../src/util/const.js' +import { setupScrapeClasses } from '../util.js' jest.setTimeout(200000) @@ -38,7 +38,6 @@ describe('Downloader class', () => { await MediaWiki.hasWikimediaDesktopApi() await MediaWiki.hasWikimediaMobileApi() await MediaWiki.hasVisualEditorApi() - await downloader.setBaseUrls() }) test('Test Action API version 2 response in comparison with version 1', async () => { @@ -129,15 +128,18 @@ describe('Downloader class', () => { describe('getArticle method', () => { let dump: Dump + let renderer const wikimediaDesktopRenderer = new WikimediaDesktopRenderer() beforeAll(async () => { const mwMetadata = await MediaWiki.getMwMetaData(downloader) dump = new Dump('', {} as any, mwMetadata) + const setupScrapeClass = await setupScrapeClasses() // en wikipedia + renderer = setupScrapeClass.renderer }) test('getArticle of "London" returns one article for WikimediaDesktop render', async () => { const articleId = 'London' - const articleUrl = getArticleUrl(downloader, dump, articleId) + const articleUrl = downloader.getArticleUrl(renderer, articleId) const articleDetail = { title: articleId, thumbnail: { @@ -173,7 +175,7 @@ describe('Downloader class', () => { revisionId: 1168361498, timestamp: '2023-08-02T09:57:11Z', } - const articleUrl = getArticleUrl(downloader, dump, articleDetail.title) + const articleUrl = downloader.getArticleUrl(renderer, articleDetail.title) const PaginatedArticle = await downloader.getArticle( downloader.webp, _moduleDependencies, @@ -190,7 +192,7 @@ describe('Downloader class', () => { test('getArticle response status for non-existent article id is 404 for WikimediaDesktop render', async () => { const articleId = 'NeverExistingArticle' - const articleUrl = getArticleUrl(downloader, dump, articleId) + const articleUrl = downloader.getArticleUrl(renderer, articleId) const articleDetail = { title: articleId, missing: '', @@ -236,7 +238,7 @@ describe('Downloader class', () => { test(`getArticle response status for non-existent article id is 404 for ${renderer} render`, async () => { const articleId = 'NeverExistingArticle' - const articleUrl = getArticleUrl(downloader, dump, articleId) + const articleUrl = downloader.getArticleUrl(dump, articleId) const articleDetail = { title: articleId, missing: '', diff --git a/test/unit/renderers/renderer.builder.test.ts b/test/unit/renderers/renderer.builder.test.ts index 25e6b7db..3bd46ded 100644 --- a/test/unit/renderers/renderer.builder.test.ts +++ b/test/unit/renderers/renderer.builder.test.ts @@ -84,7 +84,7 @@ describe('RendererBuilder', () => { await MediaWiki.hasWikimediaDesktopApi() await MediaWiki.hasWikimediaMobileApi() await MediaWiki.hasVisualEditorApi() - await downloader.setBaseUrls() + await MediaWiki.hasVisualEditorApi() const rendererBuilderOptions = { MediaWiki, diff --git a/test/unit/saveArticles.test.ts b/test/unit/saveArticles.test.ts index 7ccbcb03..7329adfb 100644 --- a/test/unit/saveArticles.test.ts +++ b/test/unit/saveArticles.test.ts @@ -7,7 +7,6 @@ import { saveArticles } from '../../src/util/saveArticles.js' import { ZimArticle } from '@openzim/libzim' import { mwRetToArticleDetail, DELETED_ARTICLE_ERROR } from '../../src/util/index.js' import { jest } from '@jest/globals' -import { getArticleUrl } from '../../src/util/saveArticles.js' import { WikimediaDesktopRenderer } from '../../src/renderers/wikimedia-desktop.renderer.js' import { VisualEditorRenderer } from '../../src/renderers/visual-editor.renderer.js' import { WikimediaMobileRenderer } from '../../src/renderers/wikimedia-mobile.renderer.js' @@ -41,7 +40,8 @@ describe('saveArticles', () => { await MediaWiki.hasWikimediaDesktopApi() await MediaWiki.hasWikimediaMobileApi() await MediaWiki.hasVisualEditorApi() - await downloader.setBaseUrls(renderer) + await MediaWiki.hasVisualEditorApi() + const _articlesDetail = await downloader.getArticleDetailsIds(['London']) const articlesDetail = mwRetToArticleDetail(_articlesDetail) const { articleDetailXId } = RedisStore @@ -71,7 +71,7 @@ describe('saveArticles', () => { expect(addedArticles[0].aid).toEqual('A/London') const articleId = 'non-existent-article' - const articleUrl = getArticleUrl(downloader, dump, articleId) + const articleUrl = downloader.getArticleUrl(renderer, articleId) const articleDetail = { title: 'Non-existent-article', missing: '' } const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title) @@ -91,9 +91,8 @@ describe('saveArticles', () => { test(`Check nodet article for en.wikipedia.org using ${renderer} renderer`, async () => { const { downloader, dump } = await setupScrapeClasses({ mwUrl: 'https://en.wikipedia.org', format: 'nodet' }) // en wikipedia - await downloader.setBaseUrls(renderer) const articleId = 'Canada' - const articleUrl = getArticleUrl(downloader, dump, articleId) + const articleUrl = downloader.getArticleUrl(rendererInstance, articleId) const _articleDetailsRet = await downloader.getArticleDetailsIds([articleId]) const articlesDetail = mwRetToArticleDetail(_articleDetailsRet) const { articleDetailXId } = RedisStore @@ -122,9 +121,9 @@ describe('saveArticles', () => { test(`Load main page and check that it is without header using ${renderer} renderer`, async () => { const { downloader, dump } = await setupScrapeClasses({ mwUrl: 'https://en.wikivoyage.org' }) // en wikipedia - await downloader.setBaseUrls(renderer) + await downloader.setBaseUrlsDirectors(rendererInstance, rendererInstance) const articleId = 'Main_Page' - const articleUrl = getArticleUrl(downloader, dump, articleId) + const articleUrl = downloader.getArticleUrl(dump, articleId) const _articleDetailsRet = await downloader.getArticleDetailsIds([articleId]) const articlesDetail = mwRetToArticleDetail(_articleDetailsRet) const { articleDetailXId } = RedisStore @@ -147,12 +146,8 @@ describe('saveArticles', () => { }) test(`--customFlavour using ${renderer} renderer`, async () => { - const { MediaWiki, downloader, dump } = await setupScrapeClasses({ format: 'nopic' }) // en wikipedia - await MediaWiki.hasCoordinates(downloader) - await MediaWiki.hasWikimediaDesktopApi() - await MediaWiki.hasWikimediaMobileApi() - await MediaWiki.hasVisualEditorApi() - await downloader.setBaseUrls(renderer) + const { downloader, dump } = await setupScrapeClasses({ format: 'nopic' }) // en wikipedia + await downloader.setBaseUrlsDirectors(rendererInstance, rendererInstance) class CustomFlavour implements CustomProcessor { // eslint-disable-next-line @typescript-eslint/no-unused-vars public async shouldKeepArticle(articleId: string, doc: Document) { @@ -227,7 +222,7 @@ describe('saveArticles', () => { const downloader = classes.downloader await downloader.setCapabilities() - await downloader.setBaseUrls() + await downloader.setBaseUrlsDirectors() const _articleDetailsRet = await downloader.getArticleDetailsIds(['Western_Greenland']) const articlesDetail = mwRetToArticleDetail(_articleDetailsRet) const { articleDetailXId } = redisStore @@ -282,6 +277,68 @@ describe('saveArticles', () => { */ }) + test('--customFlavour', async () => { + const { downloader, dump } = await setupScrapeClasses({ format: 'nopic' }) // en wikipedia + class CustomFlavour implements CustomProcessor { + // eslint-disable-next-line @typescript-eslint/no-unused-vars + public async shouldKeepArticle(articleId: string, doc: Document) { + return articleId !== 'London' + } + public async preProcessArticle(articleId: string, doc: Document) { + if (articleId === 'Paris') { + const h2 = doc.createElement('h2') + h2.textContent = 'INSERTED_BY_PRE_PROCESSOR' + h2.id = 'PRE_PROCESSOR' + doc.body.appendChild(h2) + } + return doc + } + public async postProcessArticle(articleId: string, doc: Document) { + if (articleId === 'Prague') { + const h2 = doc.createElement('h2') + h2.textContent = 'INSERTED_BY_POST_PROCESSOR' + h2.id = 'POST_PROCESSOR' + doc.body.appendChild(h2) + } + return doc + } + } + const customFlavour = new CustomFlavour() + dump.customProcessor = customFlavour + + const _articlesDetail = await downloader.getArticleDetailsIds(['London', 'Paris', 'Prague']) + const articlesDetail = mwRetToArticleDetail(_articlesDetail) + const { articleDetailXId } = RedisStore + await articleDetailXId.flush() + await articleDetailXId.setMany(articlesDetail) + + const writtenArticles: any = {} + await saveArticles( + { + addArticle(article: typeof ZimArticle) { + if (article.mimeType === 'text/html') { + writtenArticles[article.title] = article + } + return Promise.resolve(null) + }, + } as any, + downloader, + dump, + true, + 'WikimediaDesktop', + ) + + const ParisDocument = domino.createDocument(writtenArticles.Paris.bufferData) + const PragueDocument = domino.createDocument(writtenArticles.Prague.bufferData) + + // London was correctly filtered out by customFlavour + expect(writtenArticles.London).toBeUndefined() + // Paris was correctly pre-processed + expect(ParisDocument.querySelector('#PRE_PROCESSOR')).toBeDefined() + // Prague was correctly post-processed + expect(PragueDocument.querySelector('#POST_PROCESSOR')).toBeDefined() + }) + test('Test deleted article rendering (Visual editor renderer)', async () => { const { downloader, dump } = await setupScrapeClasses() // en wikipedia const { articleDetailXId } = RedisStore diff --git a/test/unit/treatments/article.treatment.test.ts b/test/unit/treatments/article.treatment.test.ts index a26076dd..145fec07 100644 --- a/test/unit/treatments/article.treatment.test.ts +++ b/test/unit/treatments/article.treatment.test.ts @@ -6,7 +6,6 @@ import { setupScrapeClasses } from '../../util.js' import { startRedis, stopRedis } from '../bootstrap.js' import { saveArticles } from '../../../src/util/saveArticles.js' import { jest } from '@jest/globals' -import { getArticleUrl } from '../../../src/util/saveArticles.js' import { WikimediaDesktopRenderer } from '../../../src/renderers/wikimedia-desktop.renderer.js' import { WikimediaMobileRenderer } from '../../../src/renderers/wikimedia-mobile.renderer.js' import { VisualEditorRenderer } from '../../../src/renderers/visual-editor.renderer.js' @@ -36,7 +35,6 @@ describe('ArticleTreatment', () => { test(`Article html processing for ${renderer} render`, async () => { const { downloader, dump } = await setupScrapeClasses() // en wikipedia - await downloader.setBaseUrls() const title = 'London' const _articlesDetail = await downloader.getArticleDetailsIds([title]) const articlesDetail = mwRetToArticleDetail(_articlesDetail) @@ -45,9 +43,8 @@ describe('ArticleTreatment', () => { await articleDetailXId.setMany(articlesDetail) const addedArticles: (typeof ZimArticle)[] = [] - const articleId = 'non-existent-article' - const articleUrl = getArticleUrl(downloader, dump, articleId) + const articleUrl = downloader.getArticleUrl(renderer, articleId) const _moduleDependencies = await downloader.getModuleDependencies(title) const articleDetail = { diff --git a/test/unit/urlRewriting.test.ts b/test/unit/urlRewriting.test.ts index 60c384a6..1ba8eab2 100644 --- a/test/unit/urlRewriting.test.ts +++ b/test/unit/urlRewriting.test.ts @@ -140,7 +140,6 @@ describe('Styles', () => { await articleDetailXId.flush() await RedisStore.redirectsXId.flush() const { downloader, dump } = await setupScrapeClasses() // en wikipedia - await downloader.setBaseUrls() await getArticleIds(downloader, '', ['London', 'British_Museum', 'Natural_History_Museum,_London', 'Farnborough/Aldershot_built-up_area']) diff --git a/test/util.ts b/test/util.ts index 7625cb78..28a030fc 100644 --- a/test/util.ts +++ b/test/util.ts @@ -33,6 +33,10 @@ export function makeLink($doc: Document, href: string, rel: string, title: strin export async function setupScrapeClasses({ mwUrl = 'https://en.wikipedia.org', format = '' } = {}) { MediaWiki.base = mwUrl + const renderer = {} + + Object.defineProperty(renderer.constructor, 'name', { value: 'WikimediaDesktopRenderer' }) + const downloader = new Downloader({ uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: false, optimisationCacheUrl: '' }) await MediaWiki.getMwMetaData(downloader) @@ -47,6 +51,7 @@ export async function setupScrapeClasses({ mwUrl = 'https://en.wikipedia.org', f MediaWiki, downloader, dump, + renderer, } } From 934253325da2fc7551aaa735bf67c3580c750593 Mon Sep 17 00:00:00 2001 From: Vadim Kovalenko Date: Tue, 17 Oct 2023 16:45:39 +0300 Subject: [PATCH 02/10] Add unit test for donwloader.getArticleUrl(), fix VE url builder --- src/Downloader.ts | 2 -- src/util/builders/url/api.director.ts | 5 +---- test/unit/builders/url/api.director.test.ts | 2 +- test/unit/downloader.test.ts | 1 - 4 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/Downloader.ts b/src/Downloader.ts index e6bde059..ac896ef1 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -79,8 +79,6 @@ export const defaultStreamRequestOptions: AxiosRequestConfig = { class Downloader { public loginCookie = '' public readonly speed: number - public baseUrl: string - public baseUrlForMainPage: string public cssDependenceUrls: KVS = {} public readonly webp: boolean = false public readonly requestTimeout: number diff --git a/src/util/builders/url/api.director.ts b/src/util/builders/url/api.director.ts index 6651d7c6..fabae422 100644 --- a/src/util/builders/url/api.director.ts +++ b/src/util/builders/url/api.director.ts @@ -49,10 +49,7 @@ export default class ApiURLDirector { } buildVisualEditorURL() { - return urlBuilder - .setDomain(this.baseDomain) - .setQueryParams({ action: 'visualeditor', mobileformat: 'html', format: 'json', paction: 'parse', formatversion: '2', page: '' }) - .build(true) + return urlBuilder.setDomain(this.baseDomain).setQueryParams({ action: 'visualeditor', mobileformat: 'html', format: 'json', paction: 'parse', formatversion: '2' }).build(true) } buildArticleApiURL(articleId: string) { diff --git a/test/unit/builders/url/api.director.test.ts b/test/unit/builders/url/api.director.test.ts index 993b9dfa..a45423be 100644 --- a/test/unit/builders/url/api.director.test.ts +++ b/test/unit/builders/url/api.director.test.ts @@ -57,7 +57,7 @@ describe('ApiURLDirector', () => { it('should return base visual editor URL object with default query params', () => { const url = apiUrlDirector.buildVisualEditorURL() - expect(url.href).toBe('https://en.wikipedia.org/w/api.php?action=visualeditor&mobileformat=html&format=json&paction=parse&formatversion=2&page=') + expect(url.href).toBe('https://en.wikipedia.org/w/api.php?action=visualeditor&mobileformat=html&format=json&paction=parse&formatversion=2') }) }) }) diff --git a/test/unit/downloader.test.ts b/test/unit/downloader.test.ts index d7d03d86..7e89a02a 100644 --- a/test/unit/downloader.test.ts +++ b/test/unit/downloader.test.ts @@ -30,7 +30,6 @@ describe('Downloader class', () => { beforeAll(async () => { MediaWiki.base = 'https://en.wikipedia.org' MediaWiki.getCategories = true - downloader = new Downloader({ uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: true, optimisationCacheUrl: '' }) await MediaWiki.getMwMetaData(downloader) From b628472c3b238d381d8491998e6d940c642a3717 Mon Sep 17 00:00:00 2001 From: Vadim Kovalenko Date: Tue, 17 Oct 2023 16:48:56 +0300 Subject: [PATCH 03/10] Fix codecov issues in Downloader --- src/Downloader.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Downloader.ts b/src/Downloader.ts index ac896ef1..0cbd1057 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -349,7 +349,7 @@ class Downloader { await this.claimRequest() try { - return await new Promise((resolve, reject) => { + return new Promise((resolve, reject) => { const cb = (err: any, val: any) => { if (err) { reject(err) @@ -688,7 +688,7 @@ class Downloader { // Solution to handle aws js sdk v3 from https://github.com/aws/aws-sdk-js-v3/issues/1877 private async streamToBuffer(stream: Readable): Promise { - return await new Promise((resolve, reject) => { + return new Promise((resolve, reject) => { const chunks: Uint8Array[] = [] stream.on('data', (chunk) => chunks.push(chunk)) stream.on('error', reject) From 916e9d31b08b0b00bff6962c0f93ee1e05c7a564 Mon Sep 17 00:00:00 2001 From: Vadim Kovalenko Date: Mon, 23 Oct 2023 13:21:11 +0300 Subject: [PATCH 04/10] Handle url director instances inside Downloader --- src/Downloader.ts | 88 +++++++++++++++---- src/MediaWiki.ts | 29 ++++-- src/mwoffliner.lib.ts | 1 + src/util/builders/url/api.director.ts | 11 +-- src/util/saveArticles.ts | 4 +- test/unit/downloader.test.ts | 23 +++-- test/unit/saveArticles.test.ts | 6 +- .../unit/treatments/article.treatment.test.ts | 3 +- test/unit/urlRewriting.test.ts | 1 + 9 files changed, 122 insertions(+), 44 deletions(-) diff --git a/src/Downloader.ts b/src/Downloader.ts index 0cbd1057..4d57bac7 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -20,9 +20,15 @@ import { normalizeMwResponse, DB_ERROR, WEAK_ETAG_REGEX, stripHttpFromUrl, isBit import S3 from './S3.js' import * as logger from './Logger.js' import MediaWiki, { QueryOpts } from './MediaWiki.js' +import { Dump } from './Dump.js' import ApiURLDirector from './util/builders/url/api.director.js' +import basicURLDirector from './util/builders/url/basic.director.js' import urlHelper from './util/url.helper.js' +import WikimediaDesktopURLDirector from './util/builders/url/desktop.director.js' +import WikimediaMobileURLDirector from './util/builders/url/mobile.director.js' +import VisualEditorURLDirector from './util/builders/url/visual-editor.director.js' + const imageminOptions = new Map() imageminOptions.set('default', new Map()) imageminOptions.set('webp', new Map()) @@ -79,6 +85,8 @@ export const defaultStreamRequestOptions: AxiosRequestConfig = { class Downloader { public loginCookie = '' public readonly speed: number + public baseUrl: string + public baseUrlForMainPage: string public cssDependenceUrls: KVS = {} public readonly webp: boolean = false public readonly requestTimeout: number @@ -87,6 +95,8 @@ class Downloader { public streamRequestOptions: AxiosRequestConfig public wikimediaMobileJsDependenciesList: string[] = [] public wikimediaMobileStyleDependenciesList: string[] = [] + public articleUrlDirector: WikimediaDesktopURLDirector | WikimediaMobileURLDirector | VisualEditorURLDirector + public mainPageUrlDirector: WikimediaDesktopURLDirector | WikimediaMobileURLDirector | VisualEditorURLDirector private readonly uaString: string private activeRequests = 0 @@ -167,21 +177,69 @@ class Downloader { } } - public getArticleUrl(renderer, articleId: string): string { - let articleUrl: string - // Renders and URL builders are independent so there should be a switch/case scenario here - switch (renderer.constructor.name) { - case 'WikimediaDesktopRenderer': - articleUrl = MediaWiki.wikimediaDesktopUrlDirector.buildArticleURL(articleId) - break - case 'VisualEditorRenderer': - articleUrl = MediaWiki.visualEditorURLDirector.buildArticleURL(articleId) - break - case 'WikimediaMobileRenderer': - articleUrl = MediaWiki.wikimediaMobileUrlDirector.buildArticleURL(articleId) - break + private getUrlDirector(capabilitiesList): WikimediaDesktopURLDirector | WikimediaMobileURLDirector | VisualEditorURLDirector { + for (const capabilityInfo of capabilitiesList) { + if (capabilityInfo.condition) { + return new capabilityInfo.Director(capabilityInfo.value) + } } - return articleUrl + throw new Error('No suitable URL director found.') + } + + public async setBaseUrlsDirectors(forceRender = null) { + if (!forceRender) { + //* Objects order in array matters! + const articlesCapabilitiesList = [ + { condition: await MediaWiki.hasWikimediaMobileApi(), value: MediaWiki.WikimediaMobileApiUrl.href, Director: WikimediaMobileURLDirector }, + { condition: await MediaWiki.hasWikimediaDesktopApi(), value: MediaWiki.WikimediaDesktopApiUrl.href, Director: WikimediaDesktopURLDirector }, + { condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.VisualEditorApiUrl.href, Director: VisualEditorURLDirector }, + ] + + this.baseUrl = basicURLDirector.buildDownloaderBaseUrl(articlesCapabilitiesList) + this.articleUrlDirector = this.getUrlDirector(articlesCapabilitiesList) + + //* Objects order in array matters! + const mainPageCapabilitiesList = [ + { condition: await MediaWiki.hasWikimediaDesktopApi(), value: MediaWiki.WikimediaDesktopApiUrl.href, Director: WikimediaDesktopURLDirector }, + { condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.VisualEditorApiUrl.href, Director: VisualEditorURLDirector }, + { condition: await MediaWiki.hasWikimediaMobileApi(), value: MediaWiki.WikimediaMobileApiUrl.href, Director: WikimediaMobileURLDirector }, + ] + this.baseUrlForMainPage = basicURLDirector.buildDownloaderBaseUrl(mainPageCapabilitiesList) + this.mainPageUrlDirector = this.getUrlDirector(mainPageCapabilitiesList) + } else { + switch (forceRender) { + case 'WikimediaDesktop': + if (MediaWiki.hasWikimediaDesktopApi()) { + this.baseUrl = MediaWiki.WikimediaDesktopApiUrl.href + this.baseUrlForMainPage = MediaWiki.WikimediaDesktopApiUrl.href + this.articleUrlDirector = this.mainPageUrlDirector = new WikimediaDesktopURLDirector(MediaWiki.WikimediaDesktopApiUrl.href) + break + } + break + case 'VisualEditor': + if (MediaWiki.hasVisualEditorApi()) { + this.baseUrl = MediaWiki.VisualEditorApiUrl.href + this.baseUrlForMainPage = MediaWiki.VisualEditorApiUrl.href + this.articleUrlDirector = this.mainPageUrlDirector = new VisualEditorURLDirector(MediaWiki.VisualEditorApiUrl.href) + break + } + break + case 'WikimediaMobile': + if (MediaWiki.hasWikimediaMobileApi()) { + this.baseUrl = MediaWiki.WikimediaMobileApiUrl.href + this.baseUrlForMainPage = MediaWiki.WikimediaMobileApiUrl.href + this.articleUrlDirector = this.mainPageUrlDirector = new WikimediaMobileURLDirector(MediaWiki.WikimediaMobileApiUrl.href) + break + } + break + default: + throw new Error('Unable to find specific API end-point to retrieve article HTML') + } + } + } + + public getArticleUrl(dump: Dump, articleId: string): string { + return `${dump.isMainPage(articleId) ? this.mainPageUrlDirector.buildArticleURL(articleId) : this.articleUrlDirector.buildArticleURL(articleId)}` } public removeEtagWeakPrefix(etag: string): string { @@ -300,7 +358,7 @@ class Downloader { articleDetailXId: RKVS, articleRenderer, articleUrl, - dump, + dump: Dump, articleDetail?: ArticleDetail, isMainPage?: boolean, ): Promise { diff --git a/src/MediaWiki.ts b/src/MediaWiki.ts index 6cd9c768..157bf4c8 100644 --- a/src/MediaWiki.ts +++ b/src/MediaWiki.ts @@ -58,12 +58,19 @@ class MediaWiki { public visualEditorApiUrl: URL public actionApiUrl: URL + + public VisualEditorApiUrl: URL + public apiUrl: URL public modulePath: string // only for reading public mobileModulePath: string public webUrl: URL public WikimediaDesktopApiUrl: URL public WikimediaMobileApiUrl: URL + #apiUrlDirector: ApiURLDirector + #wikimediaDesktopUrlDirector: WikimediaDesktopURLDirector + #wikimediaMobileUrlDirector: WikimediaMobileURLDirector + #visualEditorURLDirector: VisualEditorURLDirector #hasWikimediaDesktopApi: boolean | null #hasWikimediaMobileApi: boolean | null #hasVisualEditorApi: boolean | null @@ -163,7 +170,8 @@ class MediaWiki { public async hasWikimediaDesktopApi(): Promise { if (this.#hasWikimediaDesktopApi === null) { - this.#hasWikimediaDesktopApi = await checkApiAvailability(this.wikimediaDesktopUrlDirector.buildArticleURL(this.apiCheckArticleId)) + this.#wikimediaDesktopUrlDirector = new WikimediaDesktopURLDirector(this.WikimediaDesktopApiUrl.href) + this.#hasWikimediaDesktopApi = await checkApiAvailability(this.#wikimediaDesktopUrlDirector.buildArticleURL(this.apiCheckArticleId)) return this.#hasWikimediaDesktopApi } return this.#hasWikimediaDesktopApi @@ -171,7 +179,8 @@ class MediaWiki { public async hasWikimediaMobileApi(): Promise { if (this.#hasWikimediaMobileApi === null) { - this.#hasWikimediaMobileApi = await checkApiAvailability(this.wikimediaMobileUrlDirector.buildArticleURL(this.apiCheckArticleId)) + this.#wikimediaMobileUrlDirector = new WikimediaMobileURLDirector(this.WikimediaMobileApiUrl.href) + this.#hasWikimediaMobileApi = await checkApiAvailability(this.#wikimediaMobileUrlDirector.buildArticleURL(this.apiCheckArticleId)) return this.#hasWikimediaMobileApi } return this.#hasWikimediaMobileApi @@ -179,7 +188,8 @@ class MediaWiki { public async hasVisualEditorApi(): Promise { if (this.#hasVisualEditorApi === null) { - this.#hasVisualEditorApi = await checkApiAvailability(this.visualEditorURLDirector.buildArticleURL(this.apiCheckArticleId)) + this.#visualEditorURLDirector = new VisualEditorURLDirector(this.VisualEditorApiUrl.href) + this.#hasVisualEditorApi = await checkApiAvailability(this.#visualEditorURLDirector.buildArticleURL(this.apiCheckArticleId)) return this.#hasVisualEditorApi } return this.#hasVisualEditorApi @@ -193,7 +203,7 @@ class MediaWiki { rdnamespace: validNamespaceIds, } - const resp = await downloader.getJSON(this.apiUrlDirector.buildQueryURL(reqOpts)) + const resp = await downloader.getJSON(this.#apiUrlDirector.buildQueryURL(reqOpts)) const isCoordinateWarning = JSON.stringify(resp?.warnings?.query ?? '').includes('coordinates') if (isCoordinateWarning) { logger.info('Coordinates not available on this wiki') @@ -218,6 +228,15 @@ class MediaWiki { this.apiUrlDirector = new ApiURLDirector(this.actionApiUrl.href) this.visualEditorApiUrl = this.apiUrlDirector.buildVisualEditorURL() this.visualEditorURLDirector = new VisualEditorURLDirector(this.visualEditorApiUrl.href) + const baseUrlDirector = new BaseURLDirector(this.baseUrl.href) + this.webUrl = baseUrlDirector.buildURL(this.#wikiPath) + this.apiUrl = baseUrlDirector.buildURL(this.#apiActionPath) + this.#apiUrlDirector = new ApiURLDirector(this.apiUrl.href) + this.VisualEditorApiUrl = this.#apiUrlDirector.buildVisualEditorURL() + this.WikimediaDesktopApiUrl = baseUrlDirector.buildWikimediaDesktopApiUrl(this.#restApiPath) + this.WikimediaMobileApiUrl = baseUrlDirector.buildWikimediaMobileApiUrl(this.#restApiPath) + this.modulePath = baseUrlDirector.buildModuleURL(this._modulePathOpt) + this.mobileModulePath = baseUrlDirector.buildMobileModuleURL() } public async login(downloader: Downloader) { @@ -261,7 +280,7 @@ class MediaWiki { } public async getNamespaces(addNamespaces: number[], downloader: Downloader) { - const url = this.apiUrlDirector.buildNamespacesURL() + const url = this.#apiUrlDirector.buildNamespacesURL() const json: any = await downloader.getJSON(url) ;['namespaces', 'namespacealiases'].forEach((type) => { diff --git a/src/mwoffliner.lib.ts b/src/mwoffliner.lib.ts index 323485bb..cb3d663e 100644 --- a/src/mwoffliner.lib.ts +++ b/src/mwoffliner.lib.ts @@ -214,6 +214,7 @@ async function execute(argv: any) { await MediaWiki.hasWikimediaDesktopApi() const hasWikimediaMobileApi = await MediaWiki.hasWikimediaMobileApi() await MediaWiki.hasVisualEditorApi() + await downloader.setBaseUrlsDirectors(forceRender) RedisStore.setOptions(argv.redis || config.defaults.redisPath) await RedisStore.connect() diff --git a/src/util/builders/url/api.director.ts b/src/util/builders/url/api.director.ts index fabae422..b4343019 100644 --- a/src/util/builders/url/api.director.ts +++ b/src/util/builders/url/api.director.ts @@ -53,12 +53,9 @@ export default class ApiURLDirector { } buildArticleApiURL(articleId: string) { - const domain = this.buildBaseArticleURL() - - return urlBuilder.setDomain(domain).setQueryParams({ page: articleId }, '&').build() - } - - private buildBaseArticleURL() { - return urlBuilder.setDomain(this.baseDomain).setQueryParams({ action: 'parse', format: 'json', prop: 'modules|jsconfigvars|headhtml', formatversion: '2' }).build() + return urlBuilder + .setDomain(this.baseDomain) + .setQueryParams({ action: 'parse', format: 'json', prop: 'modules|jsconfigvars|headhtml', formatversion: '2', page: articleId }) + .build() } } diff --git a/src/util/saveArticles.ts b/src/util/saveArticles.ts index f715a526..79ba34ef 100644 --- a/src/util/saveArticles.ts +++ b/src/util/saveArticles.ts @@ -133,7 +133,7 @@ async function getAllArticlesToKeep(downloader: Downloader, articleDetailXId: RK try { const isMainPage = dump.isMainPage(articleId) const renderer = isMainPage ? mainPageRenderer : articlesRenderer - const articleUrl = downloader.getArticleUrl(renderer, articleId) + const articleUrl = downloader.getArticleUrl(dump, articleId) rets = await downloader.getArticle(downloader.webp, _moduleDependencies, articleId, articleDetailXId, renderer, articleUrl, dump, articleDetail, isMainPage) for (const { articleId, html } of rets) { @@ -292,7 +292,7 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade const isMainPage = dump.isMainPage(articleId) const renderer = isMainPage ? mainPageRenderer : articlesRenderer - const articleUrl = downloader.getArticleUrl(renderer, articleId) + const articleUrl = downloader.getArticleUrl(dump, articleId) rets = await downloader.getArticle(downloader.webp, _moduleDependencies, articleId, articleDetailXId, renderer, articleUrl, dump, articleDetail, isMainPage) diff --git a/test/unit/downloader.test.ts b/test/unit/downloader.test.ts index 7e89a02a..e283ff54 100644 --- a/test/unit/downloader.test.ts +++ b/test/unit/downloader.test.ts @@ -14,10 +14,9 @@ import urlParser from 'url' import { setTimeout } from 'timers/promises' import domino from 'domino' import { WikimediaDesktopRenderer } from '../../src/renderers/wikimedia-desktop.renderer.js' -import { VisualEditorRenderer } from '../../src/renderers/visual-editor.renderer.js' import { WikimediaMobileRenderer } from '../../src/renderers/wikimedia-mobile.renderer.js' +import { VisualEditorRenderer } from '../../src/renderers/visual-editor.renderer.js' import { RENDERERS_LIST } from '../../src/util/const.js' -import { setupScrapeClasses } from '../util.js' jest.setTimeout(200000) @@ -127,18 +126,16 @@ describe('Downloader class', () => { describe('getArticle method', () => { let dump: Dump - let renderer - const wikimediaDesktopRenderer = new WikimediaDesktopRenderer() + const wikimediaMobileRenderer = new WikimediaMobileRenderer() + beforeAll(async () => { const mwMetadata = await MediaWiki.getMwMetaData(downloader) dump = new Dump('', {} as any, mwMetadata) - const setupScrapeClass = await setupScrapeClasses() // en wikipedia - renderer = setupScrapeClass.renderer }) - test('getArticle of "London" returns one article for WikimediaDesktop render', async () => { + test('getArticle of "London" returns one article for wikimediaMobileRenderer render', async () => { const articleId = 'London' - const articleUrl = downloader.getArticleUrl(renderer, articleId) + const articleUrl = downloader.getArticleUrl(dump, articleId) const articleDetail = { title: articleId, thumbnail: { @@ -156,7 +153,7 @@ describe('Downloader class', () => { _moduleDependencies, articleId, RedisStore.articleDetailXId, - wikimediaDesktopRenderer, + wikimediaMobileRenderer, articleUrl, dump, articleDetail, @@ -168,13 +165,15 @@ describe('Downloader class', () => { test('Categories with many subCategories are paginated for WikimediaDesktop render', async () => { const articleId = 'Category:Container_categories' const _moduleDependencies = await downloader.getModuleDependencies(articleId) + const wikimediaDesktopRenderer = new WikimediaDesktopRenderer() const articleDetail = { title: articleId, ns: 14, revisionId: 1168361498, timestamp: '2023-08-02T09:57:11Z', } - const articleUrl = downloader.getArticleUrl(renderer, articleDetail.title) + // Enforce desktop url here as this test desktop API-specific + const articleUrl = `https://en.wikipedia.org/api/rest_v1/page/html/${articleId}` const PaginatedArticle = await downloader.getArticle( downloader.webp, _moduleDependencies, @@ -191,7 +190,7 @@ describe('Downloader class', () => { test('getArticle response status for non-existent article id is 404 for WikimediaDesktop render', async () => { const articleId = 'NeverExistingArticle' - const articleUrl = downloader.getArticleUrl(renderer, articleId) + const articleUrl = downloader.getArticleUrl(dump, articleId) const articleDetail = { title: articleId, missing: '', @@ -203,7 +202,7 @@ describe('Downloader class', () => { _moduleDependencies, 'NeverExistingArticle', RedisStore.articleDetailXId, - wikimediaDesktopRenderer, + wikimediaMobileRenderer, articleUrl, dump, articleDetail, diff --git a/test/unit/saveArticles.test.ts b/test/unit/saveArticles.test.ts index 7329adfb..7f10413d 100644 --- a/test/unit/saveArticles.test.ts +++ b/test/unit/saveArticles.test.ts @@ -71,7 +71,7 @@ describe('saveArticles', () => { expect(addedArticles[0].aid).toEqual('A/London') const articleId = 'non-existent-article' - const articleUrl = downloader.getArticleUrl(renderer, articleId) + const articleUrl = downloader.getArticleUrl(dump, articleId) const articleDetail = { title: 'Non-existent-article', missing: '' } const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title) @@ -91,8 +91,9 @@ describe('saveArticles', () => { test(`Check nodet article for en.wikipedia.org using ${renderer} renderer`, async () => { const { downloader, dump } = await setupScrapeClasses({ mwUrl: 'https://en.wikipedia.org', format: 'nodet' }) // en wikipedia + await downloader.setBaseUrlsDirectors(renderer) const articleId = 'Canada' - const articleUrl = downloader.getArticleUrl(rendererInstance, articleId) + const articleUrl = downloader.getArticleUrl(dump, articleId) const _articleDetailsRet = await downloader.getArticleDetailsIds([articleId]) const articlesDetail = mwRetToArticleDetail(_articleDetailsRet) const { articleDetailXId } = RedisStore @@ -279,6 +280,7 @@ describe('saveArticles', () => { test('--customFlavour', async () => { const { downloader, dump } = await setupScrapeClasses({ format: 'nopic' }) // en wikipedia + await downloader.setBaseUrlsDirectors() class CustomFlavour implements CustomProcessor { // eslint-disable-next-line @typescript-eslint/no-unused-vars public async shouldKeepArticle(articleId: string, doc: Document) { diff --git a/test/unit/treatments/article.treatment.test.ts b/test/unit/treatments/article.treatment.test.ts index 145fec07..0758d02e 100644 --- a/test/unit/treatments/article.treatment.test.ts +++ b/test/unit/treatments/article.treatment.test.ts @@ -43,8 +43,9 @@ describe('ArticleTreatment', () => { await articleDetailXId.setMany(articlesDetail) const addedArticles: (typeof ZimArticle)[] = [] + const articleId = 'non-existent-article' - const articleUrl = downloader.getArticleUrl(renderer, articleId) + const articleUrl = downloader.getArticleUrl(dump, articleId) const _moduleDependencies = await downloader.getModuleDependencies(title) const articleDetail = { diff --git a/test/unit/urlRewriting.test.ts b/test/unit/urlRewriting.test.ts index 1ba8eab2..9efa2b24 100644 --- a/test/unit/urlRewriting.test.ts +++ b/test/unit/urlRewriting.test.ts @@ -140,6 +140,7 @@ describe('Styles', () => { await articleDetailXId.flush() await RedisStore.redirectsXId.flush() const { downloader, dump } = await setupScrapeClasses() // en wikipedia + await downloader.setBaseUrlsDirectors('WikimediaDesktop') await getArticleIds(downloader, '', ['London', 'British_Museum', 'Natural_History_Museum,_London', 'Farnborough/Aldershot_built-up_area']) From 0f718bceb6640c49f6419bb94f1f0fa12e3d78b0 Mon Sep 17 00:00:00 2001 From: Vadim Kovalenko Date: Thu, 26 Oct 2023 11:11:33 +0300 Subject: [PATCH 05/10] Minor fixes after rebase --- src/config.ts | 2 +- src/renderers/abstractMobile.render.ts | 4 +- test/unit/saveArticles.test.ts | 63 -------------------------- 3 files changed, 3 insertions(+), 66 deletions(-) diff --git a/src/config.ts b/src/config.ts index 1bc3da66..1418c531 100644 --- a/src/config.ts +++ b/src/config.ts @@ -61,7 +61,7 @@ const config = { jsResources: ['../node_modules/details-element-polyfill/dist/details-element-polyfill'], wikimediaMobileCssResources: ['wm_mobile_override_style'], - mwMobileJsResources: ['wm_mobile_override_script'], + wikimediaMobileJsResources: ['wm_mobile_override_script'], // JS/CSS resources to be imported from MediaWiki mw: { diff --git a/src/renderers/abstractMobile.render.ts b/src/renderers/abstractMobile.render.ts index b771bd0b..c470b4d9 100644 --- a/src/renderers/abstractMobile.render.ts +++ b/src/renderers/abstractMobile.render.ts @@ -11,7 +11,7 @@ export abstract class MobileRenderer extends Renderer { public staticFilesListMobile: string[] = [] constructor() { super() - this.staticFilesListMobile = this.staticFilesListCommon.concat(getStaticFiles(config.output.mwMobileJsResources, config.output.wikimediaMobileCssResources)) + this.staticFilesListMobile = this.staticFilesListCommon.concat(getStaticFiles(config.output.wikimediaMobileJsResources, config.output.wikimediaMobileCssResources)) } public filterWikimediaMobileModules(_moduleDependencies) { @@ -45,7 +45,7 @@ export abstract class MobileRenderer extends Renderer { const htmlTemplateString = htmlWikimediaMobileTemplateCode() .replace('__ARTICLE_CANONICAL_LINK__', genCanonicalLink(config, MediaWiki.webUrl.href, articleId)) .replace('__ARTICLE_CONFIGVARS_LIST__', '') - .replace('__JS_SCRIPTS__', this.genWikimediaMobileOverrideScript(config.output.mwMobileJsResources[0])) + .replace('__JS_SCRIPTS__', this.genWikimediaMobileOverrideScript(config.output.wikimediaMobileJsResources[0])) .replace('__CSS_LINKS__', this.genWikimediaMobileOverrideCSSLink(config.output.wikimediaMobileCssResources[0])) .replace( '__ARTICLE_JS_LIST__', diff --git a/test/unit/saveArticles.test.ts b/test/unit/saveArticles.test.ts index 7f10413d..b3add992 100644 --- a/test/unit/saveArticles.test.ts +++ b/test/unit/saveArticles.test.ts @@ -278,69 +278,6 @@ describe('saveArticles', () => { */ }) - test('--customFlavour', async () => { - const { downloader, dump } = await setupScrapeClasses({ format: 'nopic' }) // en wikipedia - await downloader.setBaseUrlsDirectors() - class CustomFlavour implements CustomProcessor { - // eslint-disable-next-line @typescript-eslint/no-unused-vars - public async shouldKeepArticle(articleId: string, doc: Document) { - return articleId !== 'London' - } - public async preProcessArticle(articleId: string, doc: Document) { - if (articleId === 'Paris') { - const h2 = doc.createElement('h2') - h2.textContent = 'INSERTED_BY_PRE_PROCESSOR' - h2.id = 'PRE_PROCESSOR' - doc.body.appendChild(h2) - } - return doc - } - public async postProcessArticle(articleId: string, doc: Document) { - if (articleId === 'Prague') { - const h2 = doc.createElement('h2') - h2.textContent = 'INSERTED_BY_POST_PROCESSOR' - h2.id = 'POST_PROCESSOR' - doc.body.appendChild(h2) - } - return doc - } - } - const customFlavour = new CustomFlavour() - dump.customProcessor = customFlavour - - const _articlesDetail = await downloader.getArticleDetailsIds(['London', 'Paris', 'Prague']) - const articlesDetail = mwRetToArticleDetail(_articlesDetail) - const { articleDetailXId } = RedisStore - await articleDetailXId.flush() - await articleDetailXId.setMany(articlesDetail) - - const writtenArticles: any = {} - await saveArticles( - { - addArticle(article: typeof ZimArticle) { - if (article.mimeType === 'text/html') { - writtenArticles[article.title] = article - } - return Promise.resolve(null) - }, - } as any, - downloader, - dump, - true, - 'WikimediaDesktop', - ) - - const ParisDocument = domino.createDocument(writtenArticles.Paris.bufferData) - const PragueDocument = domino.createDocument(writtenArticles.Prague.bufferData) - - // London was correctly filtered out by customFlavour - expect(writtenArticles.London).toBeUndefined() - // Paris was correctly pre-processed - expect(ParisDocument.querySelector('#PRE_PROCESSOR')).toBeDefined() - // Prague was correctly post-processed - expect(PragueDocument.querySelector('#POST_PROCESSOR')).toBeDefined() - }) - test('Test deleted article rendering (Visual editor renderer)', async () => { const { downloader, dump } = await setupScrapeClasses() // en wikipedia const { articleDetailXId } = RedisStore From 2f5d03fa744365e124a7bb4127cde2ad636328e0 Mon Sep 17 00:00:00 2001 From: Vadim Kovalenko Date: Wed, 25 Oct 2023 11:40:49 +0300 Subject: [PATCH 06/10] Fix codefactor issue in mwoffliner.lib.ts --- src/mwoffliner.lib.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mwoffliner.lib.ts b/src/mwoffliner.lib.ts index cb3d663e..72d09b0d 100644 --- a/src/mwoffliner.lib.ts +++ b/src/mwoffliner.lib.ts @@ -608,7 +608,7 @@ async function execute(argv: any) { } async function fetchArticleDetail(articleId: string) { - return await articleDetailXId.get(articleId) + return articleDetailXId.get(articleId) } async function updateArticleThumbnail(articleDetail: any, articleId: string) { From 0d937d1dd3cdf065001fcef0e78c499a1d1359a0 Mon Sep 17 00:00:00 2001 From: Vadim Kovalenko Date: Thu, 26 Oct 2023 12:29:59 +0300 Subject: [PATCH 07/10] Fix handling articleId for VE --- src/util/builders/url/visual-editor.director.ts | 5 +---- test/unit/treatments/article.treatment.test.ts | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/util/builders/url/visual-editor.director.ts b/src/util/builders/url/visual-editor.director.ts index e818d1d3..660f1e1d 100644 --- a/src/util/builders/url/visual-editor.director.ts +++ b/src/util/builders/url/visual-editor.director.ts @@ -11,9 +11,6 @@ export default class VisualEditorURLDirector { } buildArticleURL(articleId: string) { - return urlBuilder - .setDomain(this.baseDomain) - .setQueryParams({ page: encodeURIComponent(articleId) }, '&') - .build() + return urlBuilder.setDomain(this.baseDomain).setQueryParams({ page: articleId }, '&').build() } } diff --git a/test/unit/treatments/article.treatment.test.ts b/test/unit/treatments/article.treatment.test.ts index 0758d02e..eb23a321 100644 --- a/test/unit/treatments/article.treatment.test.ts +++ b/test/unit/treatments/article.treatment.test.ts @@ -36,6 +36,7 @@ describe('ArticleTreatment', () => { test(`Article html processing for ${renderer} render`, async () => { const { downloader, dump } = await setupScrapeClasses() // en wikipedia const title = 'London' + await downloader.setBaseUrlsDirectors(renderer) const _articlesDetail = await downloader.getArticleDetailsIds([title]) const articlesDetail = mwRetToArticleDetail(_articlesDetail) const { articleDetailXId } = RedisStore From 5b169c1180339cd7299cfa21acc40a58295fa230 Mon Sep 17 00:00:00 2001 From: Vadim Kovalenko Date: Thu, 26 Oct 2023 15:24:18 +0300 Subject: [PATCH 08/10] Add 1000 ms delay to tests where muliple renderers are involved --- test/e2e/extra.e2e.test.ts | 2 ++ test/testRenders.ts | 3 +-- test/unit/downloader.test.ts | 2 ++ test/unit/saveArticles.test.ts | 6 +++++- test/unit/treatments/article.treatment.test.ts | 5 ++++- 5 files changed, 14 insertions(+), 4 deletions(-) diff --git a/test/e2e/extra.e2e.test.ts b/test/e2e/extra.e2e.test.ts index 4d8dd12c..24fd5128 100644 --- a/test/e2e/extra.e2e.test.ts +++ b/test/e2e/extra.e2e.test.ts @@ -6,6 +6,7 @@ import { execa } from 'execa' import 'dotenv/config.js' import { jest } from '@jest/globals' import { RENDERERS_LIST } from '../../src/util/const.js' +import { sleep } from '../util.js' jest.setTimeout(20000) @@ -67,6 +68,7 @@ describe('Extra', () => { const redisScan = await execa('redis-cli --scan', { shell: true }) // Redis has been cleared expect(redisScan.stdout).toEqual('') + await sleep(1000) }) } }) diff --git a/test/testRenders.ts b/test/testRenders.ts index 68062359..a5f932a6 100644 --- a/test/testRenders.ts +++ b/test/testRenders.ts @@ -2,7 +2,7 @@ import * as logger from '../src/Logger.js' import * as mwoffliner from '../src/mwoffliner.lib.js' import { execa } from 'execa' import { RENDERERS_LIST } from '../src/util/const.js' -import { zimcheckAvailable, zimdumpAvailable } from './util.js' +import { zimcheckAvailable, zimdumpAvailable, sleep } from './util.js' interface Parameters { mwUrl: string @@ -66,7 +66,6 @@ export async function testRenders(parameters: Parameters, callback, renderersLis logger.error(err.message) return } - } } export async function testAllRenders(parameters: Parameters, callback) { diff --git a/test/unit/downloader.test.ts b/test/unit/downloader.test.ts index e283ff54..41e34f25 100644 --- a/test/unit/downloader.test.ts +++ b/test/unit/downloader.test.ts @@ -17,6 +17,7 @@ import { WikimediaDesktopRenderer } from '../../src/renderers/wikimedia-desktop. import { WikimediaMobileRenderer } from '../../src/renderers/wikimedia-mobile.renderer.js' import { VisualEditorRenderer } from '../../src/renderers/visual-editor.renderer.js' import { RENDERERS_LIST } from '../../src/util/const.js' +import { sleep } from '../util.js' jest.setTimeout(200000) @@ -255,6 +256,7 @@ describe('Downloader class', () => { dump.isMainPage(articleId), ), ).rejects.toThrowError(new Error('Request failed with status code 404')) + await sleep(1000) }) } }) diff --git a/test/unit/saveArticles.test.ts b/test/unit/saveArticles.test.ts index b3add992..407903e1 100644 --- a/test/unit/saveArticles.test.ts +++ b/test/unit/saveArticles.test.ts @@ -2,7 +2,7 @@ import domino from 'domino' import RedisStore from '../../src/RedisStore.js' import { startRedis, stopRedis } from './bootstrap.js' -import { setupScrapeClasses } from '../util.js' +import { setupScrapeClasses, sleep } from '../util.js' import { saveArticles } from '../../src/util/saveArticles.js' import { ZimArticle } from '@openzim/libzim' import { mwRetToArticleDetail, DELETED_ARTICLE_ERROR } from '../../src/util/index.js' @@ -87,6 +87,7 @@ describe('saveArticles', () => { expect(articleDoc.querySelector('meta[name="geo.position"]')?.getAttribute('content')).toEqual('51.50722222;-0.1275') // Check if header exists expect(articleDoc.querySelector('h1.article-header, h1.pcs-edit-section-title')).toBeTruthy() + await sleep(1000) }) test(`Check nodet article for en.wikipedia.org using ${renderer} renderer`, async () => { @@ -118,6 +119,7 @@ describe('saveArticles', () => { const leadSection = sections[0] expect(sections.length).toEqual(1) expect(leadSection.getAttribute('data-mw-section-id')).toEqual('0') + await sleep(1000) }) test(`Load main page and check that it is without header using ${renderer} renderer`, async () => { @@ -144,6 +146,7 @@ describe('saveArticles', () => { ) const articleDoc = domino.createDocument(result[0].html) expect(articleDoc.querySelector('h1.article-header')).toBeFalsy() + await sleep(1000) }) test(`--customFlavour using ${renderer} renderer`, async () => { @@ -207,6 +210,7 @@ describe('saveArticles', () => { expect(ParisDocument.querySelector('#PRE_PROCESSOR')).toBeDefined() // Prague was correctly post-processed expect(PragueDocument.querySelector('#POST_PROCESSOR')).toBeDefined() + await sleep(1000) }) } diff --git a/test/unit/treatments/article.treatment.test.ts b/test/unit/treatments/article.treatment.test.ts index eb23a321..b38a40a7 100644 --- a/test/unit/treatments/article.treatment.test.ts +++ b/test/unit/treatments/article.treatment.test.ts @@ -2,7 +2,7 @@ import domino from 'domino' import RedisStore from '../../../src/RedisStore.js' import { ZimArticle } from '@openzim/libzim' import { mwRetToArticleDetail } from '../../../src/util/mw-api.js' -import { setupScrapeClasses } from '../../util.js' +import { setupScrapeClasses, sleep } from '../../util.js' import { startRedis, stopRedis } from '../bootstrap.js' import { saveArticles } from '../../../src/util/saveArticles.js' import { jest } from '@jest/globals' @@ -74,6 +74,7 @@ describe('ArticleTreatment', () => { downloader, dump, true, + renderer, ) // Successfully scrapped existent articles @@ -90,6 +91,8 @@ describe('ArticleTreatment', () => { expect(articleDoc.querySelector('meta[name="geo.position"]')).toBeDefined() // Geo Position data is correct expect(articleDoc.querySelector('meta[name="geo.position"]')?.getAttribute('content')).toEqual('51.50722222;-0.1275') + + await sleep(1000) }) } }) From d8bf7fcc6ed6c6587ffdc15ebcfca013e41bcc49 Mon Sep 17 00:00:00 2001 From: Vadim Kovalenko Date: Tue, 7 Nov 2023 15:10:29 +0200 Subject: [PATCH 09/10] Deprecate downloader.setBaseUrls() in favor of downloader.setUrlsDirectors() --- src/Downloader.ts | 82 +++++-------------- src/mwoffliner.lib.ts | 1 - src/util/saveArticles.ts | 6 +- test/e2e/extra.e2e.test.ts | 2 - test/testRenders.ts | 2 +- test/unit/downloader.test.ts | 11 ++- test/unit/saveArticles.test.ts | 16 ++-- .../unit/treatments/article.treatment.test.ts | 8 +- test/unit/urlRewriting.test.ts | 1 - 9 files changed, 40 insertions(+), 89 deletions(-) diff --git a/src/Downloader.ts b/src/Downloader.ts index 4d57bac7..30df0b49 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -22,7 +22,6 @@ import * as logger from './Logger.js' import MediaWiki, { QueryOpts } from './MediaWiki.js' import { Dump } from './Dump.js' import ApiURLDirector from './util/builders/url/api.director.js' -import basicURLDirector from './util/builders/url/basic.director.js' import urlHelper from './util/url.helper.js' import WikimediaDesktopURLDirector from './util/builders/url/desktop.director.js' @@ -95,8 +94,6 @@ class Downloader { public streamRequestOptions: AxiosRequestConfig public wikimediaMobileJsDependenciesList: string[] = [] public wikimediaMobileStyleDependenciesList: string[] = [] - public articleUrlDirector: WikimediaDesktopURLDirector | WikimediaMobileURLDirector | VisualEditorURLDirector - public mainPageUrlDirector: WikimediaDesktopURLDirector | WikimediaMobileURLDirector | VisualEditorURLDirector private readonly uaString: string private activeRequests = 0 @@ -105,6 +102,8 @@ class Downloader { private readonly optimisationCacheUrl: string private s3: S3 private apiUrlDirector: ApiURLDirector + private articleUrlDirector: WikimediaDesktopURLDirector | WikimediaMobileURLDirector | VisualEditorURLDirector + private mainPageUrlDirector: WikimediaDesktopURLDirector | WikimediaMobileURLDirector | VisualEditorURLDirector constructor({ uaString, speed, reqTimeout, optimisationCacheUrl, s3, webp, backoffOptions }: DownloaderOpts) { this.uaString = uaString @@ -177,69 +176,32 @@ class Downloader { } } - private getUrlDirector(capabilitiesList): WikimediaDesktopURLDirector | WikimediaMobileURLDirector | VisualEditorURLDirector { - for (const capabilityInfo of capabilitiesList) { - if (capabilityInfo.condition) { - return new capabilityInfo.Director(capabilityInfo.value) - } + private getUrlDirector(renderer: object) { + switch (renderer.constructor.name) { + case 'WikimediaDesktopRenderer': + return new WikimediaDesktopURLDirector(MediaWiki.WikimediaDesktopApiUrl.href) + case 'VisualEditorRenderer': + return new VisualEditorURLDirector(MediaWiki.VisualEditorApiUrl.href) + case 'WikimediaMobileRenderer': + return new WikimediaMobileURLDirector(MediaWiki.WikimediaMobileApiUrl.href) } - throw new Error('No suitable URL director found.') } - public async setBaseUrlsDirectors(forceRender = null) { - if (!forceRender) { - //* Objects order in array matters! - const articlesCapabilitiesList = [ - { condition: await MediaWiki.hasWikimediaMobileApi(), value: MediaWiki.WikimediaMobileApiUrl.href, Director: WikimediaMobileURLDirector }, - { condition: await MediaWiki.hasWikimediaDesktopApi(), value: MediaWiki.WikimediaDesktopApiUrl.href, Director: WikimediaDesktopURLDirector }, - { condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.VisualEditorApiUrl.href, Director: VisualEditorURLDirector }, - ] - - this.baseUrl = basicURLDirector.buildDownloaderBaseUrl(articlesCapabilitiesList) - this.articleUrlDirector = this.getUrlDirector(articlesCapabilitiesList) - - //* Objects order in array matters! - const mainPageCapabilitiesList = [ - { condition: await MediaWiki.hasWikimediaDesktopApi(), value: MediaWiki.WikimediaDesktopApiUrl.href, Director: WikimediaDesktopURLDirector }, - { condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.VisualEditorApiUrl.href, Director: VisualEditorURLDirector }, - { condition: await MediaWiki.hasWikimediaMobileApi(), value: MediaWiki.WikimediaMobileApiUrl.href, Director: WikimediaMobileURLDirector }, - ] - this.baseUrlForMainPage = basicURLDirector.buildDownloaderBaseUrl(mainPageCapabilitiesList) - this.mainPageUrlDirector = this.getUrlDirector(mainPageCapabilitiesList) - } else { - switch (forceRender) { - case 'WikimediaDesktop': - if (MediaWiki.hasWikimediaDesktopApi()) { - this.baseUrl = MediaWiki.WikimediaDesktopApiUrl.href - this.baseUrlForMainPage = MediaWiki.WikimediaDesktopApiUrl.href - this.articleUrlDirector = this.mainPageUrlDirector = new WikimediaDesktopURLDirector(MediaWiki.WikimediaDesktopApiUrl.href) - break - } - break - case 'VisualEditor': - if (MediaWiki.hasVisualEditorApi()) { - this.baseUrl = MediaWiki.VisualEditorApiUrl.href - this.baseUrlForMainPage = MediaWiki.VisualEditorApiUrl.href - this.articleUrlDirector = this.mainPageUrlDirector = new VisualEditorURLDirector(MediaWiki.VisualEditorApiUrl.href) - break - } - break - case 'WikimediaMobile': - if (MediaWiki.hasWikimediaMobileApi()) { - this.baseUrl = MediaWiki.WikimediaMobileApiUrl.href - this.baseUrlForMainPage = MediaWiki.WikimediaMobileApiUrl.href - this.articleUrlDirector = this.mainPageUrlDirector = new WikimediaMobileURLDirector(MediaWiki.WikimediaMobileApiUrl.href) - break - } - break - default: - throw new Error('Unable to find specific API end-point to retrieve article HTML') - } + public async setUrlsDirectors(mainPageRenderer, articlesRenderer): Promise { + if (!this.articleUrlDirector) { + this.articleUrlDirector = this.getUrlDirector(articlesRenderer) + } + if (!this.mainPageUrlDirector) { + this.mainPageUrlDirector = this.getUrlDirector(mainPageRenderer) } } - public getArticleUrl(dump: Dump, articleId: string): string { - return `${dump.isMainPage(articleId) ? this.mainPageUrlDirector.buildArticleURL(articleId) : this.articleUrlDirector.buildArticleURL(articleId)}` + public getArticleUrl(articleId: string): string { + return this.articleUrlDirector.buildArticleURL(articleId) + } + + public getMainPageUrl(articleId: string): string { + return this.mainPageUrlDirector.buildArticleURL(articleId) } public removeEtagWeakPrefix(etag: string): string { diff --git a/src/mwoffliner.lib.ts b/src/mwoffliner.lib.ts index 72d09b0d..f85cef5b 100644 --- a/src/mwoffliner.lib.ts +++ b/src/mwoffliner.lib.ts @@ -214,7 +214,6 @@ async function execute(argv: any) { await MediaWiki.hasWikimediaDesktopApi() const hasWikimediaMobileApi = await MediaWiki.hasWikimediaMobileApi() await MediaWiki.hasVisualEditorApi() - await downloader.setBaseUrlsDirectors(forceRender) RedisStore.setOptions(argv.redis || config.defaults.redisPath) await RedisStore.connect() diff --git a/src/util/saveArticles.ts b/src/util/saveArticles.ts index 79ba34ef..f32577ca 100644 --- a/src/util/saveArticles.ts +++ b/src/util/saveArticles.ts @@ -133,7 +133,7 @@ async function getAllArticlesToKeep(downloader: Downloader, articleDetailXId: RK try { const isMainPage = dump.isMainPage(articleId) const renderer = isMainPage ? mainPageRenderer : articlesRenderer - const articleUrl = downloader.getArticleUrl(dump, articleId) + const articleUrl = isMainPage ? downloader.getMainPageUrl(articleId) : downloader.getArticleUrl(articleId) rets = await downloader.getArticle(downloader.webp, _moduleDependencies, articleId, articleDetailXId, renderer, articleUrl, dump, articleDetail, isMainPage) for (const { articleId, html } of rets) { @@ -254,6 +254,7 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade renderType: hasWikimediaMobileApi ? 'mobile' : 'auto', }) } + downloader.setUrlsDirectors(mainPageRenderer, articlesRenderer) if (dump.customProcessor?.shouldKeepArticle) { await getAllArticlesToKeep(downloader, articleDetailXId, dump, mainPageRenderer, articlesRenderer) @@ -291,8 +292,7 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade try { const isMainPage = dump.isMainPage(articleId) const renderer = isMainPage ? mainPageRenderer : articlesRenderer - - const articleUrl = downloader.getArticleUrl(dump, articleId) + const articleUrl = isMainPage ? downloader.getMainPageUrl(articleId) : downloader.getArticleUrl(articleId) rets = await downloader.getArticle(downloader.webp, _moduleDependencies, articleId, articleDetailXId, renderer, articleUrl, dump, articleDetail, isMainPage) diff --git a/test/e2e/extra.e2e.test.ts b/test/e2e/extra.e2e.test.ts index 24fd5128..4d8dd12c 100644 --- a/test/e2e/extra.e2e.test.ts +++ b/test/e2e/extra.e2e.test.ts @@ -6,7 +6,6 @@ import { execa } from 'execa' import 'dotenv/config.js' import { jest } from '@jest/globals' import { RENDERERS_LIST } from '../../src/util/const.js' -import { sleep } from '../util.js' jest.setTimeout(20000) @@ -68,7 +67,6 @@ describe('Extra', () => { const redisScan = await execa('redis-cli --scan', { shell: true }) // Redis has been cleared expect(redisScan.stdout).toEqual('') - await sleep(1000) }) } }) diff --git a/test/testRenders.ts b/test/testRenders.ts index a5f932a6..f0bb23f8 100644 --- a/test/testRenders.ts +++ b/test/testRenders.ts @@ -2,7 +2,7 @@ import * as logger from '../src/Logger.js' import * as mwoffliner from '../src/mwoffliner.lib.js' import { execa } from 'execa' import { RENDERERS_LIST } from '../src/util/const.js' -import { zimcheckAvailable, zimdumpAvailable, sleep } from './util.js' +import { zimcheckAvailable, zimdumpAvailable } from './util.js' interface Parameters { mwUrl: string diff --git a/test/unit/downloader.test.ts b/test/unit/downloader.test.ts index 41e34f25..a1806fd1 100644 --- a/test/unit/downloader.test.ts +++ b/test/unit/downloader.test.ts @@ -17,7 +17,6 @@ import { WikimediaDesktopRenderer } from '../../src/renderers/wikimedia-desktop. import { WikimediaMobileRenderer } from '../../src/renderers/wikimedia-mobile.renderer.js' import { VisualEditorRenderer } from '../../src/renderers/visual-editor.renderer.js' import { RENDERERS_LIST } from '../../src/util/const.js' -import { sleep } from '../util.js' jest.setTimeout(200000) @@ -134,9 +133,10 @@ describe('Downloader class', () => { dump = new Dump('', {} as any, mwMetadata) }) - test('getArticle of "London" returns one article for wikimediaMobileRenderer render', async () => { + test('getArticle of "London" returns one article for WikimediaMobileRenderer render', async () => { const articleId = 'London' - const articleUrl = downloader.getArticleUrl(dump, articleId) + downloader.setUrlsDirectors(wikimediaMobileRenderer, wikimediaMobileRenderer) + const articleUrl = downloader.getArticleUrl(articleId) const articleDetail = { title: articleId, thumbnail: { @@ -191,7 +191,7 @@ describe('Downloader class', () => { test('getArticle response status for non-existent article id is 404 for WikimediaDesktop render', async () => { const articleId = 'NeverExistingArticle' - const articleUrl = downloader.getArticleUrl(dump, articleId) + const articleUrl = downloader.getArticleUrl(articleId) const articleDetail = { title: articleId, missing: '', @@ -237,7 +237,7 @@ describe('Downloader class', () => { test(`getArticle response status for non-existent article id is 404 for ${renderer} render`, async () => { const articleId = 'NeverExistingArticle' - const articleUrl = downloader.getArticleUrl(dump, articleId) + const articleUrl = downloader.getArticleUrl(articleId) const articleDetail = { title: articleId, missing: '', @@ -256,7 +256,6 @@ describe('Downloader class', () => { dump.isMainPage(articleId), ), ).rejects.toThrowError(new Error('Request failed with status code 404')) - await sleep(1000) }) } }) diff --git a/test/unit/saveArticles.test.ts b/test/unit/saveArticles.test.ts index 407903e1..4bce3eb4 100644 --- a/test/unit/saveArticles.test.ts +++ b/test/unit/saveArticles.test.ts @@ -2,7 +2,7 @@ import domino from 'domino' import RedisStore from '../../src/RedisStore.js' import { startRedis, stopRedis } from './bootstrap.js' -import { setupScrapeClasses, sleep } from '../util.js' +import { setupScrapeClasses } from '../util.js' import { saveArticles } from '../../src/util/saveArticles.js' import { ZimArticle } from '@openzim/libzim' import { mwRetToArticleDetail, DELETED_ARTICLE_ERROR } from '../../src/util/index.js' @@ -71,7 +71,7 @@ describe('saveArticles', () => { expect(addedArticles[0].aid).toEqual('A/London') const articleId = 'non-existent-article' - const articleUrl = downloader.getArticleUrl(dump, articleId) + const articleUrl = downloader.getArticleUrl(articleId) const articleDetail = { title: 'Non-existent-article', missing: '' } const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title) @@ -87,14 +87,13 @@ describe('saveArticles', () => { expect(articleDoc.querySelector('meta[name="geo.position"]')?.getAttribute('content')).toEqual('51.50722222;-0.1275') // Check if header exists expect(articleDoc.querySelector('h1.article-header, h1.pcs-edit-section-title')).toBeTruthy() - await sleep(1000) }) test(`Check nodet article for en.wikipedia.org using ${renderer} renderer`, async () => { const { downloader, dump } = await setupScrapeClasses({ mwUrl: 'https://en.wikipedia.org', format: 'nodet' }) // en wikipedia - await downloader.setBaseUrlsDirectors(renderer) const articleId = 'Canada' - const articleUrl = downloader.getArticleUrl(dump, articleId) + downloader.setUrlsDirectors(rendererInstance, rendererInstance) + const articleUrl = downloader.getArticleUrl(articleId) const _articleDetailsRet = await downloader.getArticleDetailsIds([articleId]) const articlesDetail = mwRetToArticleDetail(_articleDetailsRet) const { articleDetailXId } = RedisStore @@ -119,14 +118,13 @@ describe('saveArticles', () => { const leadSection = sections[0] expect(sections.length).toEqual(1) expect(leadSection.getAttribute('data-mw-section-id')).toEqual('0') - await sleep(1000) }) test(`Load main page and check that it is without header using ${renderer} renderer`, async () => { const { downloader, dump } = await setupScrapeClasses({ mwUrl: 'https://en.wikivoyage.org' }) // en wikipedia await downloader.setBaseUrlsDirectors(rendererInstance, rendererInstance) const articleId = 'Main_Page' - const articleUrl = downloader.getArticleUrl(dump, articleId) + const articleUrl = downloader.getArticleUrl(articleId) const _articleDetailsRet = await downloader.getArticleDetailsIds([articleId]) const articlesDetail = mwRetToArticleDetail(_articleDetailsRet) const { articleDetailXId } = RedisStore @@ -146,7 +144,6 @@ describe('saveArticles', () => { ) const articleDoc = domino.createDocument(result[0].html) expect(articleDoc.querySelector('h1.article-header')).toBeFalsy() - await sleep(1000) }) test(`--customFlavour using ${renderer} renderer`, async () => { @@ -210,7 +207,6 @@ describe('saveArticles', () => { expect(ParisDocument.querySelector('#PRE_PROCESSOR')).toBeDefined() // Prague was correctly post-processed expect(PragueDocument.querySelector('#POST_PROCESSOR')).toBeDefined() - await sleep(1000) }) } @@ -227,7 +223,7 @@ describe('saveArticles', () => { const downloader = classes.downloader await downloader.setCapabilities() - await downloader.setBaseUrlsDirectors() + await downloader.setUrlsDirectors() const _articleDetailsRet = await downloader.getArticleDetailsIds(['Western_Greenland']) const articlesDetail = mwRetToArticleDetail(_articleDetailsRet) const { articleDetailXId } = redisStore diff --git a/test/unit/treatments/article.treatment.test.ts b/test/unit/treatments/article.treatment.test.ts index b38a40a7..28387aba 100644 --- a/test/unit/treatments/article.treatment.test.ts +++ b/test/unit/treatments/article.treatment.test.ts @@ -2,7 +2,7 @@ import domino from 'domino' import RedisStore from '../../../src/RedisStore.js' import { ZimArticle } from '@openzim/libzim' import { mwRetToArticleDetail } from '../../../src/util/mw-api.js' -import { setupScrapeClasses, sleep } from '../../util.js' +import { setupScrapeClasses } from '../../util.js' import { startRedis, stopRedis } from '../bootstrap.js' import { saveArticles } from '../../../src/util/saveArticles.js' import { jest } from '@jest/globals' @@ -36,7 +36,6 @@ describe('ArticleTreatment', () => { test(`Article html processing for ${renderer} render`, async () => { const { downloader, dump } = await setupScrapeClasses() // en wikipedia const title = 'London' - await downloader.setBaseUrlsDirectors(renderer) const _articlesDetail = await downloader.getArticleDetailsIds([title]) const articlesDetail = mwRetToArticleDetail(_articlesDetail) const { articleDetailXId } = RedisStore @@ -46,7 +45,8 @@ describe('ArticleTreatment', () => { const addedArticles: (typeof ZimArticle)[] = [] const articleId = 'non-existent-article' - const articleUrl = downloader.getArticleUrl(dump, articleId) + downloader.setUrlsDirectors(rendererInstance, rendererInstance) + const articleUrl = downloader.getArticleUrl(articleId) const _moduleDependencies = await downloader.getModuleDependencies(title) const articleDetail = { @@ -91,8 +91,6 @@ describe('ArticleTreatment', () => { expect(articleDoc.querySelector('meta[name="geo.position"]')).toBeDefined() // Geo Position data is correct expect(articleDoc.querySelector('meta[name="geo.position"]')?.getAttribute('content')).toEqual('51.50722222;-0.1275') - - await sleep(1000) }) } }) diff --git a/test/unit/urlRewriting.test.ts b/test/unit/urlRewriting.test.ts index 9efa2b24..1ba8eab2 100644 --- a/test/unit/urlRewriting.test.ts +++ b/test/unit/urlRewriting.test.ts @@ -140,7 +140,6 @@ describe('Styles', () => { await articleDetailXId.flush() await RedisStore.redirectsXId.flush() const { downloader, dump } = await setupScrapeClasses() // en wikipedia - await downloader.setBaseUrlsDirectors('WikimediaDesktop') await getArticleIds(downloader, '', ['London', 'British_Museum', 'Natural_History_Museum,_London', 'Farnborough/Aldershot_built-up_area']) From db23e5cd120dc84727b77c9abb8776ab4597a754 Mon Sep 17 00:00:00 2001 From: Vadim Kovalenko Date: Wed, 8 Nov 2023 17:30:56 +0200 Subject: [PATCH 10/10] Refactor MW url builder methods invocation, refactor forceRender test --- src/Downloader.ts | 10 ++-- src/MediaWiki.ts | 92 +++++++++++++++++----------------- test/e2e/forceRender.test.ts | 2 +- test/testRenders.ts | 1 + test/unit/saveArticles.test.ts | 4 +- 5 files changed, 53 insertions(+), 56 deletions(-) diff --git a/src/Downloader.ts b/src/Downloader.ts index 30df0b49..9d09a93e 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -84,8 +84,6 @@ export const defaultStreamRequestOptions: AxiosRequestConfig = { class Downloader { public loginCookie = '' public readonly speed: number - public baseUrl: string - public baseUrlForMainPage: string public cssDependenceUrls: KVS = {} public readonly webp: boolean = false public readonly requestTimeout: number @@ -179,15 +177,15 @@ class Downloader { private getUrlDirector(renderer: object) { switch (renderer.constructor.name) { case 'WikimediaDesktopRenderer': - return new WikimediaDesktopURLDirector(MediaWiki.WikimediaDesktopApiUrl.href) + return new WikimediaDesktopURLDirector(MediaWiki.wikimediaDesktopApiUrl.href) case 'VisualEditorRenderer': - return new VisualEditorURLDirector(MediaWiki.VisualEditorApiUrl.href) + return new VisualEditorURLDirector(MediaWiki.visualEditorApiUrl.href) case 'WikimediaMobileRenderer': - return new WikimediaMobileURLDirector(MediaWiki.WikimediaMobileApiUrl.href) + return new WikimediaMobileURLDirector(MediaWiki.wikimediaMobileApiUrl.href) } } - public async setUrlsDirectors(mainPageRenderer, articlesRenderer): Promise { + public setUrlsDirectors(mainPageRenderer, articlesRenderer): void { if (!this.articleUrlDirector) { this.articleUrlDirector = this.getUrlDirector(articlesRenderer) } diff --git a/src/MediaWiki.ts b/src/MediaWiki.ts index 157bf4c8..113b3ff3 100644 --- a/src/MediaWiki.ts +++ b/src/MediaWiki.ts @@ -42,6 +42,7 @@ class MediaWiki { public namespacesToMirror: string[] = [] public apiCheckArticleId: string public queryOpts: QueryOpts + public urlDirector: BaseURLDirector #wikiPath: string #actionApiPath: string @@ -50,22 +51,19 @@ class MediaWiki { #username: string #password: string #domain: string - private apiUrlDirector: ApiURLDirector - private baseUrlDirector: BaseURLDirector + public wikimediaDesktopUrlDirector: WikimediaDesktopURLDirector public wikimediaMobileUrlDirector: WikimediaMobileURLDirector public visualEditorURLDirector: VisualEditorURLDirector public visualEditorApiUrl: URL public actionApiUrl: URL + public webUrl: URL + public wikimediaDesktopApiUrl: URL + public wikimediaMobileApiUrl: URL - public VisualEditorApiUrl: URL - public apiUrl: URL public modulePath: string // only for reading public mobileModulePath: string - public webUrl: URL - public WikimediaDesktopApiUrl: URL - public WikimediaMobileApiUrl: URL #apiUrlDirector: ApiURLDirector #wikimediaDesktopUrlDirector: WikimediaDesktopURLDirector @@ -87,14 +85,16 @@ class MediaWiki { set actionApiPath(value: string) { if (value) { this.#actionApiPath = value - this.initApiURLDirector() + this.actionApiUrl = this.urlDirector.buildURL(this.#actionApiPath) + this.setVisualEditorURL() } } set restApiPath(value: string) { if (value) { this.#restApiPath = value - this.initApiURLDirector() + this.setWikimediaDesktopApiUrl() + this.setWikimediaMobileApiUrl() } } @@ -105,31 +105,33 @@ class MediaWiki { set wikiPath(value: string) { if (value) { this.#wikiPath = value - this.initApiURLDirector() + this.webUrl = this.urlDirector.buildURL(this.#wikiPath) } } set base(value: string) { if (value) { this.baseUrl = basicURLDirector.buildMediawikiBaseURL(value) - this.baseUrlDirector = new BaseURLDirector(this.baseUrl.href) - this.initMWApis() - this.initApiURLDirector() + this.urlDirector = new BaseURLDirector(this.baseUrl.href) + this.webUrl = this.urlDirector.buildURL(this.#wikiPath) + this.actionApiUrl = this.urlDirector.buildURL(this.#actionApiPath) + this.setWikimediaDesktopApiUrl() + this.setWikimediaMobileApiUrl() + this.setVisualEditorURL() + this.setModuleURL() + this.setMobileModuleUrl() } } set modulePathOpt(value: string) { - if (value) { + if (value !== undefined) { this.#modulePathOpt = value - if (this.baseUrlDirector) { - this.modulePath = this.baseUrlDirector.buildModuleURL(this.#modulePathOpt) - } else { - logger.error('Base url director should be specified first') - } - } else { - if (this.baseUrlDirector) { - this.modulePath = this.baseUrlDirector.buildModuleURL(this.#modulePathOpt) - } + } + + if (this.urlDirector) { + this.setModuleURL() + } else if (value) { + logger.error('Base url director should be specified first') } } @@ -170,7 +172,7 @@ class MediaWiki { public async hasWikimediaDesktopApi(): Promise { if (this.#hasWikimediaDesktopApi === null) { - this.#wikimediaDesktopUrlDirector = new WikimediaDesktopURLDirector(this.WikimediaDesktopApiUrl.href) + this.#wikimediaDesktopUrlDirector = new WikimediaDesktopURLDirector(this.wikimediaDesktopApiUrl.href) this.#hasWikimediaDesktopApi = await checkApiAvailability(this.#wikimediaDesktopUrlDirector.buildArticleURL(this.apiCheckArticleId)) return this.#hasWikimediaDesktopApi } @@ -179,7 +181,7 @@ class MediaWiki { public async hasWikimediaMobileApi(): Promise { if (this.#hasWikimediaMobileApi === null) { - this.#wikimediaMobileUrlDirector = new WikimediaMobileURLDirector(this.WikimediaMobileApiUrl.href) + this.#wikimediaMobileUrlDirector = new WikimediaMobileURLDirector(this.wikimediaMobileApiUrl.href) this.#hasWikimediaMobileApi = await checkApiAvailability(this.#wikimediaMobileUrlDirector.buildArticleURL(this.apiCheckArticleId)) return this.#hasWikimediaMobileApi } @@ -188,7 +190,7 @@ class MediaWiki { public async hasVisualEditorApi(): Promise { if (this.#hasVisualEditorApi === null) { - this.#visualEditorURLDirector = new VisualEditorURLDirector(this.VisualEditorApiUrl.href) + this.#visualEditorURLDirector = new VisualEditorURLDirector(this.visualEditorApiUrl.href) this.#hasVisualEditorApi = await checkApiAvailability(this.#visualEditorURLDirector.buildArticleURL(this.apiCheckArticleId)) return this.#hasVisualEditorApi } @@ -214,29 +216,25 @@ class MediaWiki { return this.#hasCoordinates } - private initMWApis() { - this.WikimediaDesktopApiUrl = this.baseUrlDirector.buildWikimediaDesktopApiUrl(this.#restApiPath) - this.WikimediaMobileApiUrl = this.baseUrlDirector.buildWikimediaMobileApiUrl(this.#restApiPath) - this.mobileModulePath = this.baseUrlDirector.buildMobileModuleURL() - this.wikimediaDesktopUrlDirector = new WikimediaDesktopURLDirector(this.WikimediaDesktopApiUrl.href) - this.wikimediaMobileUrlDirector = new WikimediaMobileURLDirector(this.WikimediaMobileApiUrl.href) + private setWikimediaDesktopApiUrl() { + this.wikimediaDesktopApiUrl = this.urlDirector.buildWikimediaDesktopApiUrl(this.#restApiPath) + } + + private setWikimediaMobileApiUrl() { + this.wikimediaMobileApiUrl = this.urlDirector.buildWikimediaMobileApiUrl(this.#restApiPath) + } + + private setVisualEditorURL() { + this.#apiUrlDirector = new ApiURLDirector(this.actionApiUrl.href) + this.visualEditorApiUrl = this.#apiUrlDirector.buildVisualEditorURL() + } + + private setModuleURL() { + this.modulePath = this.urlDirector.buildModuleURL(this.#modulePathOpt) } - private initApiURLDirector() { - this.webUrl = this.baseUrlDirector.buildURL(this.#wikiPath) - this.actionApiUrl = this.baseUrlDirector.buildURL(this.#actionApiPath) - this.apiUrlDirector = new ApiURLDirector(this.actionApiUrl.href) - this.visualEditorApiUrl = this.apiUrlDirector.buildVisualEditorURL() - this.visualEditorURLDirector = new VisualEditorURLDirector(this.visualEditorApiUrl.href) - const baseUrlDirector = new BaseURLDirector(this.baseUrl.href) - this.webUrl = baseUrlDirector.buildURL(this.#wikiPath) - this.apiUrl = baseUrlDirector.buildURL(this.#apiActionPath) - this.#apiUrlDirector = new ApiURLDirector(this.apiUrl.href) - this.VisualEditorApiUrl = this.#apiUrlDirector.buildVisualEditorURL() - this.WikimediaDesktopApiUrl = baseUrlDirector.buildWikimediaDesktopApiUrl(this.#restApiPath) - this.WikimediaMobileApiUrl = baseUrlDirector.buildWikimediaMobileApiUrl(this.#restApiPath) - this.modulePath = baseUrlDirector.buildModuleURL(this._modulePathOpt) - this.mobileModulePath = baseUrlDirector.buildMobileModuleURL() + private setMobileModuleUrl() { + this.mobileModulePath = this.urlDirector.buildMobileModuleURL() } public async login(downloader: Downloader) { diff --git a/test/e2e/forceRender.test.ts b/test/e2e/forceRender.test.ts index ef1d8860..e8a6512d 100644 --- a/test/e2e/forceRender.test.ts +++ b/test/e2e/forceRender.test.ts @@ -53,7 +53,7 @@ describe('forceRender', () => { try { await mwoffliner.execute({ ...parameters, forceRender }) } catch (err) { - expect(err.message).toEqual('Unable to find specific API end-point to retrieve article HTML') + expect(err.message).toEqual('Unknown renderName for specific mode: unknownRenderName') } }) }) diff --git a/test/testRenders.ts b/test/testRenders.ts index f0bb23f8..68062359 100644 --- a/test/testRenders.ts +++ b/test/testRenders.ts @@ -66,6 +66,7 @@ export async function testRenders(parameters: Parameters, callback, renderersLis logger.error(err.message) return } + } } export async function testAllRenders(parameters: Parameters, callback) { diff --git a/test/unit/saveArticles.test.ts b/test/unit/saveArticles.test.ts index 4bce3eb4..ae3baee8 100644 --- a/test/unit/saveArticles.test.ts +++ b/test/unit/saveArticles.test.ts @@ -122,7 +122,7 @@ describe('saveArticles', () => { test(`Load main page and check that it is without header using ${renderer} renderer`, async () => { const { downloader, dump } = await setupScrapeClasses({ mwUrl: 'https://en.wikivoyage.org' }) // en wikipedia - await downloader.setBaseUrlsDirectors(rendererInstance, rendererInstance) + downloader.setUrlsDirectors(rendererInstance, rendererInstance) const articleId = 'Main_Page' const articleUrl = downloader.getArticleUrl(articleId) const _articleDetailsRet = await downloader.getArticleDetailsIds([articleId]) @@ -148,7 +148,7 @@ describe('saveArticles', () => { test(`--customFlavour using ${renderer} renderer`, async () => { const { downloader, dump } = await setupScrapeClasses({ format: 'nopic' }) // en wikipedia - await downloader.setBaseUrlsDirectors(rendererInstance, rendererInstance) + downloader.setUrlsDirectors(rendererInstance, rendererInstance) class CustomFlavour implements CustomProcessor { // eslint-disable-next-line @typescript-eslint/no-unused-vars public async shouldKeepArticle(articleId: string, doc: Document) {