From 132d56111e8e57f311ec0370c7f268821cc3138c Mon Sep 17 00:00:00 2001 From: Vadim Kovalenko Date: Thu, 10 Aug 2023 20:49:35 +0300 Subject: [PATCH] Replace mwCapabilities to Mediawiki class (partial impl) --- src/Downloader.ts | 67 ++------------- src/MediaWiki.ts | 82 ++++++++++++++++--- src/mwoffliner.lib.ts | 4 +- src/util/builders/url/basic.director.ts | 2 + src/util/mw-api.ts | 10 +++ test/unit/downloader.test.ts | 1 - test/unit/mwApi.test.ts | 2 +- test/unit/saveArticles.test.ts | 2 - .../unit/treatments/article.treatment.test.ts | 1 - test/unit/urlRewriting.test.ts | 2 +- test/util.ts | 2 +- 11 files changed, 93 insertions(+), 82 deletions(-) diff --git a/src/Downloader.ts b/src/Downloader.ts index 536794316..a8fe95e1d 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -1,4 +1,3 @@ -import * as path from 'path' import deepmerge from 'deepmerge' import * as backoff from 'backoff' import { config } from './config.js' @@ -19,8 +18,6 @@ import S3 from './S3.js' import * as logger from './Logger.js' import MediaWiki from './MediaWiki.js' import ApiURLDirector from './util/builders/url/api.director.js' -import DesktopURLDirector from './util/builders/url/desktop.director.js' -import VisualEditorURLDirector from './util/builders/url/visual-editor.director.js' import basicURLDirector from './util/builders/url/basic.director.js' import urlHelper from './util/url.helper.js' @@ -62,13 +59,6 @@ interface BackoffOptions { backoffHandler: (number: number, delay: number, error?: any) => void } -export interface MWCapabilities { - apiAvailable: boolean - veApiAvailable: boolean - coordinatesAvailable: boolean - desktopRestApiAvailable: boolean -} - export const defaultStreamRequestOptions: AxiosRequestConfig = { headers: { accept: 'application/octet-stream', @@ -103,7 +93,6 @@ class Downloader { private readonly backoffOptions: BackoffOptions private readonly optimisationCacheUrl: string private s3: S3 - public mwCapabilities: MWCapabilities // TODO: move to MW, temporary open the property private apiUrlDirector: ApiURLDirector constructor({ mw, uaString, speed, reqTimeout, optimisationCacheUrl, s3, webp, backoffOptions }: DownloaderOpts) { @@ -116,12 +105,6 @@ class Downloader { this.optimisationCacheUrl = optimisationCacheUrl this.webp = webp this.s3 = s3 - this.mwCapabilities = { - apiAvailable: false, - veApiAvailable: false, - coordinatesAvailable: true, - desktopRestApiAvailable: false, - } this.apiUrlDirector = new ApiURLDirector(mw.apiUrl.href) this.backoffOptions = { @@ -187,14 +170,14 @@ class Downloader { public async setBaseUrls() { //* Objects order in array matters! this.baseUrl = basicURLDirector.buildDownloaderBaseUrl([ - { condition: this.mwCapabilities.desktopRestApiAvailable, value: this.mw.desktopRestApiUrl.href }, - { condition: this.mwCapabilities.veApiAvailable, value: this.mw.veApiUrl.href }, + { condition: await this.mw.hasDesktopRestApi(), value: this.mw.desktopRestApiUrl.href }, + { condition: await this.mw.hasVeApi(), value: this.mw.veApiUrl.href }, ]) //* Objects order in array matters! this.baseUrlForMainPage = basicURLDirector.buildDownloaderBaseUrl([ - { condition: this.mwCapabilities.desktopRestApiAvailable, value: this.mw.desktopRestApiUrl.href }, - { condition: this.mwCapabilities.veApiAvailable, value: this.mw.veApiUrl.href }, + { condition: await this.mw.hasDesktopRestApi(), value: this.mw.desktopRestApiUrl.href }, + { condition: await this.mw.hasVeApi(), value: this.mw.veApiUrl.href }, ]) logger.log('Base Url: ', this.baseUrl) @@ -203,40 +186,6 @@ class Downloader { if (!this.baseUrl || !this.baseUrlForMainPage) throw new Error('Unable to find appropriate API end-point to retrieve article HTML') } - public async checkApiAvailabilty(url: string): Promise { - try { - const resp = await axios.get(url, { headers: { cookie: this.loginCookie } }) - // Check for hostname is for domain name in cases of redirects. - return resp.status === 200 && !resp.headers['mediawiki-api-error'] && path.dirname(url) === path.dirname(resp.request.res.responseUrl) - } catch (err) { - return false - } - } - - public async checkCapabilities(testArticleId = 'MediaWiki:Sidebar'): Promise { - const desktopUrlDirector = new DesktopURLDirector(this.mw.desktopRestApiUrl.href) - const visualEditorURLDirector = new VisualEditorURLDirector(this.mw.veApiUrl.href) - - // By default check all API's responses and set the capabilities - // accordingly. We need to set a default page (always there because - // installed per default) to request the REST API, otherwise it would - // fail the check. - this.mwCapabilities.desktopRestApiAvailable = await this.checkApiAvailabilty(desktopUrlDirector.buildArticleURL(testArticleId)) - this.mwCapabilities.veApiAvailable = await this.checkApiAvailabilty(visualEditorURLDirector.buildArticleURL(testArticleId)) - this.mwCapabilities.apiAvailable = await this.checkApiAvailabilty(this.mw.apiUrl.href) - - // Coordinate fetching - const reqOpts = this.getArticleQueryOpts() - - const resp = await this.getJSON(this.apiUrlDirector.buildQueryURL(reqOpts)) - - const isCoordinateWarning = resp.warnings && resp.warnings.query && (resp.warnings.query['*'] || '').includes('coordinates') - if (isCoordinateWarning) { - logger.info('Coordinates not available on this wiki') - this.mwCapabilities.coordinatesAvailable = false - } - } - public removeEtagWeakPrefix(etag: string): string { return etag && etag.replace(WEAK_ETAG_REGEX, '') } @@ -253,7 +202,7 @@ class Downloader { const queryOpts: KVS = { ...this.getArticleQueryOpts(shouldGetThumbnail, true), titles: articleIds.join('|'), - ...(this.mwCapabilities.coordinatesAvailable ? { colimit: 'max' } : {}), + ...(this.mw.hasCoordinatesApi ? { colimit: 'max' } : {}), ...(this.mw.getCategories ? { cllimit: 'max', @@ -293,7 +242,7 @@ class Downloader { while (true) { const queryOpts: KVS = { ...this.getArticleQueryOpts(), - ...(this.mwCapabilities.coordinatesAvailable ? { colimit: 'max' } : {}), + ...(this.mw.hasCoordinatesApi ? { colimit: 'max' } : {}), ...(this.mw.getCategories ? { cllimit: 'max', @@ -441,9 +390,7 @@ class Downloader { return { action: 'query', format: 'json', - prop: `redirects|revisions${includePageimages ? '|pageimages' : ''}${this.mwCapabilities.coordinatesAvailable ? '|coordinates' : ''}${ - this.mw.getCategories ? '|categories' : '' - }`, + prop: `redirects|revisions${includePageimages ? '|pageimages' : ''}${this.mw.hasCoordinatesApi() ? '|coordinates' : ''}${this.mw.getCategories ? '|categories' : ''}`, rdlimit: 'max', rdnamespace: validNamespaceIds.join('|'), redirects: redirects ? true : undefined, diff --git a/src/MediaWiki.ts b/src/MediaWiki.ts index f9f582ebe..67325a2bd 100644 --- a/src/MediaWiki.ts +++ b/src/MediaWiki.ts @@ -10,27 +10,65 @@ import semver from 'semver' import basicURLDirector from './util/builders/url/basic.director.js' import BaseURLDirector from './util/builders/url/base.director.js' import ApiURLDirector from './util/builders/url/api.director.js' +import { checkApiAvailabilty } from './util/mw-api.js' class MediaWiki { public metaData: MWMetaData public readonly baseUrl: URL - public readonly modulePath: string - public readonly webUrl: URL - public readonly apiUrl: URL - public readonly veApiUrl: URL - public readonly restApiUrl: URL - public readonly mobileRestApiUrl: URL - public readonly desktopRestApiUrl: URL public readonly getCategories: boolean + public readonly modulePathConfig: string public readonly namespaces: MWNamespaces = {} public readonly namespacesToMirror: string[] = [] private readonly wikiPath: string + private readonly restApiPath: string private readonly username: string private readonly password: string private readonly apiPath: string private readonly domain: string private apiUrlDirector: ApiURLDirector + private baseUrlDirector: BaseURLDirector + + public veApiUrl: URL + public restApiUrl: URL + public apiUrl: URL + public modulePath: string + public webUrl: URL + public desktopRestApiUrl: URL + + public hasDesktopRestApi = async function (loginCookie?: string, testArticleId?: string): Promise { + const desktopRestApiAvailable = await checkApiAvailabilty(this.desktopRestApiUrl, loginCookie) + this.hasDesktopRestApi = async function (): Promise { + return desktopRestApiAvailable + } + } + + public hasVeApi = async function (loginCookie?: string, testArticleId?: string): Promise { + const veRestApiAvailable = await checkApiAvailabilty(this.veApiUrl, loginCookie) + this.hasVeApi = async function (): Promise { + return veRestApiAvailable + } + } + + public hasCoordinatesApi = async function (downloader?: Downloader): Promise { + const validNamespaceIds = this.namespacesToMirror.map((ns) => this.namespaces[ns].num) + const reqOpts = { + action: 'query', + format: 'json', + prop: `redirects|revisions${(await this.hasCoordinatesApi()) ? '|coordinates' : ''}${this.getCategories ? '|categories' : ''}`, + rdlimit: 'max', + rdnamespace: validNamespaceIds.join('|'), + } + if (downloader) { + const resp = await downloader.getJSON(this.apiUrlDirector.buildQueryURL(reqOpts)) + const isCoordinateWarning = resp.warnings && resp.warnings.query && (resp.warnings.query['*'] || '').includes('coordinates') + if (isCoordinateWarning) { + logger.info('Coordinates not available on this wiki') + return false + } + } + return true + } constructor(config: MWConfig) { this.domain = config.domain || '' @@ -42,20 +80,30 @@ class MediaWiki { this.apiPath = config.apiPath ?? 'w/api.php' this.wikiPath = config.wikiPath ?? DEFAULT_WIKI_PATH + this.restApiPath = config.restApiPath + this.modulePathConfig = config.modulePath const baseUrlDirector = new BaseURLDirector(this.baseUrl.href) this.webUrl = baseUrlDirector.buildURL(this.wikiPath) this.apiUrl = baseUrlDirector.buildURL(this.apiPath) - this.apiUrlDirector = new ApiURLDirector(this.apiUrl.href) - this.veApiUrl = this.apiUrlDirector.buildVisualEditorURL() - this.restApiUrl = baseUrlDirector.buildRestApiURL(config.restApiPath) - this.desktopRestApiUrl = baseUrlDirector.buildDesktopRestApiURL(config.restApiPath) + this.restApiUrl = baseUrlDirector.buildRestApiURL(this.restApiPath) + this.desktopRestApiUrl = baseUrlDirector.buildDesktopRestApiURL(this.restApiPath) - this.modulePath = baseUrlDirector.buildModuleURL(config.modulePath) + this.modulePath = baseUrlDirector.buildModuleURL(this.modulePathConfig) + + /* + this.restApiUrl = this.baseUrlDirector.buildRestApiURL(this.restApiPath) + this.apiUrl = this.baseUrlDirector.buildURL(this.apiPath) + this.modulePath = this.baseUrlDirector.buildModuleURL(this.modulePathConfig) + this.webUrl = this.baseUrlDirector.buildURL(this.wikiPath) + this.desktopRestApiUrl = this.baseUrlDirector.buildDesktopRestApiURL(config.restApiPath) + + this.veApiUrl = this.apiUrlDirector.buildVisualEditorURL() + */ } public async login(downloader: Downloader) { @@ -85,12 +133,13 @@ class MediaWiki { }, method: 'POST', }) - .then((resp) => { + .then(async (resp) => { if (resp.data.login.result !== 'Success') { throw new Error('Login Failed') } downloader.loginCookie = resp.headers['set-cookie'].join(';') + await this.checkCapabilities(resp.headers['set-cookie'].join(';')) }) .catch((err) => { throw err @@ -296,6 +345,13 @@ class MediaWiki { return mwMetaData } + + // Set capability properties, usied while mw.login + private async checkCapabilities(loginCookie?: string, testArticleId = 'MediaWiki:Sidebar'): Promise { + await this.hasDesktopRestApi(loginCookie, testArticleId) + await this.hasVeApi(loginCookie, testArticleId) + await this.hasCoordinatesApi() + } } export default MediaWiki diff --git a/src/mwoffliner.lib.ts b/src/mwoffliner.lib.ts index 84ee9186c..db008a56f 100644 --- a/src/mwoffliner.lib.ts +++ b/src/mwoffliner.lib.ts @@ -17,6 +17,7 @@ import semver from 'semver' import * as path from 'path' import * as QueryStringParser from 'querystring' import { ZimArticle, ZimCreator } from '@openzim/libzim' +import { checkApiAvailabilty } from './util/mw-api.js' import { MAX_CPU_CORES, @@ -205,12 +206,11 @@ async function execute(argv: any) { if (customMainPage) { mainPage = customMainPage const mainPageUrl = mw.webUrl + encodeURIComponent(mainPage) - if (!(await downloader.checkApiAvailabilty(mainPageUrl))) { + if (!(await checkApiAvailabilty(mainPageUrl))) { throw new Error(`customMainPage doesn't return 200 status code for url ${mainPageUrl}`) } } - await downloader.checkCapabilities(mwMetaData.mainPage) await downloader.setBaseUrls() const redisStore = new RedisStore(argv.redis || config.defaults.redisPath) diff --git a/src/util/builders/url/basic.director.ts b/src/util/builders/url/basic.director.ts index 1248a7e2c..b97dce3f7 100644 --- a/src/util/builders/url/basic.director.ts +++ b/src/util/builders/url/basic.director.ts @@ -10,6 +10,8 @@ class BasicURLDirector { buildDownloaderBaseUrl(conditions: DownloaderBaseUrlConditions): string | undefined { let baseUrl: string + console.log('Args for buildDownloaderBaseUrl ', conditions) + for (const { condition, value } of conditions) { if (condition) { baseUrl = value diff --git a/src/util/mw-api.ts b/src/util/mw-api.ts index 28d32d5d5..a77f5fe7a 100644 --- a/src/util/mw-api.ts +++ b/src/util/mw-api.ts @@ -3,6 +3,7 @@ import deepmerge from 'deepmerge' import * as logger from '../Logger.js' import Downloader from '../Downloader.js' import Timer from './Timer.js' +import axios from 'axios' export async function getArticlesByIds(articleIds: string[], downloader: Downloader, redisStore: RS, log = true): Promise { let from = 0 @@ -253,3 +254,12 @@ export function mwRetToArticleDetail(obj: QueryMwRet): KVS { } return ret } + +export async function checkApiAvailabilty(url: string, loginCookie = ''): Promise { + try { + const resp = await axios.get(url, { maxRedirects: 0, headers: { cookie: loginCookie } }) + return resp.status === 200 && !resp.headers['mediawiki-api-error'] + } catch (err) { + return false + } +} diff --git a/test/unit/downloader.test.ts b/test/unit/downloader.test.ts index 8c78a531a..44062f376 100644 --- a/test/unit/downloader.test.ts +++ b/test/unit/downloader.test.ts @@ -33,7 +33,6 @@ describe('Downloader class', () => { downloader = new Downloader({ mw, uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: true, optimisationCacheUrl: '' }) await mw.getMwMetaData(downloader) - await downloader.checkCapabilities() await downloader.setBaseUrls() }) diff --git a/test/unit/mwApi.test.ts b/test/unit/mwApi.test.ts index 594db4a25..cb622fbf1 100644 --- a/test/unit/mwApi.test.ts +++ b/test/unit/mwApi.test.ts @@ -25,7 +25,7 @@ describe('mwApi', () => { downloader = new Downloader({ mw, uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: false, optimisationCacheUrl: '' }) await mw.getMwMetaData(downloader) - await downloader.checkCapabilities() + // await downloader.checkCapabilities() await mw.getNamespaces([], downloader) }) diff --git a/test/unit/saveArticles.test.ts b/test/unit/saveArticles.test.ts index 26d60490f..a54275106 100644 --- a/test/unit/saveArticles.test.ts +++ b/test/unit/saveArticles.test.ts @@ -17,7 +17,6 @@ describe('saveArticles', () => { test('Article html processing', async () => { const { downloader, mw, dump } = await setupScrapeClasses() // en wikipedia - await downloader.checkCapabilities() await downloader.setBaseUrls() const _articlesDetail = await downloader.getArticleDetailsIds(['London']) const articlesDetail = mwRetToArticleDetail(_articlesDetail) @@ -130,7 +129,6 @@ describe('saveArticles', () => { test('--customFlavour', async () => { const { downloader, mw, dump } = await setupScrapeClasses({ format: 'nopic' }) // en wikipedia - await downloader.checkCapabilities() await downloader.setBaseUrls() class CustomFlavour implements CustomProcessor { // eslint-disable-next-line @typescript-eslint/no-unused-vars diff --git a/test/unit/treatments/article.treatment.test.ts b/test/unit/treatments/article.treatment.test.ts index 4233ad715..97c89a9d7 100644 --- a/test/unit/treatments/article.treatment.test.ts +++ b/test/unit/treatments/article.treatment.test.ts @@ -16,7 +16,6 @@ describe('ArticleTreatment', () => { test('Article html processing', async () => { const { downloader, mw, dump } = await setupScrapeClasses() // en wikipedia - await downloader.checkCapabilities() await downloader.setBaseUrls() const _articlesDetail = await downloader.getArticleDetailsIds(['London']) const articlesDetail = mwRetToArticleDetail(_articlesDetail) diff --git a/test/unit/urlRewriting.test.ts b/test/unit/urlRewriting.test.ts index c31e61605..ed2901b6a 100644 --- a/test/unit/urlRewriting.test.ts +++ b/test/unit/urlRewriting.test.ts @@ -138,7 +138,7 @@ describe('Styles', () => { await articleDetailXId.flush() await redisStore.redirectsXId.flush() const { downloader, mw, dump } = await setupScrapeClasses() // en wikipedia - await downloader.checkCapabilities() + // await downloader.checkCapabilities() await downloader.setBaseUrls() await getArticleIds(downloader, redisStore, mw, '', ['London', 'British_Museum', 'Natural_History_Museum,_London', 'Farnborough/Aldershot_built-up_area']) diff --git a/test/util.ts b/test/util.ts index 0a39a5018..66db24459 100644 --- a/test/util.ts +++ b/test/util.ts @@ -38,7 +38,7 @@ export async function setupScrapeClasses({ mwUrl = 'https://en.wikipedia.org', f const downloader = new Downloader({ mw, uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: false, optimisationCacheUrl: '' }) await mw.getMwMetaData(downloader) - await downloader.checkCapabilities() + // await downloader.checkCapabilities() const dump = new Dump(format, {} as any, mw.metaData)