Skip to content

Commit

Permalink
Merge pull request #1854 from openzim/feature/1830-rearchitecturing/u…
Browse files Browse the repository at this point in the history
…rls-builders

New URLs builders for Downloader and Mediawiki classes
  • Loading branch information
kelson42 committed Aug 1, 2023
2 parents e99aa3a + 24b2564 commit 1cd6963
Show file tree
Hide file tree
Showing 28 changed files with 650 additions and 90 deletions.
12 changes: 12 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# EditorConfig is awesome: https://EditorConfig.org

# top-most EditorConfig file
root = true

[*]
indent_style = space
indent_size = 2
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ npm-debug.log
.env
.nyc_output
coverage
mwo-test-*
mwo-test-*
.vscode
5 changes: 3 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,9 @@ Advices for debugging mwoffliner issues:
output itself. For Wikimedia wikis you can easily generate and
view the output in your browser using the Parsoid REST interface.
Example URLs:
* Mobile (most pages):
https://en.wikivoyage.org/api/rest_v1/page/mobile-sections/Hot_springs
* <del>Mobile (most pages):
https://en.wikivoyage.org/api/rest_v1/page/mobile-sections/Hot_springs</del>
> :warning: **DEPRECATED**: Mobile Content Service endpoints are now deprecated.
* Desktop (main page):
https://es.wikipedia.org/api/rest_v1/page/html/Espa%C3%B1a
3. If the error is with the Parsoid output
Expand Down
73 changes: 44 additions & 29 deletions src/Downloader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import https from 'https'

import {
normalizeMwResponse,
objToQueryString,
DB_ERROR,
WEAK_ETAG_REGEX,
renderArticle,
Expand All @@ -31,6 +30,10 @@ import S3 from './S3.js'
import { Dump } from './Dump.js'
import * as logger from './Logger.js'
import MediaWiki from './MediaWiki.js'
import ApiURLDirector from './util/builders/url/api.director.js'
import DesktopURLDirector from './util/builders/url/desktop.director.js'
import VisualEditorURLDirector from './util/builders/url/visual-editor.director.js'
import basicURLDirector from './util/builders/url/basic.director.js'

const imageminOptions = new Map()
imageminOptions.set('default', new Map())
Expand Down Expand Up @@ -75,7 +78,6 @@ export interface MWCapabilities {
veApiAvailable: boolean
coordinatesAvailable: boolean
desktopRestApiAvailable: boolean
mobileRestApiAvailable: boolean
}

export const defaultStreamRequestOptions: AxiosRequestConfig = {
Expand All @@ -90,6 +92,9 @@ export const defaultStreamRequestOptions: AxiosRequestConfig = {
method: 'GET',
}

/**
* Common interface to download the content
*/
class Downloader {
public readonly mw: MediaWiki
public loginCookie = ''
Expand All @@ -99,6 +104,9 @@ class Downloader {
public cssDependenceUrls: KVS<boolean> = {}
public readonly webp: boolean = false
public readonly requestTimeout: number
public arrayBufferRequestOptions: AxiosRequestConfig
public jsonRequestOptions: AxiosRequestConfig
public streamRequestOptions: AxiosRequestConfig

private readonly uaString: string
private activeRequests = 0
Expand All @@ -108,9 +116,7 @@ class Downloader {
private readonly optimisationCacheUrl: string
private s3: S3
private mwCapabilities: MWCapabilities // todo move to MW
public arrayBufferRequestOptions: AxiosRequestConfig
public jsonRequestOptions: AxiosRequestConfig
public streamRequestOptions: AxiosRequestConfig
private apiUrlDirector: ApiURLDirector

constructor({ mw, uaString, speed, reqTimeout, optimisationCacheUrl, s3, webp, backoffOptions }: DownloaderOpts) {
this.mw = mw
Expand All @@ -127,8 +133,8 @@ class Downloader {
veApiAvailable: false,
coordinatesAvailable: true,
desktopRestApiAvailable: false,
mobileRestApiAvailable: false,
}
this.apiUrlDirector = new ApiURLDirector(mw.apiUrl.href)

this.backoffOptions = {
strategy: new backoff.ExponentialStrategy(),
Expand Down Expand Up @@ -214,15 +220,17 @@ class Downloader {
}

public async setBaseUrls() {
this.baseUrl = this.mwCapabilities.mobileRestApiAvailable
? this.mw.mobileRestApiUrl.href
: this.mwCapabilities.desktopRestApiAvailable
? this.mw.desktopRestApiUrl.href
: this.mwCapabilities.veApiAvailable
? this.mw.veApiUrl.href
: undefined

this.baseUrlForMainPage = this.mwCapabilities.desktopRestApiAvailable ? this.mw.desktopRestApiUrl.href : this.mwCapabilities.veApiAvailable ? this.mw.veApiUrl.href : undefined
//* Objects order in array matters!
this.baseUrl = basicURLDirector.buildDownloaderBaseUrl([
{ condition: this.mwCapabilities.desktopRestApiAvailable, value: this.mw.desktopRestApiUrl.href },
{ condition: this.mwCapabilities.veApiAvailable, value: this.mw.veApiUrl.href },
])

//* Objects order in array matters!
this.baseUrlForMainPage = basicURLDirector.buildDownloaderBaseUrl([
{ condition: this.mwCapabilities.desktopRestApiAvailable, value: this.mw.desktopRestApiUrl.href },
{ condition: this.mwCapabilities.veApiAvailable, value: this.mw.veApiUrl.href },
])

logger.log('Base Url: ', this.baseUrl)
logger.log('Base Url for Main Page: ', this.baseUrlForMainPage)
Expand All @@ -241,20 +249,22 @@ class Downloader {
}

public async checkCapabilities(testArticleId = 'MediaWiki:Sidebar'): Promise<void> {
const desktopUrlDirector = new DesktopURLDirector(this.mw.desktopRestApiUrl.href)
const visualEditorURLDirector = new VisualEditorURLDirector(this.mw.veApiUrl.href)

// By default check all API's responses and set the capabilities
// accordingly. We need to set a default page (always there because
// installed per default) to request the REST API, otherwise it would
// fail the check.
this.mwCapabilities.mobileRestApiAvailable = await this.checkApiAvailabilty(this.mw.getMobileRestApiArticleUrl(testArticleId))
this.mwCapabilities.desktopRestApiAvailable = await this.checkApiAvailabilty(this.mw.getDesktopRestApiArticleUrl(testArticleId))
this.mwCapabilities.veApiAvailable = await this.checkApiAvailabilty(this.mw.getVeApiArticleUrl(testArticleId))
this.mwCapabilities.desktopRestApiAvailable = await this.checkApiAvailabilty(desktopUrlDirector.buildArticleURL(testArticleId))
this.mwCapabilities.veApiAvailable = await this.checkApiAvailabilty(visualEditorURLDirector.buildArticleURL(testArticleId))
this.mwCapabilities.apiAvailable = await this.checkApiAvailabilty(this.mw.apiUrl.href)

// Coordinate fetching
const reqOpts = objToQueryString({
...this.getArticleQueryOpts(),
})
const resp = await this.getJSON<MwApiResponse>(`${this.mw.apiUrl.href}${reqOpts}`)
const reqOpts = this.getArticleQueryOpts()

const resp = await this.getJSON<MwApiResponse>(this.apiUrlDirector.buildQueryURL(reqOpts))

const isCoordinateWarning = resp.warnings && resp.warnings.query && (resp.warnings.query['*'] || '').includes('coordinates')
if (isCoordinateWarning) {
logger.info('Coordinates not available on this wiki')
Expand All @@ -266,8 +276,8 @@ class Downloader {
return etag && etag.replace(WEAK_ETAG_REGEX, '')
}

public query(query: string): KVS<any> {
return this.getJSON(this.mw.getApiQueryUrl(query))
public query(): KVS<any> {
return this.getJSON(this.apiUrlDirector.buildSiteInfoQueryURL())
}

public async getArticleDetailsIds(articleIds: string[], shouldGetThumbnail = false): Promise<QueryMwRet> {
Expand All @@ -287,9 +297,11 @@ class Downloader {
: {}),
...(continuation || {}),
}
const queryString = objToQueryString(queryOpts)
const reqUrl = this.mw.getApiQueryUrl(queryString)

const reqUrl = this.apiUrlDirector.buildQueryURL(queryOpts)

const resp = await this.getJSON<MwApiResponse>(reqUrl)

Downloader.handleMWWarningsAndErrors(resp)

let processedResponse = resp.query ? normalizeMwResponse(resp.query) : {}
Expand All @@ -312,6 +324,7 @@ class Downloader {
let queryContinuation: QueryContinueOpts
let finalProcessedResp: QueryMwRet
let gCont: string = null

while (true) {
const queryOpts: KVS<any> = {
...this.getArticleQueryOpts(),
Expand All @@ -337,8 +350,7 @@ class Downloader {
queryOpts.rdcontinue = queryContinuation?.redirects?.rdcontinue ?? queryOpts.rdcontinue
}

const queryString = objToQueryString(queryOpts)
const reqUrl = this.mw.getApiQueryUrl(queryString)
const reqUrl = this.apiUrlDirector.buildQueryURL(queryOpts)

const resp = await this.getJSON<MwApiResponse>(reqUrl)
Downloader.handleMWWarningsAndErrors(resp)
Expand Down Expand Up @@ -652,8 +664,11 @@ class Downloader {
}

private async getSubCategories(articleId: string, continueStr = ''): Promise<Array<{ pageid: number; ns: number; title: string }>> {
const { query, continue: cont } = await this.getJSON<any>(this.mw.subCategoriesApiUrl(articleId, continueStr))
const apiUrlDirector = new ApiURLDirector(this.mw.apiUrl.href)

const { query, continue: cont } = await this.getJSON<any>(apiUrlDirector.buildSubCategoriesURL(articleId, continueStr))
const items = query.categorymembers.filter((a: any) => a && a.title)

if (cont && cont.cmcontinue) {
const nextItems = await this.getSubCategories(articleId, cont.cmcontinue)
return items.concat(nextItems)
Expand Down
6 changes: 5 additions & 1 deletion src/Dump.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import * as domino from 'domino'
import * as logger from './Logger.js'
import Downloader from './Downloader.js'
import { getStringsForLang } from './util/index.js'
import WebURLDirector from './util/builders/url/web.director.js'

interface DumpOpts {
tmpDir: string
Expand Down Expand Up @@ -214,7 +215,10 @@ export class Dump {

/* Push Mediawiki:Offline.css (at the end) */
// TODO: Weak URL (might fail in a number of cases where the wiki path is not like on Wikipedia)
const offlineCssUrl = downloader.mw.getWebArticleUrlRaw('Mediawiki:offline.css')
const webUrlDirector = new WebURLDirector(downloader.mw.webUrl.href)

const offlineCssUrl = webUrlDirector.buildArticleRawURL('Mediawiki:offline.css')

if (await downloader.canGetUrl(offlineCssUrl)) {
sheetUrls.push(offlineCssUrl)
}
Expand Down
72 changes: 21 additions & 51 deletions src/MediaWiki.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import urlParser from 'url'
import * as pathParser from 'path'
import * as logger from './Logger.js'
import * as util from './util/index.js'
import * as domino from 'domino'
import type Downloader from './Downloader.js'
import { ensureTrailingChar, DEFAULT_WIKI_PATH } from './util/index.js'
import { DEFAULT_WIKI_PATH } from './util/index.js'
import axios from 'axios'
import qs from 'querystring'
import semver from 'semver'
import basicURLDirector from './util/builders/url/basic.director.js'
import BaseURLDirector from './util/builders/url/base.director.js'
import ApiURLDirector from './util/builders/url/api.director.js'

class MediaWiki {
public metaData: MWMetaData
Expand All @@ -28,35 +30,37 @@ class MediaWiki {
private readonly password: string
private readonly apiPath: string
private readonly domain: string
private readonly articleApiUrlBase: string
private apiUrlDirector: ApiURLDirector

constructor(config: MWConfig) {
this.domain = config.domain || ''
this.username = config.username
this.password = config.password
this.getCategories = config.getCategories

this.baseUrl = new URL(ensureTrailingChar(config.base, '/'))
this.baseUrl = basicURLDirector.buildMediawikiBaseURL(config.base)

this.apiPath = config.apiPath ?? 'w/api.php'
this.wikiPath = config.wikiPath ?? DEFAULT_WIKI_PATH

this.webUrl = new URL(this.wikiPath, this.baseUrl)
this.apiUrl = new URL(`${this.apiPath}?`, this.baseUrl)
const baseUrlDirector = new BaseURLDirector(this.baseUrl.href)

this.veApiUrl = new URL(`${this.apiUrl.href}action=visualeditor&mobileformat=html&format=json&paction=parse&page=`)
this.webUrl = baseUrlDirector.buildURL(this.wikiPath)
this.apiUrl = baseUrlDirector.buildURL(this.apiPath)

this.restApiUrl = new URL(ensureTrailingChar(new URL(config.restApiPath ?? 'api/rest_v1', this.baseUrl.href).toString(), '/'))
this.mobileRestApiUrl = new URL(ensureTrailingChar(new URL(config.restApiPath ?? 'api/rest_v1/page/mobile-sections', this.baseUrl.href).toString(), '/'))
this.desktopRestApiUrl = new URL(ensureTrailingChar(new URL(config.restApiPath ?? 'api/rest_v1/page/html', this.baseUrl.href).toString(), '/'))
this.apiUrlDirector = new ApiURLDirector(this.apiUrl.href)

this.modulePath = `${urlParser.resolve(this.baseUrl.href, config.modulePath ?? 'w/load.php')}?`
this.articleApiUrlBase = `${this.apiUrl.href}action=parse&format=json&prop=${encodeURI('modules|jsconfigvars|headhtml')}&page=`
this.veApiUrl = this.apiUrlDirector.buildVisualEditorURL()

this.restApiUrl = baseUrlDirector.buildRestApiURL(config.restApiPath)
this.desktopRestApiUrl = baseUrlDirector.buildDesktopRestApiURL(config.restApiPath)

this.modulePath = baseUrlDirector.buildModuleURL(config.modulePath)
}

public async login(downloader: Downloader) {
if (this.username && this.password) {
let url = this.apiUrl.href
let url = this.apiUrl.href + '?'

// Add domain if configured
if (this.domain) {
Expand Down Expand Up @@ -94,43 +98,9 @@ class MediaWiki {
}
}

// In all the url methods below:
// * encodeURIComponent is mandatory for languages with illegal letters for uri (fa.wikipedia.org)
// * encodeURI is mandatory to encode the pipes '|' but the '&' and '=' must not be encoded
public siteInfoUrl() {
return `${this.apiUrl.href}action=query&meta=siteinfo&format=json`
}

public articleApiUrl(articleId: string): string {
return `${this.articleApiUrlBase}${encodeURIComponent(articleId)}`
}

public subCategoriesApiUrl(articleId: string, continueStr = '') {
return `${this.apiUrl.href}action=query&list=categorymembers&cmtype=subcat&cmlimit=max&format=json&cmtitle=${encodeURIComponent(articleId)}&cmcontinue=${continueStr}`
}

public getVeApiArticleUrl(articleId: string): string {
return `${this.veApiUrl.href}${encodeURIComponent(articleId)}`
}

public getDesktopRestApiArticleUrl(articleId: string): string {
return `${this.desktopRestApiUrl.href}${encodeURIComponent(articleId)}`
}

public getMobileRestApiArticleUrl(articleId: string): string {
return `${this.mobileRestApiUrl.href}${encodeURIComponent(articleId)}`
}

public getApiQueryUrl(query = ''): string {
return `${this.apiUrl.href}${query}`
}

public getWebArticleUrlRaw(articleId: string): string {
return `${this.webUrl.href}?title=${encodeURIComponent(articleId)}&action=raw`
}

public async getNamespaces(addNamespaces: number[], downloader: Downloader) {
const url = `${this.apiUrl.href}action=query&meta=siteinfo&siprop=namespaces|namespacealiases&format=json`
const url = this.apiUrlDirector.buildNamespacesURL()

const json: any = await downloader.getJSON(url)
;['namespaces', 'namespacealiases'].forEach((type) => {
const entries = json.query[type]
Expand Down Expand Up @@ -234,8 +204,8 @@ class MediaWiki {

public async getSiteInfo(downloader: Downloader) {
logger.log('Getting site info...')
const query = 'action=query&meta=siteinfo&format=json&siprop=general|namespaces|statistics|variables|category|wikidesc'
const body = await downloader.query(query)
const body = await downloader.query()

const entries = body.query.general

// Checking mediawiki version
Expand Down
Loading

0 comments on commit 1cd6963

Please sign in to comment.