From dc5b53a78025fac278709edc7ec11bae1133b150 Mon Sep 17 00:00:00 2001 From: Emmanuel Engelhart Date: Mon, 15 Jul 2024 20:01:46 +0200 Subject: [PATCH 1/6] Remove temporary workaround for S3 webp mgmt --- src/Downloader.ts | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/Downloader.ts b/src/Downloader.ts index d9c147a9..2df865fd 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -550,17 +550,6 @@ class Downloader { // See: https://github.com/openzim/mwoffliner/issues/2061 const mwResp = await axios(url, { ...this.arrayBufferRequestOptions, headers: { Referer: 'https://localhost/' } }) - /* TODO: Code to remove in a few months (February 2023). For - some reason, it seems a few pictures have 'image/webp' - mime-type in S3 although they are png, ... This leads to - having the following code not assuming they should be - converted to wepb. To avoid this, as a temporary solution, - such scneario are ignored and mime-type definition relies - only on url and Mediawiki header. */ - if (s3Resp?.Metadata?.contenttype === 'image/webp') { - s3Resp.Metadata.contenttype = undefined - } - // sanitize Content-Type mwResp.headers['content-type'] = getMimeType(url, s3Resp?.Metadata?.contenttype || mwResp.headers['content-type']) From 13b78d7b6023f3f231be57f93597f1a20fa590c8 Mon Sep 17 00:00:00 2001 From: Emmanuel Engelhart Date: Mon, 15 Jul 2024 20:11:45 +0200 Subject: [PATCH 2/6] Better error message around object cache --- src/S3.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/S3.ts b/src/S3.ts index 69e704f4..7e34607f 100644 --- a/src/S3.ts +++ b/src/S3.ts @@ -105,10 +105,10 @@ class S3 { .catch((err: any) => { // For 404 error handle AWS service-specific exception if (err && err.name === 'NoSuchKey') { - logger.log(`Error: The specified key ${key} does not exist.`) + logger.log(`The specified key '${key}' does not exist in the cache.`) resolve(null) } else { - logger.log(`Error while downloading the file ${err}`) + logger.error(`Error (${err}) while downloading the object '${key}' from the cache.`) reject(err) } }) @@ -123,7 +123,7 @@ class S3 { .send(command) .then((val: any) => resolve(val)) .catch((err: any) => { - logger.log('S3 error while uploading file', err) + logger.error('Error while deleting object in the cache', err) reject(err) }) }) From b21e78846f91ab6f050b15e8b30ddd009824ebad Mon Sep 17 00:00:00 2001 From: Emmanuel Engelhart Date: Mon, 15 Jul 2024 20:34:23 +0200 Subject: [PATCH 3/6] Add comments to image download procedure --- src/Downloader.ts | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/Downloader.ts b/src/Downloader.ts index 2df865fd..f240bf90 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -541,8 +541,16 @@ class Downloader { private async downloadImage(url: string, handler: any) { try { this.s3 + + // Check first if we have an entry in the (object storage) cache for this URL .downloadBlob(stripHttpFromUrl(url), this.webp ? 'webp' : '1') + + // Handling the cache response and act accordingly .then(async (s3Resp) => { + + // 'Versioning' of image is made via HTTP ETag. We should + // check if we have the proper version by requestion proper + // ETag to upstream MediaWiki. if (s3Resp?.Metadata?.etag) { this.arrayBufferRequestOptions.headers['If-None-Match'] = this.removeEtagWeakPrefix(s3Resp.Metadata.etag) } @@ -550,37 +558,42 @@ class Downloader { // See: https://github.com/openzim/mwoffliner/issues/2061 const mwResp = await axios(url, { ...this.arrayBufferRequestOptions, headers: { Referer: 'https://localhost/' } }) - // sanitize Content-Type + // HTTP response content-type can not really be trusted (at least if 304) mwResp.headers['content-type'] = getMimeType(url, s3Resp?.Metadata?.contenttype || mwResp.headers['content-type']) - // Most of the images after uploading once will always have - // 304 status, until modified. - // 304 does not have to answer with content-type, we have to get it - // via S3 metadata or extension + // Most of the images, after been uploaded once to the + // cache, will always have 304 status, until modified. If cache + // is up2date proceed. if (mwResp.status === 304) { // eslint-disable-next-line @typescript-eslint/no-unused-vars const headers = (({ Body, ...o }) => o)(s3Resp) + // If candidate to a WEPB convertion if (isWebpCandidateImageMimeType(this.webp, mwResp.headers['content-type']) && !this.cssDependenceUrls.hasOwnProperty(mwResp.config.url)) { headers.path_postfix = '.webp' headers['content-type'] = 'image/webp' } + + // Proceed with image handler(null, { responseHeaders: headers, content: (await this.streamToBuffer(s3Resp.Body as Readable)) as any, }) + + // Don´t pursue return } - // Compress content + // Compress content in case image blob comes from upstream MediaWiki await this.getCompressedBody(mwResp) - // Check for the etag and upload + // Check for the ETag and upload to cache const etag = this.removeEtagWeakPrefix(mwResp.headers.etag) if (etag) { this.s3.uploadBlob(stripHttpFromUrl(url), mwResp.data, etag, mwResp.headers['content-type'], this.webp ? 'webp' : '1') } + // Proceed with image handler(null, { responseHeaders: mwResp.headers, content: mwResp.data, From a09d7a0900cd2196f6ee565667e7198259777f0a Mon Sep 17 00:00:00 2001 From: Emmanuel Engelhart Date: Mon, 15 Jul 2024 20:37:10 +0200 Subject: [PATCH 4/6] Better object cache error --- src/S3.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/S3.ts b/src/S3.ts index 7e34607f..1c1d8002 100644 --- a/src/S3.ts +++ b/src/S3.ts @@ -81,7 +81,7 @@ class S3 { resolve(response) }) .catch((err: any) => { - logger.log('S3 error while uploading file', err) + logger.error('Cache error while uploading object:', err) reject(err) }) }) From dc6d43dd6d08b1e851a12d6a2f37461af886072e Mon Sep 17 00:00:00 2001 From: Emmanuel Engelhart Date: Mon, 15 Jul 2024 20:42:20 +0200 Subject: [PATCH 5/6] Fix linter issue --- src/Downloader.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Downloader.ts b/src/Downloader.ts index f240bf90..57037293 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -547,7 +547,6 @@ class Downloader { // Handling the cache response and act accordingly .then(async (s3Resp) => { - // 'Versioning' of image is made via HTTP ETag. We should // check if we have the proper version by requestion proper // ETag to upstream MediaWiki. From d339e537995680684b963d99c45f75b8dade37dd Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Mon, 22 Jul 2024 08:16:27 -0700 Subject: [PATCH 6/6] Grammar in comments --- src/Downloader.ts | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/Downloader.ts b/src/Downloader.ts index 57037293..b6ab3c4e 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -545,11 +545,11 @@ class Downloader { // Check first if we have an entry in the (object storage) cache for this URL .downloadBlob(stripHttpFromUrl(url), this.webp ? 'webp' : '1') - // Handling the cache response and act accordingly + // Handle the cache response and act accordingly .then(async (s3Resp) => { // 'Versioning' of image is made via HTTP ETag. We should - // check if we have the proper version by requestion proper - // ETag to upstream MediaWiki. + // check if we have the proper version by requesting proper + // ETag from upstream MediaWiki. if (s3Resp?.Metadata?.etag) { this.arrayBufferRequestOptions.headers['If-None-Match'] = this.removeEtagWeakPrefix(s3Resp.Metadata.etag) } @@ -560,14 +560,14 @@ class Downloader { // HTTP response content-type can not really be trusted (at least if 304) mwResp.headers['content-type'] = getMimeType(url, s3Resp?.Metadata?.contenttype || mwResp.headers['content-type']) - // Most of the images, after been uploaded once to the + // Most of the images, after having been uploaded once to the // cache, will always have 304 status, until modified. If cache - // is up2date proceed. + // is up to date, return cached image. if (mwResp.status === 304) { // eslint-disable-next-line @typescript-eslint/no-unused-vars const headers = (({ Body, ...o }) => o)(s3Resp) - // If candidate to a WEPB convertion + // If image is a webp conversion candidate if (isWebpCandidateImageMimeType(this.webp, mwResp.headers['content-type']) && !this.cssDependenceUrls.hasOwnProperty(mwResp.config.url)) { headers.path_postfix = '.webp' headers['content-type'] = 'image/webp' @@ -579,11 +579,10 @@ class Downloader { content: (await this.streamToBuffer(s3Resp.Body as Readable)) as any, }) - // Don´t pursue return } - // Compress content in case image blob comes from upstream MediaWiki + // Compress content because image blob comes from upstream MediaWiki await this.getCompressedBody(mwResp) // Check for the ETag and upload to cache