From 856ef4f00f4c4d51de77bb73875b921afa234012 Mon Sep 17 00:00:00 2001 From: tomoyane Date: Sat, 6 Apr 2024 14:18:22 +0900 Subject: [PATCH] refactor tag rewrite orocessing --- README.md | 72 ++++++++++++++++++++------------ src/config/proxyConfig.js | 9 +++- src/lib/autoOgpExtractor.js | 21 +++++++++- src/lib/autoOgpExtractor.test.js | 16 +++++-- src/lib/htmlParser.js | 44 +++++++++++++++++-- src/lib/htmlParser.test.js | 27 +++++++++--- src/proxy/proxy.js | 63 ++++++++++++++++------------ 7 files changed, 184 insertions(+), 68 deletions(-) diff --git a/README.md b/README.md index 666d89a..b25bbc2 100644 --- a/README.md +++ b/README.md @@ -8,46 +8,53 @@ This proxy does not depend on Cloudflare and launches on express server. `<20.5.1` -If using version 21.x or above, it will generate a deprecation warning indicating the use of the deprecated punycode module. +If using version 21.x or above, it will generate a deprecation warning indicating the use of the +deprecated punycode module. ## Environment variable -| ProxyConfig | Description | Default | -|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------| -| PROXY_PORT | Proxy port number | "3456" | -| DOMAIN | Proxy domain for rewrite | "localhost:3456" | -| IS_TLS | Proxy tls(http/https) for rewrite | "false" | -| NOTION_PAGE_ID | Notion public page id | "f1db0cfbe246475784c67f279289abea" | -| CUSTOM_SCRIPT | Custom script | "" | -| CONTENT_CACHE_SEC | Cache time for loaded content (sec) | "300" | -| GOOGLE_FONT | See: `https://developers.google.com/fonts` | "" | -| AUTO_SET_OGP | The server automatically extracts Open Graph Protocol (OGP) data from your NotionId upon startup.
When this feature is enabled, the values of `OG_TAG_TITLE` and `OG_TAG_IMAGE_URL` are utilized for automatic configuration.
If you prefer to wait until the OGP tags are fetched automatically, you can use the `/readyz` command.

Requirements
- Headless chrome
- CPU is always allocated
- At least 512MB of memory for better | "false" | -| OG_TAG_TITLE | Title for og tag | "" | -| OG_TAG_DESC | Description for og tag | "" | -| OG_TAG_IMAGE_URL | Image url for og tag | "" | -| OG_TAG_TYPE | Type for og tag | "website" | -| TWITTER_CARD | Twitter card for og tag | "summary_large_image" | - -### Note for AUTO_SET_OGP variable +(※) is required. + +| ProxyConfig | Description | Default | +|--------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------| +| PROXY_PORT (※) | Proxy port number | "3456" | +| DOMAIN (※) | Proxy domain (*Your domain*) for rewrite | "localhost:3456" | +| IS_TLS (※) | Proxy tls(http/https) for rewrite | "false" | +| NOTION_PAGE_ID (※) | Notion public page id | "f1db0cfbe246475784c67f279289abea" | +| CUSTOM_SCRIPT | Custom script | "" | +| CONTENT_CACHE_SEC | Cache time for loaded content (sec) | "300" | +| GOOGLE_FONT | See: https://developers.google.com/fonts | "" | +| AUTO_SET_OG_TAG | The server automatically extracts Open Graph Protocol (OGP) data from your NotionId upon startup.
When this feature is enabled, the values of `OG_TAG_TITLE` and `OG_TAG_IMAGE_URL` and `ICON_URL` are utilized for automatic configuration.
If you prefer to wait until the OGP tags are fetched automatically, you can use the `/readyz` endpoint.

Requirements
- Headless chrome
- CPU is always allocated
- At least 512MB of memory for better | "false" | +| OG_TAG_TITLE | Title for og tag for rewrite.
If you use default value, there is no data to rewrite, so the data from when it back post to Notion will be used. | "" | +| OG_TAG_DESC | Description for og tag for rewrite.
If you use default value, there is no data to rewrite, so the data from when it back post to Notion will be used. | "" | +| OG_TAG_IMAGE_URL | Image url for og tag for rewrite.
If you use default value, there is no data to rewrite, so the data from when it back post to Notion will be used. | "" | +| OG_TAG_TYPE | Type for og tag for rewrite | "website" | +| ICON_URL | Icon url for rewrite.
If you use default value, there is no data to rewrite, so the data from when it back post to Notion will be used. | "" | | +| TWITTER_CARD | Twitter card for og tag for rewrite | "summary_large_image" | + +### Note for AUTO_SET_OG_TAG variable **OgTag setting priority** Environment variables with OG_xxx in prefix are set with the highest priority. -So, if AUTO_SET_OGP is enabled but the OG_xxx environment variable is set, OG_xxx will have priority. +So, if AUTO_SET_OG_TAG is enabled but the OG_xxx environment variable is set, OG_xxx will have +priority. **Headless chrome Requirements** At startup, we are extracting og tags from the NotionId page using Chrome Headless. -So, CPU allocation is necessary. Please be cautious when using request allocation in services like Cloud Functions or Cloud Run or Other. +So, CPU allocation is necessary. Please be cautious when using request allocation in services like +Cloud Functions or Cloud Run or Other. ## Getting started Start proxy for debug on local. ```bash -$ npm ci -$ npm test -$ npm start_proxy +npm ci +npm test +npm run start_proxy + > notion-proxy@1.0.0 start > node src/index.js Proxy listening at localhost:3456, NotionId: f1db0cfbe246475784c67f279289abea @@ -56,8 +63,19 @@ Proxy listening at localhost:3456, NotionId: f1db0cfbe246475784c67f279289abea Start proxy binary. ```bash -$ npm install -g pkg -$ npm run build -$ ./notion-proxy +npm install -g pkg +npm run build +./notion-proxy + Proxy listening at localhost:3456, NotionId: f1db0cfbe246475784c67f279289abea -``` \ No newline at end of file +``` + +## Proxy example with your domain + +```bash +export DOMAIN="CHANGE IT" && \ +export NOTION_PAGE_ID="CHANGE IT" && \ +export IS_TLS="true" && \ +export AUTO_SET_OG_TAG="true" && \ +npm run start_proxy +``` diff --git a/src/config/proxyConfig.js b/src/config/proxyConfig.js index e23c4ba..db40054 100644 --- a/src/config/proxyConfig.js +++ b/src/config/proxyConfig.js @@ -8,8 +8,9 @@ class ProxyConfig { this.notionPageId = process.env.NOTION_PAGE_ID || 'f1db0cfbe246475784c67f279289abea'; this.customScript = process.env.CUSTOM_SCRIPT || ''; this.contentCacheSec = process.env.CONTENT_CACHE_SEC || '300'; - this.autoSetOgp = process.env.AUTO_SET_OGP || 'false'; - this.autoSetOgp = this.autoSetOgp === 'true'; + this.iconUrl = process.env.ICON_URL || ''; + this.autoSetOgTag = process.env.AUTO_SET_OG_TAG || 'false'; + this.autoSetOgTag = this.autoSetOgTag === 'true'; this.slugToPage = { "": this.notionPageId } @@ -52,6 +53,10 @@ class ProxyConfig { throw new Error("Invalid CONTENT_CACHE_SEC environment. Allow number"); } } + + replaceIconUrl(v) { + this.iconUrl = v; + } } class TwitterTag { diff --git a/src/lib/autoOgpExtractor.js b/src/lib/autoOgpExtractor.js index 62eac30..61f8d30 100644 --- a/src/lib/autoOgpExtractor.js +++ b/src/lib/autoOgpExtractor.js @@ -10,10 +10,11 @@ const {JSDOM} = require("jsdom"); * The image has Chrome installed if the notion proxy is running on a container. See Dockerfile. */ class AutoOgpExtractor { - constructor(notionId, domain, isTls) { + constructor(notionId, domain, isTls, proxyPort) { this.notionId = notionId; this.domain = domain; this.isTls = isTls; + this.proxyPort = proxyPort; } async fetchHtmlAfterExecutedJs() { @@ -24,7 +25,7 @@ class AutoOgpExtractor { args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'] }); const page = await browser.newPage(); - await page.goto(`http://localhost:3456/${this.notionId}`); + await page.goto(`http://localhost:${this.proxyPort}/${this.notionId}`); await page.waitForSelector('.notion-topbar'); const html = await page.content(); await browser.close(); @@ -66,6 +67,22 @@ class AutoOgpExtractor { return `${protocol}://${this.domain}/${uri}`; } + extractIcon(htmlStr) { + if (htmlStr === '' || htmlStr === null) { + return null; + } + const dom = new JSDOM(htmlStr); + const imgElements = dom.window.document.querySelectorAll('img[alt="Page icon"]'); + const srcValues = Array.from(imgElements).map(img => img.getAttribute('src')); + if (!srcValues || srcValues.length === 0) { + return null; + } + + const protocol = this.isTls ? 'https' : 'http'; + let uri = srcValues[0].substring(1);; + return `${protocol}://${this.domain}/${uri}`; + } + extractOgDesc(htmlStr) { if (htmlStr === '' || htmlStr === null) { return null; diff --git a/src/lib/autoOgpExtractor.test.js b/src/lib/autoOgpExtractor.test.js index dc95b17..c90cb59 100644 --- a/src/lib/autoOgpExtractor.test.js +++ b/src/lib/autoOgpExtractor.test.js @@ -17,6 +17,9 @@ const testHtmlStr = `

Hello

+
+ Page icon +
@@ -28,7 +31,7 @@ function getAutoOgpExtractor() { const notionId = 'f1db0cfbe246475784c67f279289abea'; const domain = 'eukarya.io'; const isTls = true; - return new AutoOgpExtractor(notionId, domain, isTls); + return new AutoOgpExtractor(notionId, domain, isTls, '3456'); } test('Extract og title', () => { @@ -40,7 +43,14 @@ test('Extract og title', () => { test('Extract og image', () => { const extractor = getAutoOgpExtractor(); - const imgage = extractor.extractOgImage(testHtmlStr); + const image = extractor.extractOgImage(testHtmlStr); + + expect(image).toBe('https://eukarya.io/image/hello.png'); +}); + +test('Extract icon', () => { + const extractor = getAutoOgpExtractor(); + const icon = extractor.extractIcon(testHtmlStr); - expect(imgage).toBe('https://eukarya.io/image/hello.png'); + expect(icon).toBe('https://eukarya.io/image/icon.png'); }); \ No newline at end of file diff --git a/src/lib/htmlParser.js b/src/lib/htmlParser.js index 56e7b4c..34dcc57 100644 --- a/src/lib/htmlParser.js +++ b/src/lib/htmlParser.js @@ -11,18 +11,20 @@ class HtmlParser { * @param pageUrl ProxyConfig.ogTag.url * @param pageType ProxyConfig.ogTag.pageType * @param twitterCard ProxyConfig.twitterTag.twitterCard + * @param iconUrl ProxyConfig.iconUrl * @param googleFont ProxyConfig.googleFont * @param domain ProxyConfig.domain * @param customScript ProxyConfig.customScript * @param isTls ProxyConfig.isTls * @param stp slug to page record */ - constructor(pageTitle, pageDesc, pageImage, pageUrl, pageType, twitterCard, googleFont, domain, customScript, isTls, stp) { + constructor(pageTitle, pageDesc, pageImage, pageUrl, pageType, twitterCard, iconUrl, googleFont, domain, customScript, isTls, stp) { this.pageTitle = pageTitle; this.pageDescription = pageDesc; this.pageImage = pageImage; this.pageUrl = pageUrl; this.pageType = pageType; + this.iconUrl = iconUrl; this.twitterCard = twitterCard; this.googleFont = googleFont; this.domain = domain; @@ -42,7 +44,7 @@ class HtmlParser { parseMeta(element) { try { if (this.pageTitle !== '') { - if (element.getAttribute('property') === 'og:title' || element.getAttribute('name') === 'twitter:title') { + if (element.getAttribute('property') === 'og:title' || element.getAttribute('name') === 'twitter:title' || element.getAttribute('property') === 'og:site_name') { element.setAttribute('content', this.pageTitle); } } @@ -79,6 +81,25 @@ class HtmlParser { } } + parseIcon(element, document) { + if (this.iconUrl !== '') { + element.setAttribute('href', this.iconUrl); + + // og:logo + const headElement = document.querySelector('head'); + const metaElement = document.createElement('meta'); + metaElement.setAttribute('property', 'og:logo'); + metaElement.setAttribute('content', this.iconUrl); + headElement.appendChild(metaElement); + + // apple-touch-icon + const appleTouchIcon = document.querySelector('link[rel="apple-touch-icon"]'); + if (appleTouchIcon) { + appleTouchIcon.setAttribute('href', this.iconUrl); + } + } + } + parseHead(element) { if (this.googleFont !== '') { element.innerHTML += ` 0) { + for (var node of mutation.addedNodes) { + if (node.nodeType === 1 && node.classList.contains('notion-presence-container') && linkElement) { + linkElement.href = '${this.iconUrl}'; + } + } + } + } + } if (redirected) { return; } @@ -189,6 +222,11 @@ class HtmlParser { this.parseMeta(metas[m]) } + const shortcutIcon = document.querySelector('link[rel="shortcut icon"]'); + if (shortcutIcon) { + this.parseIcon(shortcutIcon, document) + } + let head = document.querySelector('head') if (head) { this.parseHead(head) diff --git a/src/lib/htmlParser.test.js b/src/lib/htmlParser.test.js index 80ed3a6..aa69085 100644 --- a/src/lib/htmlParser.test.js +++ b/src/lib/htmlParser.test.js @@ -7,6 +7,7 @@ function getParser() { const desc = 'Test Desc'; const image = 'https://eukarya.io/img/logo.svg'; const url = 'https://eukarya.io'; + const iconUrl = 'https://reearth.io/img/logo.svg'; const type = 'website'; const twitterCard = 'summary_large_image'; const googleFont = ''; @@ -21,6 +22,7 @@ function getParser() { url, type, twitterCard, + iconUrl, googleFont, domain, customScript, @@ -32,8 +34,7 @@ test('Parse html for Notion', () => { const parser = getParser(); const element = new JSDOM( ` - - + Notion – The all-in-one workspace for your notes, tasks, wikis, and databases. @@ -45,6 +46,8 @@ test('Parse html for Notion', () => { + +

Hello

@@ -59,14 +62,16 @@ test('Parse html for Notion', () => { Notion – The all-in-one workspace for your notes, tasks, wikis, and databases. - + - + + +

Hello

@@ -97,7 +102,19 @@ test('Parse html for Notion', () => { history.replaceState(history.state, '', '/' + slug); } } - const observer = new MutationObserver(function() { + var linkElement = document.querySelector('link[rel="shortcut icon"]'); + const observer = new MutationObserver(function(mutationsList) { + if ('https://reearth.io/img/logo.svg' !== '') { + for (var mutation of mutationsList) { + if (mutation.type === 'childList' && mutation.addedNodes.length > 0) { + for (var node of mutation.addedNodes) { + if (node.nodeType === 1 && node.classList.contains('notion-presence-container') && linkElement) { + linkElement.href = 'https://reearth.io/img/logo.svg'; + } + } + } + } + } if (redirected) { return; } diff --git a/src/proxy/proxy.js b/src/proxy/proxy.js index 48c705b..406c518 100644 --- a/src/proxy/proxy.js +++ b/src/proxy/proxy.js @@ -14,22 +14,24 @@ class Proxy { * @param config ProxyConfig class */ constructor(config) { - this.initialize(config); + let isReady = !config.autoSetOgTag; + this.initVariable(config, isReady); } /** * Init field variable. * * @param config ProxyConfig class - * @param isReloadedVariable Reloaded variable for automatic set OGP + * @param isReady Whether automatic OGP extraction is successful or not */ - initialize(config, isReloadedVariable = false) { + initVariable(config, isReady) { this.proxyConfig = config; this.cacheStore = new ContentCache(config.contentCacheSec); this.autoOgpExtractor = new AutoOgpExtractor( config.notionPageId, config.domain, - config.isTls + config.isTls, + config.proxyPort ); this.htmlParser = new HtmlParser( config.ogTag.title, @@ -38,6 +40,7 @@ class Proxy { config.ogTag.url, config.ogTag.type, config.twitterTag.card, + config.iconUrl, config.googleFont, config.domain, config.customScript, @@ -45,48 +48,56 @@ class Proxy { config.slugToPage ); - if (config.autoSetOgp && isReloadedVariable) { - this.readyz = true; - this.livez = true; - } else if (config.autoSetOgp && !isReloadedVariable) { - this.readyz = false; - this.livez = true; - } else { - this.readyz = true; - this.livez = true; - } + this.readyz = isReady; + this.livez = true; } /** * Reload proxy config if AUTO_SET_OGP enabled. + * Failure safe processing * * @returns {Promise} */ async reloadProxyConfig() { - if (!this.proxyConfig.autoSetOgp) { + if (!this.proxyConfig.autoSetOgTag) { return; } + const html = await this.autoOgpExtractor.fetchHtmlAfterExecutedJs(); const fetchedTitle = this.autoOgpExtractor.extractOgTitle(html); const fetchedImage = this.autoOgpExtractor.extractOgImage(html); + const fetchedIcon = this.autoOgpExtractor.extractIcon(html); + if (fetchedTitle !== null && this.proxyConfig.ogTag.title === '') { - this.proxyConfig.ogTag.replaceTitle(fetchedTitle) - this.proxyConfig.twitterTag.replaceTitle(fetchedTitle) + this.proxyConfig.ogTag.replaceTitle(fetchedTitle); + this.proxyConfig.twitterTag.replaceTitle(fetchedTitle); } if (fetchedImage !== null && this.proxyConfig.ogTag.image === '') { - this.proxyConfig.ogTag.replaceImage(fetchedImage) - this.proxyConfig.twitterTag.replaceImage(fetchedImage) + this.proxyConfig.ogTag.replaceImage(fetchedImage); + this.proxyConfig.twitterTag.replaceImage(fetchedImage); } - if (fetchedTitle === null && fetchedImage === null) { + if (fetchedIcon !== null && this.proxyConfig.iconUrl === '') { + this.proxyConfig.replaceIconUrl(fetchedIcon); + } + + let isReady; + if (fetchedTitle === null && fetchedImage === null && fetchedIcon === null) { console.log('[WARN] Failed to fetch OGP tag automatically'); + isReady = false; } else { const imgMsg = this.proxyConfig.ogTag.image.length > 30 ? `${this.proxyConfig.ogTag.image.substring(0, 30)}...` : this.proxyConfig.ogTag.image; + + const iconMsg = this.proxyConfig.iconUrl.length > 30 ? + `${this.proxyConfig.iconUrl.substring(0, 30)}...` : this.proxyConfig.iconUrl; + console.log('Successful automatic fetched of OGP tag.' + - ` Title: ${this.proxyConfig.ogTag.title}` + - ` Image: ${imgMsg}`); + ` Title: ${this.proxyConfig.ogTag.title},` + + ` Image: ${imgMsg},` + + ` Icon: ${iconMsg}` ); + isReady = true; } - this.initialize(this.proxyConfig, true); + this.initVariable(this.proxyConfig, isReady); } /** @@ -116,8 +127,8 @@ class Proxy { /** * GET readyz - * When the automatic OGP tag extraction process is complete, the proxy will be set to Ready regardless of success or failure. - * The extract process is dependent on the Chrome environment, so NotionProxy is set to Ready regardless. + * When the automatic OGP tag extraction process is successful, the proxy will be set to Ready. + * If failure to fetch OGP tag, Server return 503. * * @param req Request of express * @param res Response of express @@ -125,7 +136,7 @@ class Proxy { */ getReadyZ(req, res) { if (!this.readyz) { - return res.status(503).send('Not ready yet because server is extracting ogp tag automatically'); + return res.status(503).send('Not ready yet'); } return res.status(200).send('OK'); }