A bit more to prevent bots from crawling /wiki/ (#346)

* A bit more to prevent bots from crawling /wiki/ Now the proxy requires a specific header and anything else gets a 404. * lint
goto-bus-stop · Dec 28, 2024 · 64112a2 · 64112a2
1 parent f279e89
commit 64112a2
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 2 deletions.
diff --git a/src/app.js b/src/app.js
@@ -81,7 +81,7 @@ app.get('/recent', t(async (req, res) => {
 app.get('/robots.txt', (req, res) => {
   // Search engines should not serve the proxied pages as if they are wikipedia
   res.send(`User-agent: *
-Disallow: /wiki/*
+Disallow: /wiki/
 `)
 })
 
@@ -92,6 +92,12 @@ app.use(serveStatic(fileURLToPath(new URL('../public', import.meta.url))))
  */
 
 app.get('/wiki/:page', t(async (req, res) => {
+  res.header('X-Robots-Tag', 'noindex,nofollow')
+  if (req.headers.authorization !== 'wikibattle.me client') {
+    res.status(404).end()
+    return
+  }
+
   const body = await wiki.get(req.params.page)
   res.end(body.content)
 }))

diff --git a/src/client/load-page.js b/src/client/load-page.js
@@ -1,6 +1,12 @@
 const cache = {}
 
+const fetchOpts = {
+  headers: {
+    authorization: 'wikibattle.me client'
+  }
+}
+
 export default function load (page, cb) {
-  cache[page] ??= fetch(`./wiki/${page}`).then((response) => response.text())
+  cache[page] ??= fetch(`./wiki/${page}`, fetchOpts).then((response) => response.text())
   cache[page].then((result) => cb(null, result), cb)
 }