Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 19 additions & 15 deletions broken-links-script/Extractlinks.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ const getMarkdownFiles = (dir) => {
const BASE_URL = "https://cumulocity.com/docs";

const shortcodeMapping = {

"c8y-current-version": "",
"c8y-edge-current-version-alt": "10.18",
"c8y-edge-version-major": "2025",
Expand Down Expand Up @@ -50,6 +49,13 @@ const shortcodeMapping = {
"email-c8y-info": "info@cumulocity.com",
};

const hasRenderFalse = (fileContent) => {
const fm = fileContent.match(/(^|\n)---\s*[\s\S]*?\n---/);
if (!fm) return false;
const frontMatter = fm[0];
return /_build:\s*[\r\n]+[\s\S]*?render:\s*false\b/i.test(frontMatter);
};

const resolveHugoShortcode = (link) => {
return link.replace(/\{\{<\s*(.*?)\s*>\}\}/g, (match, shortcode) => {
const resolvedValue = shortcodeMapping[shortcode];
Expand All @@ -63,34 +69,35 @@ const resolveFullUrl = (link, relativePath, fileContent) => {
}

if (link.startsWith("#")) {
const fileDir = path.dirname(relativePath);
const fileDir = path.dirname(relativePath).replaceAll(path.sep, '/');
const fileName = path.basename(relativePath, ".md");
let segments = fileDir.split(path.sep);
let segments = fileDir.split("/").filter(Boolean);
let hasBundle = false;

if (segments.length > 0) {
const lastSegment = segments[segments.length - 1];
if (lastSegment.endsWith("-bundle")) {
if (/-bundle$/.test(lastSegment)) {
segments[segments.length - 1] = lastSegment.replace(/-bundle$/, "");
hasBundle = true;
}
}

// if this file is not rendered, publish from its directory (e.g., /glossary/)
const notRendered = hasRenderFalse(fileContent);

let publishedBasePath = "";
if (hasBundle) {
if (notRendered || hasBundle) {
publishedBasePath = segments.join("/");
} else {
publishedBasePath = fileName === "index" ? fileDir : path.join(fileDir, fileName);
publishedBasePath = fileName === "index" ? fileDir : `${fileDir}/${fileName}`;
}
publishedBasePath = publishedBasePath
.replace(/\\/g, "/")
.replace(/^\/+/, "")
.replace(/\/+$/, "");
return `${BASE_URL.replace(/\/$/, "")}/${publishedBasePath}/#${link.substring(1)}`;
publishedBasePath = publishedBasePath.replace(/^\/+|\/+$/g, "");
const base = `${BASE_URL.replace(/\/$/, "")}/${publishedBasePath ? publishedBasePath + "/" : ""}`;
return `${base}#${link.substring(1)}`;
}

const resolvedLink = resolveHugoShortcode(link);
if (resolvedLink.startsWith("http://") || resolvedLink.startsWith("https://")) {
if (/^https?:\/\//i.test(resolvedLink)) {
return resolvedLink;
}
return `${BASE_URL.replace(/\/$/, "")}/${resolvedLink.replace(/^\//, "")}`;
Expand All @@ -99,8 +106,6 @@ const resolveFullUrl = (link, relativePath, fileContent) => {
(() => {
const projectDir = "../content";
const markdownFiles = getMarkdownFiles(projectDir);

// Use an object to map each unique link to a Set of file paths where it's found.
const linkMap = {};

markdownFiles.forEach((mdFile) => {
Expand All @@ -121,7 +126,6 @@ const resolveFullUrl = (link, relativePath, fileContent) => {
});
});

// Convert the map into an array of objects.
const result = Object.keys(linkMap).map(link => ({
link,
files: Array.from(linkMap[link])
Expand Down
94 changes: 68 additions & 26 deletions broken-links-script/cypress/e2e/link-checker.cy.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,41 +5,52 @@ describe('Link and Routing Validation - Individual URL Checks', () => {
const totalTests = urls.length;


const escRegExp = (string) => string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const expectFragmentExists = (doc, fragment) => {
const decodedFragment = decodeURIComponent(fragment);
const collectFragments = (root) => {
const ids = Array.from(root.querySelectorAll('[id]')).map(el => el.id);
const names = Array.from(root.querySelectorAll('a[name]')).map(a => a.getAttribute('name'));
return [...ids, ...names].filter(Boolean);
};

const expectFragmentExists = (htmlContent, fragment) => {
const escFragment = escRegExp(fragment);
const regex = new RegExp(`(id=["']${escFragment}["']|name=["']${escFragment}["'])`);
const exists = regex.test(htmlContent);
expect(exists, `An element with id or name "${fragment}" should exist in HTML`).to.be.true;
let allFragments = collectFragments(doc);
const iframes = doc.querySelectorAll('iframe, frame');
for (const frame of iframes) {
try {
const frameDoc = frame.contentDocument || frame.contentWindow?.document;
if (frameDoc) {
allFragments = allFragments.concat(collectFragments(frameDoc));
}
} catch (e) {
}
}
Comment on lines +17 to +26
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you give me example of the case you're addressing here? I've made some tests and if you have a document with iframe, and iframe contains an anchor, you cannot use url like /main-document.html#a-name-from-iframe to link to the anchor inside the iframe, i.e. user won't be scrolled to the right position within the iframe.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The intention here isn’t to support direct navigation to iframe anchors via #fragment, but to verify that referenced anchors (even inside embedded documents) actually exist. We have some pages that load documentation or content in iframes, so this logic helps our validation detect missing anchors there too.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be easier if you provided an example link and use case. Sure, with the above mechanism we can verify that the anchor exists inside the iframe on the target page (if they are from the same domain). My concern is also user experience in case we have a link in our docs which refers to a specific anchor inside an iframe on another page (internal or external), then user won't be navigated to the referred section directly, but just to the top of the page.


const exists = allFragments.some(f => f === decodedFragment);

if (!exists) {
cy.log(`Available fragments (including frames):\n${allFragments.join('\n')}`);
}

expect(exists, `An element with id or name = "#${fragment}" should exist in HTML or frames`).to.be.true;
};

const expectNoUnencodedParentheses = (url) => {
cy.wrap(url).should('not.match', /[()]/, `URL should not contain unencoded parentheses: ${url}`);
};

// Note: On GitHub pages, heading IDs are prefixed with "user-content-".
const checkGithubFragment = (fragment) => {
const normalizedFragment = fragment.toLowerCase().replace(/[^\w\-]+/g, '-').replace(/^-+|-+$/g, '');
cy.document().then((doc) => {
const anchorExists = Array.from(doc.querySelectorAll('a'))
.some(a => a.getAttribute('href') === `#${normalizedFragment}`);

expect(anchorExists, `Fragment "#${normalizedFragment}" should exist in href attribute of an <a> tag`).to.be.true;
const allFragments = Array.from(doc.querySelectorAll('a'))
.map(a => a.getAttribute('href'))
.filter(href => href && href.startsWith('#'));
cy.log(`Available fragments on page:\n${allFragments.join('\n')}`);
const ids = Array.from(doc.querySelectorAll('[id]')).map(el => el.id.replace(/^user-content-/, ''));
cy.log(`Available GitHub IDs:\n${ids.join('\n')}`);
const exists = ids.some(id => id === fragment);
expect(exists, `Fragment "#${fragment}" should exist in GitHub page`).to.be.true;
});
};

const checkRegularFragment = (fragment) => {
cy.document().then((doc) => {
const html = doc.documentElement.innerHTML;
expectFragmentExists(html, fragment);
const ids = Array.from(doc.querySelectorAll('[id]')).map(el => el.id);
const names = Array.from(doc.querySelectorAll('a[name]')).map(a => a.getAttribute('name'));
const allFragments = [...ids, ...names];
cy.log(`Available elements on page with ids and names:\n${allFragments.join('\n')}`);
expectFragmentExists(doc, fragment);
});
};

Expand All @@ -62,17 +73,37 @@ describe('Link and Routing Validation - Individual URL Checks', () => {

const hasNonHtmlExtension = nonHtmlExtensions.some(ext => url.endsWith(ext));
const isNonHtmlResource = hasNonHtmlExtension || url.includes('/files/') || url.includes('/downloads/');
const isNpmPackagePage = url.startsWith('https://www.npmjs.com/package/');
if (isNpmPackagePage) {
const m = url.match(/^https:\/\/www\.npmjs\.com\/package\/(@[^/]+\/[^#?]+)/);
const pkg = m ? m[1] : null;
const encodedUrl = pkg ? url.replace(pkg, encodeURIComponent(pkg)) : url;

if (pkg) {
cy.request({
url: `https://registry.npmjs.org/${pkg}`,
failOnStatusCode: false,
headers: { Accept: 'application/vnd.npm.install-v1+json' }
}).then((res) => {
expect(res.status, `npm registry status for ${pkg}`).to.eq(200);
});
}
cy.visit(encodedUrl, { timeout: 50000, failOnStatusCode: false });
cy.url().should('include', '/package/%40');
Comment on lines +78 to +92
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain this logic? I see you match only packages with scoped names starting with @ and currently we only have such urls in docs, but I wouldn't make it that specific. Could we assume that anything after https://www.npmjs.com/package/ is a full package name?
I also read about constraints on accessing https://www.npmjs.com by bots and that we need to use registry instead. But what's the purpose of then encoding url, visiting it and checking for %40?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, I am matching only scoped packages (@scope/pkg) since those are the only ones we have in our docs. The registry call is used to confirm that the package actually exists (to avoid npmjs.com’s bot-blocks).
After that, the encoded URL visit (%40) ensures the page resolves correctly for scoped packages, since npmjs.com automatically redirects them to the encoded form

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, I am matching only scoped packages (@scope/pkg) since those are the only ones we have in our docs. The registry call is used to confirm that the package actually exists (to avoid npmjs.com’s bot-blocks).

ok, that's clear

After that, the encoded URL visit (%40) ensures the page resolves correctly for scoped packages, since npmjs.com automatically redirects them to the encoded form

Sorry, didn't get that. If you navigate to the encoded URL, then it's expected that you detect %40 coming from the encoded URL. It does not prove that npmjs.com resolved anything or redirected you.
This check will also pass for a made-up package name:

cy.visit('https://www.npmjs.com/package/%40c8y/made-up-package', { timeout: 50000, failOnStatusCode: false });
cy.url().should('include', `/package/%40`);

Isn't the registry check enough?

completedTests++;
return;
}

Cypress.env('sourceFiles', item.files);
expectNoUnencodedParentheses(url);

if (isNonHtmlResource) {
cy.log(`Validating non-HTML resource: ${url}`);
cy.request({
url: url,
failOnStatusCode: false
}).then((response) => {
expect(response.status).to.be.oneOf([200, 304]);
expect(response.status).to.be.oneOf([200, 201, 202, 203, 204, 301, 302, 304]);

if (url.endsWith('.json')) {
expect(response.body).to.be.an('object');
Expand Down Expand Up @@ -117,12 +148,23 @@ describe('Link and Routing Validation - Individual URL Checks', () => {
checkGithubFragment(fragment);
}
else if (fragment) {
cy.visit(url, { timeout: 20000 });
cy.visit(url, { timeout: 30000, failOnStatusCode: false, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0 Safari/537.36' }});
checkRegularFragment(fragment);
}
else {
cy.visit(url, { timeout: 20000 });
cy.document().its('body').should('not.be.empty');
cy.request({
url: url,
failOnStatusCode: false
}).then((response) => {
const contentType = response.headers['content-type'] || '';
if (!contentType.includes('text/html')) {
cy.log(`Non-HTML content detected for ${url}, skipping cy.visit()`);
expect(response.status).to.be.oneOf([200, 201, 202, 203, 204, 301, 302, 304]);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we check that response.body is not empty? If we direct user there, there should be some content. Do you have any example of what non-html resource might that be?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, Link: https://download.cumulocity.com/Apama/Debian/. These are file repositories or download links, not HTML pages, so I only check that they return a valid status and skip checking the body.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This link seems to be special as it actually returns 2 content-type headers, but response.headers only shows you one of them.

2025-10-29_17-35

Anyway, I'd consider checking response.body to not be empty, just as you check detected text/html pages for non-empty HTML body.

Suggested change
expect(response.status).to.be.oneOf([200, 201, 202, 203, 204, 301, 302, 304]);
expect(response.status).to.be.oneOf([200, 201, 202, 203, 204, 301, 302, 304]);
expect(response.body).not.to.be.empty;

} else {
cy.visit(url, { timeout: 20000 });
cy.document().its('body').should('not.be.empty');
}
});
}

completedTests++;
Expand Down
12 changes: 10 additions & 2 deletions broken-links-script/cypress/support/e2e.js
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,17 @@ Cypress.on('uncaught:exception', (err) => {
if (err.message.includes("$(...).tooltip is not a function")) {
return false;
}
if (err.message.includes("No key found. SDK can not be initialized")) {
if (err.message.includes("No key found. SDK can not be initialized")) {
return false;
}
if (err.message.includes("Failed to execute 'getComputedStyle' on 'Window': parameter 1 is not of type 'Element'.")) {
return false;
}
if (err.message.includes("Identifier 'rocket_pairs' has already been declared")) {
return false;
}
if (err.message.includes("Unexpected token 'var'")) {
return false;
}

});