From af019d05cfc716dc5ff5c73ccff5b3405325b3a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Rame=CC=81?= Date: Tue, 19 Mar 2024 18:24:45 +0100 Subject: [PATCH] fix(semgrep): if it takes too long we time out the scan --- src/semgrep/index.ts | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/semgrep/index.ts b/src/semgrep/index.ts index cab57ff..e9cda9e 100644 --- a/src/semgrep/index.ts +++ b/src/semgrep/index.ts @@ -1,5 +1,6 @@ import * as Sentry from '@sentry/nextjs'; import { noCase } from 'change-case'; +import { minutesToMilliseconds } from 'date-fns/minutesToMilliseconds'; import { $ } from 'execa'; import fsSync from 'fs'; import fs from 'fs/promises'; @@ -39,7 +40,13 @@ export async function analyzeWithSemgrep(folderPath: string, outputPath: string) try { // `--no-git-ignore` is required since the `.semgrepignore` is not taken into account with absolute paths (ref: https://github.com/semgrep/semgrep/issues/9960) - await $`semgrep --metrics=off --no-git-ignore --config ${codeAnalysisRulesPath} ${folderPath} --json -o ${outputPath}`; + await $({ + // When it takes too much time it's not temporary, it's always the same for this specific repository, so better to skip it + // For example with https://forge.aeif.fr/edupyter/EDUPYTER310 there is no big file, but around ~3000 files to analyze + // and it was like stuck forever, until resulting in a OOM, or a JSON file being so big that the `readFile` was throwing `RangeError: Invalid string length` which is the limit a string can contains + // Ref: https://github.com/semgrep/semgrep/issues/9469#issuecomment-2007687541 + timeout: minutesToMilliseconds(2), + })`semgrep --metrics=off --no-git-ignore --config ${codeAnalysisRulesPath} ${folderPath} --json -o ${outputPath}`; const codeAnalysisDataString = await fs.readFile(outputPath, 'utf-8'); const codeAnalysisDataObject = JSON.parse(codeAnalysisDataString); @@ -55,8 +62,16 @@ export async function analyzeWithSemgrep(folderPath: string, outputPath: string) } } } catch (error) { + // We allow silenting the error if this one is related to the repository itself, so we can still analyze valuable information with biliothecary let acceptableError: boolean = false; - if (fsSync.existsSync(outputPath)) { + + // Make sure it's a formatted execa error (ref: https://github.com/sindresorhus/execa/issues/909) + // Note: for a timeout `error.timedOut` can be true, but sometimes it says it has been killed (`error.killed`)... so just checking the `error.failed` flag + if (error instanceof Error && !!(error as any).failed && !!(error as any).shortMessage) { + console.log(`semgrep analysis skipped since it has reached the defined timeout limit`); + + acceptableError = true; + } else if (fsSync.existsSync(outputPath)) { console.log(`the details of the semgrep error can be read into ${outputPath}, but pushing it on Sentry too`); const codeAnalysisDataString = await fs.readFile(outputPath, 'utf-8'); @@ -70,7 +85,6 @@ export async function analyzeWithSemgrep(folderPath: string, outputPath: string) }) ) { // This means the project is way too stuffed to be analyzed by semgrep - // We silent the error since in any way we could extract valuable information, we consider this as acceptable console.log(`semgrep analysis skipped due to the project being too big`); acceptableError = true;