chore(semgrep): allow limiting the per-file memory during the scan

betagouv · Mar 19, 2024 · d823bc0 · d823bc0
1 parent af019d0
commit d823bc0
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 1 deletion.
diff --git a/.env.model b/.env.model
@@ -5,6 +5,7 @@ LLM_MANAGER_MOCK=false
 LLM_MANAGER_MAXIMUM_API_REQUESTS_PER_SECOND=5
 MAINTENANCE_API_KEY=
 CHROMIUM_MAXIMUM_CONCURRENCY=1
+SEMGREP_PER_FILE_MEMORY_LIMIT_IN_MB=0
 NEXT_PUBLIC_CRISP_WEBSITE_ID=
 NEXT_PUBLIC_SENTRY_DSN=
 NEXT_PUBLIC_MATOMO_URL=

diff --git a/.env.test b/.env.test
@@ -4,6 +4,7 @@ LLM_MANAGER_MOCK=true
 LLM_MANAGER_MAXIMUM_API_REQUESTS_PER_SECOND=5
 MAINTENANCE_API_KEY=random
 CHROMIUM_MAXIMUM_CONCURRENCY=1
+SEMGREP_PER_FILE_MEMORY_LIMIT_IN_MB=0
 NEXT_PUBLIC_CRISP_WEBSITE_ID=random-one-since-cannot-work-without-a-remote-crisp-account
 NEXT_PUBLIC_SENTRY_DSN=
 NEXT_PUBLIC_MATOMO_URL=

diff --git a/README.md b/README.md
@@ -196,6 +196,7 @@ For each build and runtime (since they are shared), you should have set some env
 - `MISTRAL_API_KEY`: [SECRET] _(you can create an API key from your MistralAI "La plateforme" account)_
 - `LLM_MANAGER_MAXIMUM_API_REQUESTS_PER_SECOND`: [TO_DEFINE] _(by default the MistralAI platform has a limit of `5` request per second, but they may increase this limit on-demand. If so, you can increase the rate limit here to parallelize underlying requests)_
 - `CHROMIUM_MAXIMUM_CONCURRENCY`: [TO_DEFINE] _(by default it will be `1` but it takes a long time when analyzing thousands of websites through the headless Chromium. After some testing we think on Clever Cloud having `4` is fine for the `S` plan (and `8` for `XL` plan for a quick test to speed things up), and locally it will depend on your hardware. Consider to lower the value when having more than 10% of analyses timed out)_
+- `SEMGREP_PER_FILE_MEMORY_LIMIT_IN_MB`: [TO_DEFINE] _(semgrep will skip analyzing a specific file if it reaches this limit (and pass to the next one), we advise you to set it to 50% of the memory capacity of the server)_
 - `NEXT_PUBLIC_APP_BASE_URL`: [TO_DEFINE] _(must be the root URL to access the application, format `https://xxx.yyy.zzz`)_
 - `NEXT_PUBLIC_CRISP_WEBSITE_ID`: [TO_DEFINE] _(this ID is defined in your Crisp account and depends on the development or production environment)_
 - `NEXT_PUBLIC_SENTRY_DSN`: [SECRET] _(format `https://xxx.yyy.zzz/nn`)_

diff --git a/src/semgrep/index.ts b/src/semgrep/index.ts
@@ -9,6 +9,7 @@ import path from 'path';
 import { SemgrepResultSchema } from '@etabli/src/semgrep';
 
 const __root_dirname = process.cwd();
+const semgrepMemoryLimitPerFile = process.env.SEMGREP_PER_FILE_MEMORY_LIMIT_IN_MB ? parseInt(process.env.SEMGREP_PER_FILE_MEMORY_LIMIT_IN_MB, 10) : 0; // Default (0) is unlimited
 
 export interface AnalysisResult {
   functions: string[];
@@ -40,13 +41,14 @@ export async function analyzeWithSemgrep(folderPath: string, outputPath: string)
 
   try {
     // `--no-git-ignore` is required since the `.semgrepignore` is not taken into account with absolute paths (ref: https://github.com/semgrep/semgrep/issues/9960)
+    // `--max-memory` is scoped to the analysis for each file, skipping it if reaching the limit, size it according to the server capacity, but keep in mind since doing concurrency we could still reach the limit (if so, put a semaphore around this instruction)
     await $({
       // When it takes too much time it's not temporary, it's always the same for this specific repository, so better to skip it
       // For example with https://forge.aeif.fr/edupyter/EDUPYTER310 there is no big file, but around ~3000 files to analyze
       // and it was like stuck forever, until resulting in a OOM, or a JSON file being so big that the `readFile` was throwing `RangeError: Invalid string length` which is the limit a string can contains
       // Ref: https://github.com/semgrep/semgrep/issues/9469#issuecomment-2007687541
       timeout: minutesToMilliseconds(2),
-    })`semgrep --metrics=off --no-git-ignore --config ${codeAnalysisRulesPath} ${folderPath} --json -o ${outputPath}`;
+    })`semgrep --metrics=off --no-git-ignore --max-memory ${semgrepMemoryLimitPerFile} --config ${codeAnalysisRulesPath} ${folderPath} --json -o ${outputPath}`;
 
     const codeAnalysisDataString = await fs.readFile(outputPath, 'utf-8');
     const codeAnalysisDataObject = JSON.parse(codeAnalysisDataString);