Jackson57279 · Jackson57279 · Aug 12, 2025 · Aug 12, 2025 · coderabbitai · Aug 12, 2025
diff --git a/src/lib/search-service.ts b/src/lib/search-service.ts
@@ -316,9 +316,16 @@
   }
 
   private extractTextContent(html: string): string {
-    return html
-      .replace(/<script[^>]*>.*?<\/script>/gi, '')
-      .replace(/<style[^>]*>.*?<\/style>/gi, '')
+    // Repeatedly remove <script> and <style> tags and their content
+    let sanitized = html;
+    let previous;
-    let previous;
+    let previous: string | null = null;
-    let previous;
+    let previous: string | null = null;
+    do {
+      previous = sanitized;
+      sanitized = sanitized
+        .replace(/<script[^>]*>.*?<\/script>/gis, '')
@@ -316,12 +316,22 @@
  }
  private extractTextContent(html: string): string {
-    return html
-      .replace(/<script[^>]*>.*?<\/script>/gi, '')
-      .replace(/<style[^>]*>.*?<\/style>/gi, '')
-      .replace(/<[^>]*>/g, ' ')
-      .replace(/\s+/g, ' ')
-      .trim();
+    let sanitized = html;
+    let previous;
+    // Remove all <script> tags and their content repeatedly
+    do {
+      previous = sanitized;
+      sanitized = sanitized.replace(/<script[^>]*>.*?<\/script>/gis, '');
+    } while (sanitized !== previous);
+    // Remove all <style> tags and their content repeatedly
+    do {
+      previous = sanitized;
+      sanitized = sanitized.replace(/<style[^>]*>.*?<\/style>/gis, '');
+    } while (sanitized !== previous);
+    // Remove all remaining HTML tags
+    sanitized = sanitized.replace(/<[^>]*>/g, ' ');
+    sanitized = sanitized.replace(/\s+/g, ' ').trim();
+    return sanitized;
  }
 }
@@ -316,12 +316,22 @@
  }

  private extractTextContent(html: string): string {
-    return html
-      .replace(/<script[^>]*>.*?<\/script>/gi, '')
-      .replace(/<style[^>]*>.*?<\/style>/gi, '')
-      .replace(/<[^>]*>/g, ' ')
-      .replace(/\s+/g, ' ')
-      .trim();
+    let sanitized = html;
+    let previous;
+    // Remove all <script> tags and their content repeatedly
+    do {
+      previous = sanitized;
+      sanitized = sanitized.replace(/<script[^>]*>.*?<\/script>/gis, '');
+    } while (sanitized !== previous);
+    // Remove all <style> tags and their content repeatedly
+    do {
+      previous = sanitized;
+      sanitized = sanitized.replace(/<style[^>]*>.*?<\/style>/gis, '');
+    } while (sanitized !== previous);
+    // Remove all remaining HTML tags
+    sanitized = sanitized.replace(/<[^>]*>/g, ' ');
+    sanitized = sanitized.replace(/\s+/g, ' ').trim();
+    return sanitized;
  }
 }

+        .replace(/<style[^>]*>.*?<\/style>/gis, '');
+    } while (sanitized !== previous);
+    return sanitized
-    // Repeatedly remove <script> and <style> tags and their content
-    let sanitized = html;
-    let previous;
-    do {
-      previous = sanitized;
-      sanitized = sanitized
-        .replace(/<script[^>]*>.*?<\/script>/gis, '')
-        .replace(/<style[^>]*>.*?<\/style>/gis, '');
-    } while (sanitized !== previous);
-    return sanitized
+   private extractTextContent(html: string): string {
+     // Prefer DOM parsing for robust removal of script/style content
+     if (typeof DOMParser !== 'undefined') {
+       const doc = new DOMParser().parseFromString(html, 'text/html');
+       doc.querySelectorAll('script, style').forEach(n => n.remove());
+       const text = doc.body?.textContent ?? '';
+       return text.replace(/\s+/g, ' ').trim();
+     }
+     // Fallback to iterative regex removal in non-browser environments
+     let sanitized = html;
+     let previous: string;
+     do {
+       previous = sanitized;
+       sanitized = sanitized
+         .replace(/<script\b[^>]*>[\s\S]*?<\/script\s*>/gi, '')
+         .replace(/<style\b[^>]*>[\s\S]*?<\/style\s*>/gi, '');
+     } while (sanitized !== previous);
+     sanitized = sanitized.replace(/<[^>]*>/g, ' ');
+     return sanitized.replace(/\s+/g, ' ').trim();
+   }
-    // Repeatedly remove <script> and <style> tags and their content
-    let sanitized = html;
-    let previous;
-    do {
-      previous = sanitized;
-      sanitized = sanitized
-        .replace(/<script[^>]*>.*?<\/script>/gis, '')
-        .replace(/<style[^>]*>.*?<\/style>/gis, '');
-    } while (sanitized !== previous);
-    return sanitized
+   private extractTextContent(html: string): string {
+     // Prefer DOM parsing for robust removal of script/style content
+     if (typeof DOMParser !== 'undefined') {
+       const doc = new DOMParser().parseFromString(html, 'text/html');
+       doc.querySelectorAll('script, style').forEach(n => n.remove());
+       const text = doc.body?.textContent ?? '';
+       return text.replace(/\s+/g, ' ').trim();
+     }
+     // Fallback to iterative regex removal in non-browser environments
+     let sanitized = html;
+     let previous: string;
+     do {
+       previous = sanitized;
+       sanitized = sanitized
+         .replace(/<script\b[^>]*>[\s\S]*?<\/script\s*>/gi, '')
+         .replace(/<style\b[^>]*>[\s\S]*?<\/style\s*>/gi, '');
+     } while (sanitized !== previous);
+     sanitized = sanitized.replace(/<[^>]*>/g, ' ');
+     return sanitized.replace(/\s+/g, ' ').trim();
+   }
       .replace(/<[^>]*>/g, ' ')
       .replace(/\s+/g, ' ')
       .trim();