LeonMusCoden · LeonMusCoden · Aug 7, 2025 · Aug 7, 2025
diff --git a/backend/app/services/email_processor.py b/backend/app/services/email_processor.py
@@ -1,10 +1,12 @@
 import email
-import html
 import imaplib
+import quopri
 from email.header import decode_header, make_header
 from email.message import Message
 
-import trafilatura
+import nh3
+from bs4 import BeautifulSoup
+from readability import Document
 from sqlalchemy.orm import Session
 
 from app.core.logging import get_logger
@@ -62,31 +64,94 @@ def _fetch_unread_email_ids(mail: imaplib.IMAP4_SSL) -> list[str]:
 
 
 def _get_email_body(msg: Message) -> str:
-    """Extract body from an email message."""
-    body = ""
+    """Extract the HTML body from an email message, falling back to plain text."""
+    html_body = ""
+    text_body = ""
     for part in msg.walk():
         ctype = part.get_content_type()
         cdispo = str(part.get("Content-Disposition"))
         if "attachment" in cdispo:
             continue
-        if ctype in ["text/plain", "text/html"]:
+
+        if ctype == "text/html":
+            try:
+                payload = part.get_payload(decode=True)
+                charset = part.get_content_charset() or "utf-8"
+                html_body = payload.decode(charset, "ignore")
+            except Exception:
+                pass
+        elif ctype == "text/plain":
             try:
                 payload = part.get_payload(decode=True)
                 charset = part.get_content_charset() or "utf-8"
-                body = payload.decode(charset, "ignore")
+                text_body = payload.decode(charset, "ignore")
             except Exception:
                 pass
-    return html.unescape(body)
+
+    # Prefer HTML body, but fall back to plain text if HTML is empty
+    return html_body or text_body
+
+
+def _extract_and_clean_html(raw_html_content: str) -> dict[str, str]:
+    """Decode, extract, and sanitize newsletter HTML."""
+    try:
+        decoded_bytes = quopri.decodestring(raw_html_content.encode("utf-8"))
+        clean_html_str = decoded_bytes.decode("utf-8", "ignore")
+    except Exception:
+        # If quopri fails, assume it's already decoded.
+        clean_html_str = raw_html_content
+
+    doc = Document(clean_html_str)
+    extracted_body = doc.summary(html_partial=True)
+
+    ALLOWED_TAGS = {
+        "p",
+        "strong",
+        "em",
+        "u",
+        "h3",
+        "h4",
+        "ul",
+        "ol",
+        "li",
+        "a",
+        "img",
+        "br",
+        "div",
+        "span",
+        "figure",
+        "figcaption",
+    }
+    ALLOWED_ATTRIBUTES = {
+        "a": {"href", "title"},
+        "img": {"src", "alt", "width", "height"},
+        "*": {"style"},
+    }
+    cleaned_body = nh3.clean(
+        extracted_body, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES
+    )
+
+    title = doc.title()
+    if not title or title == "no-title":
+        soup = BeautifulSoup(cleaned_body, "html.parser")
+        first_headline = soup.find(["h1", "h2", "h3"])
+        title = first_headline.get_text(strip=True) if first_headline else "Newsletter"
+
+    return {"title": title, "body": cleaned_body}
 
 
 def _auto_add_newsletter(
-    db: Session, sender: str, msg: Message, settings: Settings
+    db: Session,
+    sender: str,
+    msg: Message,
+    settings: Settings,
 ) -> Newsletter:
     """Automatically add a new newsletter."""
     logger.info(f"Auto-adding new newsletter for sender: {sender}")
     newsletter_name = email.utils.parseaddr(msg["From"])[0] or sender
     new_newsletter_schema = NewsletterCreate(
-        name=newsletter_name, sender_emails=[sender]
+        name=newsletter_name,
+        sender_emails=[sender],
     )
     return create_newsletter(db, new_newsletter_schema)
 
@@ -129,14 +194,15 @@ def _process_single_email(
         return
 
     subject = str(make_header(decode_header(msg["Subject"])))
-    final_body = _get_email_body(msg)
+    body = _get_email_body(msg)
 
     if newsletter.extract_content:
-        extracted_body = trafilatura.extract(final_body)
-        if extracted_body:
-            final_body = extracted_body
+        cleaned_data = _extract_and_clean_html(body)
+        # The subject from the email itself is often better than what readability extracts
+        # so we only override the body.
+        body = cleaned_data["body"]
 
-    entry_schema = EntryCreate(subject=subject, body=final_body, message_id=message_id)
+    entry_schema = EntryCreate(subject=subject, body=body, message_id=message_id)
     new_entry = create_entry(db, entry_schema, newsletter.id)
 
     if not new_entry:

diff --git a/backend/app/tests/test_core.py b/backend/app/tests/test_core.py
@@ -70,7 +70,7 @@ def test_process_emails(mock_imap, db_session: Session):
     mock_mail.search.return_value = ("OK", [b"1"])
 
     # Mock email content
-    mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\nMessage-ID: <test@test.com>\n\nTest Body"
+    mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\nMessage-ID: <test@test.com>\n\n<p>Test Body</p>"
     mock_mail.fetch.return_value = ("OK", [(None, mock_msg_bytes)])
 
     process_emails(db_session)
@@ -95,7 +95,7 @@ def test_process_emails(mock_imap, db_session: Session):
     entries = get_entries_by_newsletter(db_session, newsletters[0].id)
     assert len(entries) == 1
     assert entries[0].subject == "Test Subject"
-    assert entries[0].body == "Test Body"
+    assert entries[0].body == "<p>Test Body</p>"
 
 
 @patch("app.core.scheduler.job")

diff --git a/backend/app/tests/test_email_processor.py b/backend/app/tests/test_email_processor.py
@@ -26,6 +26,7 @@ def _setup_test_email_processing(
     msg["From"] = newsletter_create_data.sender_emails[0]
     msg["Subject"] = "Test Email"
     msg["Message-ID"] = "<test-message-id>"
+    msg.set_payload("<html><body><p>Original Body</p></body></html>", "utf-8")
     mock_mail.fetch.return_value = ("OK", [(b"1 (RFC822)", msg.as_bytes())])
 
     return mock_mail, newsletter, settings
@@ -83,13 +84,17 @@ def test_process_single_email_with_global_move_folder(db_session: Session):
     mock_mail.store.assert_any_call("1", "+FLAGS", "\\Deleted")
 
 
-@patch("app.services.email_processor.trafilatura.extract")
+@patch("app.services.email_processor._extract_and_clean_html")
 def test_process_single_email_with_content_extraction(
-    mock_trafilatura, db_session: Session
+    mock_extract_clean,
+    db_session: Session,
 ):
-    """Test that trafilatura is called when extract_content is True."""
+    """Test that the cleaning function is called when extract_content is True."""
     # 1. ARRANGE
-    mock_trafilatura.return_value = "Extracted Body"
+    mock_extract_clean.return_value = {
+        "title": "Extracted Title",
+        "body": "Extracted Body",
+    }
     settings_data = SettingsCreate(
         imap_server="test.com", imap_username="test", imap_password="password"
     )
@@ -108,8 +113,10 @@ def test_process_single_email_with_content_extraction(
         _process_single_email("1", mock_mail, db_session, sender_map, settings)
 
     # 3. ASSERT
-    mock_trafilatura.assert_called_once()
+    mock_extract_clean.assert_called_once()
     # Check that create_entry was called with the extracted body
     mock_create_entry.assert_called_once()
     entry_create_arg = mock_create_entry.call_args[0][1]
     assert entry_create_arg.body == "Extracted Body"
+    # Subject should still come from the email, not the extracted title
+    assert entry_create_arg.subject == "Test Email"
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -8,17 +8,19 @@ dependencies = [
     "alembic>=1.16.4",
     "apscheduler>=3.11.0",
     "bcrypt>=4.3.0",
+    "beautifulsoup4>=4.13.4",
     "fastapi>=0.116.0",
     "feedgen>=1.0.0",
     "nanoid>=2.0.0",
+    "nh3>=0.3.0",
     "passlib>=1.7.4",
     "pydantic-settings>=2.10.1",
     "pydantic[email]>=2.11.7",
     "python-dotenv>=1.1.1",
     "python-jose[cryptography]>=3.5.0",
     "python-multipart>=0.0.20",
+    "readability-lxml>=0.8.4.1",
     "sqlalchemy>=2.0.41",
-    "trafilatura>=1.10.0",
     "uvicorn>=0.35.0",
 ]