Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 80 additions & 14 deletions backend/app/services/email_processor.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import email
import html
import imaplib
import quopri
from email.header import decode_header, make_header
from email.message import Message

import trafilatura
import nh3
from bs4 import BeautifulSoup
from readability import Document
from sqlalchemy.orm import Session

from app.core.logging import get_logger
Expand Down Expand Up @@ -62,31 +64,94 @@ def _fetch_unread_email_ids(mail: imaplib.IMAP4_SSL) -> list[str]:


def _get_email_body(msg: Message) -> str:
"""Extract body from an email message."""
body = ""
"""Extract the HTML body from an email message, falling back to plain text."""
html_body = ""
text_body = ""
for part in msg.walk():
ctype = part.get_content_type()
cdispo = str(part.get("Content-Disposition"))
if "attachment" in cdispo:
continue
if ctype in ["text/plain", "text/html"]:

if ctype == "text/html":
try:
payload = part.get_payload(decode=True)
charset = part.get_content_charset() or "utf-8"
html_body = payload.decode(charset, "ignore")
except Exception:
pass
elif ctype == "text/plain":
try:
payload = part.get_payload(decode=True)
charset = part.get_content_charset() or "utf-8"
body = payload.decode(charset, "ignore")
text_body = payload.decode(charset, "ignore")
except Exception:
pass
return html.unescape(body)

# Prefer HTML body, but fall back to plain text if HTML is empty
return html_body or text_body


def _extract_and_clean_html(raw_html_content: str) -> dict[str, str]:
"""Decode, extract, and sanitize newsletter HTML."""
try:
decoded_bytes = quopri.decodestring(raw_html_content.encode("utf-8"))
clean_html_str = decoded_bytes.decode("utf-8", "ignore")
except Exception:
# If quopri fails, assume it's already decoded.
clean_html_str = raw_html_content

doc = Document(clean_html_str)
extracted_body = doc.summary(html_partial=True)

ALLOWED_TAGS = {
"p",
"strong",
"em",
"u",
"h3",
"h4",
"ul",
"ol",
"li",
"a",
"img",
"br",
"div",
"span",
"figure",
"figcaption",
}
ALLOWED_ATTRIBUTES = {
"a": {"href", "title"},
"img": {"src", "alt", "width", "height"},
"*": {"style"},
}
cleaned_body = nh3.clean(
extracted_body, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES
)

title = doc.title()
if not title or title == "no-title":
soup = BeautifulSoup(cleaned_body, "html.parser")
first_headline = soup.find(["h1", "h2", "h3"])
title = first_headline.get_text(strip=True) if first_headline else "Newsletter"

return {"title": title, "body": cleaned_body}


def _auto_add_newsletter(
db: Session, sender: str, msg: Message, settings: Settings
db: Session,
sender: str,
msg: Message,
settings: Settings,
) -> Newsletter:
"""Automatically add a new newsletter."""
logger.info(f"Auto-adding new newsletter for sender: {sender}")
newsletter_name = email.utils.parseaddr(msg["From"])[0] or sender
new_newsletter_schema = NewsletterCreate(
name=newsletter_name, sender_emails=[sender]
name=newsletter_name,
sender_emails=[sender],
)
return create_newsletter(db, new_newsletter_schema)

Expand Down Expand Up @@ -129,14 +194,15 @@ def _process_single_email(
return

subject = str(make_header(decode_header(msg["Subject"])))
final_body = _get_email_body(msg)
body = _get_email_body(msg)

if newsletter.extract_content:
extracted_body = trafilatura.extract(final_body)
if extracted_body:
final_body = extracted_body
cleaned_data = _extract_and_clean_html(body)
# The subject from the email itself is often better than what readability extracts
# so we only override the body.
body = cleaned_data["body"]

entry_schema = EntryCreate(subject=subject, body=final_body, message_id=message_id)
entry_schema = EntryCreate(subject=subject, body=body, message_id=message_id)
new_entry = create_entry(db, entry_schema, newsletter.id)

if not new_entry:
Expand Down
4 changes: 2 additions & 2 deletions backend/app/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_process_emails(mock_imap, db_session: Session):
mock_mail.search.return_value = ("OK", [b"1"])

# Mock email content
mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\nMessage-ID: <test@test.com>\n\nTest Body"
mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\nMessage-ID: <test@test.com>\n\n<p>Test Body</p>"
mock_mail.fetch.return_value = ("OK", [(None, mock_msg_bytes)])

process_emails(db_session)
Expand All @@ -95,7 +95,7 @@ def test_process_emails(mock_imap, db_session: Session):
entries = get_entries_by_newsletter(db_session, newsletters[0].id)
assert len(entries) == 1
assert entries[0].subject == "Test Subject"
assert entries[0].body == "Test Body"
assert entries[0].body == "<p>Test Body</p>"


@patch("app.core.scheduler.job")
Expand Down
17 changes: 12 additions & 5 deletions backend/app/tests/test_email_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def _setup_test_email_processing(
msg["From"] = newsletter_create_data.sender_emails[0]
msg["Subject"] = "Test Email"
msg["Message-ID"] = "<test-message-id>"
msg.set_payload("<html><body><p>Original Body</p></body></html>", "utf-8")
mock_mail.fetch.return_value = ("OK", [(b"1 (RFC822)", msg.as_bytes())])

return mock_mail, newsletter, settings
Expand Down Expand Up @@ -83,13 +84,17 @@ def test_process_single_email_with_global_move_folder(db_session: Session):
mock_mail.store.assert_any_call("1", "+FLAGS", "\\Deleted")


@patch("app.services.email_processor.trafilatura.extract")
@patch("app.services.email_processor._extract_and_clean_html")
def test_process_single_email_with_content_extraction(
mock_trafilatura, db_session: Session
mock_extract_clean,
db_session: Session,
):
"""Test that trafilatura is called when extract_content is True."""
"""Test that the cleaning function is called when extract_content is True."""
# 1. ARRANGE
mock_trafilatura.return_value = "Extracted Body"
mock_extract_clean.return_value = {
"title": "Extracted Title",
"body": "Extracted Body",
}
settings_data = SettingsCreate(
imap_server="test.com", imap_username="test", imap_password="password"
)
Expand All @@ -108,8 +113,10 @@ def test_process_single_email_with_content_extraction(
_process_single_email("1", mock_mail, db_session, sender_map, settings)

# 3. ASSERT
mock_trafilatura.assert_called_once()
mock_extract_clean.assert_called_once()
# Check that create_entry was called with the extracted body
mock_create_entry.assert_called_once()
entry_create_arg = mock_create_entry.call_args[0][1]
assert entry_create_arg.body == "Extracted Body"
# Subject should still come from the email, not the extracted title
assert entry_create_arg.subject == "Test Email"
4 changes: 3 additions & 1 deletion backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,19 @@ dependencies = [
"alembic>=1.16.4",
"apscheduler>=3.11.0",
"bcrypt>=4.3.0",
"beautifulsoup4>=4.13.4",
"fastapi>=0.116.0",
"feedgen>=1.0.0",
"nanoid>=2.0.0",
"nh3>=0.3.0",
"passlib>=1.7.4",
"pydantic-settings>=2.10.1",
"pydantic[email]>=2.11.7",
"python-dotenv>=1.1.1",
"python-jose[cryptography]>=3.5.0",
"python-multipart>=0.0.20",
"readability-lxml>=0.8.4.1",
"sqlalchemy>=2.0.41",
"trafilatura>=1.10.0",
"uvicorn>=0.35.0",
]

Expand Down
Loading