Skip to content

Commit

Permalink
feed refactor and processing (#37)
Browse files Browse the repository at this point in the history
* begin feed refactor

* ui updates

* keep icon url

* debounce feed invalidation

* sanitize fields

* rm id [skip ci]

* add more fields

* add line [skip ci]

* fetch column

* fix favicon url

* save favicon and render

* more required fields

* rename module

* add saved to stats

* count saved in total

* 404 styles

* improve styles and rm local entry store

* debounce stats invalidation

* rename classes

* render html and display title

* rename query state hook

* scrape wip

* add scraped_at field

* libxml

* sudo

* initial readability port

* add scraping job

* check content_html

* fix check

* trim and retry urls

* ui tweaks

* scrape favicons

* use b64

* readme [skip ci]
  • Loading branch information
zaknesler authored May 26, 2024
1 parent ab370a3 commit ebe8b56
Show file tree
Hide file tree
Showing 65 changed files with 3,044 additions and 1,200 deletions.
1,104 changes: 1,037 additions & 67 deletions Cargo.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ thiserror = "1.0"
tokio = { version = "1.37", features = ["macros", "rt-multi-thread"] }
tracing = "0.1"
typeshare = "1.0"
url = "2.5"
uuid = { version = "1.8", features = ["v4", "fast-rng", "serde"] }

[package]
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Demo is available at [blend.zak.fm](https://blend.zak.fm).
- [x] Fetch metadata and feed entries in background
- [x] Notifications via websocket
- [x] Automatic + manual refreshing
- [ ] Scrape HTML if entries do not contain article content
- [x] Scrape HTML if entries do not contain article content
- [ ] Organize feeds into folders
- [ ] UI options for theme, font, etc.
- [ ] Keyboard shortcuts
Expand Down
5 changes: 3 additions & 2 deletions crates/blend-db/migrations/20240422000000_feeds.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ CREATE TABLE IF NOT EXISTS feeds (
uuid TEXT PRIMARY KEY NOT NULL,
id TEXT NOT NULL,
url_feed TEXT NOT NULL,
url_site TEXT,
title TEXT,
url_site TEXT NOT NULL,
title TEXT NOT NULL,
title_display TEXT,
favicon_url TEXT,
favicon_b64 BLOB,
published_at DATETIME,
updated_at DATETIME,
Expand Down
5 changes: 4 additions & 1 deletion crates/blend-db/migrations/20240430000000_entries.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@ CREATE TABLE IF NOT EXISTS entries (
id TEXT NOT NULL,
url TEXT,
title TEXT,
summary TEXT,
summary_html TEXT,
content_html TEXT,
content_scraped_html TEXT,
media_url TEXT,
published_at DATETIME,
updated_at DATETIME,
read_at DATETIME,
saved_at DATETIME,
scraped_at DATETIME,
UNIQUE(feed_uuid, id),
CONSTRAINT fk_feed FOREIGN KEY (feed_uuid) REFERENCES feeds(uuid) ON DELETE CASCADE
);
15 changes: 0 additions & 15 deletions crates/blend-db/migrations/20240504000000_feeds_stats.sql

This file was deleted.

5 changes: 4 additions & 1 deletion crates/blend-db/src/model/entry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@ pub struct Entry {
pub id: String,
pub url: String,
pub title: Option<String>,
pub summary: Option<String>,
pub summary_html: Option<String>,
#[sqlx(default)]
pub content_html: Option<String>,
#[sqlx(default)]
pub content_scraped_html: Option<String>,
pub media_url: Option<String>,
pub published_at: Option<DateTime<Utc>>,
pub updated_at: Option<DateTime<Utc>>,
pub read_at: Option<DateTime<Utc>>,
pub saved_at: Option<DateTime<Utc>>,
pub scraped_at: Option<DateTime<Utc>>,
}
8 changes: 5 additions & 3 deletions crates/blend-db/src/model/feed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ pub struct Feed {
pub uuid: Uuid,
pub id: String,
pub url_feed: String,
pub url_site: Option<String>,
pub title: Option<String>,
pub url_site: String,
pub title: String,
pub title_display: Option<String>,
pub favicon_b64: Option<Vec<u8>>,
pub favicon_b64: Option<String>,
pub favicon_url: Option<String>,
pub published_at: Option<DateTime<Utc>>,
pub updated_at: Option<DateTime<Utc>>,
}
Expand All @@ -24,4 +25,5 @@ pub struct FeedStats {
pub uuid: Uuid,
pub count_total: u32,
pub count_unread: u32,
pub count_saved: u32,
}
41 changes: 36 additions & 5 deletions crates/blend-db/src/repo/entry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ pub struct CreateEntryParams {
pub id: String,
pub url: Option<String>,
pub title: Option<String>,
pub summary: Option<String>,
pub summary_html: Option<String>,
pub content_html: Option<String>,
pub media_url: Option<String>,
pub published_at: Option<DateTime<Utc>>,
pub updated_at: Option<DateTime<Utc>>,
}
Expand Down Expand Up @@ -86,7 +87,7 @@ impl EntryRepo {
let el = filter.sort.query_elements();
let el_inv = filter.sort.query_elements_inverse();

let mut query = QueryBuilder::<Sqlite>::new("SELECT uuid, feed_uuid, id, url, title, summary, published_at, updated_at, read_at FROM entries WHERE 1=1");
let mut query = QueryBuilder::<Sqlite>::new("SELECT uuid, feed_uuid, id, url, title, summary_html, media_url, published_at, updated_at, read_at, saved_at, scraped_at FROM entries WHERE 1=1");

match filter.view {
View::All => query.push(""),
Expand Down Expand Up @@ -146,6 +147,17 @@ impl EntryRepo {
.map_err(|err| err.into())
}

pub async fn get_entries_to_scrape(
&self,
feed_uuid: &uuid::Uuid,
) -> DbResult<Vec<model::Entry>> {
sqlx::query_as::<_, model::Entry>("SELECT * FROM entries WHERE feed_uuid = ?1 AND content_html IS NULL AND content_scraped_html IS NULL AND scraped_at IS NULL")
.bind(feed_uuid)
.fetch_all(&self.db)
.await
.map_err(|err| err.into())
}

pub async fn update_entry_as_read(&self, entry_uuid: &uuid::Uuid) -> DbResult<bool> {
let rows_affected = sqlx::query("UPDATE entries SET read_at = ?1 WHERE uuid = ?2")
.bind(Utc::now())
Expand Down Expand Up @@ -176,15 +188,16 @@ impl EntryRepo {
return Ok(vec![]);
}

let mut query = QueryBuilder::<Sqlite>::new("INSERT INTO entries (feed_uuid, uuid, id, url, title, summary, content_html, published_at, updated_at) ");
let mut query = QueryBuilder::<Sqlite>::new("INSERT INTO entries (feed_uuid, uuid, id, url, title, summary_html, content_html, media_url, published_at, updated_at) ");
query.push_values(entries.iter(), |mut b, entry| {
b.push_bind(feed_uuid)
.push_bind(uuid::Uuid::new_v4())
.push_bind(entry.id.clone())
.push_bind(entry.url.clone())
.push_bind(entry.title.clone())
.push_bind(entry.summary.clone())
.push_bind(entry.summary_html.clone())
.push_bind(entry.content_html.clone())
.push_bind(entry.media_url.clone())
.push_bind(entry.published_at)
.push_bind(entry.updated_at);
});
Expand All @@ -194,7 +207,7 @@ impl EntryRepo {
DO UPDATE SET
url = excluded.url,
title = excluded.title,
summary = excluded.summary,
summary_html = excluded.summary_html,
content_html = excluded.content_html,
updated_at = excluded.updated_at
RETURNING uuid
Expand All @@ -209,4 +222,22 @@ impl EntryRepo {
.map(|row| row.try_get("uuid").map_err(|err| err.into()))
.collect::<DbResult<Vec<uuid::Uuid>>>()
}

pub async fn update_scraped_entry(
&self,
entry_uuid: &uuid::Uuid,
content_scraped_html: Option<String>,
) -> DbResult<bool> {
let rows_affected = sqlx::query(
"UPDATE entries SET content_scraped_html = ?1, scraped_at = ?2 WHERE uuid = ?3",
)
.bind(content_scraped_html)
.bind(Utc::now())
.bind(entry_uuid)
.execute(&self.db)
.await?
.rows_affected();

Ok(rows_affected > 0)
}
}
51 changes: 43 additions & 8 deletions crates/blend-db/src/repo/feed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ pub struct FeedRepo {

pub struct CreateFeedParams {
pub id: String,
pub title: Option<String>,
pub url_feed: Option<String>,
pub title: String,
pub url_feed: String,
pub url_site: String,
pub favicon_url: Option<String>,
pub published_at: Option<DateTime<Utc>>,
pub updated_at: Option<DateTime<Utc>>,
}
Expand All @@ -26,10 +28,21 @@ impl FeedRepo {
}

pub async fn get_stats(&self) -> DbResult<Vec<model::FeedStats>> {
sqlx::query_as::<_, model::FeedStats>("SELECT * from feeds_stats")
.fetch_all(&self.db)
.await
.map_err(|err| err.into())
sqlx::query_as::<_, model::FeedStats>(
r#"
SELECT
feeds.uuid,
COUNT(entries.uuid) as count_total,
COUNT(CASE WHEN entries.read_at IS NULL THEN 1 ELSE NULL END) as count_unread,
COUNT(CASE WHEN entries.saved_at IS NOT NULL THEN 1 ELSE NULL END) as count_saved
FROM feeds
INNER JOIN entries ON feeds.uuid = entries.feed_uuid
GROUP BY feeds.uuid
"#,
)
.fetch_all(&self.db)
.await
.map_err(|err| err.into())
}

pub async fn get_feed(&self, feed_uuid: uuid::Uuid) -> DbResult<Option<model::Feed>> {
Expand All @@ -43,25 +56,47 @@ impl FeedRepo {
pub async fn create_feed(&self, data: CreateFeedParams) -> DbResult<model::Feed> {
let feed = sqlx::query_as::<_, model::Feed>(
r#"
INSERT INTO feeds (uuid, id, url_feed, title, published_at, updated_at)
VALUES (?1, ?2, ?3, ?4, ?5, ?6)
INSERT INTO feeds (uuid, id, url_feed, url_site, title, favicon_url, published_at, updated_at)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)
ON CONFLICT (id)
DO UPDATE SET
url_feed = excluded.url_feed,
url_site = excluded.url_site,
title = excluded.title,
favicon_url = excluded.favicon_url,
updated_at = excluded.updated_at
RETURNING *
"#,
)
.bind(uuid::Uuid::new_v4())
.bind(data.id)
.bind(data.url_feed)
.bind(data.url_site)
.bind(data.title)
.bind(data.favicon_url)
.bind(data.published_at)
.bind(data.updated_at)
.fetch_one(&self.db)
.await?;

Ok(feed)
}

pub async fn update_favicon(
&self,
feed_uuid: &uuid::Uuid,
favicon_url: String,
favicon_base64: Option<String>,
) -> DbResult<bool> {
let rows_affected =
sqlx::query("UPDATE feeds SET favicon_url = ?1, favicon_b64 = ?2 WHERE uuid = ?3")
.bind(favicon_url)
.bind(favicon_base64)
.bind(feed_uuid)
.execute(&self.db)
.await?
.rows_affected();

Ok(rows_affected > 0)
}
}
7 changes: 7 additions & 0 deletions crates/blend-feed/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,16 @@ repository.workspace = true
license.workspace = true

[dependencies]
ammonia = "4.0"
chrono = { workspace = true, features = ["serde"] }
feed-rs = { workspace = true }
futures = { workspace = true }
html5ever = "0.27"
lazy_static = "1.4"
markup5ever_rcdom = "0.3"
regex = "1.9"
reqwest = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
url = { workspace = true }
6 changes: 6 additions & 0 deletions crates/blend-feed/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@ pub(crate) type FeedResult<T> = Result<T, FeedError>;

#[derive(thiserror::Error, Debug)]
pub enum FeedError {
#[error("invalid url: {0}")]
InvalidUrl(String),

#[error(transparent)]
Io(#[from] std::io::Error),

#[error(transparent)]
RequestError(#[from] reqwest::Error),

#[error(transparent)]
UrlParseError(#[from] url::ParseError),

#[error(transparent)]
ParseFeedError(#[from] feed_rs::parser::ParseFeedError),
}
25 changes: 25 additions & 0 deletions crates/blend-feed/src/extract/html.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
use ammonia::Builder;
use std::collections::HashSet;

const REMOVE_TAGS: [&str; 1] = ["article"];

// Sanitize HTML input, allowing only safe elements
pub fn extract_html(src: &str) -> String {
Builder::default().rm_tags(HashSet::from(REMOVE_TAGS)).clean(src).to_string()
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn it_keeps_only_safe_elements() {
let src = r#"<article><p>Some body text that we <em>want</em> to keep.</p><p class="read-more">[<a href="https://example.com">Read More</a>]</p><script>alert("gotcha")</script><style>body { display: none }</style></article>"#;

let parsed = extract_html(src);
assert_eq!(
parsed,
r#"<p>Some body text that we <em>want</em> to keep.</p><p>[<a href="https://example.com" rel="noopener noreferrer">Read More</a>]</p>"#
);
}
}
10 changes: 10 additions & 0 deletions crates/blend-feed/src/extract/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
mod stylistic;
pub use stylistic::extract_stylistic_html;

mod text;
pub use text::extract_text;

mod html;
pub use html::extract_html;

// TODO: use `.url_relative(UrlRelative::RewriteWithBase(...))` with ammonia and pass in site URL to rewrite relative URLs
Loading

0 comments on commit ebe8b56

Please sign in to comment.