Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use news parser #33

Merged
merged 11 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ config :crawly,
log_to_file: true,

closespider_timeout: 1,
concurrent_requests_per_domain: 4,
concurrent_requests_per_domain: 8,
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Expand Down
17 changes: 11 additions & 6 deletions lib/crawlers/news/spider.ex
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
alias News.Article

defmodule News.Spider do
@moduledoc false
use Crawly.Spider
Expand All @@ -19,7 +21,7 @@ defmodule News.Spider do

@impl Crawly.Spider
def parse_item(%{request_url: @index} = response) do
Logger.info("Parsing the feed index #{response.request_url}...")
Logger.info("Parsing the feed index #{response.request_url}")

feed_urls =
response.body
Expand All @@ -30,12 +32,12 @@ defmodule News.Spider do
Crawly.Utils.build_absolute_url(url, response.request.url) |> Crawly.Utils.request_from_url()
end)

%{items: [], requests: feed_urls}
%Crawly.ParsedItem{items: [], requests: feed_urls}
end


def parse_item(%{request_url: @feed_prefix <> _} = response) do
Logger.info("Parsing RSS feed #{response.request_url}...")
Logger.info("Parsing RSS feed #{response.request_url}")

article_urls =
response.body
Expand All @@ -44,13 +46,16 @@ defmodule News.Spider do
|> Enum.map(&Floki.text()/1)
|> Enum.map(&Crawly.Utils.request_from_url/1)

%{items: [], requests: article_urls}
%Crawly.ParsedItem{items: [], requests: article_urls}
end


def parse_item(%{request_url: @article_prefix <> _} = response) do
Logger.info("Parsing article #{response.request_url}...")
url = response.request_url
Logger.info("Parsing article #{url}")

article = Article.parse_from_html(response.body, url)

%{items: [%{url: response.request_url}], requests: []}
%Crawly.ParsedItem{items: [%{url: url, article: article}], requests: []}
end
end
21 changes: 19 additions & 2 deletions lib/news/article.ex
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,16 @@ defmodule News.Article do
end


def parse_from_html(html, uri) do
@spec parse_from_html(binary, URI.t | binary) :: %News.Article{
citations: list(),
date_modified: nil | Date.t(),
description: binary,
source_name: binary,
source_url: <<_::24, _::_*8>>,
title: binary
}
def parse_from_html(html, url) when is_binary(html) and is_binary(url), do: parse_from_html(html, URI.parse(url))
def parse_from_html(html, %URI{} = uri) when is_binary(html) do
{:ok, document} = Floki.parse_document(html)

cites = find_citations_in_html(document)
Expand All @@ -77,6 +86,7 @@ defmodule News.Article do
end


@spec find_citations_in_html(Floki.html_tree) :: [binary]
def find_citations_in_html(document) do
cites_from_hrefs =
document
Expand All @@ -95,6 +105,12 @@ defmodule News.Article do
|> map(fn m -> "C.R.S. #{m}" end)
|> flatten()

crs_cites_from_text_3 =
Regex.scan(~r/Colo. Rev. Stat. § (\d+-\d+-\d+(?:\.\d+)?)/, html)
|> map(&last/1)
|> map(fn m -> "C.R.S. #{m}" end)
|> flatten()

tx_cites_from_text =
Regex.scan(~r/(Texas \w+ Code Section [\d\w.]+)/, html)
|> flatten()
Expand All @@ -103,12 +119,13 @@ defmodule News.Article do
|> map(fn m -> String.replace(m, "Transportation ", "Transp. ") end)


(cites_from_hrefs ++ crs_cites_from_text_1 ++ crs_cites_from_text_2 ++ tx_cites_from_text)
(cites_from_hrefs ++ crs_cites_from_text_1 ++ crs_cites_from_text_2 ++ crs_cites_from_text_3 ++ tx_cites_from_text)
|> filter(&is_binary/1)
|> cleanup_list()
end


@spec hrefs(Floki.html_tree) :: [binary]
def hrefs(document) do
document
|> Floki.attribute("a", "href")
Expand Down
59 changes: 53 additions & 6 deletions lib/news/date_modified.ex
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,24 @@ defmodule News.DateModified do

@spec parse_from_meta_tags(Floki.html_tree) :: Date.t | nil
def parse_from_meta_tags(document) do
document
|> Floki.find("meta[property='article:published_time']")
|> Floki.attribute("content")
|> List.first()
|> parse_date_text()
meta_date =
document
|> Floki.find("meta[property='article:published_time']")
|> Floki.attribute("content")
|> List.first()
|> parse_date_text()

date = if is_nil(meta_date) do
document
|> Floki.find("time")
|> List.first()
|> Floki.text()
|> parse_date_text()
else
meta_date
end

date
end


Expand All @@ -50,9 +63,43 @@ defmodule News.DateModified do
{:ok, date} <- Date.from_iso8601(match) do
date
else
_ -> nil
_ -> parse_human_date_string(a_string)
end
end

def parse_date_text(_), do: nil

@months ~w(January February March April May June July August September October November December)

@doc """
Parses a date string in the format of "January 1, 2020" into a Date.t.

Examples:

iex> DateModified.parse_human_date_string("May 26, 1997")
~D[1997-05-26]

iex> DateModified.parse_human_date_string("January 1, 2020")
~D[2020-01-01]

iex> DateModified.parse_human_date_string("")
nil

iex> DateModified.parse_human_date_string("January 1")
nil

"""
@spec parse_human_date_string(binary) :: nil | Date.t
def parse_human_date_string(text) when is_binary(text) do
with [_, raw_month, raw_day, year] <- Regex.run(~r/^(.+) (.+), (\d\d\d\d)$/, text),
day <- String.pad_leading(raw_day, 2, "0"),
month_num <- Integer.to_string(Enum.find_index(@months, &(&1 == raw_month)) + 1),
month <- String.pad_leading(month_num, 2, "0"),
{:ok, date} <- Date.from_iso8601("#{year}-#{month}-#{day}") do
date
else
_ -> nil
end
end

end
4 changes: 2 additions & 2 deletions mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ defmodule Crawlers.MixProject do
[
# Crawly with deps that it uses.
{:crawly, github: "dogweather/crawly", branch: "error-level-messages"},
{:floki, "~> 0.33.0"},
{:logger_file_backend, "~> 0.0.11"},
{:floki, "~> 0.33.0"},
{:logger_file_backend, github: "dbii/logger_file_backend", branch: "warn-fix"},

{:credo, "> 0.0.0"},
{:dialyxir, "> 1.0.0", runtime: false},
Expand Down
2 changes: 1 addition & 1 deletion mix.lock
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"httpoison": {:hex, :httpoison, "1.8.2", "9eb9c63ae289296a544842ef816a85d881d4a31f518a0fec089aaa744beae290", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "2bb350d26972e30c96e2ca74a1aaf8293d61d0742ff17f01e0279fef11599921"},
"idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
"jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"},
"logger_file_backend": {:hex, :logger_file_backend, "0.0.13", "df07b14970e9ac1f57362985d76e6f24e3e1ab05c248055b7d223976881977c2", [:mix], [], "hexpm", "71a453a7e6e899ae4549fb147b1c6621f4233f8f48f58ca10a64ec67b6c50018"},
"logger_file_backend": {:git, "https://github.com/dbii/logger_file_backend.git", "dfd91a414d0971cbeaea159ed0d69012b7ca7abe", [branch: "warn-fix"]},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
"mime": {:hex, :mime, "2.0.5", "dc34c8efd439abe6ae0343edbb8556f4d63f178594894720607772a041b04b02", [:mix], [], "hexpm", "da0d64a365c45bc9935cc5c8a7fc5e49a0e0f9932a761c55d6c52b142780a05c"},
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
Expand Down
19 changes: 17 additions & 2 deletions test/news/article_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ defmodule News.ArticleTest do
use ExUnit.Case
doctest News.Article

@test_cases [
@file_test_cases [
%{file: "qandasec5.asp", cites: ["CA Educ Code Section 47605", "CA Educ Code Section 47605.6"]},
%{file: "qandasec6.asp", cites: ["CA Educ Code Section 47605"]},
%{file: "Formal Marriage License.html", cites: ["Tex. Fam. Code Section 2.003", "Tex. Fam. Code Section 2.013", "Tex. Fam. Code Section 2.203"]},
Expand All @@ -18,12 +18,27 @@ defmodule News.ArticleTest do
%{file: "colorado-knife-laws.html", cites: ["C.R.S. 18-12-101", "C.R.S. 18-12-102", "C.R.S. 18-12-105", "C.R.S. 18-12-105.5"]}
]

Enum.each(@test_cases, fn %{file: f, cites: c} ->
Enum.each(@file_test_cases, fn %{file: f, cites: c} ->
test "finds the cites in #{f}" do
document = unquote(f) |> Test.fixture_html!
cites = unquote(c)

assert Article.find_citations_in_html(document) == cites
end
end)


@snippet_test_cases [
%{html: "<html></html>", cites: []},
%{html: "<html><p>under Colo. Rev. Stat. § 24-34-402.7 and</p></html>", cites: ["C.R.S. 24-34-402.7"]},
]

Enum.each(@snippet_test_cases, fn %{html: html, cites: cites} ->
test "finds the cites in #{html}" do
html = unquote(html) |> Floki.parse_document!
cites = unquote(cites)

assert Article.find_citations_in_html(html) == cites
end
end)
end
4 changes: 3 additions & 1 deletion test/news/date_modified_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ defmodule News.DateModifiedTest do
%{date: "2020-01-01", html: "<html><script type='application/ld+json'>{\"dateModified\": \"2020-01-01\"}</script></html>"},
%{date: "2023-08-25", html: "<html><script type='application/ld+json'>{\"dateModified\": \"Fri, 2023-08-25 21:16:28\"}</script></html>"},
%{date: "2023-09-20", html: "<html><script type='application/ld+json'>{\"dateModified\": \"2023-09-20T16:20:21-05:00\"}</script></html>"},
%{date: "2020-05-19", html: "<html><head><meta property=\"article:published_time\" content=\"2020-05-19T16:20:34+00:00\"></head></html>"}
%{date: "2020-05-19", html: "<html><head><meta property=\"article:published_time\" content=\"2020-05-19T16:20:34+00:00\"></head></html>"},
# JD Supra
%{date: "2023-12-05", html: "<html><div class=\"tc-ns f7 silver mv2\"><time>December 5, 2023</time></div></html>"},
]

# Create and run a test for each of the @test_cases
Expand Down
Loading