public-law · dogweather · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023
diff --git a/config/config.exs b/config/config.exs
@@ -13,7 +13,7 @@ config :crawly,
   log_to_file: true,
 
   closespider_timeout: 1,
-  concurrent_requests_per_domain: 4,
+  concurrent_requests_per_domain: 8,
   middlewares: [
     Crawly.Middlewares.DomainFilter,
     Crawly.Middlewares.UniqueRequest,

diff --git a/lib/crawlers/news/spider.ex b/lib/crawlers/news/spider.ex
@@ -1,3 +1,5 @@
+alias News.Article
+
 defmodule News.Spider do
   @moduledoc false
   use Crawly.Spider
@@ -19,7 +21,7 @@ defmodule News.Spider do
 
   @impl Crawly.Spider
   def parse_item(%{request_url: @index} = response) do
-    Logger.info("Parsing the feed index #{response.request_url}...")
+    Logger.info("Parsing the feed index #{response.request_url}")
 
     feed_urls =
       response.body
@@ -30,12 +32,12 @@ defmodule News.Spider do
         Crawly.Utils.build_absolute_url(url, response.request.url) |> Crawly.Utils.request_from_url()
       end)
 
-    %{items: [], requests: feed_urls}
+    %Crawly.ParsedItem{items: [], requests: feed_urls}
   end
 
 
   def parse_item(%{request_url: @feed_prefix <> _} = response) do
-    Logger.info("Parsing RSS feed #{response.request_url}...")
+    Logger.info("Parsing RSS feed #{response.request_url}")
 
     article_urls =
       response.body
@@ -44,13 +46,16 @@ defmodule News.Spider do
       |> Enum.map(&Floki.text()/1)
       |> Enum.map(&Crawly.Utils.request_from_url/1)
 
-    %{items: [], requests: article_urls}
+    %Crawly.ParsedItem{items: [], requests: article_urls}
   end
 
 
   def parse_item(%{request_url: @article_prefix <> _} = response) do
-    Logger.info("Parsing article #{response.request_url}...")
+    url = response.request_url
+    Logger.info("Parsing article #{url}")
+
+    article = Article.parse_from_html(response.body, url)
 
-    %{items: [%{url: response.request_url}], requests: []}
+    %Crawly.ParsedItem{items: [%{url: url, article: article}], requests: []}
   end
 end
diff --git a/lib/news/article.ex b/lib/news/article.ex
@@ -56,7 +56,16 @@ defmodule News.Article do
   end
 
 
-  def parse_from_html(html, uri) do
+  @spec parse_from_html(binary, URI.t | binary) :: %News.Article{
+          citations: list(),
+          date_modified: nil | Date.t(),
+          description: binary,
+          source_name: binary,
+          source_url: <<_::24, _::_*8>>,
+          title: binary
+        }
+    def parse_from_html(html, url) when is_binary(html) and is_binary(url), do: parse_from_html(html, URI.parse(url))
+    def parse_from_html(html, %URI{} = uri) when is_binary(html) do
     {:ok, document} = Floki.parse_document(html)
 
     cites      = find_citations_in_html(document)
@@ -77,6 +86,7 @@ defmodule News.Article do
   end
 
 
+  @spec find_citations_in_html(Floki.html_tree) :: [binary]
   def find_citations_in_html(document) do
     cites_from_hrefs =
       document
@@ -95,6 +105,12 @@ defmodule News.Article do
       |> map(fn m -> "C.R.S. #{m}" end)
       |> flatten()
 
+    crs_cites_from_text_3 =
+      Regex.scan(~r/Colo. Rev. Stat. § (\d+-\d+-\d+(?:\.\d+)?)/, html)
+      |> map(&last/1)
+      |> map(fn m -> "C.R.S. #{m}" end)
+      |> flatten()
+
     tx_cites_from_text =
       Regex.scan(~r/(Texas \w+ Code Section [\d\w.]+)/, html)
       |> flatten()
@@ -103,12 +119,13 @@ defmodule News.Article do
       |> map(fn m -> String.replace(m, "Transportation ", "Transp. ") end)
 
 
-     (cites_from_hrefs ++ crs_cites_from_text_1 ++ crs_cites_from_text_2 ++ tx_cites_from_text)
+     (cites_from_hrefs ++ crs_cites_from_text_1 ++ crs_cites_from_text_2 ++ crs_cites_from_text_3 ++ tx_cites_from_text)
      |> filter(&is_binary/1)
      |> cleanup_list()
   end
 
 
+  @spec hrefs(Floki.html_tree) :: [binary]
   def hrefs(document) do
     document
     |> Floki.attribute("a", "href")

diff --git a/lib/news/date_modified.ex b/lib/news/date_modified.ex
@@ -35,11 +35,24 @@ defmodule News.DateModified do
 
   @spec parse_from_meta_tags(Floki.html_tree) :: Date.t | nil
   def parse_from_meta_tags(document) do
-    document
-    |> Floki.find("meta[property='article:published_time']")
-    |> Floki.attribute("content")
-    |> List.first()
-    |> parse_date_text()
+    meta_date =
+      document
+      |> Floki.find("meta[property='article:published_time']")
+      |> Floki.attribute("content")
+      |> List.first()
+      |> parse_date_text()
+
+    date = if is_nil(meta_date) do
+      document
+      |> Floki.find("time")
+      |> List.first()
+      |> Floki.text()
+      |> parse_date_text()
+    else
+      meta_date
+    end
+
+    date
   end
 
 
@@ -50,9 +63,43 @@ defmodule News.DateModified do
          {:ok, date} <- Date.from_iso8601(match) do
       date
     else
-      _ -> nil
+      _ -> parse_human_date_string(a_string)
     end
   end
 
   def parse_date_text(_), do: nil
+
+  @months ~w(January February March April May June July August September October November December)
+
+  @doc """
+  Parses a date string in the format of "January 1, 2020" into a Date.t.
+
+  Examples:
+
+        iex> DateModified.parse_human_date_string("May 26, 1997")
+        ~D[1997-05-26]
+
+        iex> DateModified.parse_human_date_string("January 1, 2020")
+        ~D[2020-01-01]
+
+        iex> DateModified.parse_human_date_string("")
+        nil
+
+        iex> DateModified.parse_human_date_string("January 1")
+        nil
+
+  """
+  @spec parse_human_date_string(binary) :: nil | Date.t
+  def parse_human_date_string(text) when is_binary(text) do
+    with [_, raw_month, raw_day, year] <- Regex.run(~r/^(.+) (.+), (\d\d\d\d)$/, text),
+          day        <- String.pad_leading(raw_day, 2, "0"),
+          month_num  <- Integer.to_string(Enum.find_index(@months, &(&1 == raw_month)) + 1),
+          month      <- String.pad_leading(month_num, 2, "0"),
+         {:ok, date} <- Date.from_iso8601("#{year}-#{month}-#{day}") do
+      date
+    else
+      _ -> nil
+    end
+  end
+
 end
diff --git a/mix.exs b/mix.exs
@@ -28,8 +28,8 @@ defmodule Crawlers.MixProject do
     [
       # Crawly with deps that it uses.
       {:crawly, github: "dogweather/crawly", branch: "error-level-messages"},
-      {:floki,               "~> 0.33.0"},
-      {:logger_file_backend, "~> 0.0.11"},
+      {:floki, "~> 0.33.0"},
+      {:logger_file_backend, github: "dbii/logger_file_backend", branch: "warn-fix"},
 
       {:credo,    "> 0.0.0"},
       {:dialyxir, "> 1.0.0", runtime: false},

diff --git a/mix.lock b/mix.lock
@@ -22,7 +22,7 @@
   "httpoison": {:hex, :httpoison, "1.8.2", "9eb9c63ae289296a544842ef816a85d881d4a31f518a0fec089aaa744beae290", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "2bb350d26972e30c96e2ca74a1aaf8293d61d0742ff17f01e0279fef11599921"},
   "idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
   "jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"},
-  "logger_file_backend": {:hex, :logger_file_backend, "0.0.13", "df07b14970e9ac1f57362985d76e6f24e3e1ab05c248055b7d223976881977c2", [:mix], [], "hexpm", "71a453a7e6e899ae4549fb147b1c6621f4233f8f48f58ca10a64ec67b6c50018"},
+  "logger_file_backend": {:git, "https://github.com/dbii/logger_file_backend.git", "dfd91a414d0971cbeaea159ed0d69012b7ca7abe", [branch: "warn-fix"]},
   "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
   "mime": {:hex, :mime, "2.0.5", "dc34c8efd439abe6ae0343edbb8556f4d63f178594894720607772a041b04b02", [:mix], [], "hexpm", "da0d64a365c45bc9935cc5c8a7fc5e49a0e0f9932a761c55d6c52b142780a05c"},
   "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},

diff --git a/test/news/article_test.exs b/test/news/article_test.exs
@@ -6,7 +6,7 @@ defmodule News.ArticleTest do
   use ExUnit.Case
   doctest News.Article
 
-  @test_cases [
+  @file_test_cases [
     %{file: "qandasec5.asp",                cites: ["CA Educ Code Section 47605", "CA Educ Code Section 47605.6"]},
     %{file: "qandasec6.asp",                cites: ["CA Educ Code Section 47605"]},
     %{file: "Formal Marriage License.html", cites: ["Tex. Fam. Code Section 2.003", "Tex. Fam. Code Section 2.013", "Tex. Fam. Code Section 2.203"]},
@@ -18,12 +18,27 @@ defmodule News.ArticleTest do
     %{file: "colorado-knife-laws.html",     cites: ["C.R.S. 18-12-101", "C.R.S. 18-12-102", "C.R.S. 18-12-105", "C.R.S. 18-12-105.5"]}
   ]
 
-  Enum.each(@test_cases, fn %{file: f, cites: c} ->
+  Enum.each(@file_test_cases, fn %{file: f, cites: c} ->
     test "finds the cites in #{f}" do
       document = unquote(f) |> Test.fixture_html!
       cites    = unquote(c)
 
       assert Article.find_citations_in_html(document) == cites
     end
   end)
+
+
+  @snippet_test_cases [
+    %{html: "<html></html>", cites: []},
+    %{html: "<html><p>under Colo. Rev. Stat. § 24-34-402.7 and</p></html>", cites: ["C.R.S. 24-34-402.7"]},
+  ]
+
+  Enum.each(@snippet_test_cases, fn %{html: html, cites: cites} ->
+    test "finds the cites in #{html}" do
+      html  = unquote(html) |> Floki.parse_document!
+      cites = unquote(cites)
+
+      assert Article.find_citations_in_html(html) == cites
+    end
+  end)
 end
diff --git a/test/news/date_modified_test.exs b/test/news/date_modified_test.exs
@@ -15,7 +15,9 @@ defmodule News.DateModifiedTest do
     %{date: "2020-01-01", html: "<html><script type='application/ld+json'>{\"dateModified\": \"2020-01-01\"}</script></html>"},
     %{date: "2023-08-25", html: "<html><script type='application/ld+json'>{\"dateModified\": \"Fri, 2023-08-25 21:16:28\"}</script></html>"},
     %{date: "2023-09-20", html: "<html><script type='application/ld+json'>{\"dateModified\": \"2023-09-20T16:20:21-05:00\"}</script></html>"},
-    %{date: "2020-05-19", html: "<html><head><meta property=\"article:published_time\" content=\"2020-05-19T16:20:34+00:00\"></head></html>"}
+    %{date: "2020-05-19", html: "<html><head><meta property=\"article:published_time\" content=\"2020-05-19T16:20:34+00:00\"></head></html>"},
+    # JD Supra
+    %{date: "2023-12-05", html: "<html><div class=\"tc-ns f7 silver mv2\"><time>December 5, 2023</time></div></html>"},
   ]
 
   # Create and run a test for each of the @test_cases