-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCompareBills.R
179 lines (151 loc) · 7.14 KB
/
CompareBills.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
################################################################################
# CompareBills.R: Translate two bills introduced in the US Senate or House, and
# available in pdf format at the US government web site
# https://www.govinfo.gov/app/details/, into a 'tidy' format with each para-
# graph of each bill in its own row, accompanied by its full outline tag.
# Extract keywords from each paragraph of each bill by listing all words and
# deleting the words found in a list of common words. Use the keywords to
# match each paragraph in the first bill with paragraphs of the second bill
# sharing a 'significant' number of keywords. Prepare a table showing the
# matched paragraphs and the common keywords between them; output this table
# to a .csv file which can be input to a spreadsheet program or other
# analysis tool. The constant values in the function areRelated() can be
# tweaked to tune the identification of related paragraphs, or this algorithm
# can be replaced with a better one.
#
# The design of this script makes no assumption that the two bills have any
# common origin, although it would be quite successful in identifying the
# text in common between two such bills (while also spewing a lot of false
# positives). The intended function is very different from the text comparison
# capabilities of word processors or source code 'diff' utilities, as well as
# from software for detecting plagiarism (turnitin).
#
# Copyright (c) 2021 Orebed Analytics LLC under MIT License; see LICENSE.txt.
#
# Data files produced by this software are released under Creative Commons
# license (see https://creativecommons.org/licenses/by/4.0/legalcode) to the
# extent that they are not already in the public domain.
library(tidyverse)
library(stringi)
library(pdftools)
library(tidytext)
library(tokenizers)
library(cleanNLP)
library(quanteda)
library(glue)
source("./digestBill.R", local = TRUE)
source("./billOutline.R", local = TRUE)
cat("CompareBills.R -- compare two bills of US federal legislation\n")
cat("Input: name and filename of bills to be compared\n")
cat("Output: .csv (tab-separated) text file showing all paragraphs of the \n")
cat(" first bill together with similar paragraphs of the second.\n")
cat(" The output file lends itself to being read into a spreadsheet\n")
cat(" program (MS Excel or equivalent) for formatting and further\n")
cat(" analysis.\n")
cat("Input is read from file comparebills.txt (if it exists) or console.\n\n")
bill2 <-list(name = "", filename = "")
bill1 <- list(name = "", filename = "")
if (file.exists("comparebills.txt")) {
connex <- file("comparebills.txt", "rt")
} else {
connex <- stdin()
}
cat("First bill name (typically Axxx or Sxxx): ")
bill1$name <- readLines(connex, 1)
cat("First bill filename (a pdf file, typically Axxxx.pdf etc.): ")
bill1$filename <- readLines(connex, 1)
stopifnot(file.exists(bill1$filename))
cat("Second bill name (typically Axxx or Sxxx): ")
bill2$name <- readLines(connex, 1)
cat("Second bill filename (a pdf file, typically Axxxx.pdf etc.): ")
bill2$filename <- readLines(connex, 1)
stopifnot(file.exists(bill2$filename))
if (!file.exists("comparebills.txt")) {
connex <- file("comparebills.txt", "wt")
writeLines(c(bill1$name, bill1$filename,
bill2$name, bill2$filename),
connex)
close(connex)
} else {
close(connex)
}
bill1.df <- digestBill(".", bill1$filename, bill1$name) %>%
group_by(Item) %>%
summarize(Outline = first(Outline_Tag),
Page_Lines = str_c(first(Page_Line), " - ", last(Page_Line)),
Text = trimws(glue_collapse(Text)),
Quoted = first(Quoted)) %>%
mutate(across(Text, ~str_replace_all(., "[[:space:]]+", " "))) %>%
filter(!is.na(Outline))
bill2.df <- digestBill(".", bill2$filename, bill2$name) %>%
group_by(Item) %>%
summarize(Outline = first(Outline_Tag),
Page_Lines = str_c(first(Page_Line), " - ", last(Page_Line)),
Text = trimws(glue_collapse(Text)),
Quoted = first(Quoted)) %>%
mutate(across(Text, ~str_replace_all(., "[[:space:]]+", " "))) %>%
filter(!is.na(Outline))
bill1.df <- processOutline(bill1.df)
bill2.df <- processOutline(bill2.df)
areRelated <- function(keys1, keys2) {
# rexp <- 1.4
rexp <- 2.2
# rexp <- 2.0
threshold <- min(floor(log2(length(keys1)^(rexp) + length(keys2)^(rexp))),
length(keys1), length(keys2), 4)
(sum(keys1 %in% keys2) > threshold) &
(length(keys1) > 0) &
(length(keys2) > 0)
}
relatedMatrix <- sapply(bill1.df$Keywords, function(x) { sapply(bill2.df$Keywords, function(y) { areRelated(x, y) })})
relatedList <- mapply(function(x) { which(relatedMatrix[,x]) }, seq(1, length(bill1.df$Keywords)))
relatedPairs <- tibble(bill1Item = 0, bill2Item = 0) %>%
filter(FALSE)
for (i in bill1.df$Item) {
elt <- relatedList[[i]]
if (!is.na(relatedList[[i]][1])) {
for (j in seq(1, length(relatedList[[i]]))) {
relatedPairs <- relatedPairs %>%
bind_rows(tibble(bill1Item = i, bill2Item = relatedList[[i]][j]))
}
}
}
bill1_bill2 <- bill1.df %>%
left_join(relatedPairs, by = c("Item" = "bill1Item")) %>%
left_join(bill2.df, by = c("bill2Item" = "Item")) %>%
mutate(Common = mapply(function(x, y)
{ sapply(x, function(z)
{ if(z %in% y) { z } else { "" } }) }, Keywords.x, Keywords.y)) %>%
mutate(Common_Keywords = sapply(Common, function(x)
{ glue_collapse(names(x[which(x != "")]), sep = ", ") } )) %>%
select(-Common) %>%
mutate(Common = NA)
for (i in seq(1, length(bill1_bill2$Common))) {
if (length(bill1_bill2$Common_Keywords[[i]]) > 0) {
bill1_bill2$Common[i] <- bill1_bill2$Common_Keywords[[i]]
} else {
bill1_bill2$Common[i] <- NA
}
}
bill1_bill2_trimmed <- bill1_bill2 %>%
select(Tag.x, Text.x, Common, Tag.y, Text.y) %>%
mutate(Text.prev = lag(Text.x)) %>%
mutate(Text.x = mapply(function(x, y) { if (is.na(y)) { x } else if (x == y) { "" } else { x }},
Text.x, Text.prev)) %>%
select(-Text.prev) %>%
mutate_all(~sapply(., function(y) { if (is.na(y)) { "" } else { y }}))
colnames(bill1_bill2_trimmed)[which(str_ends(colnames(bill1_bill2_trimmed), fixed(".x")))] <-
str_c(bill1$name, " ", str_remove(
colnames(bill1_bill2_trimmed)[which(str_ends(colnames(bill1_bill2_trimmed),
fixed(".x")))], fixed(".x")))
colnames(bill1_bill2_trimmed)[which(str_ends(colnames(bill1_bill2_trimmed),
fixed(".y")))] <-
str_c(bill2$name, " ", str_remove(
colnames(bill1_bill2_trimmed)[which(str_ends(colnames(bill1_bill2_trimmed),
fixed(".y")))], fixed(".y")))
colnames(bill1_bill2_trimmed)[which(str_ends(colnames(bill1_bill2_trimmed), fixed(" Tag")))] <-
str_replace(colnames(bill1_bill2_trimmed)[which(str_ends(colnames(bill1_bill2_trimmed),
fixed(" Tag")))],
fixed(" Tag"),
" Outline")
write_delim(bill1_bill2_trimmed, str_c(bill1$name, "_", bill2$name, ".csv"), delim = "\t")