Skip to content

Commit 9d4ed54

Browse files
committed
Adding an Rscript to create and verify checksums
1 parent d5cb6d9 commit 9d4ed54

File tree

2 files changed

+102
-8
lines changed

2 files changed

+102
-8
lines changed

checksums.R

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# From Stata
2+
3+
# if $redownload == 1 {
4+
# copy "https://datahub.io/core/country-codes/r/country-codes.csv" "data/raw/country-codes.csv", replace
5+
# // create checksum of file
6+
# // Aug 2023 version: 2295658388
7+
# global countrycksum 2295658388
8+
# checksum "data/raw/country-codes.csv", save
9+
# assert $countrycksum == r(checksum)
10+
# // This will fail if the files are not identical
11+
# // Provide a verbose message if we get past this point
12+
# disp in green "Country codes file downloaded successfully"
13+
# }
14+
15+
library(dplyr)
16+
library(openssl)
17+
library(tools)
18+
19+
# Function to calculate SHA256 hash of a file
20+
calculate_sha256 <- function(filepath) {
21+
tryCatch({
22+
hash <- sha256(file(filepath))
23+
# Convert the raw vector to a single hexadecimal string
24+
as.character(hash, sep = "")
25+
}, error = function(e) {
26+
NA_character_
27+
})
28+
}
29+
30+
# Function to verify checksum
31+
verify_checksum <- function(filepath, expected_hash) {
32+
calculated_hash <- calculate_sha256(filepath)
33+
if (is.na(calculated_hash)) {
34+
return(FALSE)
35+
}
36+
return(calculated_hash == expected_hash)
37+
}
38+
39+
# Flags
40+
41+
reprocess <- TRUE
42+
generate <- TRUE
43+
44+
# filepaths - generalized - this could be in an externally sourced file treated as confidential
45+
46+
data.path <- "data/"
47+
data.path.external <- file.path(data.path,"external")
48+
data.path.registry <- file.path(data.path,"registry")
49+
data.path.metadata <- file.path(data.path,"metadata")
50+
51+
52+
if ( generate ) {
53+
metadata <- read.csv(file.path(data.path.metadata,"consistency.csv"))
54+
# Add SHA256 hash column
55+
metadata <- metadata %>%
56+
rowwise() %>%
57+
mutate(sha256sum = calculate_sha256(file.path(get(path), filename)),
58+
asofdate=date()) %>%
59+
select(filename,path,sha256sum,asofdate)
60+
# Write the updated metadata back to a CSV file
61+
write.csv(metadata, file.path(data.path.metadata,"consistency.csv"),
62+
row.names = FALSE)
63+
}
64+
65+
# check the checksums before proceeding
66+
67+
# Verify all checksums in the metadata
68+
metadata <- read.csv(file.path(data.path.metadata,"consistency.csv"))
69+
metadata <- metadata %>%
70+
rowwise() %>%
71+
mutate(checksum_verified =
72+
verify_checksum(file.path(get(paste0("data.path.",path)), filename), sha256sum)
73+
)
74+
75+
# Display results
76+
message("Verification results for files")
77+
print(metadata %>% select(filename, sha256sum, checksum_verified))
78+
79+
# Count of verified and failed checksums
80+
summary <- metadata %>%
81+
ungroup() %>%
82+
summarise(
83+
total_files = n(),
84+
verified = sum(checksum_verified),
85+
failed = sum(!checksum_verified)
86+
)
87+
print(summary)
88+
89+
if ( reprocess ) {
90+
# Do stuff here
91+
}
92+
93+
94+

data/metadata/consistency.csv

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
filename,path,checksum,date
2-
anes_timeseries_2020_stata_20220210_hill_roberts_subset.csv,external,,2024-09-11
3-
China2021Data.csv,external,,2024-09-11
4-
LucidPlotText.csv,external,,2024-09-11
5-
LucidWideFile.csv,external,,2024-09-11
6-
Pre2020Surveys_Pooled.csv,external,,2024-09-11
7-
trials.Rds,registry,,2024-09-11
8-
consistency.csv,metadata,,2024-09-11
1+
"filename","path","sha256sum","asofdate"
2+
"anes_timeseries_2020_stata_20220210_hill_roberts_subset.csv","external","75d050817b922b02b444b58c279f5d770f01376321d894c80b76364063248b63","Fri Sep 13 07:53:19 2024"
3+
"China2021Data.csv","external","2ce75c757229ddb631cadccc9c4063537de313d3a754fed13f93054656261bba","Fri Sep 13 07:53:19 2024"
4+
"LucidPlotText.csv","external","992daee8392ebeee510290a7b5895d53028c358b892a9d7c10a6391b5d979194","Fri Sep 13 07:53:19 2024"
5+
"LucidWideFile.csv","external","767eb7b05fc1910603780680476cf32fee845728b4c7346e9f0ada6ed0c84f04","Fri Sep 13 07:53:19 2024"
6+
"Pre2020Surveys_Pooled.csv","external","17b6cc33b8bdc3938aab5b59de0dc4fa2b724d1ad997060385466d0899dae82e","Fri Sep 13 07:53:19 2024"
7+
"trials.Rds","registry","07e0a48dc2bb5896fd89540915f431eb6e1b36b374c848c1edfbabee8d5e6ad7","Fri Sep 13 07:53:19 2024"
8+
"consistency.csv","metadata","2732f4a0fa6a580fc601266ed34e76d6b276a256a09e95232022b7b53f4a9025","Fri Sep 13 07:53:19 2024"

0 commit comments

Comments
 (0)