|
| 1 | +# From Stata |
| 2 | + |
| 3 | +# if $redownload == 1 { |
| 4 | +# copy "https://datahub.io/core/country-codes/r/country-codes.csv" "data/raw/country-codes.csv", replace |
| 5 | +# // create checksum of file |
| 6 | +# // Aug 2023 version: 2295658388 |
| 7 | +# global countrycksum 2295658388 |
| 8 | +# checksum "data/raw/country-codes.csv", save |
| 9 | +# assert $countrycksum == r(checksum) |
| 10 | +# // This will fail if the files are not identical |
| 11 | +# // Provide a verbose message if we get past this point |
| 12 | +# disp in green "Country codes file downloaded successfully" |
| 13 | +# } |
| 14 | + |
| 15 | +library(dplyr) |
| 16 | +library(openssl) |
| 17 | +library(tools) |
| 18 | + |
| 19 | +# Function to calculate SHA256 hash of a file |
| 20 | +calculate_sha256 <- function(filepath) { |
| 21 | + tryCatch({ |
| 22 | + hash <- sha256(file(filepath)) |
| 23 | + # Convert the raw vector to a single hexadecimal string |
| 24 | + as.character(hash, sep = "") |
| 25 | + }, error = function(e) { |
| 26 | + NA_character_ |
| 27 | + }) |
| 28 | +} |
| 29 | + |
| 30 | +# Function to verify checksum |
| 31 | +verify_checksum <- function(filepath, expected_hash) { |
| 32 | + calculated_hash <- calculate_sha256(filepath) |
| 33 | + if (is.na(calculated_hash)) { |
| 34 | + return(FALSE) |
| 35 | + } |
| 36 | + return(calculated_hash == expected_hash) |
| 37 | +} |
| 38 | + |
| 39 | +# Flags |
| 40 | + |
| 41 | +reprocess <- TRUE |
| 42 | +generate <- TRUE |
| 43 | + |
| 44 | +# filepaths - generalized - this could be in an externally sourced file treated as confidential |
| 45 | + |
| 46 | +data.path <- "data/" |
| 47 | +data.path.external <- file.path(data.path,"external") |
| 48 | +data.path.registry <- file.path(data.path,"registry") |
| 49 | +data.path.metadata <- file.path(data.path,"metadata") |
| 50 | + |
| 51 | + |
| 52 | +if ( generate ) { |
| 53 | + metadata <- read.csv(file.path(data.path.metadata,"consistency.csv")) |
| 54 | + # Add SHA256 hash column |
| 55 | + metadata <- metadata %>% |
| 56 | + rowwise() %>% |
| 57 | + mutate(sha256sum = calculate_sha256(file.path(get(path), filename)), |
| 58 | + asofdate=date()) %>% |
| 59 | + select(filename,path,sha256sum,asofdate) |
| 60 | + # Write the updated metadata back to a CSV file |
| 61 | + write.csv(metadata, file.path(data.path.metadata,"consistency.csv"), |
| 62 | + row.names = FALSE) |
| 63 | +} |
| 64 | + |
| 65 | +# check the checksums before proceeding |
| 66 | + |
| 67 | +# Verify all checksums in the metadata |
| 68 | +metadata <- read.csv(file.path(data.path.metadata,"consistency.csv")) |
| 69 | +metadata <- metadata %>% |
| 70 | + rowwise() %>% |
| 71 | + mutate(checksum_verified = |
| 72 | + verify_checksum(file.path(get(paste0("data.path.",path)), filename), sha256sum) |
| 73 | + ) |
| 74 | + |
| 75 | +# Display results |
| 76 | +message("Verification results for files") |
| 77 | +print(metadata %>% select(filename, sha256sum, checksum_verified)) |
| 78 | + |
| 79 | +# Count of verified and failed checksums |
| 80 | +summary <- metadata %>% |
| 81 | + ungroup() %>% |
| 82 | + summarise( |
| 83 | + total_files = n(), |
| 84 | + verified = sum(checksum_verified), |
| 85 | + failed = sum(!checksum_verified) |
| 86 | + ) |
| 87 | +print(summary) |
| 88 | + |
| 89 | +if ( reprocess ) { |
| 90 | + # Do stuff here |
| 91 | +} |
| 92 | + |
| 93 | + |
| 94 | + |
0 commit comments