-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquality_checks.sh
executable file
·104 lines (86 loc) · 4.8 KB
/
quality_checks.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/bin/bash
#Quality checks to review the data.json file for cdc.gov
#depends on jq1.5+
#There's one proper file https://data.cdc.gov/data.json, but then there's another file, https://www.cdc.gov/wcms/opendata/data.json that no one knows who makes it, but HHS harvests it anyway, I don't remember why I don't check this in, I think it's because I don't care so much what happens over time to it and it's not really a file I work on.
curl -Ls https://www.cdc.gov/wcms/opendata/data.json --output data/wcms.opendata.data.json
#HHS escapes all their slashes and this is bad for comparing identifiers since CDC uses URIs with https:// addresses, so I need to replace \/ with /
#running it through echo -n because sed on OSX is adding a trailing newline
echo -n `cat data/hhs.data.json | sed "s/\\\//\//g"` > data/hhs.data-clean.json
echo "https://data.cdc.gov/data.json aka \"Main\" has this many datasets"
jq ".dataset | length" data/cdc.gov.data.json
jq ".dataset[].title" data/cdc.gov.data.json |wc
echo "https://www.cdc.gov/wcms/opendata/data.json aka \"WCMS\" has this many datasets"
jq ".dataset | length" data/wcms.opendata.data.json
jq ".dataset[].title" data/wcms.opendata.data.json |wc
echo "https://www.healthdata.gov/data.json aka \"HHS\" has this many datasets from CDC"
jq ".dataset[] | select((.publisher.name==\"Centers for Disease Control and Prevention\") or (.publisher.name==\"Centers for Disease Control and Prevention, Department of Health & Human Services\")).title" data/hhs.gov.data.json|wc
echo -e "\nMain has these unique bureauCodes"
jq ".dataset[].bureauCode" data/cdc.gov.data.json -c|sort|uniq
echo -e "\n WCMS has these unique bureauCodes"
jq ".dataset[].bureauCode" data/wcms.opendata.data.json -c|sort|uniq
TITLES=`jq ".dataset[].title" data/wcms.opendata.data.json -r`
echo -e "\nLoop through all the titles in WCMS them and see if any of them are also in Main, printing out landing pages that don't match. This is weird and should be investigated."
jq ".dataset[].landingPage" data/wcms.opendata.data.json -r | {
while read -r landingPage
do
#echo $title
MATCH=`jq ".dataset[] | select(.landingPage==\"$landingPage\").landingPage" data/cdc.gov.data.json`
if [ -n "$MATCH" ]
then
(( COUNTER_MATCH++ ))
else
(( COUNTER_NOMATCH++ ))
echo "NO MATCH"
echo $landingPage
echo "Also, here's the identifier, so you can check to see if it's in HHS"
jq ".dataset[] | select(.landingPage==\"$landingPage\").identifier" data/wcms.opendata.data.json
fi
done
echo $COUNTER_MATCH "matches"
echo $COUNTER_NOMATCH "no matches"
}
echo -e "\nLoop through all the landingPages in Main them and see if any of them are missing in HHS, printing out landing pages that don't match. This is weird and should be investigated."
jq ".dataset[].landingPage" data/cdc.gov.data.json -r | {
while read -r landingPage
do
#echo $title
MATCH=`jq ".dataset[] | select(.landingPage==\"$landingPage\").landingPage" data/hhs.gov.data.json`
if [ -n "$MATCH" ]
then
(( COUNTER_MATCH++ ))
else
(( COUNTER_NOMATCH++ ))
echo "NO MATCH"
echo $landingPage
echo "Also, here's the identifier, so you can check to see if it's in HHS"
jq ".dataset[] | select(.landingPage==\"$landingPage\").identifier" data/cdc.gov.data.json
fi
done
echo $COUNTER_MATCH "matches"
echo $COUNTER_NOMATCH "no matches"
}
echo -e "\nLoop through all the landingPages in HHS where CDC is the publisher see if any of them are missing in Main, printing out landing pages that don't match. This is weird and should be investigated."
jq ".dataset[] | select((.publisher.name==\"Centers for Disease Control and Prevention\") or (.publisher.name==\"Centers for Disease Control and Prevention, Department of Health & Human Services\")).landingPage" data/hhs.gov.data.json -r | {
while read -r landingPage
do
#echo $title
MATCH=`jq ".dataset[] | select(.landingPage==\"$landingPage\").landingPage" data/cdc.gov.data.json`
if [ -n "$MATCH" ]
then
(( COUNTER_MATCH++ ))
else
(( COUNTER_NOMATCH++ ))
echo "NO MATCH"
echo $landingPage
echo "Also, here's the identifier, so you can check to see if it's in HHS"
jq ".dataset[] | select(.landingPage==\"$landingPage\").identifier" data/hhs.gov.data.json
fi
done
echo $COUNTER_MATCH "matches"
echo $COUNTER_NOMATCH "no matches"
}
#as of 5/6 healthdata.gov shows 506 datasets for CDC, this jsonpath
#$.dataset[?(@.publisher.name=="Centers for Disease Control and Prevention")] returns 458
#$.dataset[?(@.publisher.name=="Centers for Disease Control and Prevention, Department of Health & Human Services")].title returns 46
#All CDC sets should have a bureauCode of 009:20 and programCode of 009:020
#But $.dataset[?(@.bureauCode=="009:20")].title returns 204