Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,30 @@ config_run.json
config_system.json
config_setup.R
*.stamp
dimagi-data-platform-R.Rproj
output_dag.csv
all_monthly.csv
tula_data.csv
attrition_data.csv
attrition_study.csv
nusers_domain.csv
nusers_lifetime_stats.csv
pathfinder_nusers.csv
sample_monthly.csv
dup_pk.csv
mobile_users.csv
web_users1.csv
blog_data_11_13_14.csv
config_active_real.json
aggregate_monthly.csv
blog_data.csv
image.csv
training_set_results.csv
Rplots.pdf
blog_active.csv
can_use_true.csv
blog_keep.csv
exclude_domains.csv
output_dag_all.csv
config_all_latest.json
config_run_test_false.json
24 changes: 10 additions & 14 deletions analysis_scripts/raw_data/db_table_import.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,32 +26,28 @@ form_table <- tbl(db, "form")
form_table <- get_data_source(db, "form", 1000) #limited number of forms
form_table <- collect(form_table)

formdef <- get_data_source(db, "formdef", 1000)
formdef <- collect(formdef)

visit <- tbl(db, "visit")
visit <- collect(visit)
visit <- get_data_source(db, "visit", 1000)
app <- tbl(db, "application")
device_log <- tbl(db, "device_log")
domain <- tbl(db, "domain")
users <- tbl(db, "users")

users <- tbl(db, "users")
#Get table for user_type (mobile, web, superuser, etc.)
#This function has been defined in data_sources.R
user_type <- get_user_type_table(db)

#Get visit_detail data source
#First run functions in data_sources.R
visit_detail <- get_visit_detail(db, 1000)
#------------------------------------------------------------------------#
#Calculations on DB tables
#------------------------------------------------------------------------#

#Use collect() to bring these results into a dataframe
# Count forms per app_id
sum_forms <-
form_table %.%
group_by(app_id) %.%
summarise(form_total = count(form_id))

summary(sum_forms)
dim(sum_forms)
head(sum_forms$select)
#Get case tables
cases <- tbl(db, "cases")
cases <- collect(cases)



45 changes: 45 additions & 0 deletions analysis_scripts/raw_data/pull_visit_offset.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#Pull visit times or form times from the main visit or form table using an offset parameter (per Yedi).
#This ensures that the visit/form date/time stamps are pulled correctly without dropping the time component.

#Number of rows in visit table through June 2015 = 7,890,421
#Time component drops with offset > 0 and limit approximately > 10,000
#test <- visit_pull_times(db)

#Form table has 12,787,766 rows through June 2015

#Function to pull visit times from the visit table using offset parameter
visit_pull_times <- function (db, limit=-1, offset=-1) {
con <- db$con
query <- 'select visit.id, visit.time_start, visit.time_end
from form, visit
where form.visit_id = visit.id
group by visit.id, visit.time_start, visit.time_end'

with_limit <-(limit > 0)
if (with_limit) {
query <- paste0(query, ' limit ',limit )
}
with_offset <-(offset > 0)
if (with_limit & with_offset) {
query <- paste0(query, ' offset ',offset )
}


v <- do_query(con, query)
return(v)
}

#Pull visit table time stamps using defined offset and limit parameters
offset_vec <- c(-1, seq(10000, 7890000, by = 10000))

for (i in 1:length(offset_vec)) {
new_batch <- visit_pull_times(db, limit = 10000, offset = offset_vec[i])

if (i == 1) {
visit_times <- new_batch
}

if (i > 1) {
visit_times <- rbind(visit_times, new_batch)
}
}
Loading