Skip to content

Commit

Permalink
Merge branch 'keyword_searches'
Browse files Browse the repository at this point in the history
  • Loading branch information
hhay1981 committed Mar 15, 2024
2 parents 7642708 + 0a9ba3a commit 70d74e3
Show file tree
Hide file tree
Showing 21 changed files with 2,989 additions and 97 deletions.
4 changes: 4 additions & 0 deletions kw_req.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
openpyxl==3.1.2
pandas==2.0.2
python-docx==0.8.11
slate3k==0.5.3
193 changes: 193 additions & 0 deletions scripts/2024_hdrive_category_report_ministries.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
---
author: "Copyright: Optimization Team 2024"
date: "Compiled on `r format(Sys.time(), '%B %d, %Y')`"
output:
prettydoc::html_pretty:
theme: architect
title: |
![](GFX_OptimizationLogo-Icon_v2.png){width=350px} <br>
NRM H Drive Category Reports - Split by Ministry
subtitle: "Data collected on `r params$collected`"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, fig.width = 9, fig.height = 7, fig.path = 'figure/')
options(readr.show_col_types = FALSE)
library(data.table)
library(DT)
library(dbplyr)
library(dplyr)
library(forcats)
library(extrafont)
library(glue)
library(here)
library(knitr)
library(stringr)
library(tidyverse)
library(rmarkdown)
library(gridExtra)
library(shadowtext)
library(scales)
library(easyr)
library(ggplot2)
library(ggforce)
library(ggsci)
library(ggtext)
library(hrbrthemes)
#font_import()
#loadfonts(device = "win")
#theme_set(theme_bw())
here::here()
```

```{r, date variables for plots, include = FALSE}
collected <- as.Date("2024-02-15")
quarter <- "Q4"
fiscal <- "FY23-24"
cap <- glue('Source: OCIO Summary Report from {format(collected, "%B %d, %Y")}')
pdfname <- sprintf('Ministry_HDrive_Plots_%s_%s', quarter, fiscal)
```

```{r load enhanced H drive data, echo = FALSE}
# create list of every CSV file in the folder with "U FileTypeCategory Summary Report" in the name
hdrive.list <- list.files(here("source"), pattern = '*-U FileTypeCategory Summary Report', full.names = TRUE)
# get ministry acronyms from file names
ministry.short <- vapply(basename(hdrive.list), `[`, 1, FUN.VALUE = character(1))
ministry.short <- vapply(strsplit(as.character(ministry.short), split = '-U FileTypeCategory Summary Report.csv'), `[`, 1, FUN.VALUE = character(1))
# correct outdated ministry acronyms
ministry.short <- gsub('LWRS', 'WLRS', ministry.short)
# assign ministry column & acronymns
hdf.list <- lapply(hdrive.list, fread, stringsAsFactors = FALSE)
hdf.list <- Map(cbind, hdf.list, Ministry = ministry.short)
hdf <- do.call(rbind, hdf.list)
# create dataframe of enhanced H drive data
h_raw_data <- data.frame(hdf) %>%
relocate(Ministry, .before = AppData.Size.GB) %>%
relocate(Map.Size.GB, .after = Images.File.Count) %>%
relocate(Map.File.Count, .after = Map.Size.GB)
#h_raw_data
```

```{r create list of dataframes, echo = FALSE}
ministries = c("AF", "BCWS", "EMLI", "ENV", "FOR", "IRR", "WLRS")
categories = c("AppData", "Archive", "Audio", "Backups", "CAD", "Database", "Disk Images", "Documents", "Email", "Empty Extension", "Encase", "Executables", "Images", "Map", "P2P", "Source Code", "System", "Temporary", "Video", "Web Page")
h_raw_data <- select(h_raw_data, c(-("User"), -contains("Count"))) %>%
set_names(nm = c("Ministry", "AppData", "Archive", "Audio", "Backups", "CAD", "Database", "Disk Images", "Documents", "Email", "Empty Extension", "Encase", "Executables", "Images", "Map", "P2P", "Source Code", "System", "Temporary", "Video", "Web Page"))
min_df <- lapply(1:length(ministries), function(x) {h_raw_data %>%
filter(Ministry == ministries[x]) %>%
mutate_if(is.double, round, 1) %>%
summarise(across(where(is.numeric), sum))
})
names(min_df) <- ministries
list2env(min_df , envir = .GlobalEnv)
```

```{r loop to factor each dataframe in list, echo = FALSE}
min_fct_df <- function(x) {
data.frame(File_Category = factor(categories, levels = categories),
Size_GB = c(x[["AppData"]], x[["Archive"]], x[["Audio"]], x[["Backups"]], x[["CAD"]], x[["Database"]], x[["Disk Images"]], x[["Documents"]], x[["Email"]], x[["Empty Extension"]], x[["Encase"]], x[["Executables"]], x[["Images"]], x[["Map"]], x[["P2P"]], x[["Source Code"]], x[["System"]], x[["Temporary"]], x[["Video"]], x[["Web Page"]]))
}
for (df in 1:length(min_df)) {
min_df[[df]] <- min_fct_df(min_df[[df]])
min_df[[df]]$File_Category = factor(categories, levels = categories[order(min_df[[df]]$Size_GB)])
}
list2env(min_df,envir = .GlobalEnv)
```

```{r function to create horizontal bar charts, echo = FALSE}
ministry_h_chart <- function(df) {
ggplot(df) +
geom_col(aes(Size_GB, File_Category), fill = "#234075", width = 0.7) +
scale_x_continuous(
limits = c(0, 10500),
breaks = seq(0, 10000, by = 1000),
expand = c(0, 0), # The horizontal axis does not extend to either side
position = "top" # Labels are located on the top
) +
# The vertical axis only extends upwards
scale_y_discrete(expand = expansion(add = c(0, 0.5))) +
theme(
# Set background color to white
panel.background = element_rect(fill = "#F2F2F2"),
panel.grid.minor = element_blank(),
# Set the color and the width of the grid lines for the horizontal axis
panel.grid.major.x = element_line(color = "#FCBA19", size = 0.3),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
# Only left line of the vertical axis is painted in black
axis.line.y.left = element_line(color = "#313132"),
# customize labels for the vertical & horizontal axis
axis.text.y = element_text(family = "BC Sans", hjust = 0, size = 8.5),
axis.text.x = element_text(family = "BC Sans", hjust = 0, size = 7)
) +
geom_shadowtext(
data = subset(df, Size_GB < 500),
aes(Size_GB, y = File_Category, label = Size_GB),
hjust = -0.1,
nudge_x = 0.3,
colour = "#234075",
bg.colour = "#F2F2F2",
bg.r = 0.2,
family = "BC Sans",
size = 2.5
) +
geom_text(
data = subset(df, Size_GB >= 500),
aes(0, y = File_Category, label = Size_GB),
hjust = -0.1,
nudge_x = 0.3,
colour = "#F2F2F2",
family = "BC Sans",
size = 2.5
) +
labs(x = "Size (GB)",
y = "File Type Category",
title = paste0(deparse(substitute(df)), " H Drive Analysis"),
subtitle = "Storage Consumption by File Type Category",
caption = cap) +
theme(
axis.title.x = element_text(family = "BC Sans", hjust = 0,size = 8),
plot.title = element_text(family = "BC Sans", face = "bold", hjust = c(0,0), size = 14),
plot.subtitle = element_text(family = "BC Sans", size = 12),
plot.caption = element_text(family = "BC Sans", color = "#606060", size = 10),
plot.margin = margin(0.5, 1.8, 0.5, 0.5, "cm")) -> plot
}
```

```{R run new function, echo = FALSE}
chart_AF_hdrive <- ministry_h_chart(AF)
chart_BCWS_hdrive <- ministry_h_chart(BCWS)
chart_EMLI_hdrive <- ministry_h_chart(EMLI)
chart_ENV_hdrive <- ministry_h_chart(ENV)
chart_FOR_hdrive <- ministry_h_chart(FOR)
chart_IRR_hdrive <- ministry_h_chart(IRR)
chart_WLRS_hdrive <- ministry_h_chart(WLRS)
plotlist = list()
plotlist[[1]] <- chart_AF_hdrive
plotlist[[2]] <- chart_BCWS_hdrive
plotlist[[3]] <- chart_EMLI_hdrive
plotlist[[4]] <- chart_ENV_hdrive
plotlist[[5]] <- chart_FOR_hdrive
plotlist[[6]] <- chart_IRR_hdrive
plotlist[[7]] <- chart_WLRS_hdrive
pdf(here('figure',glue(pdfname,".pdf")))
for (i in 1:7) {
print(plotlist[[i]])
}
dev.off()
```
187 changes: 187 additions & 0 deletions scripts/2024_hdrive_category_report_total.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
---
author: "Copyright: Optimization Team 2023"
date: "Compiled on `r format(Sys.time(), '%B %d, %Y')`"
output:
prettydoc::html_pretty:
theme: architect
title: |
![](GFX_OptimizationLogo-Icon_v2.png){width=350px} <br>
NRM H Drive Consumption by File Type Category
subtitle: "Data collected on `r params$collected`"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo=TRUE, fig.width=9, fig.height=7, fig.path='figure/')
options(readr.show_col_types = FALSE)
library(data.table)
library(DT)
library(dbplyr)
library(dplyr)
library(forcats)
library(glue)
library(here)
library(htmltools)
library(knitr)
library(stringr)
library(tidyverse)
library(rmarkdown)
library(grid)
library(shadowtext)
library(scales)
library(easyr)
library(extrafont)
library(ggsci)
library(ggplot2)
#font_import()
#loadfonts(device = "win")
#theme_set(theme_bw())
here::here()
```

```{r, date variables for plots}
collected <- as.Date("2024-02-15")
quarter <- "Q4"
fiscal <- "FY23-24"
cap <- glue('Source: OCIO Summary Report from {format(collected, "%B %d, %Y")}')
```

```{r BCGOV colour palette, include = FALSE}
# create a bcgov colour palette
bcgov_col <- c("#234075", "#e3a82b", "#313132", "#65799e", "#FCBA19", "#606060", "#003366", "#38598A", "#1A5A96")
```

```{r load nrm_h_data, include = FALSE}
#h_raw_data <- fread((here("source", glue("2023-04_NRM_Enhanced_HDrive_Usage.csv", stringsAsFactors = FALSE))))
```

```{r load enhanced H drive data, include = FALSE}
# create list of every CSV file in the folder with "U FileTypeCategory Summary Report" in the name
hdrive.list <- list.files(here("source"), pattern = '*-U FileTypeCategory Summary Report', full.names = TRUE)
# get ministry acronyms from file names
ministry.short <- vapply(basename(hdrive.list), `[`, 1, FUN.VALUE=character(1))
ministry.short <- vapply(strsplit(as.character(ministry.short), split = '-U FileTypeCategory Summary Report.csv'), `[`, 1, FUN.VALUE=character(1))
# correct outdated ministry acronyms
ministry.short <- gsub('LWRS', 'WLRS', ministry.short)
# assign ministry column & acronymns
hdf.list <- lapply(hdrive.list, fread, stringsAsFactors = FALSE)
hdf.list <- Map(cbind, hdf.list, Ministry = ministry.short)
hdf <- do.call(rbind, hdf.list)
# create dataframe of enhanced H drive data
h_raw_data <- data.frame(hdf)
h_raw_data <- h_raw_data %>%
relocate(Ministry, .before = AppData.Size.GB)
```

```{r create nrm_h_data table, include = FALSE}
h_info <- h_raw_data %>%
mutate_if(is.double, round, 1)
head(h_info)
```

```{r sum up NRM GB columns, include = FALSE}
min_h_catsums <- h_info %>%
summarise_if(is.numeric, sum, na.rm = TRUE)
head(min_h_catsums)
```

```{r category names for plots, include = FALSE}
categories <- c("AppData", "Archive", "Audio", "Backups", "CAD", "Database", "Disk Images", "Documents", "Email", "Empty Extension", "Encase", "Executables", "Images", "Map", "P2P", "Source Code", "System", "Temporary", "Video", "Web Page")
```

```{r create nrm_h_dataframe for NRM charts, include = FALSE}
nrm_h_data <- data.frame(Size_GB = c(min_h_catsums[["AppData.Size.GB"]], min_h_catsums[["Archive.Size.GB"]], min_h_catsums[["Audio.Size.GB"]], min_h_catsums[["Backups.Size.GB"]], min_h_catsums[["CAD.Size.GB"]], min_h_catsums[["Database.Size.GB"]], min_h_catsums[["Disk.Images.Size.GB"]], min_h_catsums[["Documents.Size.GB"]], min_h_catsums[["Email.Size.GB"]], min_h_catsums[["Empty.Extension.Size.GB"]], min_h_catsums[["Encase.Size.GB"]], min_h_catsums[["Executables.Size.GB"]], min_h_catsums[["Images.Size.GB"]], min_h_catsums[["Map.Size.GB"]], min_h_catsums[["P2P.Size.GB"]], min_h_catsums[["Source.Code.Size.GB"]], min_h_catsums[["System.Size.GB"]], min_h_catsums[["Temporary.Size.GB"]], min_h_catsums[["Video.Size.GB"]], min_h_catsums[["Web.Page.Size.GB"]]),
File_Category = factor(categories, levels = categories),
y = seq(length(categories)) * 0.9
)
nrm_h_data$File_Category = factor(categories, levels = categories[order(nrm_h_data$Size_GB)])
```

```{r step 1 Basic barchart, include = FALSE}
total_chart <- ggplot(nrm_h_data) +
geom_col(aes(Size_GB, File_Category), fill = "#234075", width = 0.7)
total_chart
```

```{r step 2 Customize layout, include = FALSE}
total_chart <- total_chart +
scale_x_continuous(
limits = c(0, 23500),
breaks = seq(0, 23000, by = 2000),
expand = c(0, 0), # The horizontal axis does not extend to either side
position = "top" # Labels are located on the top
) +
# The vertical axis only extends upwards
scale_y_discrete(expand = expansion(add = c(0, 0.5))) +
theme(
# Set background color to white
panel.background = element_rect(fill = "#F2F2F2"),
panel.grid.minor = element_blank(),
# Set the color and the width of the grid lines for the horizontal axis
panel.grid.major.x = element_line(color = "#FCBA19", size = 0.3),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
# Only left line of the vertical axis is painted in black
axis.line.y.left = element_line(color = "#313132"),
# customize labels for the vertical & horizontal axis
axis.text.y = element_text(family = "BC Sans", hjust = 0, size = 8.5),
axis.text.x = element_text(family = "BC Sans", hjust = 0, size = 7)
)
total_chart
```

```{r step 3 Add labels, include = FALSE}
total_chart <- total_chart +
geom_shadowtext(
data = subset(nrm_h_data, Size_GB < 1200),
aes(Size_GB, y = File_Category, label = Size_GB),
hjust = -0.1,
nudge_x = 0.3,
colour = "#234075",
bg.colour = "#F2F2F2",
bg.r = 0.2,
family = "BC Sans",
size = 3
) +
geom_text(
data = subset(nrm_h_data, Size_GB >= 1200),
aes(0, y = File_Category, label = Size_GB),
hjust = -0.1,
nudge_x = 0.3,
colour = "#F2F2F2",
family = "BC Sans",
size = 3
)
total_chart
```

```{r step 4 Add annotations and final tweaks, include = FALSE}
pdf(here('figure', glue('total_nrm_hdrive_catplot_', quarter, '_', fiscal, '.pdf')))
total_chart <- total_chart +
labs(x = "Size (GB)",
y = "File Type Category",
title = "H Drive Content by File Type Category",
subtitle = "NRM Storage Consumption Analysis",
caption = cap) +
theme(axis.title.x = element_text(family = "BC Sans", hjust = 0,size = 8),
plot.title = element_text(family = "BC Sans", face = "bold", hjust = c(0,0), size = 14),
plot.subtitle = element_text(family = "BC Sans", size = 12),
plot.caption = element_text(family = "BC Sans", color = "#606060", size = 10),
plot.margin = margin(0.5, 1.8, 0.5, 0.5, "cm"))
total_chart
print(total_chart)
dev.off()
```
Loading

0 comments on commit 70d74e3

Please sign in to comment.