Skip to content

Commit

Permalink
adding some functionalities and fennica
Browse files Browse the repository at this point in the history
  • Loading branch information
ake123 committed Oct 11, 2024
1 parent 16e417a commit a8d721c
Show file tree
Hide file tree
Showing 15 changed files with 474 additions and 75 deletions.
8 changes: 6 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,16 @@ Imports:
ggplot2,
purrr,
tibble,
curl
curl,
rlang,
stringr,
tidyr,
wordcloud2
Suggests:
testthat (>= 3.0.0),
rmarkdown,
knitr
URL: https://github.com/rOpenGov/finna
URL: https://ropengov.github.io/finna/
BugReports: https://github.com/rOpenGov/finna/issues
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.2
Expand Down
9 changes: 9 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,21 @@ export(refine_metadata)
export(search_finna)
export(visualize_author_distribution)
export(visualize_format_distribution)
export(visualize_format_distribution_pie)
export(visualize_format_library_correlation)
export(visualize_library_distribution)
export(visualize_subject_distribution)
export(visualize_title_year_heatmap)
export(visualize_top_20_titles)
export(visualize_word_cloud)
export(visualize_year_distribution)
export(visualize_year_distribution_line)
import(dplyr)
import(ggplot2)
import(rlang)
import(stringr)
import(tidyr)
import(wordcloud2)
importFrom(curl,curl_download)
importFrom(httr,GET)
importFrom(httr,content)
Expand Down
20 changes: 19 additions & 1 deletion R/search_finna.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ search_finna <- function(query = NULL,#lookfor
page <- 1
records_per_page <- 100 # Fetch 100 records per page for efficiency

# Initialize resultCount
result_count <- 0

while (total_fetched < limit) {
# Calculate the remaining number of records to fetch
remaining_to_fetch <- min(records_per_page, limit - total_fetched)
Expand All @@ -66,7 +69,7 @@ search_finna <- function(query = NULL,#lookfor
lookfor = query,
type = type,
`field[]` = fields,
`filter[]` = filters,
# `filter[]` = filters,
`facet[]` = facets,
`facetFilter[]` = facetFilters,
sort = sort,
Expand All @@ -76,6 +79,13 @@ search_finna <- function(query = NULL,#lookfor
prettyPrint = prettyPrint
)

# Add filters to the query parameters (handle each filter as filter[])
if (!is.null(filters)) {
for (i in seq_along(filters)) {
query_params[[paste0('filter[', i, ']')]] <- filters[i]
}
}

# Execute the GET request and handle potential errors
response <- tryCatch(
httr::GET(base_url, query = query_params),
Expand All @@ -90,6 +100,11 @@ search_finna <- function(query = NULL,#lookfor
# Parse the JSON content of the response
search_results <- httr::content(response, "parsed")

# Extract resultCount only from the first page
if (page == 1) {
result_count <- search_results$resultCount
}

# Extract and structure relevant data from the search results
records <- search_results$records
if (is.null(records) || length(records) == 0) {
Expand Down Expand Up @@ -163,5 +178,8 @@ search_finna <- function(query = NULL,#lookfor
# Attach the language attribute to the tibble
attr(tibble_results, "language") <- lng
#cat("Data retrieved from Finna API (https://www.finna.fi) - metadata licensed under CC0.\n")
#return(tibble_results)
attr(tibble_results, "result_count") <- result_count
return(tibble_results)

}
191 changes: 189 additions & 2 deletions R/visualizeFinna.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,41 @@ visualize_year_distribution <- function(year_data) {
)
}

#' Visualize Year Distribution (Line Plot)
#'
#' Creates a line plot showing the distribution of records by year.
#'
#' @param metadata A tibble containing refined Finna metadata, with a "Year" column.
#' @return A ggplot2 object representing the line plot.
#' @import ggplot2
#' @import dplyr
#' @export
#' @examples
#' library(finna)
#' sibelius_data <- search_finna("sibelius")
#' refined_data <- refine_metadata(sibelius_data)
#' visualize_year_distribution_line(refined_data)
visualize_year_distribution_line <- function(metadata) {
# Convert the Year to numeric
metadata <- metadata %>%
mutate(Year = as.numeric(Year)) %>%
filter(!is.na(Year))

# Count the number of records by year
year_distribution <- metadata %>%
count(Year, sort = TRUE)

# Plot the year distribution as a line plot
ggplot(year_distribution, aes(x = Year, y = n)) +
geom_line(color = "steelblue", size = 1) +
theme_minimal() +
labs(
title = "Yearly Distribution of Records",
x = "Year",
y = "Number of Records"
)
}

#' Visualize Top-20 Titles by Count
#'
#' Creates a bar plot showing the top-20 most frequent titles and their counts.
Expand All @@ -31,16 +66,20 @@ visualize_year_distribution <- function(year_data) {
#' @return A ggplot2 object showing the bar plot of the top-20 titles.
#' @import ggplot2
#' @import dplyr
#' @import stringr
#' @export
#' @examples
#' library(finna)
#' sibelius_data <- search_finna("sibelius")
#' refined_data <- refine_metadata(sibelius_data)
#' visualize_top_20_titles(refined_data)
visualize_top_20_titles <- function(metadata) {
# Convert titles to lowercase for case-insensitive comparison
# Clean the titles: Convert to lowercase, remove punctuation, and trim whitespace
metadata <- metadata %>%
mutate(Title = tolower(Title))
mutate(Title = tolower(Title), # Convert to lowercase
Title = stringr::str_trim(Title), # Trim leading and trailing whitespace
Title = stringr::str_squish(Title), # Remove extra spaces between words
Title = stringr::str_replace_all(Title, "[[:punct:]]", "")) # Remove punctuation

# Calculate the top-20 titles
top_titles <- metadata %>%
Expand All @@ -59,6 +98,52 @@ visualize_top_20_titles <- function(metadata) {
)
}

#' Visualize Heatmap of Titles by Year
#'
#' Creates a heatmap showing the most frequent titles and their occurrence over time.
#'
#' @param metadata A tibble containing refined Finna metadata, with "Title" and "Year" columns.
#' @return A ggplot2 object showing the heatmap of title frequency by year.
#' @import ggplot2
#' @import dplyr
#' @export
#' @examples
#' library(finna)
#' sibelius_data <- search_finna("sibelius")
#' refined_data <- refine_metadata(sibelius_data)
#' visualize_title_year_heatmap(refined_data)
visualize_title_year_heatmap <- function(metadata) {
# Clean the data
metadata <- metadata %>%
mutate(Title = tolower(Title), # Convert to lowercase
Title = stringr::str_trim(Title), # Trim leading/trailing whitespace
Year = as.numeric(Year)) %>% # Convert year to numeric
filter(!is.na(Year), !is.na(Title)) %>%
count(Title, Year)

# Filter top 20 titles by total count
top_titles <- metadata %>%
group_by(Title) %>%
summarise(total = sum(n)) %>%
top_n(20, total) %>%
pull(Title)

# Filter metadata to include only top titles
metadata_filtered <- metadata %>% filter(Title %in% top_titles)

# Plot the heatmap
ggplot(metadata_filtered, aes(x = Year, y = reorder(Title, n), fill = n)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "steelblue") +
theme_minimal() +
labs(
title = "Heatmap of Top 20 Titles by Year",
x = "Year",
y = "Title",
fill = "Count"
)
}


#' Visualize Distribution by Formats
#'
Expand Down Expand Up @@ -95,6 +180,41 @@ visualize_format_distribution <- function(metadata) {
)
}

#' Visualize Correlation Between Formats and Libraries
#'
#' Creates a heatmap showing the correlation between formats and libraries.
#'
#' @param metadata A tibble containing refined Finna metadata, with "Formats" and "Library" columns.
#' @return A ggplot2 object showing the heatmap of format-library correlation.
#' @import ggplot2
#' @import dplyr
#' @export
#' @examples
#' library(finna)
#' sibelius_data <- search_finna("sibelius")
#' refined_data <- refine_metadata(sibelius_data)
#' visualize_format_library_correlation(refined_data)
visualize_format_library_correlation <- function(metadata) {
# Clean and count format-library combinations
format_library_dist <- metadata %>%
mutate(Formats = tolower(Formats),
Library = tolower(Library)) %>%
count(Formats, Library) %>%
filter(!is.na(Formats), !is.na(Library))

# Plot the heatmap
ggplot(format_library_dist, aes(x = Formats, y = Library, fill = n)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "darkorange") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Rotate x-axis labels
labs(
title = "Correlation Between Formats and Libraries",
x = "Format",
y = "Library",
fill = "Count"
)
}

#' Visualize Distribution by Libraries
#'
Expand Down Expand Up @@ -234,5 +354,72 @@ visualize_subject_distribution <- function(metadata) {
)
}

#' Visualize Word Cloud of Titles or Subjects
#'
#' Creates a word cloud showing the frequency of words in titles or subjects.
#'
#' @param metadata A tibble containing refined Finna metadata, with "Title" or "Subjects" column.
#' @param column The column to visualize as a word cloud (e.g., "Title" or "Subjects").
#' @return A word cloud plot of the most frequent words.
#' @import wordcloud2
#' @import dplyr
#' @import tidyr
#' @import rlang
#' @export
#' @examples
#' library(finna)
#' music_data <- search_finna("music")
#' refined_data <- refine_metadata(music_data)
#' visualize_word_cloud(refined_data, "Title")
visualize_word_cloud <- function(metadata, column = "Title") {
# Convert the column name to symbol
column <- rlang::ensym(column)

# Clean and split words
words <- metadata %>%
select(!!column) %>%
mutate(!!column := tolower(!!column)) %>%
tidyr::separate_rows(!!column, sep = " ") %>%
count(!!column, sort = TRUE) %>%
filter(!is.na(!!column), nchar(!!column) > 1) # Remove NA and short words

# Create word cloud
wordcloud2::wordcloud2(data = words)
}



#' Visualize Format Distribution as Pie Chart
#'
#' Creates a pie chart showing the distribution of records by formats.
#'
#' @param metadata A tibble containing refined Finna metadata, with a "Formats" column.
#' @return A ggplot2 object showing the pie chart of format distribution.
#' @import ggplot2
#' @import dplyr
#' @export
#' @examples
#' library(finna)
#' sibelius_data <- search_finna("sibelius")
#' refined_data <- refine_metadata(sibelius_data)
#' visualize_format_distribution_pie(refined_data)
visualize_format_distribution_pie <- function(metadata) {
# Clean and count the format distribution
format_distribution <- metadata %>%
mutate(Formats = tolower(Formats)) %>%
count(Formats, sort = TRUE)

# Plot the pie chart
ggplot(format_distribution, aes(x = "", y = n, fill = Formats)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
theme_void() + # Remove background and axis
labs(
title = "Distribution of Formats",
fill = "Formats"
)
}




52 changes: 0 additions & 52 deletions custom.css

This file was deleted.

File renamed without changes.
Loading

0 comments on commit a8d721c

Please sign in to comment.