Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Google Custom Search (GCS) and add 2024Q4 report #148

Merged
merged 35 commits into from
Jan 9, 2025
Merged
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
6cfacd2
add/update processed gcs data
TimidRobot Dec 13, 2024
f05b713
add totals_by_country and totals_by_langauage
TimidRobot Dec 13, 2024
3f5a441
improve argument and error handling
TimidRobot Dec 13, 2024
525aa37
improve argument and error handling
TimidRobot Dec 13, 2024
62e6f9e
improve argument and error handling
TimidRobot Dec 13, 2024
b1a0fba
use supplied logger and improve info message
TimidRobot Dec 13, 2024
f314773
refactor to include updates to flow (--enable-save, --enable-git), hi…
TimidRobot Dec 13, 2024
0713b37
add plot_totals_by_product
TimidRobot Dec 18, 2024
ba7093f
improve Google Custom Search (GCS) fetch accuracy with quotes and lin…
TimidRobot Dec 18, 2024
ce68a8a
update data following accuracy improvements
TimidRobot Dec 19, 2024
2c0f4fa
Merge branch 'improve-gcs-accuracy' into moar-gcs
TimidRobot Dec 19, 2024
a73aa36
Refactor for clarity and to be more "pythonic"
TimidRobot Dec 19, 2024
955a29e
separate caption text and entry text
TimidRobot Dec 19, 2024
24741cf
add GCS intro and references
TimidRobot Dec 19, 2024
99bc996
rename processed data sets. add current, old, retired. rmove top 25
TimidRobot Dec 21, 2024
713b0c4
update reporting plot styles and add gcs current, old, retired
TimidRobot Dec 21, 2024
c2e9a4a
re-enable plots
TimidRobot Dec 21, 2024
2c61cf5
update processed data
TimidRobot Dec 23, 2024
00c8814
refactor GCS report
TimidRobot Dec 23, 2024
0d5fffd
Merge branch 'main' into moar-gcs
TimidRobot Dec 23, 2024
f9bf303
update processed data with more accurate gcs data
TimidRobot Dec 23, 2024
a117c6a
fix spelling mistake
TimidRobot Dec 23, 2024
711ecb6
improve naming and add plots
TimidRobot Dec 23, 2024
a734985
Merge branch 'main' into moar-gcs
TimidRobot Jan 6, 2025
5a1e162
add and fix support for specifying quarter
TimidRobot Jan 6, 2025
52e1a02
rename report to be more generic
TimidRobot Jan 6, 2025
ae0433f
add support for specifying quarter and usage section
TimidRobot Jan 6, 2025
c7a763d
rename functions for easier sorting
TimidRobot Jan 6, 2025
5ed2a7d
sort functions
TimidRobot Jan 6, 2025
ee3f5f8
move plotting code to shared library
TimidRobot Jan 6, 2025
000b7a3
remove extra space
TimidRobot Jan 6, 2025
c757504
add 2024Q4 report
TimidRobot Jan 6, 2025
0647329
Impove terms and wording
TimidRobot Jan 9, 2025
dbf9ef1
provide more context for Approved for Free Cultural Works
TimidRobot Jan 9, 2025
f8b3cca
fix data paths
TimidRobot Jan 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add GCS intro and references
TimidRobot committed Dec 19, 2024
commit 24741cf72ef52b1d25513b6cf84deb2ec33d20ba
101 changes: 67 additions & 34 deletions scripts/3-report/gcs_report.py
Original file line number Diff line number Diff line change
@@ -30,6 +30,7 @@

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
SECTION = "Google Custom Search (GCS)"


def parse_arguments():
@@ -61,34 +62,73 @@ def parse_arguments():
args = parser.parse_args()
if not args.enable_save and args.enable_git:
parser.error("--enable-git requires --enable-save")
args.logger = LOGGER
args.paths = PATHS
return args


def gcs_intro(args):
"""
Write Google Custom Search (GCS) introduction.
"""
LOGGER.info(plot_totals_by_product.__doc__.strip())
file_path = shared.path_join(
PATHS["data_2-process"], "gcs_totals_by_product.csv"
)
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
data = pd.read_csv(file_path)
shared.update_readme(
args,
SECTION,
"Overview",
None,
None,
"Google Custom Search (GCS) data uses the `totalResults` returned by"
" API for search queries of the legal tool URLs (quoted and using"
" `linkSite` for accuracy), countries codes, and language codes.\n"
"\n"
f"**The results show there are a total of {data['Count'].sum():,d}"
" online documents in the commons--documents that are licensed or put"
" in the public domain using a Creative Commons (CC) legal tool.**\n"
"\n"
"Thank you Google for providing the Programable Search Engine: Custom"
" Search JSON API!\n",
)


def plot_top_25_tools(args):
"""
Create a bar chart for the top 25 legal tools
"""
LOGGER.info(plot_totals_by_product.__doc__.strip())
file_path = shared.path_join(
PATHS["data_2-process"], "gcs_top_25_tools.csv"
)
LOGGER.info("Create a bar chart for the top 25 legal tools")
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
data = pd.read_csv(file_path)

plt.figure(figsize=(10, 10))
ax = sns.barplot(data, x="Count", y="CC legal tool")
y_column = "CC legal tool"
ax = sns.barplot(
data,
x="Count",
y=y_column,
hue=y_column,
palette="pastel",
legend=False,
)
for index, row in data.iterrows():
ax.annotate(
f"{row['Count']:,d}",
(4, index),
(4 + 80, index),
xycoords=("axes points", "data"),
color="white",
fontsize="x-small",
horizontalalignment="left",
color="black",
fontsize="small",
horizontalalignment="right",
verticalalignment="center",
)
plt.title(f"Top 25 legal tools ({args.quarter})")
plt.xlabel("Number of references")
plt.xlabel("Number of works")
plt.ylabel("Creative Commons (CC) legal tool")

# Use the millions formatter for x-axis
@@ -113,13 +153,11 @@ def millions_formatter(x, pos):
plt.savefig(image_path)

shared.update_readme(
PATHS,
image_path,
"Google Custom Search",
"Bar chart showing the top 25 legal tools based on the count of"
" search results for each legal tool's URL.",
"Top 25 legal tools",
args,
SECTION,
"Top 25 legal tools",
image_path,
"Bar chart showing the top 25 individual legal tools.",
)

LOGGER.info("Visualization by license type created.")
@@ -129,10 +167,10 @@ def plot_totals_by_product(args):
"""
Create a bar chart of the totals by product
"""
LOGGER.info(plot_totals_by_product.__doc__.strip())
file_path = shared.path_join(
PATHS["data_2-process"], "gcs_totals_by_product.csv"
)
LOGGER.info(__doc__)
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
data = pd.read_csv(file_path)

@@ -152,14 +190,14 @@ def plot_totals_by_product(args):
(0 + 80, index),
xycoords=("axes points", "data"),
color="black",
fontsize="x-small",
fontsize="small",
horizontalalignment="right",
verticalalignment="center",
)
plt.title(f"Totals by product ({args.quarter})")
plt.ylabel("Creative Commons (CC) legal tool product")
plt.xscale("log")
plt.xlabel("Number of references")
plt.xlabel("Number of works")

# Use the millions formatter for x-axis
def millions_formatter(x, pos):
@@ -185,15 +223,12 @@ def millions_formatter(x, pos):
plt.savefig(image_path)

shared.update_readme(
PATHS,
args,
SECTION,
"Totals by product",
image_path,
"Google Custom Search",
"Bar chart showing how many documents there are for each Creative"
" Commons (CC) legal tool. **There are a total of"
f" {data['Count'].sum():,d} documents that are either CC licensed"
" or put in the public domain using a CC legal tool.**",
"Totals by product",
args,
" Commons (CC) legal tool product.",
)

LOGGER.info("Visualization by license type created.")
@@ -234,7 +269,7 @@ def millions_formatter(x, pos):
# plt.xticks(rotation=45)
#
# # Add value numbers to the top of each bar
# for p in ax.patches:
# for p in ax.patcplot_totals_by_producthes:
# ax.annotate(
# format(p.get_height(), ",.0f"),
# (p.get_x() + p.get_width() / 2.0, p.get_height()),
@@ -265,12 +300,11 @@ def millions_formatter(x, pos):
# plt.show()
#
# shared.update_readme(
# PATHS,
# args,
# SECTION,
# "Country Report",
# image_path,
# "Google Custom Search",
# "Number of Google Webpages Licensed by Country",
# "Country Report",
# args,
# )
#
# LOGGER.info("Visualization by country created.")
@@ -343,25 +377,24 @@ def millions_formatter(x, pos):
# plt.show()
#
# shared.update_readme(
# PATHS,
# args,
# SECTION,
# "Language Report",
# image_path,
# "Google Custom Search",
# "Number of Google Webpages Licensed by Language",
# "Language Report",
# args,
# )
#
# LOGGER.info("Visualization by language created.")


def main():
args = parse_arguments()
args.logger = LOGGER
shared.log_paths(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])

plot_top_25_tools(args)
gcs_intro(args)
plot_totals_by_product(args)
plot_top_25_tools(args)
# plot_by_country(data, args)
# plot_by_language(data, args)

135 changes: 135 additions & 0 deletions scripts/3-report/references.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!/usr/bin/env python
"""
Add project references.
"""
# Standard library
import argparse
import os
import sys
import textwrap
import traceback

# Third-party
from pygments import highlight
from pygments.formatters import TerminalFormatter
from pygments.lexers import PythonTracebackLexer

# Add parent directory so shared can be imported
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

# First-party/Local
import shared # noqa: E402

# Setup
LOGGER, PATHS = shared.setup(__file__)

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
SECTION = "References"


def parse_arguments():
"""
Parses command-line arguments, returns parsed arguments.
"""
LOGGER.info("Parsing command-line arguments")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--quarter",
default=QUARTER,
help="Data quarter in format YYYYQx, e.g., 2024Q2",
)
parser.add_argument(
"--show-plots",
action="store_true",
help="Show generated plots (in addition to saving them)",
)
parser.add_argument(
"--enable-save",
action="store_true",
help="Enable saving results",
)
parser.add_argument(
"--enable-git",
action="store_true",
help="Enable git actions (fetch, merge, add, commit, and push)",
)
args = parser.parse_args()
if not args.enable_save and args.enable_git:
parser.error("--enable-git requires --enable-save")
args.logger = LOGGER
args.paths = PATHS
return args


def data_locations(args):
"""
Write References
"""
shared.update_readme(
args,
SECTION,
"Data locations",
None,
None,
"This report was generated as part of:\n"
"\n"
"**[creativecommons/quantifying][repo]:** *quantify the size and"
" diversity of the commons--the collection of works that are openly"
" licensed or in the public domain*\n"
"\nThe data used to generate this report is avaiable in that"
" repository at the following locations:\n"
"\n"
" | Resource | Location |\n"
" | --------------- | -------- |\n"
" | Fetched data: | [`../1-fetch/`](../1-fetch) |\n"
" | Processed data: | [`../2-process/`](../2-process) |\n"
" | Report data: | [`../2-report/`](../2-report) |\n"
"\n"
"[repo]: https://github.com/creativecommons/quantifying\n",
)


def main():
args = parse_arguments()
shared.log_paths(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])

data_locations(args)

args = shared.git_add_and_commit(
args,
PATHS["repo"],
PATHS["data_quarter"],
f"Add and commit References for {QUARTER}",
)
shared.git_push_changes(args, PATHS["repo"])


if __name__ == "__main__":
try:
main()
except shared.QuantifyingException as e:
if e.exit_code == 0:
LOGGER.info(e.message)
else:
LOGGER.error(e.message)
sys.exit(e.exit_code)
except SystemExit as e:
if e.code != 0:
LOGGER.error(f"System exit with code: {e.code}")
sys.exit(e.code)
except KeyboardInterrupt:
LOGGER.info("(130) Halted via KeyboardInterrupt.")
sys.exit(130)
except Exception:
traceback_formatted = textwrap.indent(
highlight(
traceback.format_exc(),
PythonTracebackLexer(),
TerminalFormatter(),
),
" ",
)
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
sys.exit(1)