add GCS intro and references

creativecommons · TimidRobot · Jan 9, 2025 · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024
commit 24741cf72ef52b1d25513b6cf84deb2ec33d20ba
@@ -30,6 +30,7 @@
 
 # Constants
 QUARTER = os.path.basename(PATHS["data_quarter"])
+SECTION = "Google Custom Search (GCS)"
 
 
 def parse_arguments():
@@ -61,34 +62,73 @@ def parse_arguments():
     args = parser.parse_args()
     if not args.enable_save and args.enable_git:
         parser.error("--enable-git requires --enable-save")
+    args.logger = LOGGER
+    args.paths = PATHS
     return args
 
 
+def gcs_intro(args):
+    """
+    Write Google Custom Search (GCS) introduction.
+    """
+    LOGGER.info(plot_totals_by_product.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"], "gcs_totals_by_product.csv"
+    )
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    data = pd.read_csv(file_path)
+    shared.update_readme(
+        args,
+        SECTION,
+        "Overview",
+        None,
+        None,
+        "Google Custom Search (GCS) data uses the `totalResults` returned by"
+        " API for search queries of the legal tool URLs (quoted and using"
+        " `linkSite` for accuracy), countries codes, and language codes.\n"
+        "\n"
+        f"**The results show there are a total of {data['Count'].sum():,d}"
+        " online documents in the commons--documents that are licensed or put"
+        " in the public domain using a Creative Commons (CC) legal tool.**\n"
+        "\n"
+        "Thank you Google for providing the Programable Search Engine: Custom"
+        " Search JSON API!\n",
+    )
+
+
 def plot_top_25_tools(args):
     """
     Create a bar chart for the top 25 legal tools
     """
+    LOGGER.info(plot_totals_by_product.__doc__.strip())
     file_path = shared.path_join(
         PATHS["data_2-process"], "gcs_top_25_tools.csv"
     )
-    LOGGER.info("Create a bar chart for the top 25 legal tools")
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     data = pd.read_csv(file_path)
 
     plt.figure(figsize=(10, 10))
-    ax = sns.barplot(data, x="Count", y="CC legal tool")
+    y_column = "CC legal tool"
+    ax = sns.barplot(
+        data,
+        x="Count",
+        y=y_column,
+        hue=y_column,
+        palette="pastel",
+        legend=False,
+    )
     for index, row in data.iterrows():
         ax.annotate(
             f"{row['Count']:,d}",
-            (4, index),
+            (4 + 80, index),
             xycoords=("axes points", "data"),
-            color="white",
-            fontsize="x-small",
-            horizontalalignment="left",
+            color="black",
+            fontsize="small",
+            horizontalalignment="right",
             verticalalignment="center",
         )
     plt.title(f"Top 25 legal tools ({args.quarter})")
-    plt.xlabel("Number of references")
+    plt.xlabel("Number of works")
     plt.ylabel("Creative Commons (CC) legal tool")
 
     # Use the millions formatter for x-axis
@@ -113,13 +153,11 @@ def millions_formatter(x, pos):
         plt.savefig(image_path)
 
     shared.update_readme(
-        PATHS,
-        image_path,
-        "Google Custom Search",
-        "Bar chart showing the top 25 legal tools based on the count of"
-        " search results for each legal tool's URL.",
-        "Top 25 legal tools",
         args,
+        SECTION,
+        "Top 25 legal tools",
+        image_path,
+        "Bar chart showing the top 25 individual legal tools.",
     )
 
     LOGGER.info("Visualization by license type created.")
@@ -129,10 +167,10 @@ def plot_totals_by_product(args):
     """
     Create a bar chart of the totals by product
     """
+    LOGGER.info(plot_totals_by_product.__doc__.strip())
     file_path = shared.path_join(
         PATHS["data_2-process"], "gcs_totals_by_product.csv"
     )
-    LOGGER.info(__doc__)
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     data = pd.read_csv(file_path)
 
@@ -152,14 +190,14 @@ def plot_totals_by_product(args):
             (0 + 80, index),
             xycoords=("axes points", "data"),
             color="black",
-            fontsize="x-small",
+            fontsize="small",
             horizontalalignment="right",
             verticalalignment="center",
         )
     plt.title(f"Totals by product ({args.quarter})")
     plt.ylabel("Creative Commons (CC) legal tool product")
     plt.xscale("log")
-    plt.xlabel("Number of references")
+    plt.xlabel("Number of works")
 
     # Use the millions formatter for x-axis
     def millions_formatter(x, pos):
@@ -185,15 +223,12 @@ def millions_formatter(x, pos):
         plt.savefig(image_path)
 
     shared.update_readme(
-        PATHS,
+        args,
+        SECTION,
+        "Totals by product",
         image_path,
-        "Google Custom Search",
         "Bar chart showing how many documents there are for each Creative"
-        " Commons (CC) legal tool. **There are a total of"
-        f" {data['Count'].sum():,d} documents that are either CC licensed"
-        " or put in the public domain using a CC legal tool.**",
-        "Totals by product",
-        args,
+        " Commons (CC) legal tool product.",
     )
 
     LOGGER.info("Visualization by license type created.")
@@ -234,7 +269,7 @@ def millions_formatter(x, pos):
 #    plt.xticks(rotation=45)
 #
 #    # Add value numbers to the top of each bar
-#    for p in ax.patches:
+#    for p in ax.patcplot_totals_by_producthes:
 #        ax.annotate(
 #            format(p.get_height(), ",.0f"),
 #            (p.get_x() + p.get_width() / 2.0, p.get_height()),
@@ -265,12 +300,11 @@ def millions_formatter(x, pos):
 #        plt.show()
 #
 #    shared.update_readme(
-#        PATHS,
+#        args,
+#        SECTION,
+#        "Country Report",
 #        image_path,
-#        "Google Custom Search",
 #        "Number of Google Webpages Licensed by Country",
-#        "Country Report",
-#        args,
 #    )
 #
 #    LOGGER.info("Visualization by country created.")
@@ -343,25 +377,24 @@ def millions_formatter(x, pos):
 #        plt.show()
 #
 #    shared.update_readme(
-#        PATHS,
+#        args,
+#        SECTION,
+#        "Language Report",
 #        image_path,
-#        "Google Custom Search",
 #        "Number of Google Webpages Licensed by Language",
-#        "Language Report",
-#        args,
 #    )
 #
 #    LOGGER.info("Visualization by language created.")
 
 
 def main():
     args = parse_arguments()
-    args.logger = LOGGER
     shared.log_paths(LOGGER, PATHS)
     shared.git_fetch_and_merge(args, PATHS["repo"])
 
-    plot_top_25_tools(args)
+    gcs_intro(args)
     plot_totals_by_product(args)
+    plot_top_25_tools(args)
     # plot_by_country(data, args)
     # plot_by_language(data, args)
 

diff --git a/scripts/3-report/references.py b/scripts/3-report/references.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+"""
+Add project references.
+"""
+# Standard library
+import argparse
+import os
+import sys
+import textwrap
+import traceback
+
+# Third-party
+from pygments import highlight
+from pygments.formatters import TerminalFormatter
+from pygments.lexers import PythonTracebackLexer
+
+# Add parent directory so shared can be imported
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
+# First-party/Local
+import shared  # noqa: E402
+
+# Setup
+LOGGER, PATHS = shared.setup(__file__)
+
+# Constants
+QUARTER = os.path.basename(PATHS["data_quarter"])
+SECTION = "References"
+
+
+def parse_arguments():
+    """
+    Parses command-line arguments, returns parsed arguments.
+    """
+    LOGGER.info("Parsing command-line arguments")
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--quarter",
+        default=QUARTER,
+        help="Data quarter in format YYYYQx, e.g., 2024Q2",
+    )
+    parser.add_argument(
+        "--show-plots",
+        action="store_true",
+        help="Show generated plots (in addition to saving them)",
+    )
+    parser.add_argument(
+        "--enable-save",
+        action="store_true",
+        help="Enable saving results",
+    )
+    parser.add_argument(
+        "--enable-git",
+        action="store_true",
+        help="Enable git actions (fetch, merge, add, commit, and push)",
+    )
+    args = parser.parse_args()
+    if not args.enable_save and args.enable_git:
+        parser.error("--enable-git requires --enable-save")
+    args.logger = LOGGER
+    args.paths = PATHS
+    return args
+
+
+def data_locations(args):
+    """
+    Write References
+    """
+    shared.update_readme(
+        args,
+        SECTION,
+        "Data locations",
+        None,
+        None,
+        "This report was generated as part of:\n"
+        "\n"
+        "**[creativecommons/quantifying][repo]:** *quantify the size and"
+        " diversity of the commons--the collection of works that are openly"
+        " licensed or in the public domain*\n"
+        "\nThe data used to generate this report is avaiable in that"
+        " repository at the following locations:\n"
+        "\n"
+        " | Resource        | Location |\n"
+        " | --------------- | -------- |\n"
+        " | Fetched data:   | [`../1-fetch/`](../1-fetch) |\n"
+        " | Processed data: | [`../2-process/`](../2-process) |\n"
+        " | Report data:    | [`../2-report/`](../2-report) |\n"
+        "\n"
+        "[repo]: https://github.com/creativecommons/quantifying\n",
+    )
+
+
+def main():
+    args = parse_arguments()
+    shared.log_paths(LOGGER, PATHS)
+    shared.git_fetch_and_merge(args, PATHS["repo"])
+
+    data_locations(args)
+
+    args = shared.git_add_and_commit(
+        args,
+        PATHS["repo"],
+        PATHS["data_quarter"],
+        f"Add and commit References for {QUARTER}",
+    )
+    shared.git_push_changes(args, PATHS["repo"])
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except shared.QuantifyingException as e:
+        if e.exit_code == 0:
+            LOGGER.info(e.message)
+        else:
+            LOGGER.error(e.message)
+        sys.exit(e.exit_code)
+    except SystemExit as e:
+        if e.code != 0:
+            LOGGER.error(f"System exit with code: {e.code}")
+        sys.exit(e.code)
+    except KeyboardInterrupt:
+        LOGGER.info("(130) Halted via KeyboardInterrupt.")
+        sys.exit(130)
+    except Exception:
+        traceback_formatted = textwrap.indent(
+            highlight(
+                traceback.format_exc(),
+                PythonTracebackLexer(),
+                TerminalFormatter(),
+            ),
+            "    ",
+        )
+        LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
+        sys.exit(1)