diff --git a/data/2024Q4/2-process/gcs_product_totals.csv b/data/2024Q4/2-process/gcs_product_totals.csv new file mode 100644 index 00000000..05a94ed3 --- /dev/null +++ b/data/2024Q4/2-process/gcs_product_totals.csv @@ -0,0 +1,8 @@ +"CC legal tool product","Count" +"Licenses version 4.0","1132000000" +"Licenses version 3.0","15289017400" +"Licenses version 2.x","18343329641" +"Licenses version 1.0","1918709000" +"CC0 1.0","30500000" +"Public Domain Mark 1.0","8180000" +"Certification 1.0 US","47000000" diff --git a/data/2024Q4/2-process/gcs_status_combined_totals.csv b/data/2024Q4/2-process/gcs_status_combined_totals.csv new file mode 100644 index 00000000..93b26144 --- /dev/null +++ b/data/2024Q4/2-process/gcs_status_combined_totals.csv @@ -0,0 +1,4 @@ +"CC legal tool","Count" +"Latest","1170680000" +"Prior","33776380800" +"Retired","1821675241" diff --git a/data/2024Q4/2-process/gcs_status_latest_totals.csv b/data/2024Q4/2-process/gcs_status_latest_totals.csv new file mode 100644 index 00000000..8b825cde --- /dev/null +++ b/data/2024Q4/2-process/gcs_status_latest_totals.csv @@ -0,0 +1,9 @@ +"CC legal tool","Count" +"CC BY 4.0","324000000" +"CC BY-NC 4.0","132000000" +"CC BY-NC-ND 4.0","132000000" +"CC BY-NC-SA 4.0","58000000" +"CC BY-ND 4.0","322000000" +"CC BY-SA 4.0","164000000" +"PDM 1.0","8180000" +"CC0 1.0","30500000" diff --git a/data/2024Q4/2-process/gcs_status_prior_totals.csv b/data/2024Q4/2-process/gcs_status_prior_totals.csv new file mode 100644 index 00000000..b453fc3d --- /dev/null +++ b/data/2024Q4/2-process/gcs_status_prior_totals.csv @@ -0,0 +1,7 @@ +"CC legal tool","Count" +"CC BY","13232393000" +"CC BY-NC","3478532000" +"CC BY-NC-ND","2239689000" +"CC BY-NC-SA","1549258800" +"CC BY-ND","6724112000" +"CC BY-SA","6552396000" diff --git a/data/2024Q4/2-process/gcs_status_retired_totals.csv b/data/2024Q4/2-process/gcs_status_retired_totals.csv new file mode 100644 index 00000000..42d21b2b --- /dev/null +++ b/data/2024Q4/2-process/gcs_status_retired_totals.csv @@ -0,0 +1,11 @@ +"CC legal tool","Count" +"CC DEVNATIONS","241" +"CC NC","111130000" +"CC NC-SA","42045000" +"CC ND","533000000" +"CC ND-NC","105730000" +"CC SA","240070000" +"CC NC-SAMPLING+","50000000" +"CC SAMPLING","284800000" +"CC SAMPLING+","407900000" +"CC PUBLICDOMAIN","47000000" diff --git a/data/2024Q4/2-process/gcs_top_25_tools.csv b/data/2024Q4/2-process/gcs_top_25_tools.csv deleted file mode 100644 index d5263121..00000000 --- a/data/2024Q4/2-process/gcs_top_25_tools.csv +++ /dev/null @@ -1,26 +0,0 @@ -"CC legal tool","Count" -"CC BY 3.0 AT","793000000" -"CC BY 2.0 AT","774000000" -"CC BY 2.0 BE","774000000" -"CC BY 2.0 IT","767000000" -"CC BY 3.0 IT","762000000" -"CC BY 3.0 FR","732000000" -"CC BY 3.0 NO","669000000" -"CC BY 3.0 US","661000000" -"CC BY 3.0 DE","579000000" -"CC BY 2.0 DE","553000000" -"CC BY 4.0","414000000" -"CC BY 3.0 IE","361000000" -"CC BY-SA 3.0 AT","344000000" -"CC BY-SA 2.0 BE","335000000" -"CC BY-SA 2.0 AT","335000000" -"CC BY-SA 2.0 IT","335000000" -"CC BY-SA 3.0 IT","333000000" -"CC BY 2.0 UK","331000000" -"CC BY-SA 3.0 FR","325000000" -"CC BY-SA 3.0 NO","307000000" -"CC BY-SA 3.0 US","293000000" -"CC BY-SA 3.0 DE","279000000" -"CC BY-ND 3.0 AT","279000000" -"CC BY-SA 2.0 DE","275000000" -"CC BY-ND 3.0 TH","274000000" diff --git a/data/2024Q4/2-process/gcs_totals_by_country.csv b/data/2024Q4/2-process/gcs_totals_by_country.csv new file mode 100644 index 00000000..4fae0d34 --- /dev/null +++ b/data/2024Q4/2-process/gcs_totals_by_country.csv @@ -0,0 +1,243 @@ +"Country","Count" +"Serbia and Montenegro","1189020000" +"United States","1001950000" +"Germany","12031900" +"United Kingdom","5348200" +"Brazil","1225402" +"France","776590" +"Spain","540230" +"Korea, Republic of","523913" +"Switzerland","518770" +"Canada","505720" +"Australia","501160" +"Netherlands","500610" +"Poland","467420" +"Indonesia","464766" +"Italy","337715" +"Japan","329490" +"Argentina","305668" +"Ireland","298550" +"Colombia","285080" +"Iran, Islamic Republic of","268490" +"Russian Federation","264007" +"India","252241" +"Belgium","236430" +"China","231860" +"Turkey","205764" +"Finland","186944" +"Sweden","176891" +"Mexico","174174" +"Denmark","171947" +"Slovenia","156250" +"Singapore","136608" +"South Africa","124099" +"Hong Kong","121683" +"Peru","98460" +"Norway","89557" +"Costa Rica","83621" +"Czech Republic","80608" +"Ecuador","78348" +"Ukraine","76967" +"Portugal","76889" +"Aruba","74692" +"Austria","73839" +"New Zealand","64452" +"Croatia (Hrvatska)","64051" +"Lithuania","52709" +"Greece","43780" +"Nicaragua","43722" +"Chile","43448" +"Malaysia","39233" +"Hungary","36620" +"Saudi Arabia","34136" +"Cuba","32785" +"Nigeria","30231" +"Romania","27824" +"Venezuela","26313" +"Israel","24304" +"Philippines","22414" +"Slovakia","21442" +"Bulgaria","18844" +"Pakistan","17099" +"Thailand","16742" +"Uzbekistan","15893" +"Luxembourg","15621" +"Taiwan, Province of China","15481" +"Iraq","15225" +"Qatar","12249" +"Moldova, Republic of","12082" +"Panama","11434" +"Iceland","11139" +"Egypt","9909" +"United Arab Emirates","9088" +"Ghana","8721" +"Somalia","8677" +"Azerbaijan","7437" +"Lebanon","7328" +"Kenya","7227" +"Armenia","6735" +"Vietnam","6483" +"El Salvador","5558" +"Bolivia","5198" +"Paraguay","5145" +"Algeria","5066" +"Nepal","4955" +"Latvia","4942" +"Cyprus","4922" +"Estonia","4781" +"Rwanda","3727" +"Virgin Islands, U.S.","3640" +"Madagascar","3566" +"Uganda","3555" +"Kazakhstan","3166" +"Sri Lanka","3143" +"Uruguay","2984" +"Congo, the Democratic Republic of the","2553" +"Bosnia and Herzegovina","2512" +"Ethiopia","2488" +"Malta","2465" +"Virgin Islands, British","2284" +"Jordan","2199" +"Tanzania, United Republic of","2130" +"Bangladesh","2017" +"Syrian Arab Republic","1926" +"Yemen","1758" +"Zimbabwe","1734" +"Libyan Arab Jamahiriya","1707" +"Macedonia, the Former Yugosalv Republic of","1699" +"Georgia","1541" +"Maldives","1426" +"Oman","1412" +"Saint Lucia","1392" +"Tonga","1381" +"Morocco","1202" +"Guatemala","1108" +"Dominican Republic","1071" +"Albania","1033" +"Bhutan","958" +"Tunisia","878" +"Cambodia","850" +"Namibia","816" +"Macao","682" +"Mozambique","600" +"Angola","553" +"Brunei Darussalam","544" +"Puerto Rico","456" +"Belarus","418" +"Palestinian Territory","411" +"Botswana","402" +"Malawi","388" +"Fiji","339" +"Sierra Leone","311" +"Zambia","310" +"Samoa","298" +"Burkina Faso","270" +"Faroe Islands","254" +"Afghanistan","252" +"Jamaica","249" +"Haiti","239" +"Kyrgyzstan","229" +"Bahrain","199" +"Myanmar","184" +"Trinidad and Tobago","168" +"Honduras","165" +"Tajikistan","149" +"New Caledonia","139" +"Mongolia","132" +"Liechtenstein","131" +"Mauritius","118" +"Benin","117" +"Senegal","112" +"Congo","112" +"Saint Helena","106" +"Lesotho","102" +"Saint Pierre and Miquelon","101" +"Sao Tome and Principe","97" +"Cameroon","91" +"Reunion","90" +"Holy See (Vatican City State)","75" +"Seychelles","70" +"Papua New Guinea","63" +"Cape Verde","59" +"Grenada","57" +"Chad","54" +"Kuwait","50" +"Greenland","47" +"Niger","43" +"Cote D'ivoire","42" +"Gambia","39" +"Barbados","36" +"Antarctica","36" +"Sudan","31" +"Lao People's Democratic Republic","27" +"Guyana","27" +"Belize","27" +"Monaco","26" +"Wallis and Futuna","24" +"Vanuatu","23" +"Togo","22" +"Christmas Island","21" +"Bermuda","21" +"Gibraltar","17" +"Saint Vincent and the Grenadines","16" +"San Marino","16" +"Antigua and Barbuda","15" +"Burundi","15" +"American Samoa","15" +"Andorra","14" +"Micronesia, Federated States of","13" +"Bahamas","13" +"Cayman Islands","13" +"Solomon Islands","13" +"Suriname","12" +"Norfolk Island","12" +"Mali","10" +"Guinea","9" +"Tuvalu","9" +"Eritrea","9" +"Niue","9" +"South Georgia and the South Sandwich Islands","8" +"Djibouti","8" +"Turkmenistan","7" +"Mauritania","7" +"Saint Kitts and Nevis","7" +"Falkland Islands (Malvinas)","6" +"Northern Mariana Islands","5" +"Swaziland","5" +"Nauru","5" +"Turks and Caicos Islands","5" +"Guinea-Bissau","5" +"Cook Islands","4" +"Equatorial Guinea","4" +"Palau","4" +"Anguilla","3" +"Liberia","3" +"Kiribati","2" +"Mayotte","1" +"Comoros","1" +"British Indian Ocean Territory","0" +"Western Sahara","0" +"Bouvet Island","0" +"Yugoslavia","0" +"United States Minor Outlying Islands","0" +"Netherlands Antilles","0" +"Tokelau","0" +"Central African Republic","0" +"Korea, Democratic People's Republic of","0" +"Heard Island and Mcdonald Islands","0" +"Guam","0" +"Guadeloupe","0" +"Pitcairn","0" +"Gabon","0" +"French Southern Territories","0" +"French Polynesia","0" +"French Guiana","0" +"France, Metropolitan","0" +"Martinique","0" +"Montserrat","0" +"European Union","0" +"East Timor","0" +"Dominica","0" +"Svalbard and Jan Mayen","0" +"Cocos (Keeling) Islands","0" +"Marshall Islands","0" diff --git a/data/2024Q4/2-process/gcs_totals_by_free_cultural.csv b/data/2024Q4/2-process/gcs_totals_by_free_cultural.csv index f787b4fa..20535c91 100644 --- a/data/2024Q4/2-process/gcs_totals_by_free_cultural.csv +++ b/data/2024Q4/2-process/gcs_totals_by_free_cultural.csv @@ -1,3 +1,3 @@ "Category","Count" -"Approved for Free Cultural Works","26263961000" -"Limited uses","18561939832" +"Approved for Free Cultural Works","21006439000" +"Limited use","15762297041" diff --git a/data/2024Q4/2-process/gcs_totals_by_language.csv b/data/2024Q4/2-process/gcs_totals_by_language.csv new file mode 100644 index 00000000..fa14e8f5 --- /dev/null +++ b/data/2024Q4/2-process/gcs_totals_by_language.csv @@ -0,0 +1,36 @@ +"Language","Count" +"English","933130000" +"Spanish","13907400" +"German","6040400" +"Portuguese","5156500" +"Indonesian","3293060" +"French","2127700" +"Italian","639900" +"Turkish","630040" +"Russian","628430" +"Polish","578500" +"Dutch","554300" +"Japanese","470320" +"Slovenian","376700" +"Chinese (Simplified)","356600" +"Korean","250660" +"Czech","236380" +"Swedish","180930" +"Serbian","162510" +"Romanian","159590" +"Croatian","155980" +"Catalan","151730" +"Norwegian","140300" +"Finnish","120640" +"Greek","110960" +"Hungarian","93330" +"Danish","72619" +"Arabic","70040" +"Chinese (Traditional)","69810" +"Lithuanian","61210" +"Slovak","58650" +"Latvian","46466" +"Hebrew","40703" +"Icelandic","27515" +"Bulgarian","24191" +"Estonian","20960" diff --git a/data/2024Q4/2-process/gcs_totals_by_product.csv b/data/2024Q4/2-process/gcs_totals_by_product.csv deleted file mode 100644 index 7de4eb3e..00000000 --- a/data/2024Q4/2-process/gcs_totals_by_product.csv +++ /dev/null @@ -1,8 +0,0 @@ -"CC legal tool product","Count" -"Licenses version 4.0","1018900000" -"Licenses version 3.0","20628432000" -"Licenses version 2.x","20875323832" -"Licenses version 1.0","2207545000" -"CC0 1.0","29700000" -"Public Domain Mark 1.0","13100000" -"Certification 1.0 US","52900000" diff --git a/data/2024Q4/2-process/gcs_totals_by_restrictions.csv b/data/2024Q4/2-process/gcs_totals_by_restrictions.csv index fd9c9d28..c046d4e6 100644 --- a/data/2024Q4/2-process/gcs_totals_by_restrictions.csv +++ b/data/2024Q4/2-process/gcs_totals_by_restrictions.csv @@ -1,5 +1,5 @@ "Category","Count" -"level 0","95700000" -"level 1","26168261000" -"level 2","7524932400" -"level 3","11037007432" +"level 0 - unrestricted","85680000" +"level 1 - few restrictions","20920759000" +"level 2 - some restrictions","5655765800" +"level 3 - many restrictions","10106531241" diff --git a/data/2024Q4/2-process/gcs_totals_by_unit.csv b/data/2024Q4/2-process/gcs_totals_by_unit.csv deleted file mode 100644 index ca9e6eb7..00000000 --- a/data/2024Q4/2-process/gcs_totals_by_unit.csv +++ /dev/null @@ -1,19 +0,0 @@ -"Legal Tool Unit","Count" -"by","17312085000" -"by-sa","8115676000" -"by-nd","6830620000" -"by-nc","4772599000" -"by-nc-nd","3381697000" -"by-nc-sa","2243522400" -"nd","628900000" -"sampling+","454000000" -"sampling","315200000" -"sa","286500000" -"nc","139386000" -"nd-nc","138490000" -"nc-sampling+","57300000" -"nc-sa","54225000" -"certification","52900000" -"cc0","29700000" -"mark","13100000" -"devnations","432" diff --git a/data/2024Q4/3-report/gcs_countries_highest_usage_latest_tools.png b/data/2024Q4/3-report/gcs_countries_highest_usage_latest_tools.png new file mode 100644 index 00000000..5ddd44ea Binary files /dev/null and b/data/2024Q4/3-report/gcs_countries_highest_usage_latest_tools.png differ diff --git a/data/2024Q4/3-report/gcs_free_culture.png b/data/2024Q4/3-report/gcs_free_culture.png new file mode 100644 index 00000000..5e70d928 Binary files /dev/null and b/data/2024Q4/3-report/gcs_free_culture.png differ diff --git a/data/2024Q4/3-report/gcs_languages_highest_usage_latest_tools.png b/data/2024Q4/3-report/gcs_languages_highest_usage_latest_tools.png new file mode 100644 index 00000000..bc714d3b Binary files /dev/null and b/data/2024Q4/3-report/gcs_languages_highest_usage_latest_tools.png differ diff --git a/data/2024Q4/3-report/gcs_product_totals.png b/data/2024Q4/3-report/gcs_product_totals.png new file mode 100644 index 00000000..c3f0c7c4 Binary files /dev/null and b/data/2024Q4/3-report/gcs_product_totals.png differ diff --git a/data/2024Q4/3-report/gcs_status_latest_tools.png b/data/2024Q4/3-report/gcs_status_latest_tools.png new file mode 100644 index 00000000..164fa414 Binary files /dev/null and b/data/2024Q4/3-report/gcs_status_latest_tools.png differ diff --git a/data/2024Q4/3-report/gcs_status_prior_tools.png b/data/2024Q4/3-report/gcs_status_prior_tools.png new file mode 100644 index 00000000..cd878d56 Binary files /dev/null and b/data/2024Q4/3-report/gcs_status_prior_tools.png differ diff --git a/data/2024Q4/3-report/gcs_status_retired_tools.png b/data/2024Q4/3-report/gcs_status_retired_tools.png new file mode 100644 index 00000000..6f4667b8 Binary files /dev/null and b/data/2024Q4/3-report/gcs_status_retired_tools.png differ diff --git a/data/2024Q4/3-report/gcs_tool_status.png b/data/2024Q4/3-report/gcs_tool_status.png new file mode 100644 index 00000000..4502fa40 Binary files /dev/null and b/data/2024Q4/3-report/gcs_tool_status.png differ diff --git a/data/2024Q4/3-report/github_repo_content_licenses.png b/data/2024Q4/3-report/github_repo_content_licenses.png new file mode 100644 index 00000000..dda9e91e Binary files /dev/null and b/data/2024Q4/3-report/github_repo_content_licenses.png differ diff --git a/data/2024Q4/README.md b/data/2024Q4/README.md new file mode 100644 index 00000000..b2f4c1df --- /dev/null +++ b/data/2024Q4/README.md @@ -0,0 +1,174 @@ +# Quantifying the Commons 2024Q4 + + + + +## Google Custom Search (GCS) + + + + +### Overview + +Google Custom Search (GCS) data uses the `totalResults` returned by API for search queries of the legal tool URLs (quoted and using `linkSite` for accuracy), countries codes, and language codes. + +**The results indicate there are a total of 36,768,736,041 online works in the commons--documents that are licensed or put in the public domain using a Creative Commons (CC) legal tool.** + +Thank you Google for providing the Programable Search Engine: Custom Search JSON API! + + + + + + + +### Products totals and percentages + +![Plots showing Creative Commons (CC) legal tool product totals and percentages.](3-report/gcs_product_totals.png) + +Plots showing Creative Commons (CC) legal tool product totals and percentages. + + + + + + +### CC legal tools status + +![Plots showing Creative Commons (CC) legal tool status totals and percentages.](3-report/gcs_tool_status.png) + +Plots showing Creative Commons (CC) legal tool status totals and percentages. + + + + + + +### Latest CC legal tools + +![Plots showing latest Creative Commons (CC) legal tool totals and percentages.](3-report/gcs_status_latest_tools.png) + +Plots showing latest Creative Commons (CC) legal tool totals and percentages. + + + + + + +### Prior CC legal tools + +![Plots showing prior Creative Commons (CC) legal tool totals and percentages.](3-report/gcs_status_prior_tools.png) + +Plots showing prior Creative Commons (CC) legal tool totals and percentages. + +The unit names have been normalized (~~`CC BY-ND-NC`~~ => `CC BY-NC-ND`). + + + + + + +### Retired CC legal tools + +![Plots showing retired Creative Commons (CC) legal tools total and percentages.](3-report/gcs_status_retired_tools.png) + +Plots showing retired Creative Commons (CC) legal tools total and percentages. + +For more information on retired legal tools, see [Retired Legal Tools - Creative Commons](https://creativecommons.org/retiredlicenses/). + + + + + + +### Countries with highest usage of latest tools + +![Plots showing countries with the highest useage of the latest Creative Commons (CC) legal tools.](3-report/gcs_countries_highest_usage_latest_tools.png) + +Plots showing countries with the highest useage of the latest Creative Commons (CC) legal tools. + +The latest tools include Licenses version 4.0 (CC BY 4.0, CC BY-NC 4.0, CC BY-NC-ND 4.0, CC BY-NC-SA 4.0, CC-BY-ND 4.0, CC BY-SA 4.0), CC0 1.0, and the Public Domain Mark (PDM 1.0). + +The complete data set indicates there are a total of 2,220,330,077 online works using a latest CC legal tool. + + + + + + +### Languages with highest usage of latest tools + +![Plots showing languages with the highest useage of the latest Creative Commons (CC) legal tools.](3-report/gcs_languages_highest_usage_latest_tools.png) + +Plots showing languages with the highest useage of the latest Creative Commons (CC) legal tools. + +The latest tools include Licenses version 4.0 (CC BY 4.0, CC BY-NC 4.0, CC BY-NC-ND 4.0, CC BY-NC-SA 4.0, CC-BY-ND 4.0, CC BY-SA 4.0), CC0 1.0, and the Public Domain Mark (PDM 1.0). + +The complete data set indicates there are a total of 970,145,024 online works using a latest CC legal tool. + + + + + + +### Approved for Free Cultural Works + +![Plots showing Approved for Free Cultural Works legal tool usage.](3-report/gcs_free_culture.png) + +Plots showing Approved for Free Cultural Works legal tool usage. + +[Understanding Free Cultural Works - Creative Commons](https://creativecommons.org/public-domain/freeworks/): + +> Using [the Freedom Defined definition of a "Free Cultural Work"], material licensed under CC BY or BY-SA is a free cultural work. (So is anything in the worldwide public domain marked with CC0 or the Public Domain Mark.) CC’s other licenses– BY-NC, BY-ND, BY-NC-SA, and BY-NC-ND–only allow more limited uses, and material under these licenses is not considered a free cultural work. + + + + + + + + + +## Notes + + + + +### Data locations + +This report was generated as part of: + +**[creativecommons/quantifying][repo]:** *quantify the size and diversity of the commons--the collection of works that are openly licensed or in the public domain* + +The data used to generate this report is available in that repository at the following locations: + + | Resource | Location | + | --------------- | -------- | + | Fetched data: | [`1-fetch/`](1-fetch) | + | Processed data: | [`2-process/`](2-process) | + | Report data: | [`3-report/`](3-report) | + +[repo]: https://github.com/creativecommons/quantifying + + + + + + + +### Usage + +The Creative Commons (CC) icons and logos are for use under the Creative Commons Trademark Policy (see [Policies - Creative Commons][ccpolicies]). **They *aren't* licensed under a Creative Commons license** (also see [Could I use a CC license to share my logo or trademark? - Frequently Asked Questions - Creative Commons][tmfaq]). + +[![CC0 1.0 Universal (CC0 1.0) Public Domain Dedicationbutton][cc-zero-png]][cc-zero] +Otherwise, this report (including the plot images) is dedicated to the public domain under the [CC0 1.0 Universal (CC0 1.0) Public Domain Dedication][cc-zero]. + +[ccpolicies]: https://creativecommons.org/policies +[tmfaq]: https://creativecommons.org/faq/#could-i-use-a-cc-license-to-share-my-logo-or-trademark +[cc-zero-png]: https://licensebuttons.net/l/zero/1.0/88x31.png "CC0 1.0 Universal (CC0 1.0) Public Domain Dedication button" +[cc-zero]: https://creativecommons.org/publicdomain/zero/1.0/ "Creative Commons — CC0 1.0 Universal" + + + + + diff --git a/scripts/1-fetch/deviantart_fetch.py b/scripts/1-fetch/deviantart_fetch.py index 35195c3a..3ab92fdc 100755 --- a/scripts/1-fetch/deviantart_fetch.py +++ b/scripts/1-fetch/deviantart_fetch.py @@ -243,7 +243,7 @@ def main(): return # Log the paths being used - shared.log_paths(LOGGER, PATHS) + shared.paths_log(LOGGER, PATHS) # Create data directory for this phase os.makedirs(PATHS["data_phase"], exist_ok=True) diff --git a/scripts/1-fetch/gcs_fetch.py b/scripts/1-fetch/gcs_fetch.py index 031cfd59..6533deb9 100755 --- a/scripts/1-fetch/gcs_fetch.py +++ b/scripts/1-fetch/gcs_fetch.py @@ -82,7 +82,10 @@ def parse_arguments(): action="store_true", help="Development mode: avoid hitting API (generate fake data)", ) - return parser.parse_args() + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + return args def get_search_service(): @@ -268,7 +271,7 @@ def query_gcs(args, service, last_completed_plan_index, plan): def main(): args = parse_arguments() - shared.log_paths(LOGGER, PATHS) + shared.paths_log(LOGGER, PATHS) service = get_search_service() shared.git_fetch_and_merge(args, PATHS["repo"]) initialize_all_data_files(args) @@ -297,7 +300,8 @@ def main(): LOGGER.error(e.message) sys.exit(e.exit_code) except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: LOGGER.info("(130) Halted via KeyboardInterrupt.") diff --git a/scripts/1-fetch/github_fetch.py b/scripts/1-fetch/github_fetch.py index bd5c9737..fc1412e6 100755 --- a/scripts/1-fetch/github_fetch.py +++ b/scripts/1-fetch/github_fetch.py @@ -71,7 +71,10 @@ def parse_arguments(): action="store_true", help="Enable git actions (fetch, merge, add, commit, and push)", ) - return parser.parse_args() + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + return args def check_for_completion(): @@ -166,7 +169,7 @@ def query_github(args, session): def main(): args = parse_arguments() - shared.log_paths(LOGGER, PATHS) + shared.paths_log(LOGGER, PATHS) check_for_completion() session = get_requests_session() tool_data = query_github(args, session) @@ -190,7 +193,8 @@ def main(): LOGGER.error(e.message) sys.exit(e.exit_code) except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: LOGGER.info("(130) Halted via KeyboardInterrupt.") diff --git a/scripts/1-fetch/internetarchive_fetch.py b/scripts/1-fetch/internetarchive_fetch.py index ca6d340c..92b3e591 100755 --- a/scripts/1-fetch/internetarchive_fetch.py +++ b/scripts/1-fetch/internetarchive_fetch.py @@ -202,7 +202,7 @@ def main(): return # Log the paths being used - shared.log_paths(LOGGER, PATHS) + shared.paths_log(LOGGER, PATHS) # Create data directory for this phase os.makedirs(PATHS["data_phase"], exist_ok=True) diff --git a/scripts/1-fetch/metmuseum_fetch.py b/scripts/1-fetch/metmuseum_fetch.py index 80e25d58..6c378c35 100755 --- a/scripts/1-fetch/metmuseum_fetch.py +++ b/scripts/1-fetch/metmuseum_fetch.py @@ -171,7 +171,7 @@ def main(): return # Log the paths being used - shared.log_paths(LOGGER, PATHS) + shared.paths_log(LOGGER, PATHS) # Create data directory for this phase os.makedirs(PATHS["data_phase"], exist_ok=True) diff --git a/scripts/1-fetch/vimeo_fetch.py b/scripts/1-fetch/vimeo_fetch.py index 124655ff..d307e62a 100755 --- a/scripts/1-fetch/vimeo_fetch.py +++ b/scripts/1-fetch/vimeo_fetch.py @@ -213,7 +213,7 @@ def main(): return # Log the paths being used - shared.log_paths(LOGGER, PATHS) + shared.paths_log(LOGGER, PATHS) # Create data directory for this phase os.makedirs(PATHS["data_phase"], exist_ok=True) diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py index 8467d8b6..ed5ea026 100755 --- a/scripts/1-fetch/wikicommons_fetch.py +++ b/scripts/1-fetch/wikicommons_fetch.py @@ -254,7 +254,7 @@ def main(): return # Log the paths being used - shared.log_paths(LOGGER, PATHS) + shared.paths_log(LOGGER, PATHS) # Create data directory for this phase os.makedirs(PATHS["data_phase"], exist_ok=True) diff --git a/scripts/1-fetch/wikipedia_fetch.py b/scripts/1-fetch/wikipedia_fetch.py index 5a32d5af..57a04663 100755 --- a/scripts/1-fetch/wikipedia_fetch.py +++ b/scripts/1-fetch/wikipedia_fetch.py @@ -198,7 +198,7 @@ def main(): return # Log the paths being used - shared.log_paths(LOGGER, PATHS) + shared.paths_log(LOGGER, PATHS) # Create data directory for this phase os.makedirs(PATHS["data_phase"], exist_ok=True) diff --git a/scripts/1-fetch/youtube_fetch.py b/scripts/1-fetch/youtube_fetch.py index b75660d3..7689ab9a 100755 --- a/scripts/1-fetch/youtube_fetch.py +++ b/scripts/1-fetch/youtube_fetch.py @@ -236,7 +236,7 @@ def main(): return # Log the paths being used - shared.log_paths(LOGGER, PATHS) + shared.paths_log(LOGGER, PATHS) # Create data directory for this phase os.makedirs(PATHS["data_phase"], exist_ok=True) diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py index 9396a995..c5d354b7 100755 --- a/scripts/2-process/gcs_process.py +++ b/scripts/2-process/gcs_process.py @@ -1,7 +1,6 @@ #!/usr/bin/env python """ -This file is dedicated to processing Google Custom Search data -for analysis and comparison between quarters. +Process Google Custom Search (GCS) data. """ # Standard library import argparse @@ -27,13 +26,6 @@ LOGGER, PATHS = shared.setup(__file__) # Constants -FILE1_COUNT = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv") -FILE2_LANGUAGE = shared.path_join( - PATHS["data_1-fetch"], "gcs_2_count_by_language.csv" -) -FILE3_COUNTRY = shared.path_join( - PATHS["data_1-fetch"], "gcs_3_count_by_country.csv" -) QUARTER = os.path.basename(PATHS["data_quarter"]) @@ -43,155 +35,31 @@ def parse_arguments(): """ LOGGER.info("Parsing command-line options") parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--quarter", + default=QUARTER, + help=f"Data quarter in format YYYYQx (default: {QUARTER})", + ) parser.add_argument( "--enable-save", action="store_true", - help="Enable saving results", + help="Enable saving results (default: False)", ) parser.add_argument( "--enable-git", action="store_true", - help="Enable git actions (fetch, merge, add, commit, and push)", + help="Enable git actions such as fetch, merge, add, commit, and push" + " (default: False)", ) - return parser.parse_args() - - -# def load_quarter_data(quarter): -# """ -# Load data for a specific quarter. -# """ -# file_path = os.path.join(PATHS["data"], f"{quarter}", -# "1-fetch", "gcs_fetched.csv") -# if not os.path.exists(file_path): -# LOGGER.error(f"Data file for quarter {quarter} not found.") -# return None -# return pd.read_csv(file_path) - - -# def compare_data(current_quarter, previous_quarter): -# """ -# Compare data between two quarters. -# """ -# current_data = load_quarter_data(current_quarter) -# previous_data = load_quarter_data(previous_quarter) - -# if current_data is None or previous_data is None: -# return - -# # Process the data to compare by country -# compare_by_country(current_data, previous_data, -# current_quarter, previous_quarter) - -# # Process the data to compare by license -# compare_by_license(current_data, previous_data, -# current_quarter, previous_quarter) - -# # Process the data to compare by language -# compare_by_language(current_data, previous_data, -# current_quarter, previous_quarter) - - -# def compare_by_country(current_data, previous_data, -# current_quarter, previous_quarter): -# """ -# Compare the number of webpages licensed by country between two quarters. -# """ -# LOGGER.info(f"Comparing data by country between -# {current_quarter} and {previous_quarter}.") - -# # Get the list of country columns dynamically -# columns = [col.strip() for col in current_data.columns.tolist()] -# start_index = columns.index("United States") -# end_index = columns.index("Japan") + 1 - -# countries = columns[start_index:end_index] - -# current_country_data = current_data[countries].sum() -# previous_country_data = previous_data[countries].sum() - -# comparison = pd.DataFrame({ -# 'Country': countries, -# f'{current_quarter}': current_country_data.values, -# f'{previous_quarter}': previous_country_data.values, -# 'Difference': current_country_data.values -# - previous_country_data.values -# }) - -# LOGGER.info(f"Country comparison:\n{comparison}") - -# # Visualization code to be added here - - -# def compare_by_license(current_data, previous_data, -# current_quarter, previous_quarter): -# """ -# Compare the number of webpages licensed by license type -# between two quarters. -# """ -# LOGGER.info(f"Comparing data by license type -# between {current_quarter} and {previous_quarter}.") - -# current_license_data = -# current_data.groupby('LICENSE TYPE').sum().sum(axis=1) -# previous_license_data = -# previous_data.groupby('LICENSE TYPE').sum().sum(axis=1) - -# comparison = pd.DataFrame({ -# 'License Type': current_license_data.index, -# f'{current_quarter}': current_license_data.values, -# f'{previous_quarter}': previous_license_data.values, -# 'Difference': current_license_data.values -# - previous_license_data.values -# }) - -# LOGGER.info(f"License type comparison:\n{comparison}") - -# # Visualization code to be added here - - -# def compare_by_language(current_data, previous_data, -# current_quarter, previous_quarter): -# """ -# Compare the number of webpages licensed by language between two quarters. -# """ -# LOGGER.info(f"Comparing data by language between -# {current_quarter} and {previous_quarter}.") - -# # Get the list of language columns dynamically -# columns = [col.strip() for col in current_data.columns.tolist()] -# start_index = columns.index("English") -# languages = columns[start_index:] - -# current_language_data = current_data[languages].sum() -# previous_language_data = previous_data[languages].sum() - -# comparison = pd.DataFrame({ -# 'Language': languages, -# f'{current_quarter}': current_language_data.values, -# f'{previous_quarter}': previous_language_data.values, -# 'Difference': current_language_data.values -# - previous_language_data.values -# }) - -# LOGGER.info(f"Language comparison:\n{comparison}") - - -# def parse_arguments(): -# """ -# Parses command-line arguments, returns parsed arguments. -# """ -# LOGGER.info("Parsing command-line arguments") -# parser = argparse.ArgumentParser( -# description="Google Custom Search Comparison Report") -# parser.add_argument( -# "--current_quarter", type=str, required=True, -# help="Current quarter for comparison (e.g., 2024Q3)" -# ) -# parser.add_argument( -# "--previous_quarter", type=str, required=True, -# help="Previous quarter for comparison (e.g., 2024Q2)" -# ) -# return parser.parse_args() + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + if args.quarter != QUARTER: + global PATHS + PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + args.logger = LOGGER + args.paths = PATHS + return args def data_to_csv(args, data, file_path): @@ -204,21 +72,11 @@ def data_to_csv(args, data, file_path): ) -def process_top_25_tools(args, count_data): - LOGGER.info("Processing top 25 tools") - data = count_data.sort_values("COUNT", ascending=False) - data.reset_index(drop=True, inplace=True) - data = data.iloc[:25] - data.rename( - columns={"TOOL_IDENTIFIER": "CC legal tool", "COUNT": "Count"}, - inplace=True, - ) - file_path = shared.path_join(PATHS["data_phase"], "gcs_top_25_tools.csv") - data_to_csv(args, data, file_path) - - -def process_totals_by_product(args, count_data): - LOGGER.info("Processing totals by product") +def process_product_totals(args, count_data): + """ + Processing count data: totals by product + """ + LOGGER.info(process_product_totals.__doc__.strip()) data = { "Licenses version 4.0": 0, "Licenses version 3.0": 0, @@ -252,46 +110,100 @@ def process_totals_by_product(args, count_data): data = pd.DataFrame( data.items(), columns=["CC legal tool product", "Count"] ) - file_path = shared.path_join( - PATHS["data_phase"], "gcs_totals_by_product.csv" - ) + file_path = shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv") data_to_csv(args, data, file_path) -def process_totals_by_unit(args, count_data): - LOGGER.info("Processing totals by unit") - data = {} +def process_latest_prior_retired_totals(args, count_data): + """ + Process count data: totals by unit in three categories: latest, prior, + and retired + """ + LOGGER.info(process_latest_prior_retired_totals.__doc__.strip()) + # https://creativecommons.org/retiredlicenses/ + retired = [ + # DevNations, + "CC DEVNATIONS ", + # NoDerivs + "CC ND ", + # NoDerivs-NonCommercial + "CC ND-NC ", + # NonCommercial + "CC NC ", + # NonCommercial-Sampling+ + "CC NC-SAMPLING+", + # NonCommercial-ShareAlike + "CC NC-SA ", + # Public Domain Dedication and Certification + "CC PUBLICDOMAIN", + # Sampling + "CC SAMPLING ", + # Sampling+ + "CC SAMPLING+ ", + # ShareAlike + "CC SA ", + ] + data = {"latest": {}, "prior": {}, "retired": {}} + status = {"Latest": 0, "Prior": 0, "Retired": 0} for row in count_data.itertuples(index=False): tool = row[0] count = row[1] - if tool.startswith("PDM"): - key = "mark" - elif "CC0" in tool: - key = "cc0" - elif "PUBLICDOMAIN" in tool: - key = "certification" - else: - parts = tool.split() - key = parts[1].lower() - if key == "by-nd-nc": - key = "by-nc-nd" - if key not in data.keys(): - data[key] = count + tool_begin = False + for version in ["1.0", "2.0", "2.1", "2.5", "3.0", "4.0"]: + if version in tool: + separator = tool.index(version) + # everything before version (including space) + tool_begin = tool[:separator] + if not tool_begin: + tool_begin = tool + # Latest + if ( + ("BY" in tool and "4.0" in tool) + or tool.startswith("CC0") + or tool.startswith("PDM") + ): + try: + data["latest"][tool] += count + except KeyError: + data["latest"][tool] = count + status["Latest"] += count + # Prior + elif "BY" in tool and tool_begin not in retired: + if "ND-NC" in tool_begin: + tool_begin = tool_begin.replace("ND-NC", "NC-ND") + try: + data["prior"][tool_begin.strip()] += count + except KeyError: + data["prior"][tool_begin.strip()] = count + status["Prior"] += count + # Retired else: - data[key] += count - - data = pd.DataFrame(data.items(), columns=["Legal Tool Unit", "Count"]) - data.sort_values("Count", ascending=False, inplace=True) - data.reset_index(drop=True, inplace=True) - file_path = shared.path_join(PATHS["data_phase"], "gcs_totals_by_unit.csv") - data_to_csv(args, data, file_path) + try: + data["retired"][tool_begin.strip()] += count + except KeyError: + data["retired"][tool_begin.strip()] = count + status["Retired"] += count + data["combined"] = status + + for key, value_data in data.items(): + dataframe = pd.DataFrame( + value_data.items(), columns=["CC legal tool", "Count"] + ) + file_path = shared.path_join( + PATHS["data_phase"], f"gcs_status_{key}_totals.csv" + ) + data_to_csv(args, dataframe, file_path) def process_totals_by_free_cultural(args, count_data): - LOGGER.info("Processing totals by Approved for Free Cultural Works") + """ + Processing count data: totals by Approved for Free Cultural Works + """ + # https://creativecommons.org/public-domain/freeworks/ + LOGGER.info(process_totals_by_free_cultural.__doc__.strip()) data = { "Approved for Free Cultural Works": 0, - "Limited uses": 0, + "Limited use": 0, } for row in count_data.itertuples(index=False): tool = row[0] @@ -304,7 +216,7 @@ def process_totals_by_free_cultural(args, count_data): if unit in ["by-sa", "by", "sa", "sampling+"]: key = "Approved for Free Cultural Works" else: - key = "Limited uses" + key = "Limited use" data[key] += count data = pd.DataFrame(data.items(), columns=["Category", "Count"]) @@ -317,22 +229,30 @@ def process_totals_by_free_cultural(args, count_data): def process_totals_by_restrictions(args, count_data): - LOGGER.info("Processing totals by restriction") - data = {"level 0": 0, "level 1": 0, "level 2": 0, "level 3": 0} + """ + Processing count data: totals by restriction + """ + LOGGER.info(process_totals_by_restrictions.__doc__.strip()) + data = { + "level 0 - unrestricted": 0, + "level 1 - few restrictions": 0, + "level 2 - some restrictions": 0, + "level 3 - many restrictions": 0, + } for row in count_data.itertuples(index=False): tool = row[0] count = row[1] if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool: - key = "level 0" + key = "level 0 - unrestricted" else: parts = tool.split() unit = parts[1].lower() if unit in ["by-sa", "by", "sa", "sampling+"]: - key = "level 1" + key = "level 1 - few restrictions" elif unit in ["by-nc", "by-nc-sa", "sampling", "nc", "nc-sa"]: - key = "level 2" + key = "level 2 - some restrictions" else: - key = "level 3" + key = "level 3 - many restrictions" data[key] += count data = pd.DataFrame(data.items(), columns=["Category", "Count"]) @@ -342,28 +262,80 @@ def process_totals_by_restrictions(args, count_data): data_to_csv(args, data, file_path) +def process_totals_by_language(args, data): + """ + Processing language data: totals by language + """ + LOGGER.info(process_totals_by_language.__doc__.strip()) + data = data.groupby(["LANGUAGE"], as_index=False)["COUNT"].sum() + data = data.sort_values("COUNT", ascending=False) + data.reset_index(drop=True, inplace=True) + data.rename( + columns={ + "LANGUAGE": "Language", + "COUNT": "Count", + }, + inplace=True, + ) + file_path = shared.path_join( + PATHS["data_phase"], "gcs_totals_by_language.csv" + ) + data_to_csv(args, data, file_path) + + +def process_totals_by_country(args, data): + """ + Processing country data: totals by country + """ + LOGGER.info(process_totals_by_country.__doc__.strip()) + data = data.groupby(["COUNTRY"], as_index=False)["COUNT"].sum() + data = data.sort_values("COUNT", ascending=False) + data.reset_index(drop=True, inplace=True) + data.rename( + columns={ + "COUNTRY": "Country", + "COUNT": "Count", + }, + inplace=True, + ) + file_path = shared.path_join( + PATHS["data_phase"], "gcs_totals_by_country.csv" + ) + data_to_csv(args, data, file_path) + + def main(): args = parse_arguments() - shared.log_paths(LOGGER, PATHS) + shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) # Count data - count_data = pd.read_csv(FILE1_COUNT, usecols=["TOOL_IDENTIFIER", "COUNT"]) - process_top_25_tools(args, count_data) - process_totals_by_product(args, count_data) - process_totals_by_unit(args, count_data) + file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv") + count_data = pd.read_csv(file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"]) + process_product_totals(args, count_data) + process_latest_prior_retired_totals(args, count_data) process_totals_by_free_cultural(args, count_data) process_totals_by_restrictions(args, count_data) - # # Langauge data - # langauge_data = pd.read_csv( - # FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"] - # ) + # Langauge data + file2_language = shared.path_join( + PATHS["data_1-fetch"], "gcs_2_count_by_language.csv" + ) + language_data = pd.read_csv( + file2_language, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"] + ) + process_totals_by_language(args, language_data) - # # Country data - # country_data = pd.read_csv( - # FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"] - # ) + # Country data + file3_country = shared.path_join( + PATHS["data_1-fetch"], "gcs_3_count_by_country.csv" + ) + country_data = pd.read_csv( + file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"] + ) + process_totals_by_country(args, country_data) + + # TODO: compare with previous quarter, previous year args = shared.git_add_and_commit( args, @@ -384,8 +356,9 @@ def main(): LOGGER.error(e.message) sys.exit(e.exit_code) except SystemExit as e: - LOGGER.error(f"System exit with code: {e.exit_code}") - sys.exit(e.exit_code) + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) except KeyboardInterrupt: LOGGER.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) diff --git a/scripts/3-report/gcs_report.py b/scripts/3-report/gcs_report.py index 5f4c05d3..105313fa 100755 --- a/scripts/3-report/gcs_report.py +++ b/scripts/3-report/gcs_report.py @@ -1,341 +1,503 @@ #!/usr/bin/env python """ This file is dedicated to visualizing and analyzing the data collected -from Google Custom Search. +from Google Custom Search (GCS). """ # Standard library import argparse import os import sys +import textwrap import traceback -from datetime import datetime, timezone # Third-party -import matplotlib.pyplot as plt -import matplotlib.ticker as ticker import pandas as pd -import seaborn as sns -from pandas import PeriodIndex +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer # Add parent directory so shared can be imported sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # First-party/Local +import plot # noqa: E402 import shared # noqa: E402 # Setup LOGGER, PATHS = shared.setup(__file__) +# Constants +QUARTER = os.path.basename(PATHS["data_quarter"]) +SECTION = "Google Custom Search (GCS)" + def parse_arguments(): """ Parses command-line arguments, returns parsed arguments. """ LOGGER.info("Parsing command-line arguments") - - # Taken from shared module, fix later - datetime_today = datetime.now(timezone.utc) - quarter = PeriodIndex([datetime_today.date()], freq="Q")[0] - - parser = argparse.ArgumentParser(description="Google Custom Search Report") + parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--quarter", - "-q", - type=str, - default=f"{quarter}", - help="Data quarter in format YYYYQx, e.g., 2024Q2", + default=QUARTER, + help=f"Data quarter in format YYYYQx (default: {QUARTER})", ) parser.add_argument( - "--skip-commit", + "--show-plots", action="store_true", - help="Don't git commit changes (also skips git push changes)", + help="Show generated plots (default: False)", ) parser.add_argument( - "--skip-push", + "--enable-save", action="store_true", - help="Don't git push changes", + help="Enable saving results (default: False)", ) parser.add_argument( - "--show-plots", + "--enable-git", action="store_true", - help="Show generated plots (in addition to saving them)", + help="Enable git actions such as fetch, merge, add, commit, and push" + " (default: False)", ) args = parser.parse_args() - if args.skip_commit: - args.skip_push = True + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + if args.quarter != QUARTER: + global PATHS + PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + args.logger = LOGGER + args.paths = PATHS return args -def load_data(args): +def gcs_intro(args): """ - Load the collected data from the CSV file. + Write Google Custom Search (GCS) introduction. """ - selected_quarter = args.quarter - - file_path = os.path.join( - PATHS["data"], f"{selected_quarter}", "1-fetch", "gcs_fetched.csv" + LOGGER.info(gcs_intro.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "gcs_product_totals.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "CC legal tool product" + data = pd.read_csv(file_path, index_col=name_label) + total_count = f"{data['Count'].sum():,d}" + shared.update_readme( + args, + SECTION, + "Overview", + None, + None, + "Google Custom Search (GCS) data uses the `totalResults` returned by" + " API for search queries of the legal tool URLs (quoted and using" + " `linkSite` for accuracy), countries codes, and language codes.\n" + "\n" + f"**The results indicate there are a total of {total_count} online" + " works in the commons--documents that are licensed or put in the" + " public domain using a Creative Commons (CC) legal tool.**\n" + "\n" + "Thank you Google for providing the Programable Search Engine: Custom" + " Search JSON API!\n", ) - - if not os.path.exists(file_path): - LOGGER.error(f"Data file not found: {file_path}") - return pd.DataFrame() - - data = pd.read_csv(file_path) - LOGGER.info(f"Data loaded from {file_path}") - return data -def visualize_by_country(data, args): +def plot_products(args): """ - Create a bar chart for the number of webpages licensed by country. + Create plots for CC legal tool product totals and percentages """ - LOGGER.info( - "Creating a bar chart for the number of webpages licensed by country." + LOGGER.info(plot_products.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], "gcs_product_totals.csv" + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "CC legal tool product" + data = pd.read_csv(file_path, index_col=name_label) + data = data[::-1] # reverse order + + title = "Products totals and percentages" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label="Count", + bar_xscale="log", + bar_ylabel=name_label, ) - selected_quarter = args.quarter + image_path = shared.path_join( + PATHS["data_phase"], "gcs_product_totals.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - # Get the list of country columns dynamically - columns = [col.strip() for col in data.columns.tolist()] + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) - start_index = columns.index("United States") - end_index = columns.index("Japan") + 1 + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing Creative Commons (CC) legal tool product totals and" + " percentages.", + ) - countries = columns[start_index:end_index] - data.columns = data.columns.str.strip() +def plot_tool_status(args): + """ + Create plots for the CC legal tool status totals and percentages + """ + LOGGER.info(plot_tool_status.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "gcs_status_combined_totals.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "CC legal tool" + data = pd.read_csv(file_path, index_col=name_label) + data.sort_values(name_label, ascending=False, inplace=True) + + title = "CC legal tools status" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label="Count", + bar_xscale="log", + bar_ylabel="CC legal tool status", + ) - LOGGER.info(f"Cleaned Columns: {data.columns.tolist()}") + image_path = shared.path_join(PATHS["data_phase"], "gcs_tool_status.png") + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - # Aggregate the data by summing the counts for each country - country_data = data[countries].sum() + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) - plt.figure(figsize=(12, 8)) - ax = sns.barplot(x=country_data.index, y=country_data.values) - plt.title( - f"Number of Google Webpages Licensed by Country ({selected_quarter})" + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing Creative Commons (CC) legal tool status totals and" + " percentages.", ) - plt.xlabel("Country") - plt.ylabel("Number of Webpages") - plt.xticks(rotation=45) - # Add value numbers to the top of each bar - for p in ax.patches: - ax.annotate( - format(p.get_height(), ",.0f"), - (p.get_x() + p.get_width() / 2.0, p.get_height()), - ha="center", - va="center", - xytext=(0, 9), - textcoords="offset points", - ) - # Format the y-axis to display numbers without scientific notation - ax.get_yaxis().get_major_formatter().set_scientific(False) - ax.get_yaxis().set_major_formatter( - plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))) +def plot_latest_tools(args): + """ + Create plots for latest CC legal tool totals and percentages + """ + LOGGER.info(plot_latest_tools.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "gcs_status_latest_totals.csv", ) - - output_directory = os.path.join( - PATHS["data"], f"{selected_quarter}", "3-report" + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "CC legal tool" + data = pd.read_csv(file_path, index_col=name_label) + data.sort_values(name_label, ascending=False, inplace=True) + + title = "Latest CC legal tools" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label="Count", ) - LOGGER.info(f"Output directory: {output_directory}") - - # Create the directory if it does not exist - os.makedirs(output_directory, exist_ok=True) - image_path = os.path.join(output_directory, "gcs_country_report.png") - plt.savefig(image_path) + image_path = shared.path_join( + PATHS["data_phase"], "gcs_status_latest_tools.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - if args.show_plots: - plt.show() + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) shared.update_readme( - PATHS, - image_path, - "Google Custom Search", - "Number of Google Webpages Licensed by Country", - "Country Report", args, + SECTION, + title, + image_path, + "Plots showing latest Creative Commons (CC) legal tool totals and" + " percentages.", ) - LOGGER.info("Visualization by country created.") - -def visualize_by_license_type(data, args): +def plot_prior_tools(args): """ - Create a bar chart for the number of webpages licensed by license type + Create plots for prior CC legal tool totals and percentages """ - LOGGER.info( - "Creating a bar chart for the number of " - "webpages licensed by license type." + LOGGER.info(plot_prior_tools.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], "gcs_status_prior_totals.csv" ) - - selected_quarter = args.quarter - - # Strip any leading/trailing spaces from the columns - data.columns = data.columns.str.strip() - - # Sum the values across all columns except the first one ('LICENSE TYPE') - license_data = data.set_index("LICENSE TYPE").sum(axis=1) - - plt.figure(figsize=(12, 8)) - ax = sns.barplot(x=license_data.index, y=license_data.values) - plt.title( - f"Number of Webpages Licensed by License Type ({selected_quarter})" + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "CC legal tool" + data = pd.read_csv(file_path, index_col=name_label) + data.sort_values(name_label, ascending=False, inplace=True) + + title = "Prior CC legal tools" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label="Count", ) - plt.xlabel("License Type") - plt.ylabel("Number of Webpages") - plt.xticks(rotation=45, ha="right") - # Use shorter X axis labels - ax.set_xticklabels( - [ - "CC BY 2.5" if "by/2.5" in label else label - for label in license_data.index - ] + image_path = shared.path_join( + PATHS["data_phase"], "gcs_status_prior_tools.png" ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - # Use the millions formatter for y-axis - def millions_formatter(x, pos): - "The two args are the value and tick position" - return f"{x * 1e-6:.1f}M" + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) - ax.yaxis.set_major_formatter(ticker.FuncFormatter(millions_formatter)) - - plt.tight_layout() - - output_directory = os.path.join( - PATHS["data"], f"{selected_quarter}", "3-report" + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing prior Creative Commons (CC) legal tool totals and" + " percentages.", + "The unit names have been normalized (~~`CC BY-ND-NC`~~ =>" + " `CC BY-NC-ND`).", ) - LOGGER.info(f"Output directory: {output_directory}") - # Create the directory if it does not exist - os.makedirs(output_directory, exist_ok=True) - image_path = os.path.join(output_directory, "gcs_licensetype_report.png") +def plot_retired_tools(args): + """ + Create plots for retired CC legal tool totals and percentages + """ + LOGGER.info(plot_retired_tools.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "gcs_status_retired_totals.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "CC legal tool" + data = pd.read_csv(file_path, index_col=name_label) + data.sort_values(name_label, ascending=False, inplace=True) + + title = "Retired CC legal tools" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label="Count", + bar_xscale="log", + ) - plt.savefig(image_path) + image_path = shared.path_join( + PATHS["data_phase"], "gcs_status_retired_tools.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - if args.show_plots: - plt.show() + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) shared.update_readme( - PATHS, - image_path, - "Google Custom Search", - "Number of Webpages Licensed by License Type", - "License Type Report", args, + SECTION, + title, + image_path, + "Plots showing retired Creative Commons (CC) legal tools total and" + " percentages.", + "For more information on retired legal tools, see [Retired Legal Tools" + " - Creative Commons](https://creativecommons.org/retiredlicenses/).", ) - LOGGER.info("Visualization by license type created.") - -def visualize_by_language(data, args): +def plot_countries_highest_usage(args): """ - Create a bar chart for the number of webpages licensed by language. + Create plots for the countries with highest usage of latest tools """ - LOGGER.info( - "Creating a bar chart for the number of webpages licensed by language." + LOGGER.info(plot_countries_highest_usage.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], "gcs_totals_by_country.csv" + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Country" + data_label = "Count" + data = pd.read_csv(file_path, index_col=name_label) + total_count = f"{data['Count'].sum():,d}" + data.sort_values(data_label, ascending=False, inplace=True) + data = data[:10] # limit to highest 10 + data = data[::-1] # reverse order + + title = "Countries with highest usage of latest tools" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + bar_xscale="log", ) - selected_quarter = args.quarter + image_path = shared.path_join( + PATHS["data_phase"], "gcs_countries_highest_usage_latest_tools.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - # Get the list of country columns dynamically - columns = [col.strip() for col in data.columns.tolist()] + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) - start_index = columns.index("English") - end_index = columns.index("Indonesian") + 1 + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing countries with the highest useage of the latest" + " Creative Commons (CC) legal tools.", + "The latest tools include Licenses version 4.0 (CC BY 4.0, CC BY-NC" + " 4.0, CC BY-NC-ND 4.0, CC BY-NC-SA 4.0, CC-BY-ND 4.0, CC BY-SA 4.0)," + " CC0 1.0, and the Public Domain Mark (PDM 1.0).\n" + "\n" + f"The complete data set indicates there are a total of {total_count}" + " online works using a latest CC legal tool.", + ) - languages = columns[start_index:end_index] - data.columns = data.columns.str.strip() +def plot_languages_highest_usage(args): + """ + Create plots for the languages with highest usage of latest tools + """ + LOGGER.info(plot_languages_highest_usage.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], "gcs_totals_by_language.csv" + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Language" + data_label = "Count" + data = pd.read_csv(file_path, index_col=name_label) + total_count = f"{data['Count'].sum():,d}" + data.sort_values(data_label, ascending=False, inplace=True) + data = data[:10] # limit to highest 10 + data = data[::-1] # reverse order + + title = "Languages with highest usage of latest tools" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + bar_xscale="log", + ) - LOGGER.info(f"Cleaned Columns: {data.columns.tolist()}") + image_path = shared.path_join( + PATHS["data_phase"], "gcs_languages_highest_usage_latest_tools.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - # Aggregate the data by summing the counts for each country - language_data = data[languages].sum() + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) - plt.figure(figsize=(12, 8)) - ax = sns.barplot(x=language_data.index, y=language_data.values) - plt.title( - f"Number of Google Webpages Licensed by Language ({selected_quarter})" + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing languages with the highest useage of the latest" + " Creative Commons (CC) legal tools.", + "The latest tools include Licenses version 4.0 (CC BY 4.0, CC BY-NC" + " 4.0, CC BY-NC-ND 4.0, CC BY-NC-SA 4.0, CC-BY-ND 4.0, CC BY-SA 4.0)," + " CC0 1.0, and the Public Domain Mark (PDM 1.0).\n" + "\n" + f"The complete data set indicates there are a total of {total_count}" + " online works using a latest CC legal tool.", ) - plt.xlabel("Language") - plt.ylabel("Number of Webpages") - plt.xticks(rotation=45) - # Add value numbers to the top of each bar - for p in ax.patches: - ax.annotate( - format(p.get_height(), ",.0f"), - (p.get_x() + p.get_width() / 2.0, p.get_height()), - ha="center", - va="center", - xytext=(0, 9), - textcoords="offset points", - ) - # Format the y-axis to display numbers without scientific notation - ax.get_yaxis().get_major_formatter().set_scientific(False) - ax.get_yaxis().set_major_formatter( - plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))) +def plot_free_culture(args): + """ + Create plots for the languages with highest usage of latest tools + """ + LOGGER.info(plot_free_culture.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "gcs_totals_by_free_cultural.csv", ) - - output_directory = os.path.join( - PATHS["data"], f"{selected_quarter}", "3-report" + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Category" + data_label = "Count" + data = pd.read_csv(file_path, index_col=name_label) + + title = "Approved for Free Cultural Works" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, ) - LOGGER.info(f"Output directory: {output_directory}") + image_path = shared.path_join(PATHS["data_phase"], "gcs_free_culture.png") + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - # Create the directory if it does not exist - os.makedirs(output_directory, exist_ok=True) - image_path = os.path.join(output_directory, "gcs_language_report.png") - plt.savefig(image_path) - - if args.show_plots: - plt.show() + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) shared.update_readme( - PATHS, - image_path, - "Google Custom Search", - "Number of Google Webpages Licensed by Language", - "Language Report", args, + SECTION, + title, + image_path, + "Plots showing Approved for Free Cultural Works legal tool usage.", + "[Understanding Free Cultural Works - Creative" + " Commons](https://creativecommons.org/public-domain/freeworks/):\n" + "\n" + '> Using [the Freedom Defined definition of a "Free Cultural Work"],' + " material licensed under CC BY or BY-SA is a free cultural work. (So" + " is anything in the worldwide public domain marked with CC0 or the" + " Public Domain Mark.) CC’s other licenses– BY-NC, BY-ND, BY-NC-SA," + " and BY-NC-ND–only allow more limited uses, and material under these" + " licenses is not considered a free cultural work.", ) - LOGGER.info("Visualization by language created.") - def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - args = parse_arguments() - - data = load_data(args) - if data.empty: - return - - current_directory = os.getcwd() - LOGGER.info(f"Current working directory: {current_directory}") - - visualize_by_country(data, args) - visualize_by_license_type(data, args) - visualize_by_language(data, args) - - # Add and commit changes - if not args.skip_commit: - shared.add_and_commit( - PATHS["repo"], PATHS["data_quarter"], "Add and commit new reports" - ) - - # Push changes - if not args.skip_push: - shared.push_changes(PATHS["repo"]) + shared.paths_log(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + + gcs_intro(args) + plot_products(args) + plot_tool_status(args) + plot_latest_tools(args) + plot_prior_tools(args) + plot_retired_tools(args) + plot_countries_highest_usage(args) + plot_languages_highest_usage(args) + plot_free_culture(args) + + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit Google Custom Search (GCS) reports for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) if __name__ == "__main__": @@ -348,11 +510,20 @@ def main(): LOGGER.error(e.message) sys.exit(e.exit_code) except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: LOGGER.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") sys.exit(1) diff --git a/scripts/3-report/notes.py b/scripts/3-report/notes.py new file mode 100755 index 00000000..ccefd058 --- /dev/null +++ b/scripts/3-report/notes.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python +""" +Add project references. +""" +# Standard library +import argparse +import os +import sys +import textwrap +import traceback + +# Third-party +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) + +# Constants +QUARTER = os.path.basename(PATHS["data_quarter"]) +SECTION = "Notes" + + +def parse_arguments(): + """ + Parses command-line arguments, returns parsed arguments. + """ + LOGGER.info("Parsing command-line arguments") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--quarter", + default=QUARTER, + help=f"Data quarter in format YYYYQx (default: {QUARTER})", + ) + parser.add_argument( + "--show-plots", + action="store_true", + help="Show generated plots (default: False)", + ) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results (default: False)", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions such as fetch, merge, add, commit, and push" + " (default: False)", + ) + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + if args.quarter != QUARTER: + global PATHS + PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + args.logger = LOGGER + args.paths = PATHS + return args + + +def data_locations(args): + """ + Write References + """ + shared.update_readme( + args, + SECTION, + "Data locations", + None, + None, + "This report was generated as part of:\n" + "\n" + "**[creativecommons/quantifying][repo]:** *quantify the size and" + " diversity of the commons--the collection of works that are openly" + " licensed or in the public domain*\n" + "\nThe data used to generate this report is available in that" + " repository at the following locations:\n" + "\n" + " | Resource | Location |\n" + " | --------------- | -------- |\n" + " | Fetched data: | [`1-fetch/`](1-fetch) |\n" + " | Processed data: | [`2-process/`](2-process) |\n" + " | Report data: | [`3-report/`](3-report) |\n" + "\n" + "[repo]: https://github.com/creativecommons/quantifying\n", + ) + + +def usage(args): + """ + Write copyright + """ + shared.update_readme( + args, + SECTION, + "Usage", + None, + None, + "The Creative Commons (CC) icons and logos are for use under the" + " Creative Commons Trademark Policy (see [Policies - Creative" + " Commons][ccpolicies]). **They *aren't* licensed under a Creative" + " Commons license** (also see [Could I use a CC license to share my" + " logo or trademark? - Frequently Asked Questions - Creative" + " Commons][tmfaq]).\n" + "\n" + "[![CC0 1.0 Universal (CC0 1.0) Public Domain Dedication" + "button][cc-zero-png]][cc-zero]\n" + "Otherwise, this report (including the plot images) is dedicated to" + " the public domain under the [CC0 1.0 Universal (CC0 1.0) Public" + " Domain Dedication][cc-zero].\n" + "\n" + "[ccpolicies]: https://creativecommons.org/policies\n" + "[tmfaq]: https://creativecommons.org/faq/" + "#could-i-use-a-cc-license-to-share-my-logo-or-trademark\n" + "[cc-zero-png]: https://licensebuttons.net/l/zero/1.0/88x31.png" + ' "CC0 1.0 Universal (CC0 1.0) Public Domain Dedication button"\n' + "[cc-zero]: https://creativecommons.org/publicdomain/zero/1.0/" + ' "Creative Commons — CC0 1.0 Universal"', + ) + + +def main(): + args = parse_arguments() + shared.paths_log(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + + data_locations(args) + usage(args) + + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit References for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.exit_code) + except SystemExit as e: + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") + sys.exit(1) diff --git a/scripts/plot.py b/scripts/plot.py new file mode 100644 index 00000000..c7f47366 --- /dev/null +++ b/scripts/plot.py @@ -0,0 +1,144 @@ +# Standard library +import os +import sys + +# Third-party +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +from matplotlib import colormaps + +# Add parent directory so shared can be imported +sys.path.append(os.path.dirname(__file__)) + +# First-party/Local +import shared # noqa: E402 + + +def annotate_ylabels(ax, data, data_label, colors): + i = 0 + c = 0 + ytick = ax.yaxis.get_major_ticks(numticks=1)[0] + # defaults: ytick.major.size + ytick.major.pad + indent = -1 * (ytick.get_tick_padding() + ytick.get_pad()) + for index, row in data.iterrows(): + if c > len(colors): + c = 0 + + # annotate totals + ax.annotate( + f" {row[data_label]:>15,d}", + (indent, i - 0.1), + xycoords=("axes points", "data"), + color=colors[c], + fontsize="x-small", + horizontalalignment="right", + verticalalignment="top", + ) + + # annotate percentages + percent = row[data_label] / data[data_label].sum() * 100 + if percent < 0.1: + percent = "< .1%" + else: + percent = f"{percent:4.1f}%" + ax.annotate( + percent, + (1.02, i), + xycoords=("axes fraction", "data"), + backgroundcolor=colors[c], + color="white", + fontsize="x-small", + horizontalalignment="left", + verticalalignment="center", + ) + + i += 1 + c += 1 + return ax + + +def combined_plot( + args, data, title, name_label, data_label, bar_xscale=None, bar_ylabel=None +): + if len(data) > 10: + raise shared.QuantifyingException( + "the combined_plot() function is limited to a maximum of 10 data" + " points" + ) + + plt.rcParams.update({"font.family": "monospace", "figure.dpi": 300}) + + height = 1 + len(data) * 0.5 + if height < 2.5: + height = 2.5 + + fig, (ax1, ax2) = plt.subplots( + 1, 2, figsize=(8, height), width_ratios=(2, 1), layout="constrained" + ) + colors = colormaps["tab10"].colors + + # 1st axes: horizontal barplot of counts + # pad tick labels to make room for annotation + tick_labels = [] + for index, row in data.iterrows(): + count = f"{row[data_label]:,d}" + tick_labels.append(f"{index}\n{' ' * len(count)}") + if bar_xscale == "log": + log = True + else: + bar_xscale = "linear" + log = False + ax1.barh(y=tick_labels, width=data[data_label], color=colors, log=log) + ax1.tick_params(axis="x", which="major", labelrotation=45) + ax1.set_xlabel("Number of works") + ax1.xaxis.set_major_formatter(ticker.FuncFormatter(number_formatter)) + if bar_ylabel is not None: + ax1.set_ylabel(bar_ylabel) + else: + ax1.set_ylabel(name_label) + ax1 = annotate_ylabels(ax1, data, data_label, colors) + + # 2nd axes: pie chart of percentages + data.plot.pie( + ax=ax2, + y=data_label, + colors=colors, + labels=None, + legend=False, + radius=1.25, + ) + ax2.set_title("Percent") + ax2.set_ylabel(None) + + # plot + plt.suptitle(title) + plt.annotate( + f"Creative Commons (CC)\nbar x scale: {bar_xscale}, data from" + f" {args.quarter}", + (0.95, 5), + xycoords=("figure fraction", "figure points"), + color="gray", + fontsize="x-small", + horizontalalignment="right", + ) + + if args.show_plots: + plt.show() + + return plt + + +def number_formatter(x, pos): + """ + Use the millions formatter for x-axis + + The two args are the value (x) and tick position (pos) + """ + if x >= 1e9: + return f"{x * 1e-9:,.0f}B" + elif x >= 1e6: + return f"{x * 1e-6:,.0f}M" + elif x >= 1e3: + return f"{x * 1e-3:,.0f}K" + else: + return f"{x:,.0f}" diff --git a/scripts/shared.py b/scripts/shared.py index dc682669..83a53b53 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -1,6 +1,4 @@ # Standard library -# import argparse -# Standard library import logging import os from datetime import datetime, timezone @@ -17,53 +15,6 @@ def __init__(self, message, exit_code=None): super().__init__(self.message) -def setup(current_file): - # Set up logging - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(module)s - %(message)s", - ) - logger = logging.getLogger(__name__) - - # Datetime - datetime_today = datetime.now(timezone.utc) - quarter = PeriodIndex([datetime_today.date()], freq="Q")[0] - - # Paths - paths = {} - paths["repo"] = os.path.dirname(path_join(__file__, "..")) - paths["dotenv"] = path_join(paths["repo"], ".env") - paths["data"] = os.path.dirname( - os.path.abspath(os.path.realpath(current_file)) - ) - current_phase = os.path.basename( - os.path.dirname(os.path.abspath(os.path.realpath(current_file))) - ) - paths["data"] = path_join(paths["repo"], "data") - data_quarter = path_join(paths["data"], f"{quarter}") - for phase in ["1-fetch", "2-process", "3-report"]: - paths[f"data_{phase}"] = path_join(data_quarter, phase) - paths["data_phase"] = path_join(data_quarter, current_phase) - - paths["data_quarter"] = data_quarter - - return logger, paths - - -def log_paths(logger, paths): - paths_list = [] - repo_path = paths["repo"] - for label, path in paths.items(): - label = f"{label}:" - if label == "repo:": - paths_list.append(f"\n{' ' * 4}{label} {path}") - else: - path_new = path.replace(repo_path, ".") - paths_list.append(f"\n{' ' * 8}{label:<15} {path_new}") - paths_list = "".join(paths_list) - logger.info(f"PATHS:{paths_list}") - - def git_fetch_and_merge(args, repo_path, branch=None): if not args.enable_git: return @@ -135,24 +86,102 @@ def path_join(*paths): return os.path.abspath(os.path.realpath(os.path.join(*paths))) +def paths_log(logger, paths): + paths_list = [] + repo_path = paths["repo"] + for label, path in paths.items(): + label = f"{label}:" + if label == "repo:": + paths_list.append(f"\n{' ' * 4}{label} {path}") + else: + path_new = path.replace(repo_path, ".") + paths_list.append(f"\n{' ' * 8}{label:<15} {path_new}") + paths_list = "".join(paths_list) + logger.info(f"PATHS:{paths_list}") + + +def paths_update(logger, paths, old_quarter, new_quarter): + logger.info(f"Updating paths: replacing {old_quarter} with {new_quarter}") + for label in [ + "data_1-fetch", + "data_2-process", + "data_3-report", + "data_phase", + "data_quarter", + ]: + paths[label] = paths[label].replace(old_quarter, new_quarter) + return paths + + +def setup(current_file): + # Set up logging + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(module)s - %(message)s", + ) + logger = logging.getLogger(__name__) + + # Datetime + datetime_today = datetime.now(timezone.utc) + quarter = PeriodIndex([datetime_today.date()], freq="Q")[0] + + # Paths + paths = {} + paths["repo"] = os.path.dirname(path_join(__file__, "..")) + paths["dotenv"] = path_join(paths["repo"], ".env") + paths["data"] = os.path.dirname( + os.path.abspath(os.path.realpath(current_file)) + ) + current_phase = os.path.basename( + os.path.dirname(os.path.abspath(os.path.realpath(current_file))) + ) + paths["data"] = path_join(paths["repo"], "data") + data_quarter = path_join(paths["data"], f"{quarter}") + for phase in ["1-fetch", "2-process", "3-report"]: + paths[f"data_{phase}"] = path_join(data_quarter, phase) + paths["data_phase"] = path_join(data_quarter, current_phase) + + paths["data_quarter"] = data_quarter + + return logger, paths + + def update_readme( - paths, image_path, data_source, description, section_title, args + args, + section_title, + entry_title, + image_path, + image_caption, + entry_text=None, ): """ Update the README.md file with the generated images and descriptions. """ + if not args.enable_save: + return + if image_path and not image_caption: + raise QuantifyingException( + "The update_readme function requires an image caption if an image" + " path is provided" + ) + if not image_path and image_caption: + raise QuantifyingException( + "The update_readme function requires an image path if an image" + " caption is provided" + ) + + logger = args.logger + paths = args.paths + readme_path = path_join(paths["data"], args.quarter, "README.md") # Define section markers for each data source - section_marker_start = f"" - section_marker_end = f"" + section_start_line = f"\n" + section_end_line = f"\n" - # Define specific section markers for each report type - specific_section_start = f"" - specific_section_end = f"" - - # Convert image path to a relative path - rel_image_path = os.path.relpath(image_path, os.path.dirname(readme_path)) + # Define entry markers for each plot (optional) and description + entry_start_line = f"\n" + entry_end_line = f"\n" if os.path.exists(readme_path): with open(readme_path, "r") as f: @@ -160,104 +189,90 @@ def update_readme( else: lines = [] - # Ensure the title is at the top - title_line = f"# {args.quarter} Quantifying the Commons\n" - + title_line = f"# Quantifying the Commons {args.quarter}\n" if not lines or lines[0].strip() != title_line.strip(): - # Add title if not present or incorrect - lines = [title_line] + lines - - # Locate or create the data source section - section_start = section_end = None - for i, line in enumerate(lines): - if section_marker_start in line: - section_start = i - if section_marker_end in line: - section_end = i - - if section_start is None or section_end is None: - # If the data source section is not present, add it + # Add the title if it is not present or is incorrect + lines.insert(0, title_line) + lines.insert(1, "\n") + + # We only need to know the position of the end to append new entries + if section_start_line in lines: + # Locate the data source section if it is already present + section_end_index = lines.index(section_end_line) + else: + # Add the data source section if it is absent lines.extend( [ - f"## Data Source: {data_source}\n", - f"{section_marker_start}\n", - f"{section_marker_end}\n", + f"{section_start_line}", + "\n", + "\n", + f"## {section_title}\n", + "\n", + "\n", + f"{section_end_line}", + "\n", ] ) - section_start = len(lines) - 2 - section_end = len(lines) - 1 - - # Locate or create the specific section within the data source section - specific_start = specific_end = None - for i in range(section_start, section_end): - if specific_section_start in lines[i]: - specific_start = i - if specific_section_end in lines[i]: - specific_end = i - - # Prepare the new content for this specific section - new_content = [ - f"{specific_section_start}\n", - f"### {section_title}\n", - f"![{description}]({rel_image_path})\n", - f"{description}\n", - f"{specific_section_end}\n", + section_end_index = lines.index(section_end_line) + + # Locate the entry if it is already present + if entry_start_line in lines: + entry_start_index = lines.index(entry_start_line) + entry_end_index = lines.index(entry_end_line) + # Include any trailing empty/whitespace-only lines + while not lines[entry_end_index + 1].strip(): + entry_end_index += 1 + # Initalize variables of entry is not present + else: + entry_start_index = None + entry_end_index = None + + # Create entry markdown content + if image_path: + relative_image_path = os.path.relpath( + image_path, os.path.dirname(readme_path) + ) + image = f"\n![{image_caption}]({relative_image_path})\n" + else: + image = "" + if entry_text and image_caption: + text = f"\n{image_caption}\n\n{entry_text}\n" + elif entry_text: + text = f"\n{entry_text}\n" + elif image_caption: + text = f"\n{image_caption}\n" + else: + text = "" + entry_lines = [ + f"{entry_start_line}", + "\n", + f"### {entry_title}\n", + image, + text, + "\n", + f"{entry_end_line}", + "\n", + "\n", ] - # Replace or add the specific section content - if specific_start is not None and specific_end is not None: - # Replace the content between the specific markers + if entry_start_index is None: + # Add entry to end of section lines = ( - lines[:specific_start] - + new_content - + lines[specific_end + 1 :] # noqa: E203 + lines[:section_end_index] + entry_lines + lines[section_end_index:] ) else: - # Add new specific section before the end of the data source section - lines = lines[:section_end] + new_content + lines[section_end:] + # Replace entry + lines = ( + lines[:entry_start_index] + + entry_lines + + lines[entry_end_index + 1 :] # noqa: E203 + ) # Write back to the README.md file with open(readme_path, "w") as f: f.writelines(lines) - logging.info( - f"Updated {readme_path} with new image and" - f"description for {section_title}." + logger.info(f"README path: {readme_path.replace(paths['repo'], '.')}") + logger.info( + f"Updated README with new image and description for {entry_title}." ) - - -# def main(): -# parser = argparse.ArgumentParser(description="Git operations script") -# parser.add_argument( -# "--operation", -# type=str, -# required=True, -# help="Operation to perform: fetch_and_merge, add_and_commit, push", -# ) -# parser.add_argument("--message", type=str, help="Commit message") -# parser.add_argument( -# "--branch", -# type=str, -# default="refine-automation", -# help="Branch to fetch and merge from", -# ) -# args = parser.parse_args() - -# repo_path = os.getcwd() # Assuming the script runs in repo root - -# if args.operation == "fetch_and_merge": -# fetch_and_merge(repo_path, args.branch) -# elif args.operation == "add_and_commit": -# if not args.message: -# raise ValueError( -# "Commit message is required for add_and_commit operation" -# ) -# add_and_commit(repo_path, args.message) -# elif args.operation == "push": -# push_changes(repo_path) -# else: -# raise ValueError("Unsupported operation") - - -# if __name__ == "__main__": -# main()