From ac798aaf30ccac41860edf0df45e9caedaabbdcb Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Wed, 13 Sep 2023 11:18:55 -0700 Subject: [PATCH 01/12] use assert_operator instead of assert minitest style guide adherence. encountered an intermittent unit test failure so assert_operator will provide better failure feedback than assert --- test/test_scheduler.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_scheduler.rb b/test/test_scheduler.rb index bac2f842..58808ea5 100644 --- a/test/test_scheduler.rb +++ b/test/test_scheduler.rb @@ -39,7 +39,7 @@ def test_scheduler sleep(5) finished_array = listen_string.split("\n") - assert finished_array.length >= 4 + assert_operator 4, :<=, finished_array.length assert job1_thread.alive? job1_thread.kill From 88e8399f990adc240d37a9e4196a5207720fe011 Mon Sep 17 00:00:00 2001 From: mdorf Date: Wed, 1 Nov 2023 11:08:58 -0700 Subject: [PATCH 02/12] fixed ncbo_ontology_archive_old_submissions error output --- Gemfile.lock | 4 ++-- bin/ncbo_ontology_archive_old_submissions | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 92164456..825ea3a5 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 911d71aefe433314d11398445e3856fca503b9c1 + revision: 6db93bb3d5095a5fe0d017e572c5a04caa34ebc6 branch: develop specs: goo (0.0.2) @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 3ae6bfb56dc59a670b5bc1a513ff4929f8cf3756 + revision: 067104ae94c0e9d058cfbf419364fbf03f34de43 branch: develop specs: ncbo_annotator (0.0.1) diff --git a/bin/ncbo_ontology_archive_old_submissions b/bin/ncbo_ontology_archive_old_submissions index 535c129e..1b2268a5 100755 --- a/bin/ncbo_ontology_archive_old_submissions +++ b/bin/ncbo_ontology_archive_old_submissions @@ -119,13 +119,17 @@ onts.each do |ont| end end -msg = JSON.pretty_generate(bad_submissions) puts -puts msg -logger.error(msg) - -msg = "Number of errored submissions: #{bad_submissions.length}" -puts msg -logger.error(msg) - +if bad_submissions.empty? + msg = "No errored submissions found" + puts msg + logger.info(msg) +else + msg = JSON.pretty_generate(bad_submissions) + puts msg + logger.error(msg) + msg = "Number of errored submissions: #{bad_submissions.length}" + puts msg + logger.error(msg) +end \ No newline at end of file From 7429289a0cf9d48c43191ed3bcaa9ce82a20f6d4 Mon Sep 17 00:00:00 2001 From: mdorf Date: Wed, 1 Nov 2023 20:16:56 -0700 Subject: [PATCH 03/12] Gemfile.lock update --- Gemfile.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index e7a782db..9e47a23c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: e33a0e451f8a8226d98291168e45b46d7065e670 + revision: ff10e5ff4103431da1aec3cbbaebc57547c0035c branch: develop specs: ontologies_linked_data (0.0.1) @@ -119,9 +119,9 @@ GEM google-cloud-env (1.6.0) faraday (>= 0.17.3, < 3.0) google-cloud-errors (1.3.1) - google-protobuf (3.24.4) - google-protobuf (3.24.4-x86_64-darwin) - google-protobuf (3.24.4-x86_64-linux) + google-protobuf (3.25.0) + google-protobuf (3.25.0-x86_64-darwin) + google-protobuf (3.25.0-x86_64-linux) googleapis-common-protos (1.4.0) google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) From bb93561f78522cf6b289afc81b3bf86cdbbb8cfc Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Tue, 7 Nov 2023 21:31:46 -0800 Subject: [PATCH 04/12] Gemfile update --- Gemfile.lock | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index e7a782db..fab55057 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: e33a0e451f8a8226d98291168e45b46d7065e670 + revision: 9487c7f73e68abab097af523d42c1d2e106e614b branch: develop specs: ontologies_linked_data (0.0.1) @@ -76,7 +76,7 @@ GEM multi_json (~> 1.0) addressable (2.8.5) public_suffix (>= 2.0.2, < 6.0) - base64 (0.1.1) + base64 (0.2.0) bcrypt (3.1.19) builder (3.2.4) coderay (1.1.3) @@ -119,14 +119,14 @@ GEM google-cloud-env (1.6.0) faraday (>= 0.17.3, < 3.0) google-cloud-errors (1.3.1) - google-protobuf (3.24.4) - google-protobuf (3.24.4-x86_64-darwin) - google-protobuf (3.24.4-x86_64-linux) + google-protobuf (3.25.0) + google-protobuf (3.25.0-x86_64-darwin) + google-protobuf (3.25.0-x86_64-linux) googleapis-common-protos (1.4.0) google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.9.0) + googleapis-common-protos-types (1.10.0) google-protobuf (~> 3.18) googleauth (1.8.1) faraday (>= 0.17.3, < 3.a) @@ -154,8 +154,8 @@ GEM jwt (2.7.1) launchy (2.5.2) addressable (~> 2.8) - libxml-ruby (4.1.1) - logger (1.5.3) + libxml-ruby (4.1.2) + logger (1.6.0) macaddr (1.7.2) systemu (~> 2.6.5) mail (2.6.6) From e8fa020e75bda0ea228089523e26e065d228eb3b Mon Sep 17 00:00:00 2001 From: mdorf Date: Tue, 14 Nov 2023 14:13:06 -0800 Subject: [PATCH 05/12] fixes to the analytics script and a new script to generate UA analytics for documentation --- Gemfile | 6 ++ Gemfile.lock | 35 ++++++-- bin/generate_ua_analytics_file.rb | 126 ++++++++++++++++++++++++++++ lib/ncbo_cron/ontology_analytics.rb | 49 ++++++----- 4 files changed, 190 insertions(+), 26 deletions(-) create mode 100755 bin/generate_ua_analytics_file.rb diff --git a/Gemfile b/Gemfile index a2c93e43..ea60eb54 100644 --- a/Gemfile +++ b/Gemfile @@ -3,6 +3,12 @@ source 'https://rubygems.org' gemspec gem 'ffi' + +# This is needed temporarily to pull the Google Universal Analytics (UA) +# data and store it in a file. See (bin/generate_ua_analytics_file.rb) +# The ability to pull this data from Google will cease on July 1, 2024 +gem "google-apis-analytics_v3" + gem 'google-analytics-data' gem 'mail', '2.6.6' gem 'multi_json' diff --git a/Gemfile.lock b/Gemfile.lock index 9e47a23c..de996c17 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 067104ae94c0e9d058cfbf419364fbf03f34de43 + revision: ebbb7a3c28ecde49c261290bec34ab082490a271 branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: ff10e5ff4103431da1aec3cbbaebc57547c0035c + revision: 5600020a8017cb4901e719f577032b0be6a14949 branch: develop specs: ontologies_linked_data (0.0.1) @@ -76,7 +76,7 @@ GEM multi_json (~> 1.0) addressable (2.8.5) public_suffix (>= 2.0.2, < 6.0) - base64 (0.1.1) + base64 (0.2.0) bcrypt (3.1.19) builder (3.2.4) coderay (1.1.3) @@ -84,6 +84,7 @@ GEM connection_pool (2.4.1) cube-ruby (0.0.3) dante (0.2.0) + declarative (0.0.20) docile (1.4.0) domain_name (0.5.20190701) unf (>= 0.0.5, < 1.0.0) @@ -113,6 +114,17 @@ GEM google-analytics-data-v1beta (0.9.0) gapic-common (>= 0.20.0, < 2.a) google-cloud-errors (~> 1.0) + google-apis-analytics_v3 (0.13.0) + google-apis-core (>= 0.11.0, < 2.a) + google-apis-core (0.11.2) + addressable (~> 2.5, >= 2.5.1) + googleauth (>= 0.16.2, < 2.a) + httpclient (>= 2.8.1, < 3.a) + mini_mime (~> 1.0) + representable (~> 3.0) + retriable (>= 2.0, < 4.a) + rexml + webrick google-cloud-core (1.6.0) google-cloud-env (~> 1.0) google-cloud-errors (~> 1.0) @@ -126,7 +138,7 @@ GEM google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.9.0) + googleapis-common-protos-types (1.10.0) google-protobuf (~> 3.18) googleauth (1.8.1) faraday (>= 0.17.3, < 3.a) @@ -147,6 +159,7 @@ GEM http-accept (1.7.0) http-cookie (1.0.5) domain_name (~> 0.5) + httpclient (2.8.3) i18n (0.9.5) concurrent-ruby (~> 1.0) json (2.6.3) @@ -154,8 +167,8 @@ GEM jwt (2.7.1) launchy (2.5.2) addressable (~> 2.8) - libxml-ruby (4.1.1) - logger (1.5.3) + libxml-ruby (4.1.2) + logger (1.6.0) macaddr (1.7.2) systemu (~> 2.6.5) mail (2.6.6) @@ -164,6 +177,7 @@ GEM mime-types (3.5.1) mime-types-data (~> 3.2015) mime-types-data (3.2023.1003) + mini_mime (1.1.5) minitest (4.7.5) mlanett-redis-lock (0.2.7) redis @@ -191,11 +205,16 @@ GEM redis-client (>= 0.17.0) redis-client (0.18.0) connection_pool + representable (3.2.0) + declarative (< 0.1.0) + trailblazer-option (>= 0.1.1, < 0.2.0) + uber (< 0.2.0) rest-client (2.1.0) http-accept (>= 1.7.0, < 2.0) http-cookie (>= 1.0.2, < 2.0) mime-types (>= 1.16, < 4.0) netrc (~> 0.8) + retriable (3.1.2) rexml (3.2.6) rsolr (2.5.0) builder (>= 2.1.2) @@ -224,13 +243,16 @@ GEM systemu (2.6.5) test-unit-minitest (0.9.1) minitest (~> 4.7) + trailblazer-option (0.1.2) tzinfo (2.0.6) concurrent-ruby (~> 1.0) + uber (0.1.0) unf (0.1.4) unf_ext unf_ext (0.0.8.2) uuid (2.3.9) macaddr (~> 1.0) + webrick (1.8.1) PLATFORMS ruby @@ -244,6 +266,7 @@ DEPENDENCIES ffi goo! google-analytics-data + google-apis-analytics_v3 mail (= 2.6.6) minitest (< 5.0) multi_json diff --git a/bin/generate_ua_analytics_file.rb b/bin/generate_ua_analytics_file.rb new file mode 100755 index 00000000..0a432a92 --- /dev/null +++ b/bin/generate_ua_analytics_file.rb @@ -0,0 +1,126 @@ +require 'logger' +require 'google/apis/analytics_v3' +require 'google/api_client/auth/key_utils' + +module NcboCron + module Models + + class OntologyAnalyticsUA + + def initialize(logger) + @logger = logger + end + + def run + redis = Redis.new(:host => NcboCron.settings.redis_host, :port => NcboCron.settings.redis_port) + ontology_analytics = fetch_ontology_analytics + File.open(NcboCron.settings.analytics_path_to_ua_data_file, 'w') do |f| + f.write(ontology_analytics.to_json) + end + end + + def fetch_ontology_analytics + google_client = authenticate_google + aggregated_results = Hash.new + start_year = Date.parse(NcboCron.settings.analytics_start_date).year || 2013 + ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map {|o| o.acronym} + # ont_acronyms = ["NCIT", "ONTOMA", "CMPO", "AEO", "SNOMEDCT"] + filter_str = (NcboCron.settings.analytics_filter_str.nil? || NcboCron.settings.analytics_filter_str.empty?) ? "" : ";#{NcboCron.settings.analytics_filter_str}" + + ont_acronyms.each do |acronym| + max_results = 10000 + num_results = 10000 + start_index = 1 + results = nil + + loop do + results = google_client.get_ga_data( + ids = NcboCron.settings.analytics_profile_id, + start_date = NcboCron.settings.analytics_start_date, + end_date = Date.today.to_s, + metrics = 'ga:pageviews', + { + dimensions: 'ga:pagePath,ga:year,ga:month', + filters: "ga:pagePath=~^(\\/ontologies\\/#{acronym})(\\/?\\?{0}|\\/?\\?{1}.*)$#{filter_str}", + start_index: start_index, + max_results: max_results + } + ) + results.rows ||= [] + start_index += max_results + num_results = results.rows.length + @logger.info "Acronym: #{acronym}, Results: #{num_results}, Start Index: #{start_index}" + @logger.flush + + results.rows.each do |row| + if aggregated_results.has_key?(acronym) + # year + if aggregated_results[acronym].has_key?(row[1].to_i) + # month + if aggregated_results[acronym][row[1].to_i].has_key?(row[2].to_i) + aggregated_results[acronym][row[1].to_i][row[2].to_i] += row[3].to_i + else + aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i + end + else + aggregated_results[acronym][row[1].to_i] = Hash.new + aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i + end + else + aggregated_results[acronym] = Hash.new + aggregated_results[acronym][row[1].to_i] = Hash.new + aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i + end + end + + if num_results < max_results + # fill up non existent years + (start_year..Date.today.year).each do |y| + aggregated_results[acronym] = Hash.new if aggregated_results[acronym].nil? + aggregated_results[acronym][y] = Hash.new unless aggregated_results[acronym].has_key?(y) + end + # fill up non existent months with zeros + (1..12).each { |n| aggregated_results[acronym].values.each { |v| v[n] = 0 unless v.has_key?(n) } } + break + end + end + end + + @logger.info "Completed Universal Analytics pull..." + @logger.flush + + aggregated_results + end + + def authenticate_google + Google::Apis::ClientOptions.default.application_name = NcboCron.settings.analytics_app_name + Google::Apis::ClientOptions.default.application_version = NcboCron.settings.analytics_app_version + # enable google api call retries in order to + # minigate analytics processing failure due to occasional google api timeouts and other outages + Google::Apis::RequestOptions.default.retries = 5 + # uncoment to enable logging for debugging purposes + # Google::Apis.logger.level = Logger::DEBUG + # Google::Apis.logger = @logger + client = Google::Apis::AnalyticsV3::AnalyticsService.new + key = Google::APIClient::KeyUtils::load_from_pkcs12(NcboCron.settings.analytics_path_to_ua_key_file, 'notasecret') + client.authorization = Signet::OAuth2::Client.new( + :token_credential_uri => 'https://accounts.google.com/o/oauth2/token', + :audience => 'https://accounts.google.com/o/oauth2/token', + :scope => 'https://www.googleapis.com/auth/analytics.readonly', + :issuer => NcboCron.settings.analytics_service_account_email_address, + :signing_key => key + ).tap { |auth| auth.fetch_access_token! } + client + end + end + end +end + +require 'ontologies_linked_data' +require 'goo' +require 'ncbo_annotator' +require 'ncbo_cron/config' +require_relative '../config/config' +ontology_analytics_log_path = File.join("logs", "ontology-analytics-ua.log") +ontology_analytics_logger = Logger.new(ontology_analytics_log_path) +NcboCron::Models::OntologyAnalyticsUA.new(ontology_analytics_logger).run diff --git a/lib/ncbo_cron/ontology_analytics.rb b/lib/ncbo_cron/ontology_analytics.rb index 3a91b813..c5a4de00 100644 --- a/lib/ncbo_cron/ontology_analytics.rb +++ b/lib/ncbo_cron/ontology_analytics.rb @@ -38,7 +38,6 @@ def fetch_ontology_analytics @logger.flush ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map {|o| o.acronym} # ont_acronyms = ["NCIT", "SNOMEDCT", "MEDDRA"] - @logger.info "Authenticating with the Google Analytics Endpoint..." @logger.flush google_client = authenticate_google @@ -137,39 +136,49 @@ def fetch_ontology_analytics break if num_results < max_results end # loop end # ont_acronyms - @logger.info "Refresh complete, merging GA4 and UA data..." - @logger.flush - full_data = merge_ga4_ua_data(aggregated_results) - @logger.info "Merged" + @logger.info "Refresh complete" @logger.flush + full_data = merge_and_fill_missing_data(aggregated_results) end # Benchmark.realtime @logger.info "Completed Google Analytics refresh in #{(time/60).round(1)} minutes." @logger.flush full_data end - def merge_ga4_ua_data(ga4_data) - ua_data_file = File.read(NcboCron.settings.analytics_path_to_ua_data_file) - ua_data = JSON.parse(ua_data_file) - ua_ga4_intersecting_year = Date.parse(GA4_START_DATE).year.to_s - ua_ga4_intersecting_month = Date.parse(GA4_START_DATE).month.to_s - - # add up hits for June of 2023 (the only intersecting month between UA and GA4) - ua_data.each do |acronym, _| - if ga4_data.has_key?(acronym) - if ga4_data[acronym][ua_ga4_intersecting_year].has_key?(ua_ga4_intersecting_month) - ua_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] += - ga4_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] - # delete data for June of 2023 from ga4_data to avoid overwriting when merging - ga4_data[acronym][ua_ga4_intersecting_year].delete(ua_ga4_intersecting_month) + def merge_and_fill_missing_data(ga4_data) + ua_data = {} + + if File.exists?(NcboCron.settings.analytics_path_to_ua_data_file) && + !File.zero?(NcboCron.settings.analytics_path_to_ua_data_file) + @logger.info "Merging GA4 and UA data..." + @logger.flush + ua_data_file = File.read(NcboCron.settings.analytics_path_to_ua_data_file) + ua_data = JSON.parse(ua_data_file) + ua_ga4_intersecting_year = Date.parse(GA4_START_DATE).year.to_s + ua_ga4_intersecting_month = Date.parse(GA4_START_DATE).month.to_s + + # add up hits for June of 2023 (the only intersecting month between UA and GA4) + ua_data.each do |acronym, _| + if ga4_data.has_key?(acronym) + if ga4_data[acronym][ua_ga4_intersecting_year].has_key?(ua_ga4_intersecting_month) + ua_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] += + ga4_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] + # delete data for June of 2023 from ga4_data to avoid overwriting when merging + ga4_data[acronym][ua_ga4_intersecting_year].delete(ua_ga4_intersecting_month) + end end end end + # merge ua and ga4 data merged_data = ua_data.deep_merge(ga4_data) # fill missing years and months + @logger.info "Filling in missing years data..." + @logger.flush fill_missing_data(merged_data) # sort acronyms, years and months + @logger.info "Sorting final data..." + @logger.flush sort_ga_data(merged_data) end @@ -221,4 +230,4 @@ def deep_merge(second) # # ontology_analytics_logger = Logger.new(ontology_analytics_log_path) # ontology_analytics_logger = Logger.new(STDOUT) # NcboCron::Models::OntologyAnalytics.new(ontology_analytics_logger).run -# # ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontologies-report true --disable-mapping-counts true --disable-spam-deletion true --ontology-analytics '14 * * * *' +# ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontologies-report true --disable-mapping-counts true --disable-spam-deletion true --ontology-analytics '14 * * * *' From c2a72dbc223cd003c0cbc96f3fb2d910b7b0f57a Mon Sep 17 00:00:00 2001 From: mdorf Date: Sun, 10 Dec 2023 09:23:35 -0800 Subject: [PATCH 06/12] Gemfile.lock update --- Gemfile.lock | 72 ++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 74482dd3..8ca29047 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: ebbb7a3c28ecde49c261290bec34ab082490a271 + revision: 067104ae94c0e9d058cfbf419364fbf03f34de43 branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 5600020a8017cb4901e719f577032b0be6a14949 + revision: 9487c7f73e68abab097af523d42c1d2e106e614b branch: develop specs: ontologies_linked_data (0.0.1) @@ -71,13 +71,17 @@ PATH GEM remote: https://rubygems.org/ specs: - activesupport (3.2.22.5) - i18n (~> 0.6, >= 0.6.4) - multi_json (~> 1.0) - addressable (2.8.5) + activesupport (4.0.13) + i18n (~> 0.6, >= 0.6.9) + minitest (~> 4.2) + multi_json (~> 1.3) + thread_safe (~> 0.1) + tzinfo (~> 0.3.37) + addressable (2.8.6) public_suffix (>= 2.0.2, < 6.0) base64 (0.2.0) - bcrypt (3.1.19) + bcrypt (3.1.20) + bigdecimal (3.1.4) builder (3.2.4) coderay (1.1.3) concurrent-ruby (1.2.2) @@ -91,7 +95,7 @@ GEM htmlentities (~> 4.3.3) launchy (~> 2.1) mail (~> 2.6) - faraday (2.7.11) + faraday (2.7.12) base64 faraday-net_http (>= 2.0, < 3.1) ruby2_keywords (>= 0.0.4) @@ -124,35 +128,36 @@ GEM retriable (>= 2.0, < 4.a) rexml webrick - google-cloud-core (1.6.0) - google-cloud-env (~> 1.0) + google-cloud-core (1.6.1) + google-cloud-env (>= 1.0, < 3.a) google-cloud-errors (~> 1.0) - google-cloud-env (1.6.0) - faraday (>= 0.17.3, < 3.0) + google-cloud-env (2.0.1) + faraday (>= 1.0, < 3.a) google-cloud-errors (1.3.1) - google-protobuf (3.25.0) - google-protobuf (3.25.0-x86_64-darwin) - google-protobuf (3.25.0-x86_64-linux) + google-protobuf (3.25.1) + google-protobuf (3.25.1-x86_64-darwin) + google-protobuf (3.25.1-x86_64-linux) googleapis-common-protos (1.4.0) google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.10.0) + googleapis-common-protos-types (1.11.0) google-protobuf (~> 3.18) - googleauth (1.8.1) - faraday (>= 0.17.3, < 3.a) + googleauth (1.9.0) + faraday (>= 1.0, < 3.a) + google-cloud-env (~> 2.0, >= 2.0.1) jwt (>= 1.4, < 3.0) multi_json (~> 1.11) os (>= 0.9, < 2.0) signet (>= 0.16, < 2.a) - grpc (1.59.2) - google-protobuf (~> 3.24) + grpc (1.60.0) + google-protobuf (~> 3.25) googleapis-common-protos-types (~> 1.0) - grpc (1.59.2-x86_64-darwin) - google-protobuf (~> 3.24) + grpc (1.60.0-x86_64-darwin) + google-protobuf (~> 3.25) googleapis-common-protos-types (~> 1.0) - grpc (1.59.2-x86_64-linux) - google-protobuf (~> 3.24) + grpc (1.60.0-x86_64-linux) + google-protobuf (~> 3.25) googleapis-common-protos-types (~> 1.0) htmlentities (4.3.4) http-accept (1.7.0) @@ -161,8 +166,8 @@ GEM httpclient (2.8.3) i18n (0.9.5) concurrent-ruby (~> 1.0) - json (2.6.3) - json_pure (2.6.3) + json (2.7.1) + json_pure (2.7.1) jwt (2.7.1) launchy (2.5.2) addressable (~> 2.8) @@ -175,7 +180,7 @@ GEM method_source (1.0.0) mime-types (3.5.1) mime-types-data (~> 3.2015) - mime-types-data (3.2023.1003) + mime-types-data (3.2023.1205) mini_mime (1.1.5) minitest (4.7.5) mlanett-redis-lock (0.2.7) @@ -183,7 +188,8 @@ GEM multi_json (1.15.0) net-http-persistent (2.9.4) netrc (0.11.0) - oj (3.16.1) + oj (3.16.2) + bigdecimal (~> 3.1) omni_logger (0.1.4) logger os (1.1.4) @@ -193,7 +199,7 @@ GEM pry (0.14.2) coderay (~> 1.1) method_source (~> 1.0) - public_suffix (5.0.3) + public_suffix (5.0.4) rack (3.0.8) rack-test (2.1.0) rack (>= 1.3) @@ -202,7 +208,7 @@ GEM addressable (>= 2.2) redis (5.0.8) redis-client (>= 0.17.0) - redis-client (0.18.0) + redis-client (0.19.0) connection_pool representable (3.2.0) declarative (< 0.1.0) @@ -242,9 +248,9 @@ GEM systemu (2.6.5) test-unit-minitest (0.9.1) minitest (~> 4.7) + thread_safe (0.3.6) trailblazer-option (0.1.2) - tzinfo (2.0.6) - concurrent-ruby (~> 1.0) + tzinfo (0.3.62) uber (0.1.0) uuid (2.3.9) macaddr (~> 1.0) @@ -283,4 +289,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.15 + 2.4.22 From ab43cdc82e49e145a759269c7b74d8d61b213176 Mon Sep 17 00:00:00 2001 From: mdorf Date: Sun, 10 Dec 2023 12:18:32 -0800 Subject: [PATCH 07/12] Gemfile.lock update --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index 8ca29047..511c10a4 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 9487c7f73e68abab097af523d42c1d2e106e614b + revision: 809c54f56b1a4d30d8c2d49e9c005f07c2d6c596 branch: develop specs: ontologies_linked_data (0.0.1) From a659415dd42c0593394aa26982751eb88ad828c7 Mon Sep 17 00:00:00 2001 From: mdorf Date: Sat, 16 Dec 2023 17:39:14 -0800 Subject: [PATCH 08/12] implemented the first pass at bmir-radx/radx-project#37 --- Gemfile.lock | 57 +++--- bin/ncbo_ontology_pull | 2 +- lib/ncbo_cron/ontology_helper.rb | 185 ++++++++++++++++++++ lib/ncbo_cron/ontology_pull.rb | 147 +--------------- lib/ncbo_cron/ontology_rank.rb | 7 +- lib/ncbo_cron/ontology_submission_parser.rb | 51 +++--- test/test_case.rb | 6 +- test/test_ontology_pull.rb | 31 +++- 8 files changed, 277 insertions(+), 209 deletions(-) create mode 100644 lib/ncbo_cron/ontology_helper.rb diff --git a/Gemfile.lock b/Gemfile.lock index 74482dd3..ed375af7 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: ebbb7a3c28ecde49c261290bec34ab082490a271 + revision: d7ee80860a0eab9293af81083a0700d099c50263 branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 5600020a8017cb4901e719f577032b0be6a14949 + revision: 9487c7f73e68abab097af523d42c1d2e106e614b branch: develop specs: ontologies_linked_data (0.0.1) @@ -74,10 +74,11 @@ GEM activesupport (3.2.22.5) i18n (~> 0.6, >= 0.6.4) multi_json (~> 1.0) - addressable (2.8.5) + addressable (2.8.6) public_suffix (>= 2.0.2, < 6.0) base64 (0.2.0) - bcrypt (3.1.19) + bcrypt (3.1.20) + bigdecimal (3.1.4) builder (3.2.4) coderay (1.1.3) concurrent-ruby (1.2.2) @@ -91,7 +92,7 @@ GEM htmlentities (~> 4.3.3) launchy (~> 2.1) mail (~> 2.6) - faraday (2.7.11) + faraday (2.7.12) base64 faraday-net_http (>= 2.0, < 3.1) ruby2_keywords (>= 0.0.4) @@ -124,35 +125,36 @@ GEM retriable (>= 2.0, < 4.a) rexml webrick - google-cloud-core (1.6.0) - google-cloud-env (~> 1.0) + google-cloud-core (1.6.1) + google-cloud-env (>= 1.0, < 3.a) google-cloud-errors (~> 1.0) - google-cloud-env (1.6.0) - faraday (>= 0.17.3, < 3.0) + google-cloud-env (2.1.0) + faraday (>= 1.0, < 3.a) google-cloud-errors (1.3.1) - google-protobuf (3.25.0) - google-protobuf (3.25.0-x86_64-darwin) - google-protobuf (3.25.0-x86_64-linux) + google-protobuf (3.25.1) + google-protobuf (3.25.1-x86_64-darwin) + google-protobuf (3.25.1-x86_64-linux) googleapis-common-protos (1.4.0) google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.10.0) + googleapis-common-protos-types (1.11.0) google-protobuf (~> 3.18) - googleauth (1.8.1) - faraday (>= 0.17.3, < 3.a) + googleauth (1.9.1) + faraday (>= 1.0, < 3.a) + google-cloud-env (~> 2.1) jwt (>= 1.4, < 3.0) multi_json (~> 1.11) os (>= 0.9, < 2.0) signet (>= 0.16, < 2.a) - grpc (1.59.2) - google-protobuf (~> 3.24) + grpc (1.60.0) + google-protobuf (~> 3.25) googleapis-common-protos-types (~> 1.0) - grpc (1.59.2-x86_64-darwin) - google-protobuf (~> 3.24) + grpc (1.60.0-x86_64-darwin) + google-protobuf (~> 3.25) googleapis-common-protos-types (~> 1.0) - grpc (1.59.2-x86_64-linux) - google-protobuf (~> 3.24) + grpc (1.60.0-x86_64-linux) + google-protobuf (~> 3.25) googleapis-common-protos-types (~> 1.0) htmlentities (4.3.4) http-accept (1.7.0) @@ -161,8 +163,8 @@ GEM httpclient (2.8.3) i18n (0.9.5) concurrent-ruby (~> 1.0) - json (2.6.3) - json_pure (2.6.3) + json (2.7.1) + json_pure (2.7.1) jwt (2.7.1) launchy (2.5.2) addressable (~> 2.8) @@ -175,7 +177,7 @@ GEM method_source (1.0.0) mime-types (3.5.1) mime-types-data (~> 3.2015) - mime-types-data (3.2023.1003) + mime-types-data (3.2023.1205) mini_mime (1.1.5) minitest (4.7.5) mlanett-redis-lock (0.2.7) @@ -183,7 +185,8 @@ GEM multi_json (1.15.0) net-http-persistent (2.9.4) netrc (0.11.0) - oj (3.16.1) + oj (3.16.3) + bigdecimal (>= 3.0) omni_logger (0.1.4) logger os (1.1.4) @@ -193,7 +196,7 @@ GEM pry (0.14.2) coderay (~> 1.1) method_source (~> 1.0) - public_suffix (5.0.3) + public_suffix (5.0.4) rack (3.0.8) rack-test (2.1.0) rack (>= 1.3) @@ -202,7 +205,7 @@ GEM addressable (>= 2.2) redis (5.0.8) redis-client (>= 0.17.0) - redis-client (0.18.0) + redis-client (0.19.0) connection_pool representable (3.2.0) declarative (< 0.1.0) diff --git a/bin/ncbo_ontology_pull b/bin/ncbo_ontology_pull index a017e4d7..be3e08de 100755 --- a/bin/ncbo_ontology_pull +++ b/bin/ncbo_ontology_pull @@ -32,7 +32,7 @@ logger = Logger.new($stdout) logger.info "Starting ncbo pull"; logger.flush puller = NcboCron::Models::OntologyPull.new begin - puller.do_ontology_pull(ontology_acronym, logger: logger , enable_pull_umls:true ) + puller.do_ontology_pull(ontology_acronym, logger: logger, enable_pull_umls: true) rescue StandardError => e logger.error e.message logger.flush diff --git a/lib/ncbo_cron/ontology_helper.rb b/lib/ncbo_cron/ontology_helper.rb new file mode 100644 index 00000000..42534768 --- /dev/null +++ b/lib/ncbo_cron/ontology_helper.rb @@ -0,0 +1,185 @@ +require 'logger' + +module NcboCron + module Helpers + module OntologyHelper + + REDIS_SUBMISSION_ID_PREFIX = "sub:" + PROCESS_QUEUE_HOLDER = "parseQueue" + PROCESS_ACTIONS = { + :process_rdf => true, + :generate_labels => true, + :index_search => true, + :index_properties => true, + :run_metrics => true, + :process_annotator => true, + :diff => true, + :remote_pull => false + } + + class RemoteFileException < StandardError + attr_reader :submission + + def initialize(submission) + super + @submission = submission + end + end + + def self.do_ontology_pull(ontology_acronym, enable_pull_umls = false, umls_download_url = '', logger = nil, + add_to_queue = true) + logger ||= Logger.new($stdout) + ont = LinkedData::Models::Ontology.find(ontology_acronym).include(:acronym).first + new_submission = nil + raise StandardError, "Ontology #{ontology_acronym} not found" if ont.nil? + + last = ont.latest_submission(status: :any) + raise StandardError, "No submission found for #{ontology_acronym}" if last.nil? + + last.bring(:hasOntologyLanguage) if last.bring?(:hasOntologyLanguage) + if !enable_pull_umls && last.hasOntologyLanguage.umls? + raise StandardError, "Pull umls not enabled" + end + + last.bring(:pullLocation) if last.bring?(:pullLocation) + raise StandardError, "#{ontology_acronym} has no pullLocation" if last.pullLocation.nil? + + last.bring(:uploadFilePath) if last.bring?(:uploadFilePath) + + if last.hasOntologyLanguage.umls? && umls_download_url && !umls_download_url.empty? + last.pullLocation = RDF::URI.new(umls_download_url + last.pullLocation.split("/")[-1]) + logger.info("Using alternative download for umls #{last.pullLocation.to_s}") + logger.flush + end + + if last.remote_file_exists?(last.pullLocation.to_s) + logger.info "Checking download for #{ont.acronym}" + logger.info "Location: #{last.pullLocation.to_s}"; logger.flush + file, filename = last.download_ontology_file + file, md5local, md5remote, new_file_exists = self.new_file_exists?(file, last) + + if new_file_exists + logger.info "New file found for #{ont.acronym}\nold: #{md5local}\nnew: #{md5remote}" + logger.flush() + new_submission = self.create_submission(ont, last, file, filename, logger, add_to_queue) + else + logger.info "There is no new file found for #{ont.acronym}" + logger.flush() + end + + file.close + new_submission + else + raise self::RemoteFileException.new(last) + end + end + + def self.create_submission(ont, sub, file, filename, logger = nil, add_to_queue = true, new_version = nil, + new_released = nil) + logger ||= Kernel.const_defined?("LOGGER") ? Kernel.const_get("LOGGER") : Logger.new(STDOUT) + new_sub = LinkedData::Models::OntologySubmission.new + + sub.bring_remaining + sub.loaded_attributes.each do |attr| + new_sub.send("#{attr}=", sub.send(attr)) + end + + submission_id = ont.next_submission_id() + new_sub.submissionId = submission_id + file_location = LinkedData::Models::OntologySubmission.copy_file_repository(ont.acronym, submission_id, file, filename) + new_sub.uploadFilePath = file_location + + unless new_version.nil? + new_sub.version = new_version + end + + if new_released.nil? + new_sub.released = DateTime.now + else + new_sub.released = DateTime.parse(new_released) + end + new_sub.submissionStatus = nil + new_sub.creationDate = nil + new_sub.missingImports = nil + new_sub.metrics = nil + full_file_path = File.expand_path(file_location) + + # check if OWLAPI is able to parse the file before creating a new submission + owlapi = LinkedData::Parser::OWLAPICommand.new( + full_file_path, + File.expand_path(new_sub.data_folder.to_s), + logger: logger) + owlapi.disable_reasoner + parsable = true + + begin + owlapi.parse + rescue Exception => e + logger.error("The new file for ontology #{ont.acronym}, submission id: #{submission_id} did not clear OWLAPI: #{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") + logger.error("A new submission has NOT been created.") + logger.flush + parsable = false + end + + if parsable + if new_sub.valid? + new_sub.save() + + if add_to_queue + self.queue_submission(new_sub, { all: true }) + logger.info("OntologyPull created a new submission (#{submission_id}) for ontology #{ont.acronym}") + end + else + logger.error("Unable to create a new submission for ontology #{ont.acronym} with id #{submission_id}: #{new_sub.errors}") + logger.flush + end + else + # delete the bad file + File.delete full_file_path if File.exist? full_file_path + end + new_sub + end + + def self.queue_submission(submission, actions={:all => true}) + redis = Redis.new(:host => NcboCron.settings.redis_host, :port => NcboCron.settings.redis_port) + + if actions[:all] + actions = PROCESS_ACTIONS.dup + else + actions.delete_if {|k, v| !PROCESS_ACTIONS.has_key?(k)} + end + actionStr = MultiJson.dump(actions) + redis.hset(PROCESS_QUEUE_HOLDER, get_prefixed_id(submission.id), actionStr) unless actions.empty? + end + + def self.get_prefixed_id(id) + "#{REDIS_SUBMISSION_ID_PREFIX}#{id}" + end + + def self.last_fragment_of_uri(uri) + uri.to_s.split("/")[-1] + end + + def self.acronym_from_submission_id(submissionID) + submissionID.to_s.split("/")[-3] + end + + def self.new_file_exists?(file, last) + file = File.open(file.path, "rb") + remote_contents = file.read + md5remote = Digest::MD5.hexdigest(remote_contents) + + if last.uploadFilePath && File.exist?(last.uploadFilePath) + file_contents = open(last.uploadFilePath) { |f| f.read } + md5local = Digest::MD5.hexdigest(file_contents) + new_file_exists = (not md5remote.eql?(md5local)) + else + # There is no existing file, so let's create a submission with the downloaded one + new_file_exists = true + end + return file, md5local, md5remote, new_file_exists + end + + end + end +end \ No newline at end of file diff --git a/lib/ncbo_cron/ontology_pull.rb b/lib/ncbo_cron/ontology_pull.rb index 7aa9fc23..c554c95e 100644 --- a/lib/ncbo_cron/ontology_pull.rb +++ b/lib/ncbo_cron/ontology_pull.rb @@ -1,22 +1,11 @@ -require 'open-uri' require 'logger' -require_relative 'ontology_submission_parser' +require_relative 'ontology_helper' module NcboCron module Models class OntologyPull - class RemoteFileException < StandardError - attr_reader :submission - - def initialize(submission) - super - @submission = submission - end - end - - def do_remote_ontology_pull(options = {}) logger = options[:logger] || Logger.new($stdout) logger.info "UMLS auto-pull #{options[:enable_pull_umls] == true}" @@ -33,11 +22,11 @@ def do_remote_ontology_pull(options = {}) ontologies.each do |ont| begin begin - new_submissions << self.do_ontology_pull(ont.acronym, + new_submissions << NcboCron::Helpers::OntologyHelper.do_ontology_pull(ont.acronym, enable_pull_umls: enable_pull_umls, umls_download_url: umls_download_url, - logger: logger) - rescue RemoteFileException => error + logger: logger, add_to_queue: true) + rescue NcboCron::Helpers::OntologyHelper::RemoteFileException => error logger.info "RemoteFileException: No submission file at pull location #{error.submission.pullLocation.to_s} for ontology #{ont.acronym}." logger.flush LinkedData::Utils::Notifications.remote_ontology_pull(error.submission) @@ -58,136 +47,8 @@ def do_remote_ontology_pull(options = {}) new_submissions end - def do_ontology_pull(ontology_acronym, enable_pull_umls: false, umls_download_url: '', logger: nil) - ont = LinkedData::Models::Ontology.find(ontology_acronym).include(:acronym).first - new_submission = nil - raise StandardError, "Ontology #{ontology_acronym} not found" if ont.nil? - - last = ont.latest_submission(status: :any) - raise StandardError, "No submission found for #{ontology_acronym}" if last.nil? - - last.bring(:hasOntologyLanguage) if last.bring?(:hasOntologyLanguage) - if !enable_pull_umls && last.hasOntologyLanguage.umls? - raise StandardError, "Pull umls not enabled" - end - - last.bring(:pullLocation) if last.bring?(:pullLocation) - raise StandardError, "#{ontology_acronym} has no pullLocation" if last.pullLocation.nil? - - last.bring(:uploadFilePath) if last.bring?(:uploadFilePath) - - if last.hasOntologyLanguage.umls? && umls_download_url - last.pullLocation = RDF::URI.new(umls_download_url + last.pullLocation.split("/")[-1]) - logger.info("Using alternative download for umls #{last.pullLocation.to_s}") - logger.flush - end - - if last.remote_file_exists?(last.pullLocation.to_s) - logger.info "Checking download for #{ont.acronym}" - logger.info "Location: #{last.pullLocation.to_s}"; logger.flush - file, filename = last.download_ontology_file - file, md5local, md5remote, new_file_exists = new_file_exists?(file, last) - - if new_file_exists - logger.info "New file found for #{ont.acronym}\nold: #{md5local}\nnew: #{md5remote}" - logger.flush() - new_submission = create_submission(ont, last, file, filename, logger) - else - logger.info "There is no new file found for #{ont.acronym}" - logger.flush() - end - - file.close - new_submission - else - raise RemoteFileException.new(last) - end - end - - def create_submission(ont, sub, file, filename, logger = nil, - add_to_pull = true, new_version = nil, new_released = nil) - logger ||= Kernel.const_defined?("LOGGER") ? Kernel.const_get("LOGGER") : Logger.new(STDOUT) - new_sub = LinkedData::Models::OntologySubmission.new - - sub.bring_remaining - sub.loaded_attributes.each do |attr| - new_sub.send("#{attr}=", sub.send(attr)) - end - - submission_id = ont.next_submission_id() - new_sub.submissionId = submission_id - file_location = LinkedData::Models::OntologySubmission.copy_file_repository(ont.acronym, submission_id, file, filename) - new_sub.uploadFilePath = file_location - unless new_version.nil? - new_sub.version = new_version - end - if new_released.nil? - new_sub.released = DateTime.now - else - new_sub.released = DateTime.parse(new_released) - end - new_sub.submissionStatus = nil - new_sub.creationDate = nil - new_sub.missingImports = nil - new_sub.metrics = nil - full_file_path = File.expand_path(file_location) - - # check if OWLAPI is able to parse the file before creating a new submission - owlapi = LinkedData::Parser::OWLAPICommand.new( - full_file_path, - File.expand_path(new_sub.data_folder.to_s), - logger: logger) - owlapi.disable_reasoner - parsable = true - - begin - owlapi.parse - rescue Exception => e - logger.error("The new file for ontology #{ont.acronym}, submission id: #{submission_id} did not clear OWLAPI: #{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") - logger.error("A new submission has NOT been created.") - logger.flush - parsable = false - end - - if parsable - if new_sub.valid? - new_sub.save() - - if add_to_pull - submission_queue = NcboCron::Models::OntologySubmissionParser.new - submission_queue.queue_submission(new_sub, { all: true }) - logger.info("OntologyPull created a new submission (#{submission_id}) for ontology #{ont.acronym}") - end - else - logger.error("Unable to create a new submission in OntologyPull: #{new_sub.errors}") - logger.flush - end - else - # delete the bad file - File.delete full_file_path if File.exist? full_file_path - end - new_sub - end - - private - def new_file_exists?(file, last) - file = File.open(file.path, "rb") - remote_contents = file.read - md5remote = Digest::MD5.hexdigest(remote_contents) - - if last.uploadFilePath && File.exist?(last.uploadFilePath) - file_contents = open(last.uploadFilePath) { |f| f.read } - md5local = Digest::MD5.hexdigest(file_contents) - new_file_exists = (not md5remote.eql?(md5local)) - else - # There is no existing file, so let's create a submission with the downloaded one - new_file_exists = true - end - return file, md5local, md5remote, new_file_exists - end - def redis_goo Redis.new(host: LinkedData.settings.goo_redis_host, port: LinkedData.settings.goo_redis_port, timeout: 30) end diff --git a/lib/ncbo_cron/ontology_rank.rb b/lib/ncbo_cron/ontology_rank.rb index b60c2740..64de8844 100644 --- a/lib/ncbo_cron/ontology_rank.rb +++ b/lib/ncbo_cron/ontology_rank.rb @@ -1,5 +1,6 @@ require 'logger' require 'benchmark' +require_relative 'ontology_helper' module NcboCron module Models @@ -66,7 +67,7 @@ def umls_scores(ontologies) ontologies.each do |ont| if ont.group && !ont.group.empty? - umls_gr = ont.group.select {|gr| acronym_from_id(gr.id.to_s).include?('UMLS')} + umls_gr = ont.group.select {|gr| NcboCron::Helpers::OntologyHelper.last_fragment_of_uri(gr.id.to_s).include?('UMLS')} scores[ont.acronym] = umls_gr.empty? ? 0 : 1 else scores[ont.acronym] = 0 @@ -75,10 +76,6 @@ def umls_scores(ontologies) scores end - def acronym_from_id(id) - id.to_s.split("/")[-1] - end - def normalize(x, xmin, xmax, ymin, ymax) xrange = xmax - xmin yrange = ymax - ymin diff --git a/lib/ncbo_cron/ontology_submission_parser.rb b/lib/ncbo_cron/ontology_submission_parser.rb index 34c53930..f493eced 100644 --- a/lib/ncbo_cron/ontology_submission_parser.rb +++ b/lib/ncbo_cron/ontology_submission_parser.rb @@ -1,39 +1,22 @@ require 'multi_json' +require_relative 'ontology_helper' module NcboCron module Models class OntologySubmissionParser - QUEUE_HOLDER = "parseQueue" - IDPREFIX = "sub:" - - ACTIONS = { - :process_rdf => true, - :generate_labels => true, - :index_search => true, - :index_properties => true, - :run_metrics => true, - :process_annotator => true, - :diff => true - } + QUEUE_HOLDER = NcboCron::Helpers::OntologyHelper::PROCESS_QUEUE_HOLDER + ACTIONS = NcboCron::Helpers::OntologyHelper::PROCESS_ACTIONS def initialize() end - def queue_submission(submission, actions={:all => true}) - redis = Redis.new(:host => NcboCron.settings.redis_host, :port => NcboCron.settings.redis_port) - - if actions[:all] - actions = ACTIONS.dup - else - actions.delete_if {|k, v| !ACTIONS.has_key?(k)} - end - actionStr = MultiJson.dump(actions) - redis.hset(QUEUE_HOLDER, get_prefixed_id(submission.id), actionStr) unless actions.empty? + def queue_submission(submission, actions={ :all => true }) + NcboCron::Helpers::OntologyHelper.queue_submission(submission, actions) end - def process_queue_submissions(options = {}) + def process_queue_submissions(options={}) logger = options[:logger] logger ||= Kernel.const_defined?("LOGGER") ? Kernel.const_get("LOGGER") : Logger.new(STDOUT) redis = Redis.new(:host => NcboCron.settings.redis_host, :port => NcboCron.settings.redis_port) @@ -44,6 +27,18 @@ def process_queue_submissions(options = {}) realKey = process_data[:key] key = process_data[:redis_key] redis.hdel(QUEUE_HOLDER, key) + + # if :remote_pull is one of the actions, pull the ontology and halt if no new submission is found + if actions.key?(:remote_pull) && actions[:remote_pull] + acronym = NcboCron::Helpers::OntologyHelper.acronym_from_submission_id(realKey) + new_submission = NcboCron::Helpers::OntologyHelper.do_ontology_pull(acronym, enable_pull_umls: false, + umls_download_url: '', logger: logger, + add_to_queue: false) + return unless new_submission + realKey = new_submission.id.to_s + actions.delete(:remote_pull) + end + begin process_submission(logger, realKey, actions) rescue Exception => e @@ -56,7 +51,7 @@ def process_queue_submissions(options = {}) def queued_items(redis, logger=nil) logger ||= Kernel.const_defined?("LOGGER") ? Kernel.const_get("LOGGER") : Logger.new(STDOUT) all = redis.hgetall(QUEUE_HOLDER) - prefix_remove = Regexp.new(/^#{IDPREFIX}/) + prefix_remove = Regexp.new(/^#{NcboCron::Helpers::OntologyHelper::REDIS_SUBMISSION_ID_PREFIX}/) items = [] all.each do |key, val| begin @@ -76,10 +71,6 @@ def queued_items(redis, logger=nil) items end - def get_prefixed_id(id) - "#{IDPREFIX}#{id}" - end - def zombie_classes_graphs query = "SELECT DISTINCT ?g WHERE { GRAPH ?g { ?s ?p ?o }}" class_graphs = [] @@ -191,6 +182,10 @@ def process_submission(logger, submission_id, actions=ACTIONS) end end + def get_prefixed_id(id) + NcboCron::Helpers::OntologyHelper.get_prefixed_id(id) + end + private def archive_old_submissions(logger, sub) diff --git a/test/test_case.rb b/test/test_case.rb index 5f164ecd..75bb0454 100644 --- a/test/test_case.rb +++ b/test/test_case.rb @@ -56,7 +56,7 @@ def count_pattern(pattern) return 0 end - def backend_4s_delete + def backend_triplestore_delete raise StandardError, 'Too many triples in KB, does not seem right to run tests' unless count_pattern('?s ?p ?o') < 400000 @@ -89,7 +89,7 @@ def _run_suites(suites, type) end def _run_suite(suite, type) - backend_4s_delete + backend_triplestore_delete suite.before_suite if suite.respond_to?(:before_suite) super(suite, type) rescue Exception => e @@ -98,7 +98,7 @@ def _run_suite(suite, type) puts 'Traced from:' raise e ensure - backend_4s_delete + backend_triplestore_delete suite.after_suite if suite.respond_to?(:after_suite) end end diff --git a/test/test_ontology_pull.rb b/test/test_ontology_pull.rb index 74923677..ca3c6130 100644 --- a/test/test_ontology_pull.rb +++ b/test/test_ontology_pull.rb @@ -76,6 +76,32 @@ def test_remote_ontology_pull assert_equal 2, ont.submissions.length end + def test_remote_pull_parsing_action + ontologies = init_ontologies(1, process_submissions: true) + ont = LinkedData::Models::Ontology.find(ontologies[0].id).first + ont.bring(:submissions) if ont.bring?(:submissions) + assert_equal 1, ont.submissions.length + + # add this ontology to submission queue with :remote_pull action enabled + parser = NcboCron::Models::OntologySubmissionParser.new + actions = NcboCron::Models::OntologySubmissionParser::ACTIONS.dup + actions[:remote_pull] = true + parser.queue_submission(ont.submissions[0], actions) + parser.process_queue_submissions + + # make sure there are now 2 submissions present + ont = LinkedData::Models::Ontology.find(ontologies[0].id).first + ont.bring(:submissions) if ont.bring?(:submissions) + assert_equal 2, ont.submissions.length + + # verify that no new submission is created when the file has not changed + parser.queue_submission(ont.submissions[0], actions) + parser.process_queue_submissions + ont = LinkedData::Models::Ontology.find(ontologies[0].id).first + ont.bring(:submissions) if ont.bring?(:submissions) + assert_equal 2, ont.submissions.length + end + def test_pull_error_notification server_port = Random.rand(55000..65535) @@ -164,8 +190,9 @@ def test_no_pull_location private - def init_ontologies(submission_count) - ont_count, acronyms, ontologies = LinkedData::SampleData::Ontology.create_ontologies_and_submissions(ont_count: 1, submission_count: submission_count, process_submission: false) + def init_ontologies(submission_count, process_submissions = false) + ont_count, acronyms, ontologies = LinkedData::SampleData::Ontology.create_ontologies_and_submissions( + ont_count: 1, submission_count: submission_count, process_submission: process_submissions) ontologies[0].bring(:submissions) if ontologies[0].bring?(:submissions) ontologies[0].submissions.each do |sub| sub.bring_remaining() From 23316314217ec2e6b5cba8d66f3b8a491f53da4c Mon Sep 17 00:00:00 2001 From: mdorf Date: Sat, 16 Dec 2023 17:54:36 -0800 Subject: [PATCH 09/12] implemented the first pass at bmir-radx/radx-project#37 --- lib/ncbo_cron/ontology_submission_parser.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/ncbo_cron/ontology_submission_parser.rb b/lib/ncbo_cron/ontology_submission_parser.rb index f493eced..8d33f89d 100644 --- a/lib/ncbo_cron/ontology_submission_parser.rb +++ b/lib/ncbo_cron/ontology_submission_parser.rb @@ -29,6 +29,8 @@ def process_queue_submissions(options={}) redis.hdel(QUEUE_HOLDER, key) # if :remote_pull is one of the actions, pull the ontology and halt if no new submission is found + # if a new submission is found, replace the submission ID with the new one and proceed with + # processing the remaining actions on the new submission if actions.key?(:remote_pull) && actions[:remote_pull] acronym = NcboCron::Helpers::OntologyHelper.acronym_from_submission_id(realKey) new_submission = NcboCron::Helpers::OntologyHelper.do_ontology_pull(acronym, enable_pull_umls: false, From 07107b1aa08e41bb4ce8de311149355de70738e5 Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Tue, 19 Dec 2023 15:08:28 -0800 Subject: [PATCH 10/12] set bundler version to be comptatible with ruby 2.7 + AG v8 --- Dockerfile | 4 ++++ docker-compose.yml | 15 ++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index dfc03492..73e1379c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,10 @@ COPY Gemfile* *.gemspec /srv/ontoportal/ncbo_cron/ WORKDIR /srv/ontoportal/ncbo_cron +# set rubygem and bundler to the last version supported by ruby 2.7 +# remove version after ruby v3 upgrade +RUN gem update --system '3.4.22' +RUN gem install bundler -v '2.4.22' RUN gem update --system RUN gem install bundler ENV BUNDLE_PATH=/srv/ontoportal/bundle diff --git a/docker-compose.yml b/docker-compose.yml index 0045ce12..5f4e9307 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,7 +4,7 @@ x-app: &app args: RUBY_VERSION: '2.7' # Increase the version number in the image tag every time Dockerfile or its arguments is changed - image: ncbo_cron:0.0.1 + image: ncbo_cron:0.0.2 environment: &env BUNDLE_PATH: /srv/ontoportal/bundle # default bundle config resolves to /usr/local/bundle/config inside of the container @@ -19,12 +19,10 @@ x-app: &app SOLR_TERM_SEARCH_URL: http://solr-ut:8983/solr/term_search_core1 SOLR_PROP_SEARCH_URL: http://solr-ut:8983/solr/prop_search_core1 MGREP_HOST: mgrep-ut - MGREP_PORT: 55555 + MGREP_PORT: 55556 stdin_open: true tty: true command: "bundle exec rackup -o 0.0.0.0 --port 9393" - ports: - - 9393:9393 volumes: # bundle volume for hosting gems installed by bundle; it helps in local development with gem udpates - bundle:/srv/ontoportal/bundle @@ -104,18 +102,17 @@ services: retries: 5 mgrep-ut: - image: ontoportal/mgrep:0.0.1 + image: ontoportal/mgrep:0.0.2 platform: linux/amd64 healthcheck: - test: ["CMD", "nc", "-z", "-v", "localhost", "55555"] + test: ["CMD", "nc", "-z", "-v", "localhost", "55556"] start_period: 3s interval: 10s timeout: 5s retries: 5 agraph-ut: - #image: franzinc/agraph:v7.3.1 - image: ontoportal/agraph:v7.3.1-patch1 + image: franzinc/agraph:v8.0.0 platform: linux/amd64 environment: - AGRAPH_SUPER_USER=test @@ -131,7 +128,7 @@ services: ; tail -f /agraph/data/agraph.log" healthcheck: test: ["CMD-SHELL", "agtool storage-report bioportal_test || exit 1"] - start_period: 10s + start_period: 20s interval: 60s timeout: 5s retries: 3 From a733888915a6c188e636ff605027490c381d72a0 Mon Sep 17 00:00:00 2001 From: mdorf Date: Wed, 10 Jan 2024 11:58:31 -0800 Subject: [PATCH 11/12] Gemfile.lock update --- Gemfile.lock | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index ed8c1956..bc2ac427 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 9487c7f73e68abab097af523d42c1d2e106e614b + revision: e716a6d410883a8e59121e7d09d2c80a6f6ab1fc branch: develop specs: ontologies_linked_data (0.0.1) @@ -87,12 +87,12 @@ GEM dante (0.2.0) declarative (0.0.20) docile (1.4.0) - domain_name (0.6.20231109) + domain_name (0.6.20240107) email_spec (2.1.1) htmlentities (~> 4.3.3) launchy (~> 2.1) mail (~> 2.6) - faraday (2.7.12) + faraday (2.8.1) base64 faraday-net_http (>= 2.0, < 3.1) ruby2_keywords (>= 0.0.4) @@ -131,9 +131,9 @@ GEM google-cloud-env (2.1.0) faraday (>= 1.0, < 3.a) google-cloud-errors (1.3.1) - google-protobuf (3.25.1) - google-protobuf (3.25.1-x86_64-darwin) - google-protobuf (3.25.1-x86_64-linux) + google-protobuf (3.25.2) + google-protobuf (3.25.2-x86_64-darwin) + google-protobuf (3.25.2-x86_64-linux) googleapis-common-protos (1.4.0) google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) @@ -168,14 +168,14 @@ GEM jwt (2.7.1) launchy (2.5.2) addressable (~> 2.8) - libxml-ruby (4.1.2) + libxml-ruby (5.0.2) logger (1.6.0) macaddr (1.7.2) systemu (~> 2.6.5) mail (2.6.6) mime-types (>= 1.16, < 4) method_source (1.0.0) - mime-types (3.5.1) + mime-types (3.5.2) mime-types-data (~> 3.2015) mime-types-data (3.2023.1205) mini_mime (1.1.5) @@ -205,7 +205,7 @@ GEM addressable (>= 2.2) redis (5.0.8) redis-client (>= 0.17.0) - redis-client (0.19.0) + redis-client (0.19.1) connection_pool representable (3.2.0) declarative (< 0.1.0) From 7066b67955c127e1b9374c2f31b60c6468015bf1 Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Fri, 12 Jan 2024 13:48:30 -0800 Subject: [PATCH 12/12] Gemfile.lock update --- Gemfile.lock | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index bc2ac427..fe81009c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 6db93bb3d5095a5fe0d017e572c5a04caa34ebc6 + revision: db2b330fb6c5fd4ea9ee17d5b58ca997f304a340 branch: develop specs: goo (0.0.2) @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: d7ee80860a0eab9293af81083a0700d099c50263 + revision: 9a037b955b1d6c3c9955250d7499afdd10d4bbd3 branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: e716a6d410883a8e59121e7d09d2c80a6f6ab1fc + revision: 4b6b7f42882b9ad815ff5e90c354212aca085fda branch: develop specs: ontologies_linked_data (0.0.1) @@ -111,8 +111,8 @@ GEM google-analytics-data (0.4.0) google-analytics-data-v1beta (>= 0.7, < 2.a) google-cloud-core (~> 1.6) - google-analytics-data-v1beta (0.10.0) - gapic-common (>= 0.20.0, < 2.a) + google-analytics-data-v1beta (0.11.1) + gapic-common (>= 0.21.1, < 2.a) google-cloud-errors (~> 1.0) google-apis-analytics_v3 (0.13.0) google-apis-core (>= 0.11.0, < 2.a) @@ -286,4 +286,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.15 + 2.3.22