From fc3608d69ac0e827b302cdd0a1d74f8d673be3b9 Mon Sep 17 00:00:00 2001 From: Nat Date: Wed, 12 Feb 2025 21:42:27 +0800 Subject: [PATCH] FEATURE: Translate all new posts automatically Adds a new site setting 'translate_posts_to_languages' --- app/jobs/regular/translate_translatable.rb | 27 +++ .../automatic_translation_backfill.rb | 104 ++++++++++++ app/services/discourse_translator/amazon.rb | 2 +- app/services/discourse_translator/base.rb | 6 +- .../discourse_translator/discourse_ai.rb | 2 +- app/services/discourse_translator/google.rb | 2 +- .../discourse_translator/libre_translate.rb | 2 +- .../discourse_translator/microsoft.rb | 2 +- app/services/discourse_translator/yandex.rb | 2 +- .../extend-for-translate-button.js | 17 +- config/locales/server.en.yml | 1 + config/settings.yml | 9 + .../translatable_languages_setting.rb | 11 ++ plugin.rb | 13 ++ .../automatic_translation_backfill_spec.rb | 159 ++++++++++++++++++ 15 files changed, 339 insertions(+), 20 deletions(-) create mode 100644 app/jobs/regular/translate_translatable.rb create mode 100644 app/jobs/scheduled/automatic_translation_backfill.rb create mode 100644 lib/discourse_translator/translatable_languages_setting.rb create mode 100644 spec/jobs/automatic_translation_backfill_spec.rb diff --git a/app/jobs/regular/translate_translatable.rb b/app/jobs/regular/translate_translatable.rb new file mode 100644 index 0000000..e0c1008 --- /dev/null +++ b/app/jobs/regular/translate_translatable.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: true + +module Jobs + class TranslateTranslatable < ::Jobs::Base + def execute(args) + return unless SiteSetting.translator_enabled + return if SiteSetting.automatic_translation_target_languages.blank? + + translatable = args[:type].constantize.find_by(id: args[:translatable_id]) + return if translatable.blank? + + target_locales = SiteSetting.automatic_translation_target_languages.split("|") + target_locales.each do |target_locale| + # 1. no special retry, job will be automatically retried with backoff + # 2. translate function will handle cases where translation is not needed or not possible + "DiscourseTranslator::#{SiteSetting.translator}".constantize.translate( + translatable, + target_locale.to_sym, + ) + end + + topic_id = translatable.is_a?(Post) ? translatable.topic.id : translatable.id + post_id = translatable.is_a?(Post) ? translatable.id : 1 + MessageBus.publish("/topic/#{topic_id}", type: :revised, id: post_id) + end + end +end diff --git a/app/jobs/scheduled/automatic_translation_backfill.rb b/app/jobs/scheduled/automatic_translation_backfill.rb new file mode 100644 index 0000000..0103d23 --- /dev/null +++ b/app/jobs/scheduled/automatic_translation_backfill.rb @@ -0,0 +1,104 @@ +# frozen_string_literal: true + +module Jobs + class AutomaticTranslationBackfill < ::Jobs::Scheduled + every 5.minutes + + BACKFILL_LOCK_KEY = "discourse_translator_backfill_lock" + + def execute(args = nil) + return unless SiteSetting.translator_enabled + return unless should_backfill? + return unless secure_backfill_lock + + begin + process_batch + ensure + Discourse.redis.del(BACKFILL_LOCK_KEY) + end + end + + def fetch_untranslated_model_ids(model = Post, limit = 100, target_locales = backfill_locales) + m = model.name.downcase + DB.query_single(<<~SQL, target_locales: target_locales, limit: limit) + SELECT m.id + FROM #{m}s m + LEFT JOIN discourse_translator_#{m}_locales dl ON dl.#{m}_id = m.id + LEFT JOIN LATERAL ( + SELECT array_agg(DISTINCT locale)::text[] as locales + FROM discourse_translator_#{m}_translations dt + WHERE dt.#{m}_id = m.id + ) translations ON true + WHERE NOT ( + ARRAY[:target_locales]::text[] <@ + (COALESCE( + array_cat( + ARRAY[COALESCE(dl.detected_locale, '')]::text[], + COALESCE(translations.locales, ARRAY[]::text[]) + ), + ARRAY[]::text[] + )) + ) + ORDER BY m.id DESC + LIMIT :limit + SQL + end + + private + + def should_backfill? + return false if SiteSetting.automatic_translation_target_languages.blank? + return false if SiteSetting.automatic_translation_backfill_maximum_translations_per_hour == 0 + true + end + + def secure_backfill_lock + Discourse.redis.set(BACKFILL_LOCK_KEY, "1", ex: 5.minutes.to_i, nx: true) + end + + def translations_per_run + [ + (SiteSetting.automatic_translation_backfill_maximum_translations_per_hour / 12) / + backfill_locales.size, + 1, + ].max + end + + def backfill_locales + @backfill_locales ||= SiteSetting.automatic_translation_target_languages.split("|") + end + + def translator + @translator_klass ||= "DiscourseTranslator::#{SiteSetting.translator}".constantize + end + + def translate_records(type, record_ids) + record_ids.each do |id| + record = type.find(id) + backfill_locales.each do |target_locale| + begin + translator.translate(record, target_locale.to_sym) + rescue => e + # continue with other locales even if one fails + Rails.logger.warn( + "Failed to machine-translate #{type.name}##{id} to #{target_locale}: #{e.message}\n#{e.backtrace.join("\n")}", + ) + next + end + end + end + end + + def process_batch + models_translated = [Post, Topic].size + translations_to_run = [translations_per_run / models_translated, 1].max + topic_ids = fetch_untranslated_model_ids(Topic, translations_to_run) + translations_to_run = translations_per_run if topic_ids.empty? + post_ids = fetch_untranslated_model_ids(Post, translations_to_run) + return if topic_ids.empty? && post_ids.empty? + + translate_records(Topic, topic_ids) + translate_records(Post, post_ids) + end + end +end diff --git a/app/services/discourse_translator/amazon.rb b/app/services/discourse_translator/amazon.rb index 314ec9b..3f3f3d9 100644 --- a/app/services/discourse_translator/amazon.rb +++ b/app/services/discourse_translator/amazon.rb @@ -126,7 +126,7 @@ def self.detect!(topic_or_post) def self.translate!(translatable, target_locale_sym = I18n.locale) detected_lang = detect(translatable) - save_translation(translatable) do + save_translation(translatable, target_locale_sym) do begin client.translate_text( { diff --git a/app/services/discourse_translator/base.rb b/app/services/discourse_translator/base.rb index 44909fd..8a1eaff 100644 --- a/app/services/discourse_translator/base.rb +++ b/app/services/discourse_translator/base.rb @@ -48,7 +48,7 @@ def self.translate(translatable, target_locale_sym = I18n.locale) ), ) end - [detected_lang, translate!(translatable)] + [detected_lang, translate!(translatable, target_locale_sym)] end # Subclasses must implement this method to translate the text of a post or topic @@ -77,9 +77,9 @@ def self.access_token raise "Not Implemented" end - def self.save_translation(translatable) + def self.save_translation(translatable, target_locale_sym = I18n.locale) translation = yield - translatable.set_translation(I18n.locale, translation) + translatable.set_translation(target_locale_sym, translation) translation end diff --git a/app/services/discourse_translator/discourse_ai.rb b/app/services/discourse_translator/discourse_ai.rb index b057e9b..fc7de8b 100644 --- a/app/services/discourse_translator/discourse_ai.rb +++ b/app/services/discourse_translator/discourse_ai.rb @@ -21,7 +21,7 @@ def self.detect!(topic_or_post) def self.translate!(translatable, target_locale_sym = I18n.locale) return unless required_settings_enabled - save_translation(translatable) do + save_translation(translatable, target_locale_sym) do ::DiscourseAi::Translator.new( text_for_translation(translatable), target_locale_sym, diff --git a/app/services/discourse_translator/google.rb b/app/services/discourse_translator/google.rb index 4d40948..19f2ad0 100644 --- a/app/services/discourse_translator/google.rb +++ b/app/services/discourse_translator/google.rb @@ -91,7 +91,7 @@ def self.translate_supported?(source, target) def self.translate!(translatable, target_locale_sym = I18n.locale) detected_locale = detect(translatable) - save_translation(translatable) do + save_translation(translatable, target_locale_sym) do res = result( TRANSLATE_URI, diff --git a/app/services/discourse_translator/libre_translate.rb b/app/services/discourse_translator/libre_translate.rb index 0e641f2..4ee81fa 100644 --- a/app/services/discourse_translator/libre_translate.rb +++ b/app/services/discourse_translator/libre_translate.rb @@ -98,7 +98,7 @@ def self.translate_supported?(source, target) def self.translate!(translatable, target_locale_sym = I18n.locale) detected_lang = detect(translatable) - save_translation(translatable) do + save_translation(translatable, target_locale_sym) do res = result( translate_uri, diff --git a/app/services/discourse_translator/microsoft.rb b/app/services/discourse_translator/microsoft.rb index e70295b..2f75f2a 100644 --- a/app/services/discourse_translator/microsoft.rb +++ b/app/services/discourse_translator/microsoft.rb @@ -166,7 +166,7 @@ def self.translate!(translatable, target_locale_sym = I18n.locale) locale = SUPPORTED_LANG_MAPPING[target_locale_sym] || (raise I18n.t("translator.not_supported")) - save_translation(translatable) do + save_translation(translatable, target_locale_sym) do query = default_query.merge("from" => detected_lang, "to" => locale, "textType" => "html") body = [{ "Text" => text_for_translation(translatable) }].to_json diff --git a/app/services/discourse_translator/yandex.rb b/app/services/discourse_translator/yandex.rb index 2ea91b9..7cf7d65 100644 --- a/app/services/discourse_translator/yandex.rb +++ b/app/services/discourse_translator/yandex.rb @@ -137,7 +137,7 @@ def self.translate!(translatable, target_locale_sym = I18n.locale) locale = SUPPORTED_LANG_MAPPING[target_locale_sym] || (raise I18n.t("translator.not_supported")) - save_translation(translatable) do + save_translation(translatable, target_locale_sym) do query = default_query.merge( "lang" => "#{detected_lang}-#{locale}", diff --git a/assets/javascripts/discourse/initializers/extend-for-translate-button.js b/assets/javascripts/discourse/initializers/extend-for-translate-button.js index 518377b..edf8872 100644 --- a/assets/javascripts/discourse/initializers/extend-for-translate-button.js +++ b/assets/javascripts/discourse/initializers/extend-for-translate-button.js @@ -29,17 +29,6 @@ function initializeTranslation(api) { (currentUser || siteSettings.experimental_anon_language_switcher) ) { api.renderInOutlet("topic-navigation", ShowOriginalContent); - api.decorateCookedElement((cookedElement, helper) => { - if (helper) { - const translatedCooked = helper.getModel().get("translated_cooked"); - if (translatedCooked) { - cookedElement.innerHTML = translatedCooked; - } else { - // this experimental feature does not yet support - // translating individual untranslated posts - } - } - }); api.registerModelTransformer("topic", (topics) => { topics.forEach((topic) => { @@ -48,6 +37,12 @@ function initializeTranslation(api) { } }); }); + + api.registerModelTransformer("post", (post) => { + if (post.translated_cooked) { + post.set("cooked", post.translated_cooked); + } + }); } if (!siteSettings.experimental_topic_translation) { diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index e423052..56cc754 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -18,6 +18,7 @@ en: restrict_translation_by_group: "Only allowed groups can translate" restrict_translation_by_poster_group: "Only allow translation of posts made by users in allowed groups. If empty, allow translations of posts from all users." experimental_anon_language_switcher: "Enable experimental language switcher for anonymous users. This will allow anonymous users to switch between translated versions of Discourse and user-contributed content in topics." + translate_posts_to_languages: "Translate posts to languages" errors: set_locale_cookie_requirements: "The experimental language switcher for anonymous users requires the `set locale from cookie` site setting to be enabled." experimental_topic_translation: "Enable experimental topic translation feature. This replaces existing post in-line translation with a button that allows users to translate the entire topic." diff --git a/config/settings.yml b/config/settings.yml index bb64319..70cb14a 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -109,3 +109,12 @@ discourse_translator: experimental_topic_translation: default: false client: true + automatic_translation_target_languages: + default: "" + type: list + list_type: named + choices: "DiscourseTranslator::TranslatableLanguagesSetting.values" + allow_any: false + automatic_translation_backfill_maximum_translations_per_hour: + default: 0 + client: false diff --git a/lib/discourse_translator/translatable_languages_setting.rb b/lib/discourse_translator/translatable_languages_setting.rb new file mode 100644 index 0000000..804a1b0 --- /dev/null +++ b/lib/discourse_translator/translatable_languages_setting.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +module DiscourseTranslator + class TranslatableLanguagesSetting < LocaleSiteSetting + def self.printable_values + values.map { |v| v[:value] } + end + + @lock = Mutex.new + end +end diff --git a/plugin.rb b/plugin.rb index 9fead2d..501670f 100644 --- a/plugin.rb +++ b/plugin.rb @@ -38,6 +38,19 @@ module ::DiscourseTranslator end end + on(:post_process_cooked) do |_, post| + return if SiteSetting.automatic_translation_target_languages.blank? + Jobs.enqueue(:translate_translatable, type: Post, translatable_id: post.id) + end + + on(:topic_created) do |topic| + Jobs.enqueue(:translate_translatable, type: Topic, translatable_id: topic.id) + end + + on(:topic_edited) do |topic| + Jobs.enqueue(:translate_translatable, type: Topic, translatable_id: topic.id) + end + add_to_serializer :post, :can_translate do scope.can_translate?(object) end diff --git a/spec/jobs/automatic_translation_backfill_spec.rb b/spec/jobs/automatic_translation_backfill_spec.rb new file mode 100644 index 0000000..0d421ff --- /dev/null +++ b/spec/jobs/automatic_translation_backfill_spec.rb @@ -0,0 +1,159 @@ +# frozen_string_literal: true + +describe Jobs::AutomaticTranslationBackfill do + before do + SiteSetting.translator_enabled = true + SiteSetting.translator = "Google" + SiteSetting.translator_google_api_key = "api_key" + end + + def expect_google_check_language + Excon + .expects(:post) + .with(DiscourseTranslator::Google::SUPPORT_URI, anything, anything) + .returns( + Struct.new(:status, :body).new( + 200, + %{ { "data": { "languages": [ { "language": "es" }, { "language": "de" }] } } }, + ), + ) + .at_least_once + end + + def expect_google_detect(locale) + Excon + .expects(:post) + .with(DiscourseTranslator::Google::DETECT_URI, anything, anything) + .returns( + Struct.new(:status, :body).new( + 200, + %{ { "data": { "detections": [ [ { "language": "#{locale}" } ] ] } } }, + ), + ) + .once + end + + def expect_google_translate(text) + Excon + .expects(:post) + .with(DiscourseTranslator::Google::TRANSLATE_URI, body: anything, headers: anything) + .returns( + Struct.new(:status, :body).new( + 200, + %{ { "data": { "translations": [ { "translatedText": "#{text}" } ] } } }, + ), + ) + end + + describe "backfilling" do + it "does not backfill if translator is disabled" do + SiteSetting.translator_enabled = false + expect_any_instance_of(Jobs::AutomaticTranslationBackfill).not_to receive(:process_batch) + described_class.new.execute + end + + it "does not backfill if backfill languages are not set" do + SiteSetting.automatic_translation_target_languages = "" + expect_any_instance_of(Jobs::AutomaticTranslationBackfill).not_to receive(:process_batch) + described_class.new.execute + end + + it "does not backfill if backfill limit is set to 0" do + SiteSetting.automatic_translation_target_languages = "de" + SiteSetting.automatic_translation_backfill_maximum_translations_per_hour = 0 + expect_any_instance_of(Jobs::AutomaticTranslationBackfill).not_to receive(:process_batch) + end + + it "does not backfill if backfill lock is not secure" do + SiteSetting.automatic_translation_target_languages = "de" + SiteSetting.automatic_translation_backfill_maximum_translations_per_hour = 1 + Discourse.redis.set("discourse_translator_backfill_lock", "1") + expect_any_instance_of(Jobs::AutomaticTranslationBackfill).not_to receive(:translate_records) + end + + describe "with two locales ['de', 'es']" do + before do + SiteSetting.automatic_translation_target_languages = "de|es" + SiteSetting.automatic_translation_backfill_maximum_translations_per_hour = 10 + expect_google_check_language + end + + it "backfills if topic is not in target languages" do + expect_google_detect("de") + expect_google_translate("hola") + topic = Fabricate(:topic) + + described_class.new.execute + + expect(topic.translations.pluck(:locale, :translation)).to eq([%w[es hola]]) + end + + it "backfills both topics and posts" do + post = Fabricate(:post) + topic = post.topic + + topic.set_detected_locale("de") + post.set_detected_locale("es") + + expect_google_translate("hallo") + expect_google_translate("hola") + + described_class.new.execute + + expect(topic.translations.pluck(:locale, :translation)).to eq([%w[es hola]]) + expect(post.translations.pluck(:locale, :translation)).to eq([%w[de hallo]]) + end + end + end + + describe ".fetch_untranslated_model_ids" do + fab!(:posts_1) { Fabricate.times(2, :post) } + fab!(:post_1) { Fabricate(:post) } + fab!(:post_2) { Fabricate(:post) } + fab!(:post_3) { Fabricate(:post) } + fab!(:posts_2) { Fabricate.times(2, :post) } + fab!(:post_4) { Fabricate(:post) } + fab!(:post_5) { Fabricate(:post) } + fab!(:post_6) { Fabricate(:post) } + fab!(:post_7) { Fabricate(:post) } + fab!(:posts_3) { Fabricate.times(2, :post) } + + before do +=begin +This is the scenario we are testing for: + | Post ID | detected_locale | translations | selected? | Why? | + |---------|-----------------|--------------|-----------|------| + | 1 | en | none | YES | source not de/es, needs both translations + | 2 | es | none | YES | source is es, but missing de translation + | 3 | null | es | YES | missing de translation + | 4 | null | de, es | NO | has both de and es translations + | 5 | de | es | NO | source is de and has es translation + | 6 | de | de | YES | both source and translation is de, missing es translation + | 7 | de | ja | YES | source is de, missing es translation +=end + + [posts_1, posts_2, posts_3].flatten.each do |post| + post.set_translation("es", "hola") + post.set_translation("de", "hallo") + end + + post_1.set_detected_locale("en") + post_2.set_detected_locale("es") + post_5.set_detected_locale("de") + post_6.set_detected_locale("de") + post_7.set_detected_locale("de") + + post_3.set_translation("es", "hola") + post_4.set_translation("de", "hallo") + post_4.set_translation("es", "hola") + post_5.set_translation("es", "hola") + post_6.set_translation("de", "hallo") + post_7.set_translation("ja", "こんにちは") + end + + it "returns correct post ids needing translation in descending id" do + result = described_class.new.fetch_untranslated_model_ids(Post, 50, %w[de es]) + expect(result).to include(post_7.id, post_6.id, post_3.id, post_2.id, post_1.id) + end + end +end