diff --git a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs index c03307776..540a49438 100644 --- a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs +++ b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs @@ -1,9 +1,13 @@ -namespace SIL.Machine.AspNetCore.Services; +using Google.Protobuf; +using MongoDB.Bson.IO; + +namespace SIL.Machine.AspNetCore.Services; public class NmtPreprocessBuildJob : HangfireBuildJob> { private readonly ISharedFileService _sharedFileService; private readonly ICorpusService _corpusService; + private readonly ILanguageTagService _languageTagService; public NmtPreprocessBuildJob( IPlatformService platformService, @@ -12,12 +16,14 @@ public NmtPreprocessBuildJob( ILogger logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - ICorpusService corpusService + ICorpusService corpusService, + ILanguageTagService languageTagService ) : base(platformService, engines, lockFactory, buildJobService, logger) { _sharedFileService = sharedFileService; _corpusService = corpusService; + _languageTagService = languageTagService; } protected override async Task DoWorkAsync( @@ -29,7 +35,27 @@ protected override async Task DoWorkAsync( CancellationToken cancellationToken ) { - await WriteDataFilesAsync(buildId, data, buildOptions, cancellationToken); + IDictionary counts = await WriteDataFilesAsync(buildId, data, buildOptions, cancellationToken); + + // Log summary of build data + JsonObject _buildPreprocessSummary = + new() { { "Event", "BuildPreprocess" }, { "EngineId", engineId }, { "BuildId", buildId } }; + foreach (KeyValuePair kvp in counts) + { + _buildPreprocessSummary.Add(kvp.Key, kvp.Value); + } + TranslationEngine? engine = await Engines.GetAsync(e => e.EngineId == engineId, cancellationToken); + if (engine is null) + throw new OperationCanceledException($"Engine {engineId} does not exist. Build canceled."); + _buildPreprocessSummary.Add( + "SourceLanguageResolved", + _languageTagService.ConvertToFlores200Code(engine.SourceLanguage) + ); + _buildPreprocessSummary.Add( + "TargetLanguageResolved", + _languageTagService.ConvertToFlores200Code(engine.TargetLanguage) + ); + Logger.LogInformation("{summary}", _buildPreprocessSummary.ToJsonString()); await using (await @lock.WriterLockAsync(cancellationToken: cancellationToken)) { @@ -47,7 +73,7 @@ CancellationToken cancellationToken } } - private async Task WriteDataFilesAsync( + private async Task> WriteDataFilesAsync( string buildId, IReadOnlyList corpora, string? buildOptions, @@ -66,7 +92,10 @@ await _sharedFileService.OpenWriteAsync($"builds/{buildId}/train.src.txt", cance await _sharedFileService.OpenWriteAsync($"builds/{buildId}/train.trg.txt", cancellationToken) ); - int corpusSize = 0; + Dictionary counts = new(); + counts["CorpusSize"] = 0; + counts["NumTrainRows"] = 0; + counts["NumPretranslateRows"] = 0; async IAsyncEnumerable ProcessRowsAsync() { foreach (Corpus corpus in corpora) @@ -106,6 +135,7 @@ async IAsyncEnumerable ProcessRowsAsync() { await sourceTrainWriter.WriteAsync($"{row.SourceText}\n"); await targetTrainWriter.WriteAsync($"{row.TargetText}\n"); + counts["NumTrainRows"] += 1; } if ( (corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(row.TextId)) @@ -137,6 +167,7 @@ async IAsyncEnumerable ProcessRowsAsync() { refs = row.TargetRefs; } + counts["NumPretranslateRows"] += 1; yield return new Pretranslation { CorpusId = corpus.Id, @@ -146,7 +177,7 @@ async IAsyncEnumerable ProcessRowsAsync() }; } if (!row.IsEmpty) - corpusSize++; + counts["CorpusSize"]++; } } } @@ -162,7 +193,8 @@ await JsonSerializer.SerializeAsync( new JsonSerializerOptions { WriteIndented = true, PropertyNamingPolicy = JsonNamingPolicy.CamelCase }, cancellationToken: cancellationToken ); - return corpusSize; + + return counts; } protected override async Task CleanupAsync( diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtEngineServiceTests.cs b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtEngineServiceTests.cs index b066fddbb..52cb1f672 100644 --- a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtEngineServiceTests.cs +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtEngineServiceTests.cs @@ -270,7 +270,8 @@ public override object ActivateJob(Type jobType) Substitute.For>(), _env.BuildJobService, _env.SharedFileService, - Substitute.For() + Substitute.For(), + new LanguageTagService() ); } if (jobType == typeof(NmtPostprocessBuildJob)) diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs index 2b31b2e2e..3e01dccf1 100644 --- a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs @@ -218,7 +218,8 @@ public TestEnvironment() Logger, BuildJobService, SharedFileService, - CorpusService + CorpusService, + new LanguageTagService() ); } }