From 9ec1e2a60c3430383ccba8456634968c9e0dc53a Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 4 Jan 2024 15:55:10 -0500 Subject: [PATCH] Log beginning of build log that can be built into metrics. --- .../Services/NmtPreprocessBuildJob.cs | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs index c03307776..9433b030c 100644 --- a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs +++ b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs @@ -1,9 +1,13 @@ -namespace SIL.Machine.AspNetCore.Services; +using Google.Protobuf; +using MongoDB.Bson.IO; + +namespace SIL.Machine.AspNetCore.Services; public class NmtPreprocessBuildJob : HangfireBuildJob> { private readonly ISharedFileService _sharedFileService; private readonly ICorpusService _corpusService; + private readonly JsonObject _buildPreprocessSummary; public NmtPreprocessBuildJob( IPlatformService platformService, @@ -18,6 +22,7 @@ ICorpusService corpusService { _sharedFileService = sharedFileService; _corpusService = corpusService; + _buildPreprocessSummary = new JsonObject(); } protected override async Task DoWorkAsync( @@ -29,8 +34,17 @@ protected override async Task DoWorkAsync( CancellationToken cancellationToken ) { + _buildPreprocessSummary.Add("event", "build_preprocess"); + _buildPreprocessSummary.Add("type", "nmt"); + _buildPreprocessSummary.Add("engine_id", engineId); + _buildPreprocessSummary.Add("build_id", buildId); + _buildPreprocessSummary.Add("build_options", JsonNode.Parse(buildOptions!)); + _buildPreprocessSummary.Add("corpora", JsonNode.Parse(JsonSerializer.Serialize(data))); + await WriteDataFilesAsync(buildId, data, buildOptions, cancellationToken); + Logger.LogInformation(_buildPreprocessSummary.ToJsonString()); + await using (await @lock.WriterLockAsync(cancellationToken: cancellationToken)) { bool canceling = !await BuildJobService.StartBuildJobAsync( @@ -67,6 +81,8 @@ await _sharedFileService.OpenWriteAsync($"builds/{buildId}/train.trg.txt", cance ); int corpusSize = 0; + var numTrainRows = 0; + var numPretranslateRows = 0; async IAsyncEnumerable ProcessRowsAsync() { foreach (Corpus corpus in corpora) @@ -106,6 +122,7 @@ async IAsyncEnumerable ProcessRowsAsync() { await sourceTrainWriter.WriteAsync($"{row.SourceText}\n"); await targetTrainWriter.WriteAsync($"{row.TargetText}\n"); + numTrainRows += 1; } if ( (corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(row.TextId)) @@ -137,6 +154,7 @@ async IAsyncEnumerable ProcessRowsAsync() { refs = row.TargetRefs; } + numPretranslateRows += 1; yield return new Pretranslation { CorpusId = corpus.Id, @@ -162,6 +180,10 @@ await JsonSerializer.SerializeAsync( new JsonSerializerOptions { WriteIndented = true, PropertyNamingPolicy = JsonNamingPolicy.CamelCase }, cancellationToken: cancellationToken ); + + _buildPreprocessSummary.Add("num_train_rows", numTrainRows); + _buildPreprocessSummary.Add("num_pretranslate_rows", numPretranslateRows); + return corpusSize; }