Skip to content

Commit

Permalink
Log beginning of build log that can be built into metrics.
Browse files Browse the repository at this point in the history
  • Loading branch information
johnml1135 committed Jan 4, 2024
1 parent 05c2ee7 commit 9ec1e2a
Showing 1 changed file with 23 additions and 1 deletion.
24 changes: 23 additions & 1 deletion src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
namespace SIL.Machine.AspNetCore.Services;
using Google.Protobuf;
using MongoDB.Bson.IO;

namespace SIL.Machine.AspNetCore.Services;

public class NmtPreprocessBuildJob : HangfireBuildJob<IReadOnlyList<Corpus>>
{
private readonly ISharedFileService _sharedFileService;
private readonly ICorpusService _corpusService;
private readonly JsonObject _buildPreprocessSummary;

public NmtPreprocessBuildJob(
IPlatformService platformService,
Expand All @@ -18,6 +22,7 @@ ICorpusService corpusService
{
_sharedFileService = sharedFileService;
_corpusService = corpusService;
_buildPreprocessSummary = new JsonObject();
}

protected override async Task DoWorkAsync(
Expand All @@ -29,8 +34,17 @@ protected override async Task DoWorkAsync(
CancellationToken cancellationToken
)
{
_buildPreprocessSummary.Add("event", "build_preprocess");
_buildPreprocessSummary.Add("type", "nmt");
_buildPreprocessSummary.Add("engine_id", engineId);
_buildPreprocessSummary.Add("build_id", buildId);
_buildPreprocessSummary.Add("build_options", JsonNode.Parse(buildOptions!));
_buildPreprocessSummary.Add("corpora", JsonNode.Parse(JsonSerializer.Serialize(data)));

await WriteDataFilesAsync(buildId, data, buildOptions, cancellationToken);

Logger.LogInformation(_buildPreprocessSummary.ToJsonString());

await using (await @lock.WriterLockAsync(cancellationToken: cancellationToken))
{
bool canceling = !await BuildJobService.StartBuildJobAsync(
Expand Down Expand Up @@ -67,6 +81,8 @@ await _sharedFileService.OpenWriteAsync($"builds/{buildId}/train.trg.txt", cance
);

int corpusSize = 0;
var numTrainRows = 0;
var numPretranslateRows = 0;
async IAsyncEnumerable<Pretranslation> ProcessRowsAsync()
{
foreach (Corpus corpus in corpora)
Expand Down Expand Up @@ -106,6 +122,7 @@ async IAsyncEnumerable<Pretranslation> ProcessRowsAsync()
{
await sourceTrainWriter.WriteAsync($"{row.SourceText}\n");
await targetTrainWriter.WriteAsync($"{row.TargetText}\n");
numTrainRows += 1;
}
if (
(corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(row.TextId))
Expand Down Expand Up @@ -137,6 +154,7 @@ async IAsyncEnumerable<Pretranslation> ProcessRowsAsync()
{
refs = row.TargetRefs;
}
numPretranslateRows += 1;
yield return new Pretranslation
{
CorpusId = corpus.Id,
Expand All @@ -162,6 +180,10 @@ await JsonSerializer.SerializeAsync(
new JsonSerializerOptions { WriteIndented = true, PropertyNamingPolicy = JsonNamingPolicy.CamelCase },
cancellationToken: cancellationToken
);

_buildPreprocessSummary.Add("num_train_rows", numTrainRows);
_buildPreprocessSummary.Add("num_pretranslate_rows", numPretranslateRows);

return corpusSize;
}

Expand Down

0 comments on commit 9ec1e2a

Please sign in to comment.