Skip to content

Commit

Permalink
Extract language tags from FLEx projects and store in DB metadata (#952)
Browse files Browse the repository at this point in the history
Extract language tags from FLEx projects and store them in the
FLExProjectMetadata table just like lex entry counts. Currently
we do not attempt to extract lang tags from WeSay projects.

Currently the language tags are extracted, but not analysed to try
to determine the primary language. Instead, the vernacular and analysis
tags are placed directly in the UI. The `isActive` flag means that the
tag appeared in the "Current" list, while the `isDefault` flag means
that the tag was the first one in the "Current" list. There should be
only one tag with `isDefault`.

* Add command to extract LangProject GUID as well

While we're adding hg runner commands, we can easily add one to extract
the GUID from the LangProject element, which may allow identifying
projects which started out as copies of each other even though their
Mercurial repo histories have no common commits.

* Add LangProjectId column to FlexProjectMetadata

Also include a DB migration and GQL schema update

* Make writing system ordering match what FLEx does

The order in which FLEx returns writing systems is current first, then
all non-current (if any) at the end. We want what we store in the
project metadata to match that ordering.

* Add UI for writing systems on proj page

* Add button for admins to refresh language list

* Add project controller actions to update missing langprojectid and writingsystems

---------

Co-authored-by: Kevin Hahn <kevin_hahn@sil.org>
  • Loading branch information
rmunn and hahn-kev authored Jul 23, 2024
1 parent 6d6e539 commit 3b5c99a
Show file tree
Hide file tree
Showing 22 changed files with 3,274 additions and 14 deletions.
70 changes: 66 additions & 4 deletions backend/LexBoxApi/Controllers/ProjectController.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ public class ProjectController(
ISchedulerFactory scheduler)
: ControllerBase
{

[HttpPost("refreshProjectLastChanged")]
public async Task<ActionResult> RefreshProjectLastChanged(string projectCode)
{
Expand Down Expand Up @@ -55,6 +54,7 @@ public async Task<ActionResult> UpdateAllRepoCommitDates(bool onlyUnknown)
{
project.LastCommit = await hgService.GetLastCommitTimeFromHg(project.Code);
}

await lexBoxDbContext.SaveChangesAsync();

return Ok();
Expand All @@ -73,14 +73,18 @@ public async Task<ActionResult<Project>> UpdateProjectType(Guid id)
project.Type = await hgService.DetermineProjectType(project.Code);
await lexBoxDbContext.SaveChangesAsync();
}

return project;
}

[HttpPost("setProjectType")]
[AdminRequired]
public async Task<ActionResult> SetProjectType(string projectCode, ProjectType projectType, bool overrideKnown = false)
public async Task<ActionResult> SetProjectType(string projectCode,
ProjectType projectType,
bool overrideKnown = false)
{
await lexBoxDbContext.Projects.Where(p => p.Code == projectCode && (p.Type == ProjectType.Unknown || overrideKnown))
await lexBoxDbContext.Projects
.Where(p => p.Code == projectCode && (p.Type == ProjectType.Unknown || overrideKnown))
.ExecuteUpdateAsync(u => u.SetProperty(p => p.Type, projectType));
return Ok();
}
Expand All @@ -107,7 +111,9 @@ public async Task<ActionResult<ProjectType>> DetermineProjectType(Guid id)
[ProducesResponseType(StatusCodes.Status200OK)]
[ProducesResponseType(StatusCodes.Status404NotFound)]
[AdminRequired]
public async Task<ActionResult<Dictionary<string, ProjectType>>> UpdateProjectTypesForUnknownProjects(int limit = 50, int offset = 0)
public async Task<ActionResult<Dictionary<string, ProjectType>>> UpdateProjectTypesForUnknownProjects(int limit =
50,
int offset = 0)
{
var projects = lexBoxDbContext.Projects
.Where(p => p.Type == ProjectType.Unknown)
Expand All @@ -121,6 +127,7 @@ public async Task<ActionResult<Dictionary<string, ProjectType>>> UpdateProjectTy
project.Type = await hgService.DetermineProjectType(project.Code);
result.Add(project.Code, project.Type);
}

await lexBoxDbContext.SaveChangesAsync();
return result;
}
Expand Down Expand Up @@ -176,6 +183,7 @@ public async Task<ActionResult<Project>> DeleteProject(Guid id)
}

public record HgCommandResponse(string Response);

[HttpGet("hgVerify/{code}")]
[AdminRequired]
[ProducesResponseType(StatusCodes.Status200OK)]
Expand All @@ -191,6 +199,7 @@ public async Task HgVerify(string code)
await Response.CompleteAsync();
return;
}

var result = await hgService.VerifyRepo(code, HttpContext.RequestAborted);
await StreamHttpResponse(result);
}
Expand All @@ -210,6 +219,7 @@ public async Task HgRecover(string code)
await Response.CompleteAsync();
return;
}

var result = await hgService.ExecuteHgRecover(code, HttpContext.RequestAborted);
await StreamHttpResponse(result);
}
Expand All @@ -232,6 +242,58 @@ public async Task<ActionResult<int>> UpdateLexEntryCount(string code)
return result is null ? NotFound() : result;
}

[HttpPost("updateLanguageList/{code}")]
[ProducesResponseType(StatusCodes.Status200OK)]
[ProducesResponseType(StatusCodes.Status404NotFound)]
[ProducesDefaultResponseType]
public async Task UpdateLanguageList(string code)
{
var projectId = await projectService.LookupProjectId(code);
await projectService.UpdateProjectLangTags(projectId);
}

[HttpPost("updateMissingLanguageList")]
public async Task<ActionResult<string[]>> UpdateMissingLanguageList(int limit = 10)
{
var projects = lexBoxDbContext.Projects
.Include(p => p.FlexProjectMetadata)
.Where(p => p.Type == ProjectType.FLEx && p.LastCommit != null && p.FlexProjectMetadata!.WritingSystems == null)
.Take(limit)
.AsAsyncEnumerable();
var codes = new List<string>(limit);
await foreach (var project in projects)
{
codes.Add(project.Code);
project.FlexProjectMetadata ??= new FlexProjectMetadata();
project.FlexProjectMetadata.WritingSystems = await hgService.GetProjectWritingSystems(project.Code);
}

await lexBoxDbContext.SaveChangesAsync();

return Ok(codes);
}

[HttpPost("updateMissingLangProjectId")]
public async Task<ActionResult<string[]>> UpdateMissingLangProjectId(int limit = 10)
{
var projects = lexBoxDbContext.Projects
.Include(p => p.FlexProjectMetadata)
.Where(p => p.Type == ProjectType.FLEx && p.LastCommit != null && p.FlexProjectMetadata!.LangProjectId == null)
.Take(limit)
.AsAsyncEnumerable();
var codes = new List<string>(limit);
await foreach (var project in projects)
{
codes.Add(project.Code);
project.FlexProjectMetadata ??= new FlexProjectMetadata();
project.FlexProjectMetadata.LangProjectId = await hgService.GetProjectIdOfFlexProject(project.Code);
}

await lexBoxDbContext.SaveChangesAsync();

return Ok(codes);
}

[HttpPost("queueUpdateProjectMetadataTask")]
public async Task<ActionResult> QueueUpdateProjectMetadataTask(string projectCode)
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
using HotChocolate.Data.Sorting;
using LexCore.Entities;

namespace LexBoxApi.GraphQL.CustomTypes;

public class FlexProjectMetadataGqlSortConfiguration : SortInputType<FlexProjectMetadata>
{
protected override void Configure(ISortInputTypeDescriptor<FlexProjectMetadata> descriptor)
{
descriptor.Field(p => p.WritingSystems).Ignore();
}
}
54 changes: 53 additions & 1 deletion backend/LexBoxApi/Services/HgService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,58 @@ await Task.Run(() =>
});
}

/// <summary>
/// Returns either an empty string, or XML (in string form) with a root LangTags element containing five child elements: AnalysisWss, CurAnalysisWss, VernWss, CurVernWss, and CurPronunWss.
/// Each child element will contain a single `<Uni>` element whose text content is a list of tags separated by spaces.
/// </summary>
private async Task<string> GetLangTagsAsXml(ProjectCode code, CancellationToken token = default)
{
var result = await ExecuteHgCommandServerCommand(code, "flexwritingsystems", token);
var xmlBody = await result.ReadAsStringAsync(token);
if (string.IsNullOrEmpty(xmlBody)) return string.Empty;
return $"<LangTags>{xmlBody}</LangTags>";
}

private string[] GetWsList(System.Xml.XmlElement root, string tagName)
{
var wsStr = root[tagName]?["Uni"]?.InnerText ?? "";
// String.Split(null) splits on any whitespace, but needs a type cast so the compiler can tell which overload (char[] vs string[]) to use
return wsStr.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries);
}

public async Task<ProjectWritingSystems?> GetProjectWritingSystems(ProjectCode code, CancellationToken token = default)
{
var langTagsXml = await GetLangTagsAsXml(code, token);
if (string.IsNullOrEmpty(langTagsXml)) return null;
var doc = new System.Xml.XmlDocument();
doc.LoadXml(langTagsXml);
var root = doc.DocumentElement;
if (root is null) return null;
var vernWss = GetWsList(root, "VernWss");
var analysisWss = GetWsList(root, "AnalysisWss");
var curVernWss = GetWsList(root, "CurVernWss");
var curAnalysisWss = GetWsList(root, "CurAnalysisWss");
var curVernSet = curVernWss.ToHashSet();
var curAnalysisSet = curAnalysisWss.ToHashSet();
// Ordering is important here to match how FLEx handles things: all *current* writing systems first, then all *non-current*.
var vernWsIds = curVernWss.Select((tag, idx) => new FLExWsId { Tag = tag, IsActive = true, IsDefault = idx == 0 }).ToList();
var analysisWsIds = curAnalysisWss.Select((tag, idx) => new FLExWsId { Tag = tag, IsActive = true, IsDefault = idx == 0 }).ToList();
vernWsIds.AddRange(vernWss.Where(ws => !curVernSet.Contains(ws)).Select(tag => new FLExWsId { Tag = tag, IsActive = false, IsDefault = false }));
analysisWsIds.AddRange(analysisWss.Where(ws => !curAnalysisSet.Contains(ws)).Select(tag => new FLExWsId { Tag = tag, IsActive = false, IsDefault = false }));
return new ProjectWritingSystems
{
VernacularWss = vernWsIds,
AnalysisWss = analysisWsIds
};
}

public async Task<Guid?> GetProjectIdOfFlexProject(ProjectCode code, CancellationToken token = default)
{
var result = await ExecuteHgCommandServerCommand(code, "flexprojectid", token);
var text = await result.ReadAsStringAsync(token);
if (Guid.TryParse(text, out var guid)) return guid;
return null;
}

public Task RevertRepo(ProjectCode code, string revHash)
{
Expand Down Expand Up @@ -254,11 +306,11 @@ public async Task<Changeset[]> GetChangesets(ProjectCode projectCode)
return logResponse?.Changesets ?? Array.Empty<Changeset>();
}


public Task<HttpContent> VerifyRepo(ProjectCode code, CancellationToken token)
{
return ExecuteHgCommandServerCommand(code, "verify", token);
}

public async Task<HttpContent> ExecuteHgRecover(ProjectCode code, CancellationToken token)
{
var response = await ExecuteHgCommandServerCommand(code, "recover", token);
Expand Down
23 changes: 23 additions & 0 deletions backend/LexBoxApi/Services/ProjectService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,29 @@ public async Task<Guid> CreateProject(CreateProjectInput input)
return projectId;
}

public async Task UpdateProjectLangTags(Guid projectId)
{
var project = await dbContext.Projects.FindAsync(projectId);
if (project is null || project.Type != ProjectType.FLEx) return;
await dbContext.Entry(project).Reference(p => p.FlexProjectMetadata).LoadAsync();
var langTags = await hgService.GetProjectWritingSystems(project.Code);
if (langTags is null) return;
project.FlexProjectMetadata ??= new FlexProjectMetadata();
project.FlexProjectMetadata.WritingSystems = langTags;
await dbContext.SaveChangesAsync();
}

public async Task UpdateProjectLangProjectId(Guid projectId)
{
var project = await dbContext.Projects.FindAsync(projectId);
if (project is null || project.Type != ProjectType.FLEx) return;
await dbContext.Entry(project).Reference(p => p.FlexProjectMetadata).LoadAsync();
var langProjGuid = await hgService.GetProjectIdOfFlexProject(project.Code);
project.FlexProjectMetadata ??= new FlexProjectMetadata();
project.FlexProjectMetadata.LangProjectId = langProjGuid;
await dbContext.SaveChangesAsync();
}

public async Task<Guid> CreateDraftProject(CreateProjectInput input)
{
// No need for a transaction if we're just saving a single item
Expand Down
18 changes: 18 additions & 0 deletions backend/LexCore/Entities/FlexProjectMetadata.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,22 @@ public class FlexProjectMetadata
{
public Guid ProjectId { get; set; }
public int? LexEntryCount { get; set; }
/// <summary>
/// GUID from the LangProject element, which is not the same as the ID of the LexBox project
/// </summary>
public Guid? LangProjectId { get; set; }
public ProjectWritingSystems? WritingSystems { get; set; }
}

public class ProjectWritingSystems
{
public required List<FLExWsId> VernacularWss { get; set; } = [];
public required List<FLExWsId> AnalysisWss { get; set; } = [];
}

public class FLExWsId
{
public required string Tag { get; set; }
public bool IsActive { get; set; }
public bool IsDefault { get; set; }
}
2 changes: 2 additions & 0 deletions backend/LexCore/ServiceInterfaces/IHgService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ public interface IHgService
Task<ProjectType> DetermineProjectType(ProjectCode projectCode);
Task DeleteRepo(ProjectCode code);
Task SoftDeleteRepo(ProjectCode code, string deletedRepoSuffix);
Task<ProjectWritingSystems?> GetProjectWritingSystems(ProjectCode code, CancellationToken token = default);
Task<Guid?> GetProjectIdOfFlexProject(ProjectCode code, CancellationToken token = default);
BackupExecutor? BackupRepo(ProjectCode code);
Task ResetRepo(ProjectCode code);
Task FinishReset(ProjectCode code, Stream zipFile);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,11 @@ public class FlexProjectMetadataEntityConfiguration: IEntityTypeConfiguration<Fl
public virtual void Configure(EntityTypeBuilder<FlexProjectMetadata> builder)
{
builder.HasKey(e => e.ProjectId);
builder.OwnsOne(e => e.WritingSystems, wsb =>
{
wsb.ToJson();
wsb.OwnsMany(e => e.AnalysisWss);
wsb.OwnsMany(e => e.VernacularWss);
});
}
}
Loading

0 comments on commit 3b5c99a

Please sign in to comment.