Skip to content

Commit

Permalink
improve import performance (#953)
Browse files Browse the repository at this point in the history
* batch import entries as a single commit. Import Semantic domains, Parts of Speech.

* pull common code between FwDataMiniLcmHub.cs and CrdtMiniLcmApiHub.cs into MiniLcmApiHubBase.cs

* fix copy function on writing system order.

* enable LcmCrdt.Tests to log to debug with sensitive data from EF

* change home buttons into proper links

* configure ef to ignore Entity.Senses and Sense.ExampleSentences

* search filters crdt entries by gloss

* correct headword code to trim and switch to lexeme form when citation form is empty

* make GetPartsOfSpeech and GetSemanticDomains required methods on ILexboxApi

* delete crdt db if project creation throws exception

* setup bulk import of semantic domains to speed up importing large projects

* add some logging to give import feedback when doing bulk import

* display 2 units of precision in import complete log

* add explanation of workaround for Senses property in Entry
  • Loading branch information
hahn-kev authored Jul 17, 2024
1 parent 4594041 commit 6737890
Show file tree
Hide file tree
Showing 20 changed files with 382 additions and 241 deletions.
13 changes: 12 additions & 1 deletion backend/FwLite/FwDataMiniLcmBridge/Api/FwDataMiniLcmApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ internal int GetWritingSystemHandle(WritingSystemId ws, WritingSystemType? type
return lcmWs.Handle;
}


internal CoreWritingSystemDefinition? GetLcmWritingSystem(WritingSystemId ws, WritingSystemType? type = null)
{
if (ws == "default")
Expand Down Expand Up @@ -156,6 +157,11 @@ public async IAsyncEnumerable<PartOfSpeech> GetPartsOfSpeech()
}
}

public async Task CreatePartOfSpeech(PartOfSpeech partOfSpeech)

Check warning on line 160 in backend/FwLite/FwDataMiniLcmBridge/Api/FwDataMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Build FW Lite

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.

Check warning on line 160 in backend/FwLite/FwDataMiniLcmBridge/Api/FwDataMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Build FW Lite

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.

Check warning on line 160 in backend/FwLite/FwDataMiniLcmBridge/Api/FwDataMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Build API / publish-api

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.
{
throw new NotImplementedException();
}

public async IAsyncEnumerable<SemanticDomain> GetSemanticDomains()

Check warning on line 165 in backend/FwLite/FwDataMiniLcmBridge/Api/FwDataMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Build FW Lite

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.

Check warning on line 165 in backend/FwLite/FwDataMiniLcmBridge/Api/FwDataMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Build FW Lite

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.

Check warning on line 165 in backend/FwLite/FwDataMiniLcmBridge/Api/FwDataMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Build API / publish-api

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.
{
foreach (var semanticDomain in SemanticDomainRepository.AllInstances().OrderBy(p => p.Name.BestAnalysisAlternative.Text))
Expand All @@ -164,11 +170,16 @@ public async IAsyncEnumerable<SemanticDomain> GetSemanticDomains()
{
Id = semanticDomain.Guid,
Name = FromLcmMultiString(semanticDomain.Name),
Code = semanticDomain.OcmCodes
Code = semanticDomain.OcmCodes ?? ""
};
}
}

public async Task CreateSemanticDomain(SemanticDomain semanticDomain)

Check warning on line 178 in backend/FwLite/FwDataMiniLcmBridge/Api/FwDataMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Build FW Lite

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.

Check warning on line 178 in backend/FwLite/FwDataMiniLcmBridge/Api/FwDataMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Build FW Lite

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.

Check warning on line 178 in backend/FwLite/FwDataMiniLcmBridge/Api/FwDataMiniLcmApi.cs

View workflow job for this annotation

GitHub Actions / Build API / publish-api

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.
{
throw new NotImplementedException();
}

internal ICmSemanticDomain GetLcmSemanticDomain(Guid semanticDomainId)
{
return SemanticDomainRepository.GetObject(semanticDomainId);
Expand Down
7 changes: 4 additions & 3 deletions backend/FwLite/LcmCrdt.Tests/Changes/JsonPatchChangeTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using LcmCrdt.Changes;
using LcmCrdt.Objects;
using SystemTextJsonPatch;
using SystemTextJsonPatch.Operations;

namespace LcmCrdt.Tests.Changes;

Expand All @@ -14,7 +15,7 @@ public void NewChangeAction_ThrowsForRemoveAtIndex()
var act = () => new JsonPatchChange<Entry>(Guid.NewGuid(),
patch =>
{
patch.Remove(entry => entry.Senses, 1);
patch.Operations.Add(new Operation<Entry>("remove", "/senses/1", null, null));
});
act.Should().Throw<NotSupportedException>();
}
Expand All @@ -23,7 +24,7 @@ public void NewChangeAction_ThrowsForRemoveAtIndex()
public void NewChangeDirect_ThrowsForRemoveAtIndex()
{
var patch = new JsonPatchDocument<Entry>();
patch.Remove(entry => entry.Senses, 1);
patch.Operations.Add(new Operation<Entry>("remove", "/senses/1", null, null));
var act = () => new JsonPatchChange<Entry>(Guid.NewGuid(), patch);
act.Should().Throw<NotSupportedException>();
}
Expand All @@ -32,7 +33,7 @@ public void NewChangeDirect_ThrowsForRemoveAtIndex()
public void NewChangeIPatchDoc_ThrowsForRemoveAtIndex()
{
var patch = new JsonPatchDocument<Entry>();
patch.Remove(entry => entry.Senses, 1);
patch.Operations.Add(new Operation<Entry>("remove", "/senses/1", null, null));
var act = () => new JsonPatchChange<Entry>(Guid.NewGuid(), patch, JsonSerializerOptions.Default);
act.Should().Throw<NotSupportedException>();
}
Expand Down
1 change: 1 addition & 0 deletions backend/FwLite/LcmCrdt.Tests/LcmCrdt.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

<ItemGroup>
<PackageReference Include="coverlet.collector" Version="6.0.0"/>
<PackageReference Include="Microsoft.Extensions.Logging.Debug" Version="8.0.0" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.8.0"/>
<PackageReference Include="FluentAssertions" Version="6.12.0"/>
<PackageReference Include="xunit" Version="2.5.3"/>
Expand Down
10 changes: 10 additions & 0 deletions backend/FwLite/LcmCrdt.Tests/LexboxApiTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
using Microsoft.Extensions.Logging;
using MiniLcm;
using Entry = MiniLcm.Entry;
using ExampleSentence = MiniLcm.ExampleSentence;
Expand All @@ -26,6 +27,7 @@ public BasicApiTests()
{
var services = new ServiceCollection()
.AddLcmCrdtClient()
.AddLogging(builder => builder.AddDebug())
.RemoveAll(typeof(ProjectContext))
.AddSingleton<ProjectContext>(new MockProjectContext(new CrdtProject("sena-3", ":memory:")))
.BuildServiceProvider();
Expand Down Expand Up @@ -146,6 +148,14 @@ public async Task GetWritingSystems()
writingSystems.Analysis.Should().NotBeEmpty();
}

[Fact]
public async Task CreatingMultipleWritingSystems_DoesNotHaveDuplicateOrders()
{
await _api.CreateWritingSystem(WritingSystemType.Vernacular, new WritingSystem() { Id = "test-2", Name = "test", Abbreviation = "test", Font = "Arial", Exemplars = new[] { "test" } });
var writingSystems = await DataModel.GetLatestObjects<Objects.WritingSystem>().Where(ws => ws.Type == WritingSystemType.Vernacular).ToArrayAsync();
writingSystems.GroupBy(ws => ws.Order).Should().NotContain(g => g.Count() > 1);
}

[Fact]
public async Task GetEntriesByExemplar()
{
Expand Down
16 changes: 8 additions & 8 deletions backend/FwLite/LcmCrdt.Tests/SerializationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ public void CanSerializeEntry()
LexemeForm = { Values = { { "en", "test" } } },
CitationForm = { Values = { { "en", "test" } } },
Senses =
{
[
new Sense
{
Id = Guid.NewGuid(),
EntryId = entryId,
Gloss = { Values = { { "en", "test" } } }
}
}
]
};
var act = () => JsonSerializer.Serialize(entry);
var json = act.Should().NotThrow().Subject;
Expand All @@ -44,7 +44,7 @@ public void CanDeserializeEntry()
LexemeForm = { Values = { { "en", "test" } } },
CitationForm = { Values = { { "en", "test" } } },
Senses =
{
[
new Sense
{
Id = senseId,
Expand All @@ -60,7 +60,7 @@ public void CanDeserializeEntry()
}
}
}
}
]
};
var json = JsonSerializer.Serialize(entry);
var act = () => JsonSerializer.Deserialize<Entry>(json);
Expand Down Expand Up @@ -94,29 +94,29 @@ public void EqualityTest()
LexemeForm = { Values = { { "en", "test" } } },
CitationForm = { Values = { { "en", "test" } } },
Senses =
{
[
new Sense
{
Id = senseId,
EntryId = entryId,
Gloss = { Values = { { "en", "test" } } }
}
}
]
};
var entryCopy = new Entry()
{
Id = entryId,
LexemeForm = { Values = { { "en", "test" } } },
CitationForm = { Values = { { "en", "test" } } },
Senses =
{
[
new Sense
{
Id = senseId,
EntryId = entryId,
Gloss = { Values = { { "en", "test" } } }
}
}
]
};
entry.Should().BeEquivalentTo(entryCopy);
}
Expand Down
52 changes: 50 additions & 2 deletions backend/FwLite/LcmCrdt/CrdtLexboxApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,26 @@ public IAsyncEnumerable<PartOfSpeech> GetPartsOfSpeech()
return PartsOfSpeech.AsAsyncEnumerable();
}

public async Task CreatePartOfSpeech(PartOfSpeech partOfSpeech)
{
await dataModel.AddChange(ClientId, new CreatePartOfSpeechChange(partOfSpeech.Id, partOfSpeech.Name, false));
}

public IAsyncEnumerable<MiniLcm.SemanticDomain> GetSemanticDomains()
{
return SemanticDomains.AsAsyncEnumerable();
}

public async Task CreateSemanticDomain(MiniLcm.SemanticDomain semanticDomain)
{
await dataModel.AddChange(ClientId, new CreateSemanticDomainChange(semanticDomain.Id, semanticDomain.Name, semanticDomain.Code));
}

public async Task BulkImportSemanticDomains(IEnumerable<MiniLcm.SemanticDomain> semanticDomains)
{
await dataModel.AddChanges(ClientId, semanticDomains.Select(sd => new CreateSemanticDomainChange(sd.Id, sd.Name, sd.Code)));
}

public IAsyncEnumerable<MiniLcm.Entry> GetEntries(QueryOptions? options = null)
{
return GetEntriesAsyncEnum(predicate: null, options);
Expand All @@ -86,7 +101,12 @@ public IAsyncEnumerable<PartOfSpeech> GetPartsOfSpeech()
public IAsyncEnumerable<MiniLcm.Entry> SearchEntries(string? query, QueryOptions? options = null)
{
if (string.IsNullOrEmpty(query)) return GetEntriesAsyncEnum(null, options);
return GetEntriesAsyncEnum(e => e.LexemeForm.SearchValue(query) || e.CitationForm.SearchValue(query), options);

return GetEntriesAsyncEnum(e => e.LexemeForm.SearchValue(query)
|| e.CitationForm.SearchValue(query)
|| e.Senses.Any(s => s.Gloss.SearchValue(query))

, options);
}

private async IAsyncEnumerable<MiniLcm.Entry> GetEntriesAsyncEnum(
Expand Down Expand Up @@ -188,6 +208,35 @@ await dataModel.AddChanges(ClientId,
], deferCommit: true);
}

public async Task BulkCreateEntries(IAsyncEnumerable<MiniLcm.Entry> entries)
{
var semanticDomains = await SemanticDomains.ToDictionaryAsync(sd => sd.Id, sd => sd);
var partsOfSpeech = await PartsOfSpeech.ToDictionaryAsync(p => p.Id, p => p);
await dataModel.AddChanges(ClientId, entries.ToBlockingEnumerable().SelectMany(entry => CreateEntryChanges(entry, semanticDomains, partsOfSpeech)));
}

private IEnumerable<IChange> CreateEntryChanges(MiniLcm.Entry entry, Dictionary<Guid, SemanticDomain> semanticDomains, Dictionary<Guid, Objects.PartOfSpeech> partsOfSpeech)
{
yield return new CreateEntryChange(entry);
foreach (var sense in entry.Senses)
{
sense.SemanticDomains = sense.SemanticDomains
.Select(sd => semanticDomains.TryGetValue(sd.Id, out var selectedSd) ? selectedSd : null)
.OfType<MiniLcm.SemanticDomain>()
.ToList();
if (sense.PartOfSpeechId is not null && partsOfSpeech.TryGetValue(sense.PartOfSpeechId.Value, out var partOfSpeech))
{
sense.PartOfSpeechId = partOfSpeech.Id;
sense.PartOfSpeech = partOfSpeech.Name["en"] ?? string.Empty;
}
yield return new CreateSenseChange(sense, entry.Id);
foreach (var exampleSentence in sense.ExampleSentences)
{
yield return new CreateExampleSentenceChange(exampleSentence, sense.Id);
}
}
}

public async Task<MiniLcm.Entry> CreateEntry(MiniLcm.Entry entry)
{
await dataModel.AddChanges(ClientId,
Expand Down Expand Up @@ -226,7 +275,6 @@ private async IAsyncEnumerable<IChange> CreateSenseChanges(Guid entryId, MiniLcm
sense.PartOfSpeech = partOfSpeech?.Name["en"] ?? string.Empty;
}


yield return new CreateSenseChange(sense, entryId);
foreach (var change in sense.ExampleSentences.Select(sentence =>
new CreateExampleSentenceChange(sentence, sense.Id)))
Expand Down
8 changes: 7 additions & 1 deletion backend/FwLite/LcmCrdt/LcmCrdtKernel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

namespace LcmCrdt;


public static class LcmCrdtKernel
{
public static IServiceCollection AddLcmCrdtClient(this IServiceCollection services)
Expand All @@ -40,6 +39,9 @@ private static void ConfigureDbOptions(IServiceProvider provider, DbContextOptio
{
var projectContext = provider.GetRequiredService<ProjectContext>();
if (projectContext.Project is null) throw new NullReferenceException("Project is null");
#if DEBUG
builder.EnableSensitiveDataLogging();
#endif
builder.UseSqlite($"Data Source={projectContext.Project.DbPath}")
.UseLinqToDB(optionsBuilder =>
{
Expand All @@ -48,6 +50,8 @@ private static void ConfigureDbOptions(IServiceProvider provider, DbContextOptio
nameof(Commit.HybridDateTime) + "." + nameof(HybridDateTime.DateTime)))
.HasAttribute<Commit>(new ColumnAttribute(nameof(HybridDateTime.Counter),
nameof(Commit.HybridDateTime) + "." + nameof(HybridDateTime.Counter)))
.Entity<Entry>().Property(e => e.Id)
.Association(e => (e.Senses as IEnumerable<Sense>)!, e => e.Id, s => s.EntryId)
.Build();
mappingSchema.SetConvertExpression((MiniLcm.WritingSystemId id) =>
new DataParameter { Value = id.Code, DataType = DataType.Text });
Expand All @@ -64,13 +68,15 @@ private static void ConfigureCrdt(CrdtConfig config)
config.ObjectTypeListBuilder
.Add<Entry>(builder =>
{
builder.Ignore(e => e.Senses);
// builder.OwnsOne(e => e.Note, n => n.ToJson());
// builder.OwnsOne(e => e.LexemeForm, n => n.ToJson());
// builder.OwnsOne(e => e.CitationForm, n => n.ToJson());
// builder.OwnsOne(e => e.LiteralMeaning, n => n.ToJson());
})
.Add<Sense>(builder =>
{
builder.Ignore(s => s.ExampleSentences);
builder.HasOne<Entry>()
.WithMany()
.HasForeignKey(sense => sense.EntryId);
Expand Down
21 changes: 18 additions & 3 deletions backend/FwLite/LcmCrdt/Objects/Entry.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Linq.Expressions;
using System.Text.Json.Serialization;
using Crdt;
using Crdt.Entities;
using LinqToDB;
Expand All @@ -8,7 +9,6 @@ namespace LcmCrdt.Objects;

public class Entry : MiniLcm.Entry, IObjectBase<Entry>
{

Guid IObjectBase.Id
{
get => Id;
Expand All @@ -17,17 +17,32 @@ Guid IObjectBase.Id

public DateTimeOffset? DeletedAt { get; set; }

/// <summary>
/// This is a bit of a hack, we want to be able to reference senses when running a query, and they must be CrdtSenses
/// however we only want to store the senses in the entry as MiniLcmSenses, so we need to convert them back to CrdtSenses
/// Note, even though this is JsonIgnored, the Senses property in the base class is still serialized
/// </summary>
[JsonIgnore]
public new IReadOnlyList<Sense> Senses
{
get
{
return [..base.Senses.Select(s => s as Sense ?? Sense.FromMiniLcm(s, Id))];
}
set { base.Senses = [..value]; }
}


[ExpressionMethod(nameof(HeadwordExpression))]
public string Headword(WritingSystemId ws)
{
var word = CitationForm[ws];
if (string.IsNullOrEmpty(word)) word = LexemeForm[ws];
return word;
return word.Trim();
}

protected static Expression<Func<Entry, WritingSystemId, string?>> HeadwordExpression() =>
(e, ws) => Json.Value(e.CitationForm, ms => ms[ws]) ?? Json.Value(e.LexemeForm, ms => ms[ws]);
(e, ws) => (string.IsNullOrEmpty(Json.Value(e.CitationForm, ms => ms[ws])) ? Json.Value(e.LexemeForm, ms => ms[ws]) : Json.Value(e.CitationForm, ms => ms[ws]))!.Trim();

public Guid[] GetReferences()
{
Expand Down
14 changes: 14 additions & 0 deletions backend/FwLite/LcmCrdt/Objects/Sense.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,20 @@ namespace LcmCrdt.Objects;

public class Sense : MiniLcm.Sense, IObjectBase<Sense>
{
public static Sense FromMiniLcm(MiniLcm.Sense sense, Guid entryId)
{
return new Sense
{
Id = sense.Id,
Definition = sense.Definition,
Gloss = sense.Gloss,
PartOfSpeech = sense.PartOfSpeech,
PartOfSpeechId = sense.PartOfSpeechId,
SemanticDomains = sense.SemanticDomains,
ExampleSentences = sense.ExampleSentences,
EntryId = entryId
};
}
public static IEnumerable<IChange> ChangesFromJsonPatch(Sense sense, JsonPatchDocument<MiniLcm.Sense> patch)
{
foreach (var rewriteChange in patch.RewriteChanges(s => s.PartOfSpeechId,
Expand Down
3 changes: 2 additions & 1 deletion backend/FwLite/LcmCrdt/Objects/WritingSystem.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ public IObjectBase Copy()
Font = Font,
Exemplars = Exemplars,
DeletedAt = DeletedAt,
Type = Type
Type = Type,
Order = Order
};
}

Expand Down
Loading

0 comments on commit 6737890

Please sign in to comment.