Skip to content

Commit 62e2c19

Browse files
hahn-kevmyieye
andauthored
use custom collation for sorting LcmCrdt data (#1291)
* define custom collations per writing system based on wsId and use for sorting * use a span-based comparison overload for custom collation to avoid allocating strings * convert headword to lowercase when sorting in mongo --------- Co-authored-by: Tim Haasdyk <tim_haasdyk@sil.org>
1 parent 94ac2a7 commit 62e2c19

File tree

9 files changed

+293
-6
lines changed

9 files changed

+293
-6
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
using FwDataMiniLcmBridge.Tests.Fixtures;
2+
3+
namespace FwDataMiniLcmBridge.Tests.MiniLcmTests;
4+
5+
[Collection(ProjectLoaderFixture.Name)]
6+
public class SortingTests(ProjectLoaderFixture fixture) : SortingTestsBase
7+
{
8+
protected override Task<IMiniLcmApi> NewApi()
9+
{
10+
return Task.FromResult<IMiniLcmApi>(fixture.NewProjectApi("sorting-test", "en", "en"));
11+
}
12+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
namespace LcmCrdt.Tests.MiniLcmTests;
2+
3+
public class SortingTests : SortingTestsBase
4+
{
5+
private readonly MiniLcmApiFixture _fixture = new();
6+
7+
protected override async Task<IMiniLcmApi> NewApi()
8+
{
9+
await _fixture.InitializeAsync();
10+
var api = _fixture.Api;
11+
return api;
12+
}
13+
14+
public override async Task DisposeAsync()
15+
{
16+
await base.DisposeAsync();
17+
await _fixture.DisposeAsync();
18+
}
19+
}

backend/FwLite/LcmCrdt/CrdtMiniLcmApi.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -233,15 +233,15 @@ private async IAsyncEnumerable<Entry> GetEntries(
233233
queryable = queryable.WhereExemplar(ws.Value, options.Exemplar.Value);
234234
}
235235

236-
var sortWs = (await GetWritingSystem(options.Order.WritingSystem, WritingSystemType.Vernacular))?.WsId;
236+
var sortWs = (await GetWritingSystem(options.Order.WritingSystem, WritingSystemType.Vernacular));
237237
if (sortWs is null)
238238
throw new NullReferenceException($"sort writing system {options.Order.WritingSystem} not found");
239239
queryable = queryable
240240
.LoadWith(e => e.Senses).ThenLoad(s => s.ExampleSentences)
241241
.LoadWith(e => e.ComplexForms)
242242
.LoadWith(e => e.Components)
243243
.AsQueryable()
244-
.OrderBy(e => e.Headword(sortWs.Value))
244+
.OrderBy(e => e.Headword(sortWs.WsId).CollateUnicode(sortWs))
245245
.ThenBy(e => e.Id)
246246
.Skip(options.Offset)
247247
.Take(options.Count);
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
using System.Data;
2+
using System.Data.Common;
3+
using System.Globalization;
4+
using System.Text;
5+
using Microsoft.Data.Sqlite;
6+
using Microsoft.EntityFrameworkCore;
7+
using Microsoft.EntityFrameworkCore.Diagnostics;
8+
using Microsoft.Extensions.Caching.Memory;
9+
using Microsoft.Extensions.Logging;
10+
11+
namespace LcmCrdt.Data;
12+
13+
public class SetupCollationInterceptor(IMemoryCache cache, ILogger<SetupCollationInterceptor> logger) : IDbConnectionInterceptor, ISaveChangesInterceptor
14+
{
15+
private WritingSystem[] GetWritingSystems(LcmCrdtDbContext dbContext, DbConnection connection)
16+
{
17+
//todo this needs to be invalidated when the writing systems change
18+
return cache.GetOrCreate(CacheKey(connection),
19+
entry =>
20+
{
21+
entry.SlidingExpiration = TimeSpan.FromMinutes(30);
22+
try
23+
{
24+
25+
return dbContext.WritingSystems.ToArray();
26+
}
27+
catch (SqliteException e)
28+
{
29+
return [];
30+
}
31+
}) ?? [];
32+
}
33+
34+
private static string CacheKey(DbConnection connection)
35+
{
36+
return $"writingSystems|{connection.ConnectionString}";
37+
}
38+
39+
private void InvalidateWritingSystemsCache(DbConnection connection)
40+
{
41+
cache.Remove(CacheKey(connection));
42+
}
43+
44+
public void ConnectionOpened(DbConnection connection, ConnectionEndEventData eventData)
45+
{
46+
var context = (LcmCrdtDbContext?)eventData.Context;
47+
if (context is null) throw new InvalidOperationException("context is null");
48+
var sqliteConnection = (SqliteConnection)connection;
49+
SetupCollations(sqliteConnection, GetWritingSystems(context, connection));
50+
51+
//setup general use collation
52+
sqliteConnection.CreateCollation(SqlSortingExtensions.CollateUnicodeNoCase,
53+
CultureInfo.CurrentCulture.CompareInfo,
54+
(compareInfo, x, y) => compareInfo.Compare(x, y, CompareOptions.IgnoreCase));
55+
}
56+
57+
public Task ConnectionOpenedAsync(DbConnection connection,
58+
ConnectionEndEventData eventData,
59+
CancellationToken cancellationToken = default)
60+
{
61+
ConnectionOpened(connection, eventData);
62+
return Task.CompletedTask;
63+
}
64+
65+
public InterceptionResult<int> SavingChanges(DbContextEventData eventData, InterceptionResult<int> result)
66+
{
67+
UpdateCollationsOnSave(eventData.Context);
68+
return result;
69+
}
70+
71+
public ValueTask<InterceptionResult<int>> SavingChangesAsync(DbContextEventData eventData,
72+
InterceptionResult<int> result,
73+
CancellationToken cancellationToken = default)
74+
{
75+
UpdateCollationsOnSave(eventData.Context);
76+
return ValueTask.FromResult(result);
77+
}
78+
79+
private void UpdateCollationsOnSave(DbContext? dbContext)
80+
{
81+
if (dbContext is null) return;
82+
var connection = (SqliteConnection)dbContext.Database.GetDbConnection();
83+
bool updateWs = false;
84+
foreach (var entityEntry in dbContext.ChangeTracker.Entries<WritingSystem>())
85+
{
86+
if (entityEntry.State is EntityState.Added or EntityState.Modified)
87+
{
88+
var writingSystem = entityEntry.Entity;
89+
SetupCollation(connection, writingSystem);
90+
updateWs = true;
91+
}
92+
}
93+
94+
if (updateWs)
95+
{
96+
InvalidateWritingSystemsCache(connection);
97+
}
98+
}
99+
100+
private void SetupCollations(SqliteConnection connection, WritingSystem[] writingSystems)
101+
{
102+
foreach (var writingSystem in writingSystems)
103+
{
104+
SetupCollation(connection, writingSystem);
105+
}
106+
}
107+
108+
private void SetupCollation(SqliteConnection connection, WritingSystem writingSystem)
109+
{
110+
CompareInfo compareInfo;
111+
try
112+
{
113+
//todo use ICU/SLDR instead
114+
compareInfo = CultureInfo.CreateSpecificCulture(writingSystem.WsId.Code).CompareInfo;
115+
}
116+
catch (Exception e)
117+
{
118+
logger.LogError(e, "Failed to create compare info for '{WritingSystemId}'", writingSystem.WsId);
119+
compareInfo = CultureInfo.InvariantCulture.CompareInfo;
120+
}
121+
122+
//todo use custom comparison based on the writing system
123+
CreateSpanCollation(connection, SqlSortingExtensions.CollationName(writingSystem),
124+
compareInfo,
125+
static (compareInfo, x, y) => compareInfo.Compare(x, y, CompareOptions.IgnoreCase));
126+
}
127+
128+
//this is a premature optimization, but it avoids creating strings for each comparison and instead uses spans which avoids allocations
129+
//if the new comparison function does not support spans then we can use SqliteConnection.CreateCollation instead which works with strings
130+
private void CreateSpanCollation<T>(SqliteConnection connection,
131+
string name, T state,
132+
Func<T, ReadOnlySpan<char>, ReadOnlySpan<char>, int> compare)
133+
{
134+
if (connection.State != ConnectionState.Open)
135+
throw new InvalidOperationException("Unable to create custom collation Connection must be open.");
136+
var rc = SQLitePCL.raw.sqlite3__create_collation_utf8(connection.Handle,
137+
name,
138+
Tuple.Create(state, compare),
139+
static (s, x, y) =>
140+
{
141+
var (state, compare) = (Tuple<T, Func<T, ReadOnlySpan<char>, ReadOnlySpan<char>, int>>) s;
142+
Span<char> xSpan = stackalloc char[Encoding.UTF8.GetCharCount(x)];
143+
Span<char> ySpan = stackalloc char[Encoding.UTF8.GetCharCount(y)];
144+
Encoding.UTF8.GetChars(x, xSpan);
145+
Encoding.UTF8.GetChars(y, ySpan);
146+
147+
return compare(state, xSpan, ySpan);
148+
});
149+
SqliteException.ThrowExceptionForRC(rc, connection.Handle);
150+
151+
}
152+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
using System.Data.SQLite;
2+
using System.Linq.Expressions;
3+
using LinqToDB;
4+
using SIL.WritingSystems;
5+
6+
namespace LcmCrdt.Data;
7+
8+
public static class SqlSortingExtensions
9+
{
10+
public const string CollateUnicodeNoCase = "NOCASE_UNICODE";
11+
12+
[ExpressionMethod(nameof(CollateUnicodeExpression))]
13+
internal static string CollateUnicode(this string value, WritingSystem ws)
14+
{
15+
//could optionally just return the value here, but it would work differently than sql
16+
throw new InvalidOperationException("CollateUnicode is a LinqToDB only API.");
17+
}
18+
19+
private static Expression<Func<string, WritingSystem, string>> CollateUnicodeExpression()
20+
{
21+
//todo maybe in the future we use a custom collation based on the writing system
22+
return (s, ws) => s.Collate(CollationName(ws));
23+
}
24+
25+
internal static string CollationName(WritingSystem ws)
26+
{
27+
//don't use ':' in the name, it won't work
28+
return $"NOCASE_WS_{ws.WsId}";
29+
}
30+
}

backend/FwLite/LcmCrdt/LcmCrdtDbContext.cs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,25 @@
1-
using System.Text.Json;
1+
using System.Data.Common;
2+
using System.Text.Json;
3+
using LcmCrdt.Data;
4+
using Microsoft.Data.Sqlite;
25
using SIL.Harmony;
36
using SIL.Harmony.Db;
47
using Microsoft.EntityFrameworkCore;
8+
using Microsoft.EntityFrameworkCore.Diagnostics;
59
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
610
using Microsoft.Extensions.Options;
711

812
namespace LcmCrdt;
913

10-
public class LcmCrdtDbContext(DbContextOptions<LcmCrdtDbContext> dbContextOptions, IOptions<CrdtConfig> options): DbContext(dbContextOptions), ICrdtDbContext
14+
public class LcmCrdtDbContext(DbContextOptions<LcmCrdtDbContext> dbContextOptions, IOptions<CrdtConfig> options, SetupCollationInterceptor setupCollationInterceptor)
15+
: DbContext(dbContextOptions), ICrdtDbContext
1116
{
1217
public DbSet<ProjectData> ProjectData => Set<ProjectData>();
18+
public IQueryable<WritingSystem> WritingSystems => Set<WritingSystem>().AsNoTracking();
19+
protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder)
20+
{
21+
optionsBuilder.AddInterceptors(setupCollationInterceptor);
22+
}
1323

1424
protected override void OnModelCreating(ModelBuilder modelBuilder)
1525
{

backend/FwLite/LcmCrdt/LcmCrdtKernel.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using SIL.Harmony.Changes;
66
using LcmCrdt.Changes;
77
using LcmCrdt.Changes.Entries;
8+
using LcmCrdt.Data;
89
using LcmCrdt.Objects;
910
using LcmCrdt.RemoteSync;
1011
using LinqToDB;
@@ -28,6 +29,7 @@ public static IServiceCollection AddLcmCrdtClient(this IServiceCollection servic
2829
{
2930
LinqToDBForEFTools.Initialize();
3031
services.AddMemoryCache();
32+
services.AddSingleton<SetupCollationInterceptor>();
3133
services.AddDbContext<LcmCrdtDbContext>(ConfigureDbOptions);
3234
services.AddOptions<LcmCrdtConfig>().BindConfiguration("LcmCrdt");
3335

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
namespace MiniLcm.Tests;
2+
3+
public abstract class SortingTestsBase : MiniLcmTestBase
4+
{
5+
public override async Task InitializeAsync()
6+
{
7+
await base.InitializeAsync();
8+
await Api.CreateWritingSystem(WritingSystemType.Analysis,
9+
new WritingSystem()
10+
{
11+
Id = Guid.NewGuid(),
12+
Type = WritingSystemType.Analysis,
13+
WsId = "en",
14+
Name = "English",
15+
Abbreviation = "En",
16+
Font = "Arial",
17+
Exemplars = []
18+
});
19+
await Api.CreateWritingSystem(WritingSystemType.Vernacular,
20+
new WritingSystem()
21+
{
22+
Id = Guid.NewGuid(),
23+
Type = WritingSystemType.Vernacular,
24+
WsId = "en-US",
25+
Name = "English",
26+
Abbreviation = "En",
27+
Font = "Arial",
28+
Exemplars = []
29+
});
30+
}
31+
32+
private Task CreateEntry(string headword)
33+
{
34+
return Api.CreateEntry(new() { LexemeForm = { { "en", headword } }, });
35+
}
36+
37+
38+
// ReSharper disable InconsistentNaming
39+
const string Ru_A= "\u0410";
40+
const string Ru_a = "\u0430";
41+
const string Ru_Б= "\u0411";
42+
const string Ru_б = "\u0431";
43+
const string Ru_В= "\u0412";
44+
const string Ru_в = "\u0432";
45+
// ReSharper restore InconsistentNaming
46+
47+
[Theory]
48+
[InlineData("aa,ab,ac")]
49+
[InlineData("aa,Ab,ac")]
50+
[InlineData($"{Ru_a}{Ru_a},{Ru_a}{Ru_б},{Ru_a}{Ru_в}")]
51+
[InlineData($"{Ru_a}{Ru_a},{Ru_A}{Ru_б},{Ru_a}{Ru_в}")]
52+
public async Task EntriesAreSorted(string headwords)
53+
{
54+
var headwordList = headwords.Split(',');
55+
foreach (var headword in headwordList.OrderBy(h => Random.Shared.Next()))
56+
{
57+
await CreateEntry(headword);
58+
}
59+
var entries = await Api.GetEntries().Select(e => e.Headword()).ToArrayAsync();
60+
entries.Should().Equal(headwordList);
61+
}
62+
}

backend/LfClassicData/LfClassicMiniLcmApi.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ private async IAsyncEnumerable<Entry> Query(QueryOptions? options = null, string
206206
new BsonDocument("$ne", new BsonArray { new BsonDocument("$trim", new BsonDocument("input", $"$citationForm.{sortWs}.value")), "" }),
207207
})
208208
},
209-
{ "then", $"$citationForm.{sortWs}.value" },
209+
{ "then", new BsonDocument("$toLower", $"$citationForm.{sortWs}.value") },
210210
{ "else", new BsonDocument("$cond", new BsonDocument
211211
{
212212
{ "if", new BsonDocument("$and", new BsonArray
@@ -216,7 +216,7 @@ private async IAsyncEnumerable<Entry> Query(QueryOptions? options = null, string
216216
new BsonDocument("$ne", new BsonArray { new BsonDocument("$trim", new BsonDocument("input", $"$lexeme.{sortWs}.value")), "" }),
217217
})
218218
},
219-
{ "then", $"$lexeme.{sortWs}.value" },
219+
{ "then", new BsonDocument("$toLower", $"$lexeme.{sortWs}.value") },
220220
{ "else", "" }
221221
})
222222
}

0 commit comments

Comments
 (0)