From 0864fc60aa070fb91f6736647a330b979da7abc0 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Mon, 9 Dec 2024 15:54:47 +0100 Subject: [PATCH] :sparkles: add support for remote resource fetching (#281) --- docs/code_samples/carte_vitale_v1.txt | 23 ----- docs/getting_started.md | 16 ++++ src/Mindee/Input/UrlInputSource.cs | 48 ++++++++++- src/Mindee/Pdf/PdfCompressor.cs | 1 + .../Product/Fr/CarteVitale/CarteVitaleV1.cs | 20 ----- .../Fr/CarteVitale/CarteVitaleV1Document.cs | 55 ------------ .../Input/UrlInputSourceTest.cs | 21 +++++ .../Input/UrlInputFileTest.cs | 86 +++++++++++++++++++ .../Fr/CarteVitale/CarteVitaleV1Test.cs | 36 -------- 9 files changed, 171 insertions(+), 135 deletions(-) delete mode 100644 docs/code_samples/carte_vitale_v1.txt delete mode 100644 src/Mindee/Product/Fr/CarteVitale/CarteVitaleV1.cs delete mode 100644 src/Mindee/Product/Fr/CarteVitale/CarteVitaleV1Document.cs create mode 100644 tests/Mindee.IntegrationTests/Input/UrlInputSourceTest.cs delete mode 100644 tests/Mindee.UnitTests/Product/Fr/CarteVitale/CarteVitaleV1Test.cs diff --git a/docs/code_samples/carte_vitale_v1.txt b/docs/code_samples/carte_vitale_v1.txt deleted file mode 100644 index 5959cca0..00000000 --- a/docs/code_samples/carte_vitale_v1.txt +++ /dev/null @@ -1,23 +0,0 @@ -using Mindee; -using Mindee.Input; -using Mindee.Product.Fr.CarteVitale; - -string apiKey = "my-api-key"; -string filePath = "/path/to/the/file.ext"; - -// Construct a new client -MindeeClient mindeeClient = new MindeeClient(apiKey); - -// Load an input source as a path string -// Other input types can be used, as mentioned in the docs -var inputSource = new LocalInputSource(filePath); - -// Call the API and parse the input -var response = await mindeeClient - .ParseAsync(inputSource); - -// Print a summary of all the predictions -System.Console.WriteLine(response.Document.ToString()); - -// Print only the document-level predictions -// System.Console.WriteLine(response.Document.Inference.Prediction.ToString()); diff --git a/docs/getting_started.md b/docs/getting_started.md index cab5781a..3c1c6da9 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -116,6 +116,7 @@ There are a few different ways of loading a document file, depending on your use * [Path](#path) * [File Object](#stream-object) * [Bytes](#bytes) +* [Remote File](#remote-file) ### Path Load from a file directly from disk. Requires an absolute path, as a string. @@ -155,6 +156,21 @@ string fileName = "myfile.pdf"; var inputSource = new LocalInputSource(myFileInBytes, fileName); ``` +### Remote File + +You can pass a URL to the server through the UrlInputSource class: + +```csharp +var remoteInput = new UrlInputSource("https://www.example.com/some/file.ext"); +``` + +If your file is hidden behind a redirection, you can load your file locally instead: + +```csharp +var remoteInput = new UrlInputSource("https://www.example.com/some/file.ext"); +var localInput = remoteInput.AsLocalInputSource(); +``` + ## Parsing a file To send a file to the API, we need to specify how to process the document. This will determine which API endpoint is used and how the API return will be handled internally by the library. diff --git a/src/Mindee/Input/UrlInputSource.cs b/src/Mindee/Input/UrlInputSource.cs index 59bbe8f5..e930dc6e 100644 --- a/src/Mindee/Input/UrlInputSource.cs +++ b/src/Mindee/Input/UrlInputSource.cs @@ -1,7 +1,9 @@ using System; using System.IO; -using System.Linq; +using System.Threading.Tasks; using Mindee.Exceptions; +using RestSharp; +using RestSharp.Authenticators; namespace Mindee.Input { @@ -44,14 +46,58 @@ private void IsUriValid() { throw new MindeeInputException("Local files are not supported, use `LocalInputSource` instead."); } + if (!FileUrl.IsAbsoluteUri) { throw new MindeeInputException("The URI must be absolute."); } + if (FileUrl.Scheme != "https") { throw new MindeeInputException("Only the HTTPS scheme is supported."); } } + + /// + /// Downloads the file from the url, and returns a LocalInputSource wrapper object for it. + /// + /// A LocalInputSource. + /// Throws if the file can't be accessed or downloaded. + public async Task AsLocalInputSource( + string filename = null, + string username = null, + string password = null, + string token = null, + int maxRedirects = 3, + IRestClient restClient = null) + { + filename ??= Path.GetFileName(FileUrl.LocalPath); + if (filename == "" || !Path.HasExtension(filename)) + { + throw new MindeeInputException("Filename must end with an extension."); + } + + var options = new RestClientOptions(FileUrl) { FollowRedirects = true, MaxRedirects = maxRedirects }; + + if (!string.IsNullOrEmpty(token)) + { + options.Authenticator = new JwtAuthenticator(token); + } + else if (!string.IsNullOrEmpty(username) && !string.IsNullOrEmpty(password)) + { + options.Authenticator = new HttpBasicAuthenticator(username, password); + } + + restClient ??= new RestClient(options); + var request = new RestRequest(FileUrl); + var response = await restClient.ExecuteAsync(request); + + // Note: response.IsSuccessful can't be mocked as easily, so this is a better solution at the moment. + if (response.IsSuccessStatusCode) + { + return new LocalInputSource(fileBytes: response.RawBytes, filename: filename); + } + throw new MindeeInputException($"Failed to download file: {response.ErrorMessage}"); + } } } diff --git a/src/Mindee/Pdf/PdfCompressor.cs b/src/Mindee/Pdf/PdfCompressor.cs index bddcc27b..1c0a3aaa 100644 --- a/src/Mindee/Pdf/PdfCompressor.cs +++ b/src/Mindee/Pdf/PdfCompressor.cs @@ -30,6 +30,7 @@ public static byte[] CompressPdf(byte[] pdfData, int imageQuality = 85, bool for Console.WriteLine( "MINDEE WARNING: Found text inside of the provided PDF file. Compression operation aborted since disableSourceText is set to 'true'."); Console.ResetColor(); + Console.Write(""); // Flush buffer color from stdout. Come on .NET... I shouldn't have to do this. return pdfData; } diff --git a/src/Mindee/Product/Fr/CarteVitale/CarteVitaleV1.cs b/src/Mindee/Product/Fr/CarteVitale/CarteVitaleV1.cs deleted file mode 100644 index 7e6c70ef..00000000 --- a/src/Mindee/Product/Fr/CarteVitale/CarteVitaleV1.cs +++ /dev/null @@ -1,20 +0,0 @@ -using System.Text.Json.Serialization; -using Mindee.Http; -using Mindee.Parsing.Common; - -namespace Mindee.Product.Fr.CarteVitale -{ - /// - /// Carte Vitale API version 1 inference prediction. - /// - [Endpoint("carte_vitale", "1")] - public sealed class CarteVitaleV1 : Inference - { - /// - /// The pages and the associated values which were detected on the document. - /// - [JsonPropertyName("pages")] - [JsonConverter(typeof(PagesJsonConverter))] - public override Pages Pages { get; set; } - } -} diff --git a/src/Mindee/Product/Fr/CarteVitale/CarteVitaleV1Document.cs b/src/Mindee/Product/Fr/CarteVitale/CarteVitaleV1Document.cs deleted file mode 100644 index cc680bd5..00000000 --- a/src/Mindee/Product/Fr/CarteVitale/CarteVitaleV1Document.cs +++ /dev/null @@ -1,55 +0,0 @@ -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Text.Json.Serialization; -using Mindee.Parsing; -using Mindee.Parsing.Standard; - -namespace Mindee.Product.Fr.CarteVitale -{ - /// - /// Carte Vitale API version 1.1 document data. - /// - public class CarteVitaleV1Document : IPrediction - { - /// - /// The given name(s) of the card holder. - /// - [JsonPropertyName("given_names")] - public IList GivenNames { get; set; } = new List(); - - /// - /// The date the card was issued. - /// - [JsonPropertyName("issuance_date")] - public DateField IssuanceDate { get; set; } - - /// - /// The Social Security Number (Numéro de Sécurité Sociale) of the card holder - /// - [JsonPropertyName("social_security")] - public StringField SocialSecurity { get; set; } - - /// - /// The surname of the card holder. - /// - [JsonPropertyName("surname")] - public StringField Surname { get; set; } - - /// - /// A prettier representation of the current model values. - /// - public override string ToString() - { - string givenNames = string.Join( - "\n " + string.Concat(Enumerable.Repeat(" ", 15)), - GivenNames.Select(item => item)); - StringBuilder result = new StringBuilder(); - result.Append($":Given Name(s): {givenNames}\n"); - result.Append($":Surname: {Surname}\n"); - result.Append($":Social Security Number: {SocialSecurity}\n"); - result.Append($":Issuance Date: {IssuanceDate}\n"); - return SummaryHelper.Clean(result.ToString()); - } - } -} diff --git a/tests/Mindee.IntegrationTests/Input/UrlInputSourceTest.cs b/tests/Mindee.IntegrationTests/Input/UrlInputSourceTest.cs new file mode 100644 index 00000000..1e2ea051 --- /dev/null +++ b/tests/Mindee.IntegrationTests/Input/UrlInputSourceTest.cs @@ -0,0 +1,21 @@ +using Mindee.Input; +using Mindee.Product.Invoice; + +namespace Mindee.IntegrationTests.Input +{ + [Trait("Category", "URL loading")] + public class UrlInputSourceTest + { + [Fact] + public async Task GivenARemoteFile_MustRetrieveResponse() + { + var apiKey = Environment.GetEnvironmentVariable("Mindee__ApiKey"); + var client = TestingUtilities.GetOrGenerateMindeeClient(apiKey); + var remoteInput = new UrlInputSource("https://github.com/mindee/client-lib-test-data/blob/main/products/invoice_splitter/invoice_5p.pdf?raw=true"); + var localInput = await remoteInput.AsLocalInputSource(); + Assert.Equal("invoice_5p.pdf", localInput.Filename); + var result = await client.ParseAsync(localInput); + Assert.Equal(5, result.Document.NPages); + } + } +} diff --git a/tests/Mindee.UnitTests/Input/UrlInputFileTest.cs b/tests/Mindee.UnitTests/Input/UrlInputFileTest.cs index f91b4d84..5a39f0ee 100644 --- a/tests/Mindee.UnitTests/Input/UrlInputFileTest.cs +++ b/tests/Mindee.UnitTests/Input/UrlInputFileTest.cs @@ -1,11 +1,16 @@ +using System.Net; using Mindee.Exceptions; using Mindee.Input; +using Moq; +using RestSharp; namespace Mindee.UnitTests.Input { [Trait("Category", "URL loading")] public class UrlInputSourceTest { + private readonly Mock _mockRestClient = new(); + [Fact] public void Can_Load_Type_String() { @@ -34,5 +39,86 @@ public void DoesNot_Load_InvalidUrl() Assert.Throws( () => new UrlInputSource("invalid-url")); } + [Fact] + public async Task AsLocalInputSource_SuccessfulDownload() + { + _mockRestClient + .Setup(x => x.ExecuteAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(new RestResponse + { + StatusCode = HttpStatusCode.OK, + RawBytes = [1, 2, 3, 4, 5], + IsSuccessStatusCode = true + }); + + var urlInputSource = new UrlInputSource("https://example.com/file.pdf"); + var result = await urlInputSource.AsLocalInputSource(restClient: _mockRestClient.Object); + + Assert.IsType(result); + Assert.Equal("file.pdf", result.Filename); + Assert.Equal(5, result.FileBytes.Length); + } + + [Fact] + public async Task AsLocalInputSource_FailedDownload() + { + _mockRestClient + .Setup(x => x.ExecuteAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(new RestResponse + { + StatusCode = HttpStatusCode.NotFound, + ErrorMessage = "File not found", + IsSuccessStatusCode = false + }); + + var urlInputSource = new UrlInputSource("https://example.com/nonexistent.pdf"); + await Assert.ThrowsAsync( + () => urlInputSource.AsLocalInputSource(restClient: _mockRestClient.Object)); + } + + [Fact] + public async Task AsLocalInputSource_WithCustomFilename() + { + _mockRestClient + .Setup(x => x.ExecuteAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(new RestResponse + { + StatusCode = HttpStatusCode.OK, + RawBytes = [1, 2, 3, 4, 5], + IsSuccessStatusCode = true + }); + + var urlInputSource = new UrlInputSource("https://example.com/file.pdf"); + var result = await urlInputSource.AsLocalInputSource("custom.pdf", restClient: _mockRestClient.Object); + + Assert.IsType(result); + Assert.Equal("custom.pdf", result.Filename); + } + + [Fact] + public async Task AsLocalInputSource_WithAuthentication() + { + _mockRestClient + .Setup(x => x.ExecuteAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(new RestResponse + { + StatusCode = HttpStatusCode.OK, + RawBytes = [1, 2, 3, 4, 5], + IsSuccessStatusCode = true + }); + + var urlInputSource = new UrlInputSource("https://example.com/file.pdf"); + var result = await urlInputSource.AsLocalInputSource(username: "user", password: "pass", restClient: _mockRestClient.Object); + + Assert.IsType(result); + Assert.Equal("file.pdf", result.Filename); + } + + [Fact] + public async Task AsLocalInputSource_InvalidFilename() + { + var urlInputSource = new UrlInputSource("https://example.com/file.pdf"); + await Assert.ThrowsAsync(() => urlInputSource.AsLocalInputSource("invalid")); + } } } diff --git a/tests/Mindee.UnitTests/Product/Fr/CarteVitale/CarteVitaleV1Test.cs b/tests/Mindee.UnitTests/Product/Fr/CarteVitale/CarteVitaleV1Test.cs deleted file mode 100644 index 960fb4d7..00000000 --- a/tests/Mindee.UnitTests/Product/Fr/CarteVitale/CarteVitaleV1Test.cs +++ /dev/null @@ -1,36 +0,0 @@ -using Mindee.Parsing.Common; -using Mindee.Product.Fr.CarteVitale; - -namespace Mindee.UnitTests.Product.Fr.CarteVitale -{ - [Trait("Category", "CarteVitaleV1")] - public class CarteVitaleV1Test - { - [Fact] - public async Task Predict_CheckEmpty() - { - var response = await GetPrediction("empty"); - var docPrediction = response.Document.Inference.Prediction; - Assert.Empty(docPrediction.GivenNames); - Assert.Null(docPrediction.Surname.Value); - Assert.Null(docPrediction.SocialSecurity.Value); - Assert.Null(docPrediction.IssuanceDate.Value); - } - - [Fact] - public async Task Predict_CheckSummary() - { - var response = await GetPrediction("complete"); - var expected = File.ReadAllText("Resources/products/carte_vitale/response_v1/summary_full.rst"); - Assert.Equal(expected, response.Document.ToString()); - } - - private static async Task> GetPrediction(string name) - { - string fileName = $"Resources/products/carte_vitale/response_v1/{name}.json"; - var mindeeAPi = UnitTestBase.GetMindeeApi(fileName); - return await mindeeAPi.PredictPostAsync( - UnitTestBase.GetFakePredictParameter()); - } - } -}