diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 4028725..848abea 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -22,6 +22,12 @@ jobs: uses: actions/checkout@v3 with: lfs: true + - name: Set up dotnet + uses: actions/setup-dotnet@v3 + with: + dotnet-version: "8.x" + - name: Unit tests + run: dotnet test tests/Unit.Tests/Unit.Tests.csproj - name: Set up QEMU uses: docker/setup-qemu-action@v2 - name: Set up Docker Buildx @@ -32,14 +38,10 @@ jobs: context: ./src/Pdf2Html load: true tags: ${{ env.TEST_TAG }} - - name: Set up dotnet - uses: actions/setup-dotnet@v3 - with: - dotnet-version: "8.x" - name: E2E tests run: | docker run --rm --detach -p 8080:8080 --name pdf2html ${{ env.TEST_TAG }} - dotnet test --filter "FullyQualifiedName=E2E.Tests" + dotnet test tests/E2E.Tests/E2E.Tests.csproj docker stop pdf2html - if: github.ref_name == 'main' || github.ref_type == 'tag' name: Login to Docker Hub diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e7236d..35314d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,10 @@ # Changelog -## develop +## 0.2.0 * Update to .net 8. * Switch base images to Ubuntu Noble (24.04 LTS). +* Add optional overrides for command-line arguments passed to `pdf2htmlEX`. * Patch and build `pdf2htmlEX` as part of this build process to use `libopenjp` instead of `libjpeg` for JPEG-2000 support. * All patches are in this source tree, and are applied to directly to the source of the upstream tag during build. * Patch issue with non-breaking spaces in `pdf2HTMLEX`. diff --git a/README.md b/README.md index ce73989..8206a7b 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,28 @@ This project is a lightweight HTTP(S) interface to the [pdf2htmlex library](https://pdf2htmlex.github.io/pdf2htmlEX/). +## Running via Docker + +```bash +docker run -p 8080 corefiling/pdf2html:$version +``` + +### Overriding `pdf2htmlEX` options + +The command line arguments passed into `pdf2htmlEX` can be overridden by passing in environment variables prefixed by `ConversionOptions__`, e.g: + +```bash +docker run -p 8080 -e ConversionOptions__BgFormat=png -e ConversionOptions__OptimizeText=true corefiling/pdf2html$version +``` + +The names of these setting keys are converted to lower-kebab-case arguments, and the values are converted to strings as needed - in the above example, the arguments are converted to `--bg-format=png --optimize-text=0`. + +The full list of arguments can be found by running `pdf2htmlEX`: + +```bash +docker run corefiling/pdf2html pdf2htmlEX:$version --help +``` + ## Licensing Since pdf2htmlex is licensed under the GPL, this project is too (see the LICENSE.TXT file). diff --git a/pdf2html.sln b/pdf2html.sln index b32ebec..e8b5c91 100644 --- a/pdf2html.sln +++ b/pdf2html.sln @@ -11,6 +11,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{C361585C EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "E2E.Tests", "tests\E2E.Tests\E2E.Tests.csproj", "{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Unit.Tests", "tests\Unit.Tests\Unit.Tests.csproj", "{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -28,9 +30,14 @@ Global {9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}.Debug|Any CPU.Build.0 = Debug|Any CPU {9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}.Release|Any CPU.ActiveCfg = Release|Any CPU {9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}.Release|Any CPU.Build.0 = Release|Any CPU + {1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Debug|Any CPU.Build.0 = Debug|Any CPU + {1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Release|Any CPU.ActiveCfg = Release|Any CPU + {1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(NestedProjects) = preSolution {D3B9B4F8-F097-4F12-AB86-72CAE0B4577C} = {ABE1E425-AA84-46A5-98EA-9B6D622EF8A5} {9CAB9112-6B91-4615-A34A-B3C66FD3FAE1} = {C361585C-8D3B-4CA0-A0BF-DB74DDB00EBE} + {1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A} = {C361585C-8D3B-4CA0-A0BF-DB74DDB00EBE} EndGlobalSection EndGlobal diff --git a/pdf2html.sln.DotSettings.user b/pdf2html.sln.DotSettings.user deleted file mode 100644 index 1aa543e..0000000 --- a/pdf2html.sln.DotSettings.user +++ /dev/null @@ -1,6 +0,0 @@ - - <SessionState ContinuousTestingMode="0" IsActive="True" Name="Test1" xmlns="urn:schemas-jetbrains-com:jetbrains-ut-session"> - <TestAncestor> - <TestId>NUnit3x::9CAB9112-6B91-4615-A34A-B3C66FD3FAE1::net7.0::E2E.Tests.ConvertPdfTest</TestId> - </TestAncestor> -</SessionState> \ No newline at end of file diff --git a/pdf2html.slnx b/pdf2html.slnx new file mode 100644 index 0000000..aef1df5 --- /dev/null +++ b/pdf2html.slnx @@ -0,0 +1,5 @@ + + + + + \ No newline at end of file diff --git a/src/Pdf2Html/AssemblyInfo.cs b/src/Pdf2Html/AssemblyInfo.cs new file mode 100644 index 0000000..762e061 --- /dev/null +++ b/src/Pdf2Html/AssemblyInfo.cs @@ -0,0 +1,3 @@ +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo("Unit.Tests")] diff --git a/src/Pdf2Html/Controllers/RootController.cs b/src/Pdf2Html/Controllers/RootController.cs index 9c0ec6a..e8bf414 100644 --- a/src/Pdf2Html/Controllers/RootController.cs +++ b/src/Pdf2Html/Controllers/RootController.cs @@ -1,21 +1,17 @@ using System.Diagnostics; using System.Net.Mime; using System.Reflection; + using Microsoft.AspNetCore.Mvc; +using Pdf2Html.Settings; + namespace Pdf2Html.Controllers; [ApiController] [Route("/")] -public class RootController : ControllerBase +public class RootController(ILogger logger, ConversionOptions conversionOptions) : ControllerBase { - private readonly ILogger _logger; - - public RootController(ILogger logger) - { - _logger = logger; - } - [HttpGet] public ActionResult Get() { @@ -38,19 +34,19 @@ public async Task Post() await using (var tempFileStream = System.IO.File.Open(inputFile, FileMode.Truncate)) { await Request.Body.CopyToAsync(tempFileStream); - _logger.LogInformation($"Copied {FormatToMb(new FileInfo(inputFile).Length)} to {inputFile}"); + logger.LogInformation($"Copied {FormatToMb(new FileInfo(inputFile).Length)} to {inputFile}"); } - _logger.LogInformation("Starting conversion..."); + logger.LogInformation("Starting conversion..."); var (success, logs) = await ConvertAsync(inputFile, outputFile); if (!success) { - _logger.LogError("Conversion failed"); + logger.LogError("Conversion failed"); return StatusCode(StatusCodes.Status500InternalServerError, new { pdf2htmlEX = new { logs } }); } - _logger.LogInformation($"Conversion completed ({FormatToMb(new FileInfo(outputFile).Length)})"); + logger.LogInformation($"Conversion completed ({FormatToMb(new FileInfo(outputFile).Length)})"); return File(await System.IO.File.ReadAllBytesAsync(outputFile), MediaTypeNames.Text.Html); } finally @@ -63,11 +59,10 @@ public async Task Post() private async Task<(bool Success, ICollection logs)> ConvertAsync(string inputFile, string outputFile) { using var p = new Process(); - const string conversionOptions = "--embed-javascript=0 --process-outline=0 --printing=0 --bg-format=svg --svg-node-count-limit=100 --decompose-ligature 1 --tounicode 1"; p.StartInfo = new ProcessStartInfo { FileName = "pdf2htmlEX", - Arguments = $"{conversionOptions} --dest-dir={Path.GetDirectoryName(outputFile)} {inputFile} {Path.GetFileName(outputFile)}", + Arguments = $"{conversionOptions.CommandLineArguments} --dest-dir={Path.GetDirectoryName(outputFile)} {inputFile} {Path.GetFileName(outputFile)}", CreateNoWindow = true, RedirectStandardOutput = true, RedirectStandardError = true @@ -83,7 +78,7 @@ void AddLog(string? log) } logs.Add(log); - _logger.LogInformation(log); + logger.LogInformation(log); } p.OutputDataReceived += (_, e) => AddLog(e.Data); @@ -97,8 +92,5 @@ void AddLog(string? log) return (p.ExitCode == 0, logs); } - private static string FormatToMb(long bytesLength) - { - return (bytesLength / 1024.0 / 1024.0).ToString("0.00 MB"); - } + private static string FormatToMb(long bytesLength) => (bytesLength / 1024.0 / 1024.0).ToString("0.00 MB"); } diff --git a/src/Pdf2Html/Program.cs b/src/Pdf2Html/Program.cs index 389ac5f..7cec636 100644 --- a/src/Pdf2Html/Program.cs +++ b/src/Pdf2Html/Program.cs @@ -1,10 +1,20 @@ +using Pdf2Html.Settings; + +using System.Diagnostics; +using System.Reflection; + var builder = WebApplication.CreateBuilder(args); builder.Logging.ClearProviders(); builder.Logging.AddConsole(); // Add services to the container. builder.Services.AddControllers(); +builder.Services.AddSingleton(); var app = builder.Build(); +var versionInfo = FileVersionInfo.GetVersionInfo(Assembly.GetExecutingAssembly().Location); +app.Logger.LogInformation($"Starting {versionInfo.ProductName} {versionInfo.ProductVersion}"); +app.Logger.LogInformation($"Using pdf2htmlEX command line arguments: {app.Services.GetService()!.CommandLineArguments}"); + app.MapControllers(); app.Run(); diff --git a/src/Pdf2Html/Settings/ConversionOptions.cs b/src/Pdf2Html/Settings/ConversionOptions.cs new file mode 100644 index 0000000..9fc1005 --- /dev/null +++ b/src/Pdf2Html/Settings/ConversionOptions.cs @@ -0,0 +1,16 @@ +using System.Text.RegularExpressions; + +namespace Pdf2Html.Settings; + +public class ConversionOptions(IConfiguration configuration) +{ + public string CommandLineArguments { get; } = ToCommandLineArguments(configuration.GetSection("ConversionOptions").AsEnumerable()); + + internal static string ToCommandLineArguments(IEnumerable> options) => + string.Join(' ', options.Where(kvp => kvp.Value != null).Select(kvp => $"--{ToKebabCase(kvp.Key.Replace("ConversionOptions:", ""))}={ValueToString(kvp.Value!)}")); + + private static string ValueToString(string value) => bool.TryParse(value, out var boolValue) ? (boolValue ? "1" : "0") : value; + + private static string ToKebabCase(string value) => + Regex.Replace(value, "(? + { + { "ConversionOptions:FooBar", "true" }, + { "ConversionOptions:BazBlort", "FALSE" }, + { "ConversionOptions:Hello", "World!" }, + { "ConversionOptions:FizzBuzz", "5" }, + }; + var result = ConversionOptions.ToCommandLineArguments(input); + Assert.That(result, Is.EqualTo("--foo-bar=1 --baz-blort=0 --hello=World! --fizz-buzz=5")); + } +} diff --git a/tests/Unit.Tests/Unit.Tests.csproj b/tests/Unit.Tests/Unit.Tests.csproj new file mode 100644 index 0000000..9b33bde --- /dev/null +++ b/tests/Unit.Tests/Unit.Tests.csproj @@ -0,0 +1,20 @@ + + + + net8.0 + enable + enable + + false + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/Unit.Tests/Usings.cs b/tests/Unit.Tests/Usings.cs new file mode 100644 index 0000000..cefced4 --- /dev/null +++ b/tests/Unit.Tests/Usings.cs @@ -0,0 +1 @@ +global using NUnit.Framework; \ No newline at end of file