diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 4028725..848abea 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -22,6 +22,12 @@ jobs:
uses: actions/checkout@v3
with:
lfs: true
+ - name: Set up dotnet
+ uses: actions/setup-dotnet@v3
+ with:
+ dotnet-version: "8.x"
+ - name: Unit tests
+ run: dotnet test tests/Unit.Tests/Unit.Tests.csproj
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
@@ -32,14 +38,10 @@ jobs:
context: ./src/Pdf2Html
load: true
tags: ${{ env.TEST_TAG }}
- - name: Set up dotnet
- uses: actions/setup-dotnet@v3
- with:
- dotnet-version: "8.x"
- name: E2E tests
run: |
docker run --rm --detach -p 8080:8080 --name pdf2html ${{ env.TEST_TAG }}
- dotnet test --filter "FullyQualifiedName=E2E.Tests"
+ dotnet test tests/E2E.Tests/E2E.Tests.csproj
docker stop pdf2html
- if: github.ref_name == 'main' || github.ref_type == 'tag'
name: Login to Docker Hub
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6e7236d..35314d0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,9 +1,10 @@
# Changelog
-## develop
+## 0.2.0
* Update to .net 8.
* Switch base images to Ubuntu Noble (24.04 LTS).
+* Add optional overrides for command-line arguments passed to `pdf2htmlEX`.
* Patch and build `pdf2htmlEX` as part of this build process to use `libopenjp` instead of `libjpeg` for JPEG-2000 support.
* All patches are in this source tree, and are applied to directly to the source of the upstream tag during build.
* Patch issue with non-breaking spaces in `pdf2HTMLEX`.
diff --git a/README.md b/README.md
index ce73989..8206a7b 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,28 @@
This project is a lightweight HTTP(S) interface to the [pdf2htmlex library](https://pdf2htmlex.github.io/pdf2htmlEX/).
+## Running via Docker
+
+```bash
+docker run -p 8080 corefiling/pdf2html:$version
+```
+
+### Overriding `pdf2htmlEX` options
+
+The command line arguments passed into `pdf2htmlEX` can be overridden by passing in environment variables prefixed by `ConversionOptions__`, e.g:
+
+```bash
+docker run -p 8080 -e ConversionOptions__BgFormat=png -e ConversionOptions__OptimizeText=true corefiling/pdf2html$version
+```
+
+The names of these setting keys are converted to lower-kebab-case arguments, and the values are converted to strings as needed - in the above example, the arguments are converted to `--bg-format=png --optimize-text=0`.
+
+The full list of arguments can be found by running `pdf2htmlEX`:
+
+```bash
+docker run corefiling/pdf2html pdf2htmlEX:$version --help
+```
+
## Licensing
Since pdf2htmlex is licensed under the GPL, this project is too (see the LICENSE.TXT file).
diff --git a/pdf2html.sln b/pdf2html.sln
index b32ebec..e8b5c91 100644
--- a/pdf2html.sln
+++ b/pdf2html.sln
@@ -11,6 +11,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{C361585C
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "E2E.Tests", "tests\E2E.Tests\E2E.Tests.csproj", "{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}"
EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Unit.Tests", "tests\Unit.Tests\Unit.Tests.csproj", "{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -28,9 +30,14 @@ Global
{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}.Debug|Any CPU.Build.0 = Debug|Any CPU
{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}.Release|Any CPU.ActiveCfg = Release|Any CPU
{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}.Release|Any CPU.Build.0 = Release|Any CPU
+ {1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
{D3B9B4F8-F097-4F12-AB86-72CAE0B4577C} = {ABE1E425-AA84-46A5-98EA-9B6D622EF8A5}
{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1} = {C361585C-8D3B-4CA0-A0BF-DB74DDB00EBE}
+ {1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A} = {C361585C-8D3B-4CA0-A0BF-DB74DDB00EBE}
EndGlobalSection
EndGlobal
diff --git a/pdf2html.sln.DotSettings.user b/pdf2html.sln.DotSettings.user
deleted file mode 100644
index 1aa543e..0000000
--- a/pdf2html.sln.DotSettings.user
+++ /dev/null
@@ -1,6 +0,0 @@
-
- <SessionState ContinuousTestingMode="0" IsActive="True" Name="Test1" xmlns="urn:schemas-jetbrains-com:jetbrains-ut-session">
- <TestAncestor>
- <TestId>NUnit3x::9CAB9112-6B91-4615-A34A-B3C66FD3FAE1::net7.0::E2E.Tests.ConvertPdfTest</TestId>
- </TestAncestor>
-</SessionState>
\ No newline at end of file
diff --git a/pdf2html.slnx b/pdf2html.slnx
new file mode 100644
index 0000000..aef1df5
--- /dev/null
+++ b/pdf2html.slnx
@@ -0,0 +1,5 @@
+
+
+
+
+
\ No newline at end of file
diff --git a/src/Pdf2Html/AssemblyInfo.cs b/src/Pdf2Html/AssemblyInfo.cs
new file mode 100644
index 0000000..762e061
--- /dev/null
+++ b/src/Pdf2Html/AssemblyInfo.cs
@@ -0,0 +1,3 @@
+using System.Runtime.CompilerServices;
+
+[assembly: InternalsVisibleTo("Unit.Tests")]
diff --git a/src/Pdf2Html/Controllers/RootController.cs b/src/Pdf2Html/Controllers/RootController.cs
index 9c0ec6a..e8bf414 100644
--- a/src/Pdf2Html/Controllers/RootController.cs
+++ b/src/Pdf2Html/Controllers/RootController.cs
@@ -1,21 +1,17 @@
using System.Diagnostics;
using System.Net.Mime;
using System.Reflection;
+
using Microsoft.AspNetCore.Mvc;
+using Pdf2Html.Settings;
+
namespace Pdf2Html.Controllers;
[ApiController]
[Route("/")]
-public class RootController : ControllerBase
+public class RootController(ILogger logger, ConversionOptions conversionOptions) : ControllerBase
{
- private readonly ILogger _logger;
-
- public RootController(ILogger logger)
- {
- _logger = logger;
- }
-
[HttpGet]
public ActionResult Get()
{
@@ -38,19 +34,19 @@ public async Task Post()
await using (var tempFileStream = System.IO.File.Open(inputFile, FileMode.Truncate))
{
await Request.Body.CopyToAsync(tempFileStream);
- _logger.LogInformation($"Copied {FormatToMb(new FileInfo(inputFile).Length)} to {inputFile}");
+ logger.LogInformation($"Copied {FormatToMb(new FileInfo(inputFile).Length)} to {inputFile}");
}
- _logger.LogInformation("Starting conversion...");
+ logger.LogInformation("Starting conversion...");
var (success, logs) = await ConvertAsync(inputFile, outputFile);
if (!success)
{
- _logger.LogError("Conversion failed");
+ logger.LogError("Conversion failed");
return StatusCode(StatusCodes.Status500InternalServerError, new { pdf2htmlEX = new { logs } });
}
- _logger.LogInformation($"Conversion completed ({FormatToMb(new FileInfo(outputFile).Length)})");
+ logger.LogInformation($"Conversion completed ({FormatToMb(new FileInfo(outputFile).Length)})");
return File(await System.IO.File.ReadAllBytesAsync(outputFile), MediaTypeNames.Text.Html);
}
finally
@@ -63,11 +59,10 @@ public async Task Post()
private async Task<(bool Success, ICollection logs)> ConvertAsync(string inputFile, string outputFile)
{
using var p = new Process();
- const string conversionOptions = "--embed-javascript=0 --process-outline=0 --printing=0 --bg-format=svg --svg-node-count-limit=100 --decompose-ligature 1 --tounicode 1";
p.StartInfo = new ProcessStartInfo
{
FileName = "pdf2htmlEX",
- Arguments = $"{conversionOptions} --dest-dir={Path.GetDirectoryName(outputFile)} {inputFile} {Path.GetFileName(outputFile)}",
+ Arguments = $"{conversionOptions.CommandLineArguments} --dest-dir={Path.GetDirectoryName(outputFile)} {inputFile} {Path.GetFileName(outputFile)}",
CreateNoWindow = true,
RedirectStandardOutput = true,
RedirectStandardError = true
@@ -83,7 +78,7 @@ void AddLog(string? log)
}
logs.Add(log);
- _logger.LogInformation(log);
+ logger.LogInformation(log);
}
p.OutputDataReceived += (_, e) => AddLog(e.Data);
@@ -97,8 +92,5 @@ void AddLog(string? log)
return (p.ExitCode == 0, logs);
}
- private static string FormatToMb(long bytesLength)
- {
- return (bytesLength / 1024.0 / 1024.0).ToString("0.00 MB");
- }
+ private static string FormatToMb(long bytesLength) => (bytesLength / 1024.0 / 1024.0).ToString("0.00 MB");
}
diff --git a/src/Pdf2Html/Program.cs b/src/Pdf2Html/Program.cs
index 389ac5f..7cec636 100644
--- a/src/Pdf2Html/Program.cs
+++ b/src/Pdf2Html/Program.cs
@@ -1,10 +1,20 @@
+using Pdf2Html.Settings;
+
+using System.Diagnostics;
+using System.Reflection;
+
var builder = WebApplication.CreateBuilder(args);
builder.Logging.ClearProviders();
builder.Logging.AddConsole();
// Add services to the container.
builder.Services.AddControllers();
+builder.Services.AddSingleton();
var app = builder.Build();
+var versionInfo = FileVersionInfo.GetVersionInfo(Assembly.GetExecutingAssembly().Location);
+app.Logger.LogInformation($"Starting {versionInfo.ProductName} {versionInfo.ProductVersion}");
+app.Logger.LogInformation($"Using pdf2htmlEX command line arguments: {app.Services.GetService()!.CommandLineArguments}");
+
app.MapControllers();
app.Run();
diff --git a/src/Pdf2Html/Settings/ConversionOptions.cs b/src/Pdf2Html/Settings/ConversionOptions.cs
new file mode 100644
index 0000000..9fc1005
--- /dev/null
+++ b/src/Pdf2Html/Settings/ConversionOptions.cs
@@ -0,0 +1,16 @@
+using System.Text.RegularExpressions;
+
+namespace Pdf2Html.Settings;
+
+public class ConversionOptions(IConfiguration configuration)
+{
+ public string CommandLineArguments { get; } = ToCommandLineArguments(configuration.GetSection("ConversionOptions").AsEnumerable());
+
+ internal static string ToCommandLineArguments(IEnumerable> options) =>
+ string.Join(' ', options.Where(kvp => kvp.Value != null).Select(kvp => $"--{ToKebabCase(kvp.Key.Replace("ConversionOptions:", ""))}={ValueToString(kvp.Value!)}"));
+
+ private static string ValueToString(string value) => bool.TryParse(value, out var boolValue) ? (boolValue ? "1" : "0") : value;
+
+ private static string ToKebabCase(string value) =>
+ Regex.Replace(value, "(?
+ {
+ { "ConversionOptions:FooBar", "true" },
+ { "ConversionOptions:BazBlort", "FALSE" },
+ { "ConversionOptions:Hello", "World!" },
+ { "ConversionOptions:FizzBuzz", "5" },
+ };
+ var result = ConversionOptions.ToCommandLineArguments(input);
+ Assert.That(result, Is.EqualTo("--foo-bar=1 --baz-blort=0 --hello=World! --fizz-buzz=5"));
+ }
+}
diff --git a/tests/Unit.Tests/Unit.Tests.csproj b/tests/Unit.Tests/Unit.Tests.csproj
new file mode 100644
index 0000000..9b33bde
--- /dev/null
+++ b/tests/Unit.Tests/Unit.Tests.csproj
@@ -0,0 +1,20 @@
+
+
+
+ net8.0
+ enable
+ enable
+
+ false
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/Unit.Tests/Usings.cs b/tests/Unit.Tests/Usings.cs
new file mode 100644
index 0000000..cefced4
--- /dev/null
+++ b/tests/Unit.Tests/Usings.cs
@@ -0,0 +1 @@
+global using NUnit.Framework;
\ No newline at end of file