Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ jobs:
uses: actions/checkout@v3
with:
lfs: true
- name: Set up dotnet
uses: actions/setup-dotnet@v3
with:
dotnet-version: "8.x"
- name: Unit tests
run: dotnet test tests/Unit.Tests/Unit.Tests.csproj
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
Expand All @@ -32,14 +38,10 @@ jobs:
context: ./src/Pdf2Html
load: true
tags: ${{ env.TEST_TAG }}
- name: Set up dotnet
uses: actions/setup-dotnet@v3
with:
dotnet-version: "8.x"
- name: E2E tests
run: |
docker run --rm --detach -p 8080:8080 --name pdf2html ${{ env.TEST_TAG }}
dotnet test --filter "FullyQualifiedName=E2E.Tests"
dotnet test tests/E2E.Tests/E2E.Tests.csproj
docker stop pdf2html
- if: github.ref_name == 'main' || github.ref_type == 'tag'
name: Login to Docker Hub
Expand Down
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Changelog

## develop
## 0.2.0

* Update to .net 8.
* Switch base images to Ubuntu Noble (24.04 LTS).
* Add optional overrides for command-line arguments passed to `pdf2htmlEX`.
* Patch and build `pdf2htmlEX` as part of this build process to use `libopenjp` instead of `libjpeg` for JPEG-2000 support.
* All patches are in this source tree, and are applied to directly to the source of the upstream tag during build.
* Patch issue with non-breaking spaces in `pdf2HTMLEX`.
Expand Down
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,28 @@

This project is a lightweight HTTP(S) interface to the [pdf2htmlex library](https://pdf2htmlex.github.io/pdf2htmlEX/).

## Running via Docker

```bash
docker run -p 8080 corefiling/pdf2html:$version
```

### Overriding `pdf2htmlEX` options

The command line arguments passed into `pdf2htmlEX` can be overridden by passing in environment variables prefixed by `ConversionOptions__`, e.g:

```bash
docker run -p 8080 -e ConversionOptions__BgFormat=png -e ConversionOptions__OptimizeText=true corefiling/pdf2html$version
```

The names of these setting keys are converted to lower-kebab-case arguments, and the values are converted to strings as needed - in the above example, the arguments are converted to `--bg-format=png --optimize-text=0`.

The full list of arguments can be found by running `pdf2htmlEX`:

```bash
docker run corefiling/pdf2html pdf2htmlEX:$version --help
```

## Licensing

Since pdf2htmlex is licensed under the GPL, this project is too (see the LICENSE.TXT file).
Expand Down
7 changes: 7 additions & 0 deletions pdf2html.sln
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{C361585C
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "E2E.Tests", "tests\E2E.Tests\E2E.Tests.csproj", "{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Unit.Tests", "tests\Unit.Tests\Unit.Tests.csproj", "{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand All @@ -28,9 +30,14 @@ Global
{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}.Debug|Any CPU.Build.0 = Debug|Any CPU
{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}.Release|Any CPU.ActiveCfg = Release|Any CPU
{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}.Release|Any CPU.Build.0 = Release|Any CPU
{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Debug|Any CPU.Build.0 = Debug|Any CPU
{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Release|Any CPU.ActiveCfg = Release|Any CPU
{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
{D3B9B4F8-F097-4F12-AB86-72CAE0B4577C} = {ABE1E425-AA84-46A5-98EA-9B6D622EF8A5}
{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1} = {C361585C-8D3B-4CA0-A0BF-DB74DDB00EBE}
{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A} = {C361585C-8D3B-4CA0-A0BF-DB74DDB00EBE}
EndGlobalSection
EndGlobal
6 changes: 0 additions & 6 deletions pdf2html.sln.DotSettings.user

This file was deleted.

5 changes: 5 additions & 0 deletions pdf2html.slnx
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<Solution>
<Project Path="src/Pdf2Html/Pdf2Html.csproj" Type="Classic C#" />
<Project Path="tests/E2E.Tests/E2E.Tests.csproj" Type="Classic C#" />
<Project Path="tests/Unit.Tests/Unit.Tests.csproj" Type="Classic C#" />
</Solution>
3 changes: 3 additions & 0 deletions src/Pdf2Html/AssemblyInfo.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
using System.Runtime.CompilerServices;

[assembly: InternalsVisibleTo("Unit.Tests")]
30 changes: 11 additions & 19 deletions src/Pdf2Html/Controllers/RootController.cs
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
using System.Diagnostics;
using System.Net.Mime;
using System.Reflection;

using Microsoft.AspNetCore.Mvc;

using Pdf2Html.Settings;

namespace Pdf2Html.Controllers;

[ApiController]
[Route("/")]
public class RootController : ControllerBase
public class RootController(ILogger<RootController> logger, ConversionOptions conversionOptions) : ControllerBase
{
private readonly ILogger<RootController> _logger;

public RootController(ILogger<RootController> logger)
{
_logger = logger;
}

[HttpGet]
public ActionResult Get()
{
Expand All @@ -38,19 +34,19 @@ public async Task<ActionResult> Post()
await using (var tempFileStream = System.IO.File.Open(inputFile, FileMode.Truncate))
{
await Request.Body.CopyToAsync(tempFileStream);
_logger.LogInformation($"Copied {FormatToMb(new FileInfo(inputFile).Length)} to {inputFile}");
logger.LogInformation($"Copied {FormatToMb(new FileInfo(inputFile).Length)} to {inputFile}");
}

_logger.LogInformation("Starting conversion...");
logger.LogInformation("Starting conversion...");
var (success, logs) = await ConvertAsync(inputFile, outputFile);

if (!success)
{
_logger.LogError("Conversion failed");
logger.LogError("Conversion failed");
return StatusCode(StatusCodes.Status500InternalServerError, new { pdf2htmlEX = new { logs } });
}

_logger.LogInformation($"Conversion completed ({FormatToMb(new FileInfo(outputFile).Length)})");
logger.LogInformation($"Conversion completed ({FormatToMb(new FileInfo(outputFile).Length)})");
return File(await System.IO.File.ReadAllBytesAsync(outputFile), MediaTypeNames.Text.Html);
}
finally
Expand All @@ -63,11 +59,10 @@ public async Task<ActionResult> Post()
private async Task<(bool Success, ICollection<string> logs)> ConvertAsync(string inputFile, string outputFile)
{
using var p = new Process();
const string conversionOptions = "--embed-javascript=0 --process-outline=0 --printing=0 --bg-format=svg --svg-node-count-limit=100 --decompose-ligature 1 --tounicode 1";
p.StartInfo = new ProcessStartInfo
{
FileName = "pdf2htmlEX",
Arguments = $"{conversionOptions} --dest-dir={Path.GetDirectoryName(outputFile)} {inputFile} {Path.GetFileName(outputFile)}",
Arguments = $"{conversionOptions.CommandLineArguments} --dest-dir={Path.GetDirectoryName(outputFile)} {inputFile} {Path.GetFileName(outputFile)}",
CreateNoWindow = true,
RedirectStandardOutput = true,
RedirectStandardError = true
Expand All @@ -83,7 +78,7 @@ void AddLog(string? log)
}

logs.Add(log);
_logger.LogInformation(log);
logger.LogInformation(log);
}

p.OutputDataReceived += (_, e) => AddLog(e.Data);
Expand All @@ -97,8 +92,5 @@ void AddLog(string? log)
return (p.ExitCode == 0, logs);
}

private static string FormatToMb(long bytesLength)
{
return (bytesLength / 1024.0 / 1024.0).ToString("0.00 MB");
}
private static string FormatToMb(long bytesLength) => (bytesLength / 1024.0 / 1024.0).ToString("0.00 MB");
}
10 changes: 10 additions & 0 deletions src/Pdf2Html/Program.cs
Original file line number Diff line number Diff line change
@@ -1,10 +1,20 @@
using Pdf2Html.Settings;

using System.Diagnostics;
using System.Reflection;

var builder = WebApplication.CreateBuilder(args);
builder.Logging.ClearProviders();
builder.Logging.AddConsole();

// Add services to the container.
builder.Services.AddControllers();
builder.Services.AddSingleton<ConversionOptions>();

var app = builder.Build();
var versionInfo = FileVersionInfo.GetVersionInfo(Assembly.GetExecutingAssembly().Location);
app.Logger.LogInformation($"Starting {versionInfo.ProductName} {versionInfo.ProductVersion}");
app.Logger.LogInformation($"Using pdf2htmlEX command line arguments: {app.Services.GetService<ConversionOptions>()!.CommandLineArguments}");

app.MapControllers();
app.Run();
16 changes: 16 additions & 0 deletions src/Pdf2Html/Settings/ConversionOptions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
using System.Text.RegularExpressions;

namespace Pdf2Html.Settings;

public class ConversionOptions(IConfiguration configuration)
{
public string CommandLineArguments { get; } = ToCommandLineArguments(configuration.GetSection("ConversionOptions").AsEnumerable());

internal static string ToCommandLineArguments(IEnumerable<KeyValuePair<string, string?>> options) =>
string.Join(' ', options.Where(kvp => kvp.Value != null).Select(kvp => $"--{ToKebabCase(kvp.Key.Replace("ConversionOptions:", ""))}={ValueToString(kvp.Value!)}"));

private static string ValueToString(string value) => bool.TryParse(value, out var boolValue) ? (boolValue ? "1" : "0") : value;

private static string ToKebabCase(string value) =>
Regex.Replace(value, "(?<!^)([A-Z][a-z]|(?<=[a-z])[A-Z0-9])", "-$1", RegexOptions.Compiled).Trim().ToLower();
}
9 changes: 9 additions & 0 deletions src/Pdf2Html/appsettings.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
{
"ConversionOptions": {
"EmbedJavascript": false,
"ProcessOutline": false,
"Printing": false,
"BgFormat": "svg",
"SvgNodeCountLimit": 100,
"DecomposeLigature": true,
"Tounicode": true
},
"Logging": {
"LogLevel": {
"Default": "Information",
Expand Down
20 changes: 20 additions & 0 deletions tests/Unit.Tests/ConversionOptionsTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
using Pdf2Html.Settings;

namespace Unit.Tests;

public class ConversionOptionsTests
{
[Test]
public void TestToCommandLineArguments()
{
var input = new Dictionary<string, string?>
{
{ "ConversionOptions:FooBar", "true" },
{ "ConversionOptions:BazBlort", "FALSE" },
{ "ConversionOptions:Hello", "World!" },
{ "ConversionOptions:FizzBuzz", "5" },
};
var result = ConversionOptions.ToCommandLineArguments(input);
Assert.That(result, Is.EqualTo("--foo-bar=1 --baz-blort=0 --hello=World! --fizz-buzz=5"));
}
}
20 changes: 20 additions & 0 deletions tests/Unit.Tests/Unit.Tests.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>

<IsPackable>false</IsPackable>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="../../src/Pdf2Html/Pdf2Html.csproj" />

<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.3.2" />
<PackageReference Include="NUnit" Version="3.13.3" />
<PackageReference Include="NUnit3TestAdapter" Version="4.2.1" />
<PackageReference Include="NUnit.Analyzers" Version="3.3.0" />
</ItemGroup>

</Project>
1 change: 1 addition & 0 deletions tests/Unit.Tests/Usings.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
global using NUnit.Framework;
Loading