Skip to content

Commit dd01764

Browse files
author
Ethan Bishop
committed
Allow the options provided to pdf2htmlEX to be configured at deployment time
Can be overriden using the `ConversionOptions__*` variables.
1 parent f75560c commit dd01764

9 files changed

Lines changed: 101 additions & 24 deletions

File tree

.github/workflows/docker.yml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ jobs:
2222
uses: actions/checkout@v3
2323
with:
2424
lfs: true
25+
- name: Set up dotnet
26+
uses: actions/setup-dotnet@v3
27+
with:
28+
dotnet-version: "8.x"
29+
- name: Unit tests
30+
run: dotnet test --filter "FullyQualifiedName=Unit.Tests"
2531
- name: Set up QEMU
2632
uses: docker/setup-qemu-action@v2
2733
- name: Set up Docker Buildx
@@ -32,10 +38,6 @@ jobs:
3238
context: ./src/Pdf2Html
3339
load: true
3440
tags: ${{ env.TEST_TAG }}
35-
- name: Set up dotnet
36-
uses: actions/setup-dotnet@v3
37-
with:
38-
dotnet-version: "8.x"
3941
- name: E2E tests
4042
run: |
4143
docker run --rm --detach -p 8080:8080 --name pdf2html ${{ env.TEST_TAG }}

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
# Changelog
22

3-
## develop
3+
## 0.2.0
44

55
* Update to .net 8.
66
* Switch base images to Ubuntu Noble (24.04 LTS).
7+
* Add optional overrides for command-line arguments passed to `pdf2htmlEX`.
78
* Patch and build `pdf2htmlEX` as part of this build process to use `libopenjp` instead of `libjpeg` for JPEG-2000 support.
89
* All patches are in this source tree, and are applied to directly to the source of the upstream tag during build.
910
* Patch issue with non-breaking spaces in `pdf2HTMLEX`.

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,28 @@
22

33
This project is a lightweight HTTP(S) interface to the [pdf2htmlex library](https://pdf2htmlex.github.io/pdf2htmlEX/).
44

5+
## Running via Docker
6+
7+
```bash
8+
docker run -p 8080 corefiling/pdf2html:$version
9+
```
10+
11+
### Overriding `pdf2htmlEX` options
12+
13+
The command line arguments passed into `pdf2htmlEX` can be overridden by passing in environment variables prefixed by `ConversionOptions__`, e.g:
14+
15+
```bash
16+
docker run -p 8080 -e ConversionOptions__BgFormat=png -e ConversionOptions__OptimizeText=true corefiling/pdf2html$version
17+
```
18+
19+
The names of these setting keys are converted to lower-kebab-case arguments, and the values are converted to strings as needed - in the above example, the arguments are converted to `--bg-format=png --optimize-text=0`.
20+
21+
The full list of arguments can be found by running `pdf2htmlEX`:
22+
23+
```bash
24+
docker run corefiling/pdf2html pdf2htmlEX:$version --help
25+
```
26+
527
## Licensing
628

729
Since pdf2htmlex is licensed under the GPL, this project is too (see the LICENSE.TXT file).

src/Pdf2Html/AssemblyInfo.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
using System.Runtime.CompilerServices;
2+
3+
[assembly: InternalsVisibleTo("Unit.Tests")]

src/Pdf2Html/Controllers/RootController.cs

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,15 @@
11
using System.Diagnostics;
22
using System.Net.Mime;
33
using System.Reflection;
4+
using System.Text.RegularExpressions;
45
using Microsoft.AspNetCore.Mvc;
56

67
namespace Pdf2Html.Controllers;
78

89
[ApiController]
910
[Route("/")]
10-
public class RootController : ControllerBase
11+
public class RootController(ILogger<RootController> logger, IConfiguration configuration) : ControllerBase
1112
{
12-
private readonly ILogger<RootController> _logger;
13-
14-
public RootController(ILogger<RootController> logger)
15-
{
16-
_logger = logger;
17-
}
18-
1913
[HttpGet]
2014
public ActionResult Get()
2115
{
@@ -38,19 +32,19 @@ public async Task<ActionResult> Post()
3832
await using (var tempFileStream = System.IO.File.Open(inputFile, FileMode.Truncate))
3933
{
4034
await Request.Body.CopyToAsync(tempFileStream);
41-
_logger.LogInformation($"Copied {FormatToMb(new FileInfo(inputFile).Length)} to {inputFile}");
35+
logger.LogInformation($"Copied {FormatToMb(new FileInfo(inputFile).Length)} to {inputFile}");
4236
}
4337

44-
_logger.LogInformation("Starting conversion...");
38+
logger.LogInformation("Starting conversion...");
4539
var (success, logs) = await ConvertAsync(inputFile, outputFile);
4640

4741
if (!success)
4842
{
49-
_logger.LogError("Conversion failed");
43+
logger.LogError("Conversion failed");
5044
return StatusCode(StatusCodes.Status500InternalServerError, new { pdf2htmlEX = new { logs } });
5145
}
5246

53-
_logger.LogInformation($"Conversion completed ({FormatToMb(new FileInfo(outputFile).Length)})");
47+
logger.LogInformation($"Conversion completed ({FormatToMb(new FileInfo(outputFile).Length)})");
5448
return File(await System.IO.File.ReadAllBytesAsync(outputFile), MediaTypeNames.Text.Html);
5549
}
5650
finally
@@ -63,11 +57,11 @@ public async Task<ActionResult> Post()
6357
private async Task<(bool Success, ICollection<string> logs)> ConvertAsync(string inputFile, string outputFile)
6458
{
6559
using var p = new Process();
66-
const string conversionOptions = "--embed-javascript=0 --process-outline=0 --printing=0 --bg-format=svg --svg-node-count-limit=100 --decompose-ligature 1 --tounicode 1";
60+
string options = ToCommandLineArguments(configuration.GetSection("ConversionOptions").AsEnumerable());
6761
p.StartInfo = new ProcessStartInfo
6862
{
6963
FileName = "pdf2htmlEX",
70-
Arguments = $"{conversionOptions} --dest-dir={Path.GetDirectoryName(outputFile)} {inputFile} {Path.GetFileName(outputFile)}",
64+
Arguments = $"{options} --dest-dir={Path.GetDirectoryName(outputFile)} {inputFile} {Path.GetFileName(outputFile)}",
7165
CreateNoWindow = true,
7266
RedirectStandardOutput = true,
7367
RedirectStandardError = true
@@ -83,7 +77,7 @@ void AddLog(string? log)
8377
}
8478

8579
logs.Add(log);
86-
_logger.LogInformation(log);
80+
logger.LogInformation(log);
8781
}
8882

8983
p.OutputDataReceived += (_, e) => AddLog(e.Data);
@@ -97,8 +91,13 @@ void AddLog(string? log)
9791
return (p.ExitCode == 0, logs);
9892
}
9993

100-
private static string FormatToMb(long bytesLength)
101-
{
102-
return (bytesLength / 1024.0 / 1024.0).ToString("0.00 MB");
103-
}
94+
internal static string ToCommandLineArguments(IEnumerable<KeyValuePair<string, string?>> options) =>
95+
string.Join(' ', options.Where(kvp => kvp.Value != null).Select(kvp => $"--{ToKebabCase(kvp.Key.Replace("ConversionOptions:", ""))}={ValueToString(kvp.Value!)}"));
96+
97+
private static string ValueToString(string value) => bool.TryParse(value, out var boolValue) ? (boolValue ? "1" : "0") : value;
98+
99+
private static string ToKebabCase(string value) =>
100+
Regex.Replace(value, "(?<!^)([A-Z][a-z]|(?<=[a-z])[A-Z0-9])", "-$1", RegexOptions.Compiled).Trim().ToLower();
101+
102+
private static string FormatToMb(long bytesLength) => (bytesLength / 1024.0 / 1024.0).ToString("0.00 MB");
104103
}

src/Pdf2Html/appsettings.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,13 @@
11
{
2+
"ConversionOptions": {
3+
"EmbedJavascript": false,
4+
"ProcessOutline": false,
5+
"Printing": false,
6+
"BgFormat": "svg",
7+
"SvgNodeCountLimit": 100,
8+
"DecomposeLigature": true,
9+
"Tounicode": true
10+
},
211
"Logging": {
312
"LogLevel": {
413
"Default": "Information",
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
using Pdf2Html.Controllers;
2+
3+
namespace Unit.Tests;
4+
5+
public class RootControllerTest
6+
{
7+
[Test]
8+
public void TestToCommandLineArguments()
9+
{
10+
var input = new Dictionary<string, string?>
11+
{
12+
{ "ConversionOptions:FooBar", "true" },
13+
{ "ConversionOptions:BazBlort", "FALSE" },
14+
{ "ConversionOptions:Hello", "World!" },
15+
{ "ConversionOptions:FizzBuzz", "5" },
16+
};
17+
var result = RootController.ToCommandLineArguments(input);
18+
Assert.That(result, Is.EqualTo("--foo-bar=1 --baz-blort=0 --hello=World! --fizz-buzz=5"));
19+
}
20+
}

tests/Unit.Tests/Unit.Tests.csproj

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<TargetFramework>net8.0</TargetFramework>
5+
<Nullable>enable</Nullable>
6+
<ImplicitUsings>enable</ImplicitUsings>
7+
8+
<IsPackable>false</IsPackable>
9+
</PropertyGroup>
10+
11+
<ItemGroup>
12+
<ProjectReference Include="../../src/Pdf2Html/Pdf2Html.csproj" />
13+
14+
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.3.2" />
15+
<PackageReference Include="NUnit" Version="3.13.3" />
16+
<PackageReference Include="NUnit3TestAdapter" Version="4.2.1" />
17+
<PackageReference Include="NUnit.Analyzers" Version="3.3.0" />
18+
</ItemGroup>
19+
20+
</Project>

tests/Unit.Tests/Usings.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
global using NUnit.Framework;

0 commit comments

Comments
 (0)