Skip to content

Commit f87d55e

Browse files
Merge pull request #6 from CoreFiling/feature/configurable-cli-options
Allow the options provided to pdf2htmlEX to be configured at deployment time
2 parents f75560c + c82ebb0 commit f87d55e

14 files changed

Lines changed: 133 additions & 31 deletions

File tree

.github/workflows/docker.yml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ jobs:
2222
uses: actions/checkout@v3
2323
with:
2424
lfs: true
25+
- name: Set up dotnet
26+
uses: actions/setup-dotnet@v3
27+
with:
28+
dotnet-version: "8.x"
29+
- name: Unit tests
30+
run: dotnet test tests/Unit.Tests/Unit.Tests.csproj
2531
- name: Set up QEMU
2632
uses: docker/setup-qemu-action@v2
2733
- name: Set up Docker Buildx
@@ -32,14 +38,10 @@ jobs:
3238
context: ./src/Pdf2Html
3339
load: true
3440
tags: ${{ env.TEST_TAG }}
35-
- name: Set up dotnet
36-
uses: actions/setup-dotnet@v3
37-
with:
38-
dotnet-version: "8.x"
3941
- name: E2E tests
4042
run: |
4143
docker run --rm --detach -p 8080:8080 --name pdf2html ${{ env.TEST_TAG }}
42-
dotnet test --filter "FullyQualifiedName=E2E.Tests"
44+
dotnet test tests/E2E.Tests/E2E.Tests.csproj
4345
docker stop pdf2html
4446
- if: github.ref_name == 'main' || github.ref_type == 'tag'
4547
name: Login to Docker Hub

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
# Changelog
22

3-
## develop
3+
## 0.2.0
44

55
* Update to .net 8.
66
* Switch base images to Ubuntu Noble (24.04 LTS).
7+
* Add optional overrides for command-line arguments passed to `pdf2htmlEX`.
78
* Patch and build `pdf2htmlEX` as part of this build process to use `libopenjp` instead of `libjpeg` for JPEG-2000 support.
89
* All patches are in this source tree, and are applied to directly to the source of the upstream tag during build.
910
* Patch issue with non-breaking spaces in `pdf2HTMLEX`.

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,28 @@
22

33
This project is a lightweight HTTP(S) interface to the [pdf2htmlex library](https://pdf2htmlex.github.io/pdf2htmlEX/).
44

5+
## Running via Docker
6+
7+
```bash
8+
docker run -p 8080 corefiling/pdf2html:$version
9+
```
10+
11+
### Overriding `pdf2htmlEX` options
12+
13+
The command line arguments passed into `pdf2htmlEX` can be overridden by passing in environment variables prefixed by `ConversionOptions__`, e.g:
14+
15+
```bash
16+
docker run -p 8080 -e ConversionOptions__BgFormat=png -e ConversionOptions__OptimizeText=true corefiling/pdf2html$version
17+
```
18+
19+
The names of these setting keys are converted to lower-kebab-case arguments, and the values are converted to strings as needed - in the above example, the arguments are converted to `--bg-format=png --optimize-text=0`.
20+
21+
The full list of arguments can be found by running `pdf2htmlEX`:
22+
23+
```bash
24+
docker run corefiling/pdf2html pdf2htmlEX:$version --help
25+
```
26+
527
## Licensing
628

729
Since pdf2htmlex is licensed under the GPL, this project is too (see the LICENSE.TXT file).

pdf2html.sln

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{C361585C
1111
EndProject
1212
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "E2E.Tests", "tests\E2E.Tests\E2E.Tests.csproj", "{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}"
1313
EndProject
14+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Unit.Tests", "tests\Unit.Tests\Unit.Tests.csproj", "{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}"
15+
EndProject
1416
Global
1517
GlobalSection(SolutionConfigurationPlatforms) = preSolution
1618
Debug|Any CPU = Debug|Any CPU
@@ -28,9 +30,14 @@ Global
2830
{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}.Debug|Any CPU.Build.0 = Debug|Any CPU
2931
{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}.Release|Any CPU.ActiveCfg = Release|Any CPU
3032
{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1}.Release|Any CPU.Build.0 = Release|Any CPU
33+
{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
34+
{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Debug|Any CPU.Build.0 = Debug|Any CPU
35+
{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Release|Any CPU.ActiveCfg = Release|Any CPU
36+
{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A}.Release|Any CPU.Build.0 = Release|Any CPU
3137
EndGlobalSection
3238
GlobalSection(NestedProjects) = preSolution
3339
{D3B9B4F8-F097-4F12-AB86-72CAE0B4577C} = {ABE1E425-AA84-46A5-98EA-9B6D622EF8A5}
3440
{9CAB9112-6B91-4615-A34A-B3C66FD3FAE1} = {C361585C-8D3B-4CA0-A0BF-DB74DDB00EBE}
41+
{1174DAD4-FE63-4CF6-9F23-3B9FB6BA409A} = {C361585C-8D3B-4CA0-A0BF-DB74DDB00EBE}
3542
EndGlobalSection
3643
EndGlobal

pdf2html.sln.DotSettings.user

Lines changed: 0 additions & 6 deletions
This file was deleted.

pdf2html.slnx

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<Solution>
2+
<Project Path="src/Pdf2Html/Pdf2Html.csproj" Type="Classic C#" />
3+
<Project Path="tests/E2E.Tests/E2E.Tests.csproj" Type="Classic C#" />
4+
<Project Path="tests/Unit.Tests/Unit.Tests.csproj" Type="Classic C#" />
5+
</Solution>

src/Pdf2Html/AssemblyInfo.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
using System.Runtime.CompilerServices;
2+
3+
[assembly: InternalsVisibleTo("Unit.Tests")]

src/Pdf2Html/Controllers/RootController.cs

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,17 @@
11
using System.Diagnostics;
22
using System.Net.Mime;
33
using System.Reflection;
4+
45
using Microsoft.AspNetCore.Mvc;
56

7+
using Pdf2Html.Settings;
8+
69
namespace Pdf2Html.Controllers;
710

811
[ApiController]
912
[Route("/")]
10-
public class RootController : ControllerBase
13+
public class RootController(ILogger<RootController> logger, ConversionOptions conversionOptions) : ControllerBase
1114
{
12-
private readonly ILogger<RootController> _logger;
13-
14-
public RootController(ILogger<RootController> logger)
15-
{
16-
_logger = logger;
17-
}
18-
1915
[HttpGet]
2016
public ActionResult Get()
2117
{
@@ -38,19 +34,19 @@ public async Task<ActionResult> Post()
3834
await using (var tempFileStream = System.IO.File.Open(inputFile, FileMode.Truncate))
3935
{
4036
await Request.Body.CopyToAsync(tempFileStream);
41-
_logger.LogInformation($"Copied {FormatToMb(new FileInfo(inputFile).Length)} to {inputFile}");
37+
logger.LogInformation($"Copied {FormatToMb(new FileInfo(inputFile).Length)} to {inputFile}");
4238
}
4339

44-
_logger.LogInformation("Starting conversion...");
40+
logger.LogInformation("Starting conversion...");
4541
var (success, logs) = await ConvertAsync(inputFile, outputFile);
4642

4743
if (!success)
4844
{
49-
_logger.LogError("Conversion failed");
45+
logger.LogError("Conversion failed");
5046
return StatusCode(StatusCodes.Status500InternalServerError, new { pdf2htmlEX = new { logs } });
5147
}
5248

53-
_logger.LogInformation($"Conversion completed ({FormatToMb(new FileInfo(outputFile).Length)})");
49+
logger.LogInformation($"Conversion completed ({FormatToMb(new FileInfo(outputFile).Length)})");
5450
return File(await System.IO.File.ReadAllBytesAsync(outputFile), MediaTypeNames.Text.Html);
5551
}
5652
finally
@@ -63,11 +59,10 @@ public async Task<ActionResult> Post()
6359
private async Task<(bool Success, ICollection<string> logs)> ConvertAsync(string inputFile, string outputFile)
6460
{
6561
using var p = new Process();
66-
const string conversionOptions = "--embed-javascript=0 --process-outline=0 --printing=0 --bg-format=svg --svg-node-count-limit=100 --decompose-ligature 1 --tounicode 1";
6762
p.StartInfo = new ProcessStartInfo
6863
{
6964
FileName = "pdf2htmlEX",
70-
Arguments = $"{conversionOptions} --dest-dir={Path.GetDirectoryName(outputFile)} {inputFile} {Path.GetFileName(outputFile)}",
65+
Arguments = $"{conversionOptions.CommandLineArguments} --dest-dir={Path.GetDirectoryName(outputFile)} {inputFile} {Path.GetFileName(outputFile)}",
7166
CreateNoWindow = true,
7267
RedirectStandardOutput = true,
7368
RedirectStandardError = true
@@ -83,7 +78,7 @@ void AddLog(string? log)
8378
}
8479

8580
logs.Add(log);
86-
_logger.LogInformation(log);
81+
logger.LogInformation(log);
8782
}
8883

8984
p.OutputDataReceived += (_, e) => AddLog(e.Data);
@@ -97,8 +92,5 @@ void AddLog(string? log)
9792
return (p.ExitCode == 0, logs);
9893
}
9994

100-
private static string FormatToMb(long bytesLength)
101-
{
102-
return (bytesLength / 1024.0 / 1024.0).ToString("0.00 MB");
103-
}
95+
private static string FormatToMb(long bytesLength) => (bytesLength / 1024.0 / 1024.0).ToString("0.00 MB");
10496
}

src/Pdf2Html/Program.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,20 @@
1+
using Pdf2Html.Settings;
2+
3+
using System.Diagnostics;
4+
using System.Reflection;
5+
16
var builder = WebApplication.CreateBuilder(args);
27
builder.Logging.ClearProviders();
38
builder.Logging.AddConsole();
49

510
// Add services to the container.
611
builder.Services.AddControllers();
12+
builder.Services.AddSingleton<ConversionOptions>();
713

814
var app = builder.Build();
15+
var versionInfo = FileVersionInfo.GetVersionInfo(Assembly.GetExecutingAssembly().Location);
16+
app.Logger.LogInformation($"Starting {versionInfo.ProductName} {versionInfo.ProductVersion}");
17+
app.Logger.LogInformation($"Using pdf2htmlEX command line arguments: {app.Services.GetService<ConversionOptions>()!.CommandLineArguments}");
18+
919
app.MapControllers();
1020
app.Run();
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
using System.Text.RegularExpressions;
2+
3+
namespace Pdf2Html.Settings;
4+
5+
public class ConversionOptions(IConfiguration configuration)
6+
{
7+
public string CommandLineArguments { get; } = ToCommandLineArguments(configuration.GetSection("ConversionOptions").AsEnumerable());
8+
9+
internal static string ToCommandLineArguments(IEnumerable<KeyValuePair<string, string?>> options) =>
10+
string.Join(' ', options.Where(kvp => kvp.Value != null).Select(kvp => $"--{ToKebabCase(kvp.Key.Replace("ConversionOptions:", ""))}={ValueToString(kvp.Value!)}"));
11+
12+
private static string ValueToString(string value) => bool.TryParse(value, out var boolValue) ? (boolValue ? "1" : "0") : value;
13+
14+
private static string ToKebabCase(string value) =>
15+
Regex.Replace(value, "(?<!^)([A-Z][a-z]|(?<=[a-z])[A-Z0-9])", "-$1", RegexOptions.Compiled).Trim().ToLower();
16+
}

0 commit comments

Comments
 (0)