Skip to content

Commit dea212f

Browse files
committed
Fix channel order handling in BufferedImageUtil.toBchwInput
Originally, when I was writing this code, I was under the assumption, that `Raster` is a pretty low-level interface. So I blindly assumed, that if pixel values are in a BGR order in the data buffer, then calling `Raster.getSample` with band 0 would return the blue value. But that is not the case. The bands are RGB ordered, and the sampling model does the index conversion for us. So we were passing the wrong values, which causes the red and blue values being swapped... Since most tests involved black text on white backgrounds, there are not a lot of meaningful changes in the results. In some cases the result is a bit better, in some it is a bit worse. Additionally added a basic test to cover this in the future. This issue is present only in the Java version of the library.
1 parent 9d0d67b commit dea212f

39 files changed

Lines changed: 193 additions & 87 deletions

pdfocr-onnxtr/src/main/java/com/itextpdf/pdfocr/onnxtr/util/BufferedImageUtil.java

Lines changed: 50 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ This file is part of the iText (R) project.
2828
import com.itextpdf.pdfocr.onnxtr.FloatBufferMdArray;
2929
import com.itextpdf.pdfocr.onnxtr.OnnxInputProperties;
3030
import com.itextpdf.pdfocr.onnxtr.exceptions.PdfOcrOnnxTrExceptionMessageConstant;
31+
3132
import org.bytedeco.javacpp.indexer.FloatIndexer;
3233
import org.bytedeco.javacpp.indexer.UByteIndexer;
3334
import org.bytedeco.opencv.global.opencv_imgproc;
@@ -39,7 +40,7 @@ This file is part of the iText (R) project.
3940
import java.awt.Graphics2D;
4041
import java.awt.RenderingHints;
4142
import java.awt.image.BufferedImage;
42-
import java.awt.image.WritableRaster;
43+
import java.awt.image.Raster;
4344
import java.nio.ByteBuffer;
4445
import java.nio.ByteOrder;
4546
import java.nio.FloatBuffer;
@@ -51,6 +52,25 @@ This file is part of the iText (R) project.
5152
* Additional algorithms for working with {@link BufferedImage}.
5253
*/
5354
public final class BufferedImageUtil {
55+
/**
56+
* Band index to retrieve a red channel sample from a Raster. Band order
57+
* does not depend on the image type, so it is the same for all RGB
58+
* variants: RGB, ARGB, BGR, ABGR.
59+
*/
60+
private static final int BAND_RED = 0;
61+
/**
62+
* Band index to retrieve a green channel sample from a Raster. Band order
63+
* does not depend on the image type, so it is the same for all RGB
64+
* variants: RGB, ARGB, BGR, ABGR.
65+
*/
66+
private static final int BAND_GREEN = 1;
67+
/**
68+
* Band index to retrieve a blue channel sample from a Raster. Band order
69+
* does not depend on the image type, so it is the same for all RGB
70+
* variants: RGB, ARGB, BGR, ABGR.
71+
*/
72+
private static final int BAND_BLUE = 2;
73+
5474
private BufferedImageUtil() {
5575
}
5676

@@ -99,27 +119,7 @@ public static FloatBufferMdArray toBchwInput(Collection<BufferedImage> images, O
99119
properties.getHeight(),
100120
properties.useSymmetricPad()
101121
);
102-
assert resizedImage.getType() == BufferedImage.TYPE_3BYTE_BGR;
103-
// Doing normalization at the same time as we fill the buffer
104-
final WritableRaster raster = resizedImage.getRaster();
105-
for (int y = 0; y < resizedImage.getHeight(); ++y) {
106-
for (int x = 0; x < resizedImage.getWidth(); ++x) {
107-
final float r = raster.getSample(x, y, 2) / 255F;
108-
inputData.put((r - properties.getRedMean()) / properties.getRedStd());
109-
}
110-
}
111-
for (int y = 0; y < resizedImage.getHeight(); ++y) {
112-
for (int x = 0; x < resizedImage.getWidth(); ++x) {
113-
final float g = raster.getSample(x, y, 1) / 255F;
114-
inputData.put((g - properties.getGreenMean()) / properties.getGreenStd());
115-
}
116-
}
117-
for (int y = 0; y < resizedImage.getHeight(); ++y) {
118-
for (int x = 0; x < resizedImage.getWidth(); ++x) {
119-
final float b = raster.getSample(x, y, 0) / 255F;
120-
inputData.put((b - properties.getBlueMean()) / properties.getBlueStd());
121-
}
122-
}
122+
putRgbImageWithNormalization(inputData, resizedImage, properties);
123123
}
124124
inputData.rewind();
125125
return new FloatBufferMdArray(inputData, inputShape);
@@ -289,6 +289,34 @@ private static BufferedImage resize(BufferedImage image, int width, int height,
289289
return result;
290290
}
291291

292+
private static void putRgbImageWithNormalization(
293+
FloatBuffer outputBuffer,
294+
BufferedImage image,
295+
OnnxInputProperties props
296+
) {
297+
assert image.getType() == BufferedImage.TYPE_3BYTE_BGR;
298+
299+
putImageBandWithNormalization(outputBuffer, image, BAND_RED, props.getRedMean(), props.getRedStd());
300+
putImageBandWithNormalization(outputBuffer, image, BAND_GREEN, props.getGreenMean(), props.getGreenStd());
301+
putImageBandWithNormalization(outputBuffer, image, BAND_BLUE, props.getBlueMean(), props.getBlueStd());
302+
}
303+
304+
private static void putImageBandWithNormalization(
305+
FloatBuffer outputBuffer,
306+
BufferedImage image,
307+
int band,
308+
double mean,
309+
double std
310+
) {
311+
final Raster raster = image.getRaster();
312+
for (int y = 0; y < raster.getHeight(); ++y) {
313+
for (int x = 0; x < raster.getWidth(); ++x) {
314+
final double v = raster.getSample(x, y, band) / 255.0;
315+
outputBuffer.put((float) ((v - mean) / std));
316+
}
317+
}
318+
}
319+
292320
private static Mat calculateBoxTransformationMat(Point[] box, float boxWidth, float boxHeight) {
293321
try (final Mat srcPoints = new Mat(3, 2, CvType.CV_32F);
294322
final Mat dstPoints = new Mat(3, 2, CvType.CV_32F);

pdfocr-onnxtr/src/test/java/com/itextpdf/pdfocr/onnxtr/OnnxDoImageOcrLanguagesTest.java

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ public void arabic1DoImageOcrTest() throws Exception {
8080
OnnxTrOcrEngine ocrEngine = new OnnxTrOcrEngine(detectionPredictor, recognitionPredictor);
8181

8282
String textFromImage = OnnxTestUtils.getTextFromImage(imageFile, ocrEngine);
83-
Assertions.assertEquals("13\n-\n4\n6\nSta:as)\n9\n4at\n-\nlive,\nlaugh,\nlove\n", textFromImage);
83+
Assertions.assertEquals("13\n-\nA\n6\nSta:as)\n9\n4tj\n-\nlive,\nlaugh,\nlove\n", textFromImage);
8484

8585
ocrEngine.close();
8686
}
@@ -110,7 +110,7 @@ public void bengaliDoImageOcrTest() throws Exception {
110110
OnnxTrOcrEngine ocrEngine = new OnnxTrOcrEngine(detectionPredictor, recognitionPredictor);
111111

112112
String textFromImage = OnnxTestUtils.getTextFromImage(imageFile, ocrEngine);
113-
Assertions.assertEquals("3(5T\n*T(3T\n", textFromImage);
113+
Assertions.assertEquals("3(5\nT(3T\n", textFromImage);
114114

115115
ocrEngine.close();
116116
}
@@ -200,7 +200,7 @@ public void greekDoImageOcrTest() throws Exception {
200200
OnnxTrOcrEngine ocrEngine = new OnnxTrOcrEngine(detectionPredictor, recognitionPredictor);
201201

202202
String textFromImage = OnnxTestUtils.getTextFromImage(imageFile, ocrEngine);
203-
Assertions.assertEquals(")\nP\n-\ny\n-\nE\nC\nN\nC\nM\n-\nA\nC\nI\nI\nA\n$\n/\n7156W5\n$\nxaboluxns\n2\n2\n14\nCTOS02u27\n2\n9\n-\nEXX2MG10\n$\ndycGuxns.\n", textFromImage);
203+
Assertions.assertEquals("0)\nP\n-\nV\n-\nE\nO\nN\n-\nM\nC\nA\nC)\nI\nI\nA\n$\n/\n7156W5\n$\nxabouxns\n2\n2\n7\nCTOS02u275\n2\n2\n/\nEXX2MG109\n$\ndycGuxns.\n", textFromImage);
204204

205205
ocrEngine.close();
206206
}
@@ -215,7 +215,7 @@ public void hindi1DoImageOcrTest() throws Exception {
215215
OnnxTrOcrEngine ocrEngine = new OnnxTrOcrEngine(detectionPredictor, recognitionPredictor);
216216

217217
String textFromImage = OnnxTestUtils.getTextFromImage(imageFile, ocrEngine);
218-
Assertions.assertEquals("o\n-\nG\ntT\ndeass\n", textFromImage);
218+
Assertions.assertEquals("o\n-\nG\ntT\ndes\n", textFromImage);
219219

220220
ocrEngine.close();
221221
}
@@ -230,7 +230,7 @@ public void hindi2DoImageOcrTest() throws Exception {
230230
OnnxTrOcrEngine ocrEngine = new OnnxTrOcrEngine(detectionPredictor, recognitionPredictor);
231231

232232
String textFromImage =OnnxTestUtils.getTextFromImage(imageFile, ocrEngine);
233-
Assertions.assertEquals("dloich\nSloial\nHindi\n", textFromImage);
233+
Assertions.assertEquals("dloich\nSlaiai\nHindi\n", textFromImage);
234234

235235
ocrEngine.close();
236236
}
@@ -260,7 +260,7 @@ public void japaneseDoImageOcrTest() throws Exception {
260260
OnnxTrOcrEngine ocrEngine = new OnnxTrOcrEngine(detectionPredictor, recognitionPredictor);
261261

262262
String textFromImage = OnnxTestUtils.getTextFromImage(imageFile, ocrEngine);
263-
Assertions.assertEquals("B\n*\n&\n-\n-\nE\nD\nX\n*\n-\n", textFromImage);
263+
Assertions.assertEquals("B\n*\naa\n-\n-\na\nK\n*\n-\n", textFromImage);
264264

265265
ocrEngine.close();
266266
}
@@ -310,7 +310,7 @@ public void thai1DoImageOcrTest() throws Exception {
310310
OnnxTrOcrEngine ocrEngine = new OnnxTrOcrEngine(detectionPredictor, recognitionPredictor);
311311
String textFromImage = OnnxTestUtils.getTextFromImage(imageFile, ocrEngine);
312312
Assertions.assertEquals(
313-
"3581991\n1\n19n8\nA\nA\nI\na\n&\n1\n19008791914497907597\n15790707047005\n19n8\n",
313+
"3581981\n1\n19n8\nA\nA\nI\na\n&\n\n19008791914497907597\n15790707047005\n19n8\n",
314314
textFromImage, textFromImage);
315315

316316
ocrEngine.close();
@@ -326,10 +326,10 @@ public void thai2DoImageOcrTest() throws Exception {
326326
OnnxTrOcrEngine ocrEngine = new OnnxTrOcrEngine(detectionPredictor, recognitionPredictor);
327327

328328
String textFromImage = OnnxTestUtils.getTextFromImage(imageFile, ocrEngine);
329-
Assertions.assertTrue(textFromImage.contains("\nGNwwInygEnAnUONEDNMENEnMEVouDoruRE\n"));
330-
Assertions.assertTrue(textFromImage.contains("\nWsruilunaMASIwOyuEAL\n"));
331-
Assertions.assertTrue(textFromImage.contains("\nwialwwnnaurw)\n"));
332-
Assertions.assertTrue(textFromImage.contains("\nLmniloumwnounguwyuEREngenananuidwwéryenshuluwenennl\n"));
329+
Assertions.assertTrue(textFromImage.contains("\nGNuwInygEMEMAnUONEDNMENAVouDaruRE\n"));
330+
Assertions.assertTrue(textFromImage.contains("\nWsruilunaMASIwEyuEAL\n"));
331+
Assertions.assertTrue(textFromImage.contains("\nMielwynanaur\n"));
332+
Assertions.assertTrue(textFromImage.contains("\nlnilounnenourwdryeuREnAOnaNADuiluwdrysnrailununeinnl\n"));
333333

334334
ocrEngine.close();
335335
}

pdfocr-onnxtr/src/test/java/com/itextpdf/pdfocr/onnxtr/OnnxDoImageOcrRotatedTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ public void rotatedColorsMix2DoImageOcrTest() {
125125
File imageFile = new File(src);
126126

127127
String textFromImage = OnnxTestUtils.getTextFromImage(imageFile, OCR_ENGINE);
128-
Assertions.assertEquals("does\nthis\nwork?\nshould\n&%!Housten\nwe\nproblem.\nhave\nnot\nydpAl,-68/9SPEZL\na\n", textFromImage);
128+
Assertions.assertEquals("does\nthis\nwork?\n123456789-FIdpt\nshould\n&%!Housten\nwe\nhave\na\nproblem.\nnot\n", textFromImage);
129129
}
130130

131131
@Test

pdfocr-onnxtr/src/test/java/com/itextpdf/pdfocr/onnxtr/OnnxDoImageOcrTest.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ public void basicDoImageOcrTest() {
6969

7070
String textFromImage = OnnxTestUtils.getTextFromImage(imageFile, OCR_ENGINE);
7171
Assertions.assertEquals("Does\nthis\nOCR\nthing\nreally\nwork?\nHi\nHow\nabout\na\nbigger\nfont?\n" +
72-
"123456789\n13\nWhat\nabowt\ntiy\nfont?\n", textFromImage);
72+
"123456789\n123\nWhat\nabowt\ntris\nfont?\n", textFromImage);
7373
}
7474

7575
@Test
@@ -149,14 +149,14 @@ public void scannedDoImageOcrTest() {
149149
File imageFile = new File(src);
150150

151151
String textFromImage = OnnxTestUtils.getTextFromImage(imageFile, OCR_ENGINE);
152-
Assertions.assertEquals("\n1Y\nSI\nENSAYARA\nCOMO\nACTUAR?\nTanto\npeor,\nlo\nmejor\nes\ndescansar\n" +
152+
Assertions.assertEquals("-\nAY\nSI\nENSAYARA\nCOMO\nACTUAR?\nTanto\npeor,\nlo\nmejor\nes\ndescansar\n" +
153153
"y\nno\npensar\nla\nfiesta,\nsi\nse\npuede.\nNo\nhay\nnada\nmas\ndesalentador\nver\nen\nlas\n" +
154154
"fiestas\na\njovenes\ncon\ncara\nde\nlastima\ny\niluslonadas\ny\nque\nse\nhan\npasado\ntodo\nel\n" +
155155
"dia\ntratando\nhallar\nlo\nmejor\ny\nla\nmas\natractiva\nmanera\nde\npres\ntarse\nen\npublico.\n" +
156156
"Hay\nque\nactuar\ncon\ncalma\ny\nno\ncansaremos\nde\nrepetirlo,\nLo\nmas\nimportante\nes\nsaber\n" +
157157
"que\nse\nva\na\nponer\ny\ntener\ntodo\na\nmano,\nSi\nintenta\nprobar\nun\nnuevo\nlapiz\nlabial\n" +
158-
"para\nla\na\nsion,\nasegurese\nque\narmonice\ncon\nel\nvestido\nque\nlle\nrà.\nTambién\nel\n" +
159-
"maquillaje\nde\nlos\nojos\ndebe\narmonil\ncon\nel\nconjunto,\n", textFromImage);
158+
"para\nla\na\nsion,\nasegurese\nque\narmonice\ncon\nel\n-\nvestido\nque\nlle\nrà.\nTambién\nel\n" +
159+
"maquillaje\nde\nlos\nojos\ndebe\narmoni\ncon\nel\nconjunto.\n", textFromImage);
160160
}
161161

162162
@Test

pdfocr-onnxtr/src/test/java/com/itextpdf/pdfocr/onnxtr/OnnxTRIntegrationTest.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ public void multipageTiffTest() throws IOException, InterruptedException {
220220

221221
extractionStrategy = OnnxTestUtils.extractTextFromLayer(pdfDocument, 7, "Text1");
222222
// Model glitch
223-
Assertions.assertEquals("Multipage\nTIFF\nExample\nPage\n/", extractionStrategy.getResultantText());
223+
Assertions.assertEquals("Multipage\nTIFF\nExample\nPage /", extractionStrategy.getResultantText());
224224

225225
extractionStrategy = OnnxTestUtils.extractTextFromLayer(pdfDocument, 9, "Text1");
226226
Assertions.assertEquals("Multipage\nTIFF\nExample\nPage 9", extractionStrategy.getResultantText());
@@ -239,7 +239,8 @@ public void scannedTest() throws IOException, InterruptedException {
239239
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(dest))) {
240240
ExtractionStrategy extractionStrategy = OnnxTestUtils.extractTextFromLayer(pdfDocument, 1, "Text1");
241241
Assertions.assertEquals(DeviceCmyk.MAGENTA, extractionStrategy.getFillColor());
242-
Assertions.assertEquals("1Y SI ENSAYARA COMO ACTUAR?\n" +
242+
Assertions.assertEquals("-\n" +
243+
"AY SI ENSAYARA COMO ACTUAR?\n" +
243244
"Tanto peor, lo mejor es descansar y no pensar\n" +
244245
"la fiesta, si se puede. No hay nada mas desalentador\n" +
245246
"ver en las fiestas a jovenes con cara de lastima y\n" +
@@ -250,8 +251,9 @@ public void scannedTest() throws IOException, InterruptedException {
250251
"que se va a poner y tener todo a mano,\n" +
251252
"Si intenta probar un nuevo lapiz labial para la a\n" +
252253
"sion, asegurese que armonice con el vestido que lle\n" +
253-
"rà. También el maquillaje de los ojos debe armonil\n" +
254-
"con el conjunto,", extractionStrategy.getResultantText());
254+
"-\n" +
255+
"rà. También el maquillaje de los ojos debe armoni\n" +
256+
"con el conjunto.", extractionStrategy.getResultantText());
255257
}
256258
}
257259

pdfocr-onnxtr/src/test/java/com/itextpdf/pdfocr/onnxtr/OnnxTRRotationIntegrationTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ public void rotatedColorsMix2Test() throws IOException, InterruptedException {
194194
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(dest2))) {
195195
ExtractionStrategy extractionStrategy = OnnxTestUtils.extractTextFromLayer(pdfDocument, 1, "Text1");
196196
Assertions.assertEquals(DeviceCmyk.MAGENTA, extractionStrategy.getFillColor());
197-
Assertions.assertEquals("a\ndoes\nthis\nwork?\nshould\nwe\n&%!Housten\nproblem.\nhave\nnot\nydpAl,-68/9SPEZL",
197+
Assertions.assertEquals("does\nthis\nwork?\n123456789-FIdpt\nshould\nwe\n&%!Housten\nproblem.\na\nhave\nnot",
198198
extractionStrategy.getResultantText());
199199
}
200200
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
/*
2+
This file is part of the iText (R) project.
3+
Copyright (c) 1998-2025 Apryse Group NV
4+
Authors: Apryse Software.
5+
6+
This program is offered under a commercial and under the AGPL license.
7+
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
8+
9+
AGPL licensing:
10+
This program is free software: you can redistribute it and/or modify
11+
it under the terms of the GNU Affero General Public License as published by
12+
the Free Software Foundation, either version 3 of the License, or
13+
(at your option) any later version.
14+
15+
This program is distributed in the hope that it will be useful,
16+
but WITHOUT ANY WARRANTY; without even the implied warranty of
17+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18+
GNU Affero General Public License for more details.
19+
20+
You should have received a copy of the GNU Affero General Public License
21+
along with this program. If not, see <https://www.gnu.org/licenses/>.
22+
*/
23+
package com.itextpdf.pdfocr.onnxtr.util;
24+
25+
import com.itextpdf.pdfocr.onnxtr.FloatBufferMdArray;
26+
import com.itextpdf.pdfocr.onnxtr.OnnxInputProperties;
27+
import com.itextpdf.test.ExtendedITextTest;
28+
29+
import java.awt.image.BufferedImage;
30+
import java.awt.image.WritableRaster;
31+
import java.util.Arrays;
32+
import java.util.Collection;
33+
import java.util.List;
34+
import org.junit.jupiter.api.Assertions;
35+
import org.junit.jupiter.api.Tag;
36+
import org.junit.jupiter.api.Test;
37+
38+
@Tag("UnitTest")
39+
public class BufferedImageUtilTest extends ExtendedITextTest {
40+
@Test
41+
public void toBchwInputRgbBasicTest() {
42+
final long[] expectedShape = new long[]{2, 3, 1, 2};
43+
final float[] expectedData = new float[]{
44+
0.616073F, -0.211813F,
45+
-2.294872F, 0.148567F,
46+
-0.015602F, 0.549441F,
47+
48+
-0.308642F, 0.199710F,
49+
-0.228507F, 0.978130F,
50+
0.815096F, 0.532574F,
51+
};
52+
final List<BufferedImage> images = Arrays.asList(
53+
newRgbImage(2, 1, new int[]{
54+
0xBF2220, 0x14C4A6,
55+
}),
56+
newRgbImage(2, 1, new int[]{
57+
0x00ABE5, 0x69FBA2,
58+
})
59+
);
60+
final OnnxInputProperties props = new OnnxInputProperties(
61+
new float[]{0.25F, 0.73F, 0.14F},
62+
new float[]{0.81F, 0.26F, 0.93F},
63+
new long[]{4, 3, 1, 2},
64+
false
65+
);
66+
toBchwInputBasicTest(expectedShape, expectedData, images, props);
67+
}
68+
69+
private static void toBchwInputBasicTest(
70+
long[] expectedShape,
71+
float[] expectedData,
72+
Collection<BufferedImage> images,
73+
OnnxInputProperties props
74+
) {
75+
final FloatBufferMdArray result = BufferedImageUtil.toBchwInput(images, props);
76+
Assertions.assertArrayEquals(expectedShape, result.getShape());
77+
final float[] actualData = new float[12];
78+
result.getData().get(actualData);
79+
Assertions.assertArrayEquals(expectedData, actualData, 1E-6F);
80+
}
81+
82+
private static BufferedImage newRgbImage(int width, int height, int[] pixels) {
83+
final BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
84+
final WritableRaster raster = img.getRaster();
85+
raster.setDataElements(0, 0, width, height, pixels);
86+
return img;
87+
}
88+
}
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)