Skip to content

Commit 6cb706a

Browse files
author
Grok Compression
committed
compositing: reduce RSS by only allocating strip buffer for composite
1 parent 985d0b0 commit 6cb706a

2 files changed

Lines changed: 80 additions & 8 deletions

File tree

src/lib/core/codestream/decompress/CodeStreamDecompress.cpp

Lines changed: 79 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -255,20 +255,65 @@ bool CodeStreamDecompress::decompress(grk_plugin_tile* tile)
255255
unreducedRegionY1);
256256
uint32_t tileGlobalYBegin = ceildivpow2<uint32_t>(unreducedTileYBegin, reduce);
257257
uint32_t tileGlobalYEnd = ceildivpow2<uint32_t>(unreducedTileYEnd, reduce);
258-
uint32_t yBegin = std::max(tileGlobalYBegin, regionY0) - regionY0;
259-
uint32_t yEnd = std::min(tileGlobalYEnd, regionY1) - regionY0;
258+
// Strip-relative y coordinates: the strip buffer starts at offset 0
259+
// for the current tile row
260+
uint32_t yBegin = 0;
261+
uint32_t yEnd =
262+
std::min(tileGlobalYEnd, regionY1) - std::max(tileGlobalYBegin, regionY0);
260263
if(yEnd <= yBegin)
261264
return;
262-
// buffer this band and drain in order
265+
266+
uint16_t tileX0 = tileIndexBegin % numTileCols;
267+
uint16_t numSlatedCols =
268+
(uint16_t)tilesToDecompress_.getSlatedTileRect().width();
269+
270+
// All compositing, band writing, and strip advancing must be serialized
271+
// to prevent races on the shared strip buffer.
263272
std::lock_guard<std::mutex> lock(bandOrderMutex_);
264-
pendingBands_[tileY] = {yBegin, yEnd};
273+
pendingBands_[tileY] = {yBegin, yEnd, tileX0, numSlatedCols};
265274
while(pendingBands_.count(nextBandTileY_))
266275
{
267276
auto& band = pendingBands_[nextBandTileY_];
277+
278+
// Composite all tiles in this row into the strip buffer
279+
for(uint16_t col = 0; col < band.numCols; col++)
280+
{
281+
uint16_t tileIndex = nextBandTileY_ * numTileCols + (band.tileX0 + col);
282+
auto cacheEntry = tileCache_->get(tileIndex);
283+
if(!cacheEntry || !cacheEntry->processor)
284+
continue;
285+
auto tileImage = cacheEntry->processor->getImage();
286+
if(tileImage)
287+
{
288+
if(!scratchImage_->composite(tileImage))
289+
success_ = false;
290+
}
291+
}
292+
268293
if(!ioBandCallback_(band.yBegin, band.yEnd, scratchImage_.get(), ioBandUserData_))
269294
success_ = false;
270295
pendingBands_.erase(nextBandTileY_);
271-
nextBandTileY_++;
296+
297+
// Advance strip buffer for the next tile row
298+
uint16_t nextTileY = nextBandTileY_ + 1;
299+
uint32_t nextUnreducedY0 =
300+
unreducedTy0 + (uint32_t)nextTileY * unreducedTileHeight;
301+
if(nextUnreducedY0 < unreducedRegionY1)
302+
{
303+
uint32_t nextUnreducedY1 =
304+
std::min(nextUnreducedY0 + unreducedTileHeight, unreducedRegionY1);
305+
for(uint16_t i = 0; i < scratchImage_->numcomps; i++)
306+
{
307+
auto comp = scratchImage_->comps + i;
308+
comp->y0 = ceildivpow2<uint32_t>(
309+
ceildiv<uint32_t>(nextUnreducedY0, comp->dy), reduce);
310+
uint32_t compY1 = ceildivpow2<uint32_t>(
311+
ceildiv<uint32_t>(nextUnreducedY1, comp->dy), reduce);
312+
comp->h = compY1 - comp->y0;
313+
}
314+
}
315+
316+
nextBandTileY_ = nextTileY;
272317
}
273318
},
274319
nullptr, tilesToDecompress_.getSlatedTileRect());
@@ -1078,11 +1123,19 @@ std::function<void()> CodeStreamDecompress::postMultiTile(ITileProcessor* tilePr
10781123
tileProcessor->post_decompressT2T1(scratchImage_.get());
10791124
tileProcessor->setBestEffortDecompressed();
10801125
numTilesDecompressed_++;
1126+
1127+
// Release throttle early: decompression is done, free the slot for other tiles.
1128+
// This prevents deadlock when strip-based band callback blocks tiles from future rows.
1129+
releaseThrottle();
1130+
10811131
auto tileImage = tileProcessor->getImage();
10821132
if(!cp_.codingParams_.dec_.skipAllocateComposite_ && scratchImage_->has_multiple_tiles &&
10831133
tileImage)
10841134
{
1085-
success_ = scratchImage_->composite(tileImage);
1135+
// When using strip-based band callback, skip composite here;
1136+
// it will be done in the row callback after all tiles in the row are complete.
1137+
if(!ioBandCallback_)
1138+
success_ = scratchImage_->composite(tileImage);
10861139
}
10871140
// complete tile
10881141
auto tileIndex = tileProcessor->getIndex();
@@ -1094,8 +1147,6 @@ std::function<void()> CodeStreamDecompress::postMultiTile(ITileProcessor* tilePr
10941147
tileCompletion_->complete(tileIndex);
10951148
else
10961149
tileProcessor->release();
1097-
1098-
releaseThrottle();
10991150
};
11001151
}
11011152

@@ -1661,6 +1712,26 @@ bool CodeStreamDecompress::activateScratch(bool singleTile, GrkImage* scratch)
16611712
if(singleTile || !headerImage_->has_multiple_tiles)
16621713
return true;
16631714

1715+
// When band callback is active, allocate only a strip buffer (one tile-row height)
1716+
// instead of the full composite image. This dramatically reduces peak memory for large images.
1717+
if(ioBandCallback_)
1718+
{
1719+
uint8_t reduce = cp_.codingParams_.dec_.reduce_;
1720+
auto slatedRect = tilesToDecompress_.getSlatedTileRect();
1721+
uint32_t unreducedTileY0 = cp_.ty0_ + (uint32_t)slatedRect.y0 * cp_.t_height_;
1722+
uint32_t unreducedTileY1 =
1723+
std::min(unreducedTileY0 + cp_.t_height_, (uint32_t)scratch->y1);
1724+
for(uint16_t i = 0; i < scratch->numcomps; i++)
1725+
{
1726+
auto comp = scratch->comps + i;
1727+
comp->y0 = ceildivpow2<uint32_t>(ceildiv<uint32_t>(unreducedTileY0, comp->dy), reduce);
1728+
uint32_t compY1 =
1729+
ceildivpow2<uint32_t>(ceildiv<uint32_t>(unreducedTileY1, comp->dy), reduce);
1730+
comp->h = compY1 - comp->y0;
1731+
}
1732+
return scratch->allocCompositeData();
1733+
}
1734+
16641735
return cp_.codingParams_.dec_.skipAllocateComposite_ || scratch->allocCompositeData();
16651736
}
16661737

src/lib/core/codestream/decompress/CodeStreamDecompress.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,7 @@ class CodeStreamDecompress final : public CodeStream, public IDecompressor
416416
struct PendingBand_
417417
{
418418
uint32_t yBegin, yEnd;
419+
uint16_t tileX0, numCols;
419420
};
420421
std::map<uint16_t, PendingBand_> pendingBands_;
421422

0 commit comments

Comments
 (0)