@@ -255,20 +255,65 @@ bool CodeStreamDecompress::decompress(grk_plugin_tile* tile)
255255 unreducedRegionY1);
256256 uint32_t tileGlobalYBegin = ceildivpow2<uint32_t >(unreducedTileYBegin, reduce);
257257 uint32_t tileGlobalYEnd = ceildivpow2<uint32_t >(unreducedTileYEnd, reduce);
258- uint32_t yBegin = std::max (tileGlobalYBegin, regionY0) - regionY0;
259- uint32_t yEnd = std::min (tileGlobalYEnd, regionY1) - regionY0;
258+ // Strip-relative y coordinates: the strip buffer starts at offset 0
259+ // for the current tile row
260+ uint32_t yBegin = 0 ;
261+ uint32_t yEnd =
262+ std::min (tileGlobalYEnd, regionY1) - std::max (tileGlobalYBegin, regionY0);
260263 if (yEnd <= yBegin)
261264 return ;
262- // buffer this band and drain in order
265+
266+ uint16_t tileX0 = tileIndexBegin % numTileCols;
267+ uint16_t numSlatedCols =
268+ (uint16_t )tilesToDecompress_.getSlatedTileRect ().width ();
269+
270+ // All compositing, band writing, and strip advancing must be serialized
271+ // to prevent races on the shared strip buffer.
263272 std::lock_guard<std::mutex> lock (bandOrderMutex_);
264- pendingBands_[tileY] = {yBegin, yEnd};
273+ pendingBands_[tileY] = {yBegin, yEnd, tileX0, numSlatedCols };
265274 while (pendingBands_.count (nextBandTileY_))
266275 {
267276 auto & band = pendingBands_[nextBandTileY_];
277+
278+ // Composite all tiles in this row into the strip buffer
279+ for (uint16_t col = 0 ; col < band.numCols ; col++)
280+ {
281+ uint16_t tileIndex = nextBandTileY_ * numTileCols + (band.tileX0 + col);
282+ auto cacheEntry = tileCache_->get (tileIndex);
283+ if (!cacheEntry || !cacheEntry->processor )
284+ continue ;
285+ auto tileImage = cacheEntry->processor ->getImage ();
286+ if (tileImage)
287+ {
288+ if (!scratchImage_->composite (tileImage))
289+ success_ = false ;
290+ }
291+ }
292+
268293 if (!ioBandCallback_ (band.yBegin , band.yEnd , scratchImage_.get (), ioBandUserData_))
269294 success_ = false ;
270295 pendingBands_.erase (nextBandTileY_);
271- nextBandTileY_++;
296+
297+ // Advance strip buffer for the next tile row
298+ uint16_t nextTileY = nextBandTileY_ + 1 ;
299+ uint32_t nextUnreducedY0 =
300+ unreducedTy0 + (uint32_t )nextTileY * unreducedTileHeight;
301+ if (nextUnreducedY0 < unreducedRegionY1)
302+ {
303+ uint32_t nextUnreducedY1 =
304+ std::min (nextUnreducedY0 + unreducedTileHeight, unreducedRegionY1);
305+ for (uint16_t i = 0 ; i < scratchImage_->numcomps ; i++)
306+ {
307+ auto comp = scratchImage_->comps + i;
308+ comp->y0 = ceildivpow2<uint32_t >(
309+ ceildiv<uint32_t >(nextUnreducedY0, comp->dy ), reduce);
310+ uint32_t compY1 = ceildivpow2<uint32_t >(
311+ ceildiv<uint32_t >(nextUnreducedY1, comp->dy ), reduce);
312+ comp->h = compY1 - comp->y0 ;
313+ }
314+ }
315+
316+ nextBandTileY_ = nextTileY;
272317 }
273318 },
274319 nullptr , tilesToDecompress_.getSlatedTileRect ());
@@ -1078,11 +1123,19 @@ std::function<void()> CodeStreamDecompress::postMultiTile(ITileProcessor* tilePr
10781123 tileProcessor->post_decompressT2T1 (scratchImage_.get ());
10791124 tileProcessor->setBestEffortDecompressed ();
10801125 numTilesDecompressed_++;
1126+
1127+ // Release throttle early: decompression is done, free the slot for other tiles.
1128+ // This prevents deadlock when strip-based band callback blocks tiles from future rows.
1129+ releaseThrottle ();
1130+
10811131 auto tileImage = tileProcessor->getImage ();
10821132 if (!cp_.codingParams_ .dec_ .skipAllocateComposite_ && scratchImage_->has_multiple_tiles &&
10831133 tileImage)
10841134 {
1085- success_ = scratchImage_->composite (tileImage);
1135+ // When using strip-based band callback, skip composite here;
1136+ // it will be done in the row callback after all tiles in the row are complete.
1137+ if (!ioBandCallback_)
1138+ success_ = scratchImage_->composite (tileImage);
10861139 }
10871140 // complete tile
10881141 auto tileIndex = tileProcessor->getIndex ();
@@ -1094,8 +1147,6 @@ std::function<void()> CodeStreamDecompress::postMultiTile(ITileProcessor* tilePr
10941147 tileCompletion_->complete (tileIndex);
10951148 else
10961149 tileProcessor->release ();
1097-
1098- releaseThrottle ();
10991150 };
11001151}
11011152
@@ -1661,6 +1712,26 @@ bool CodeStreamDecompress::activateScratch(bool singleTile, GrkImage* scratch)
16611712 if (singleTile || !headerImage_->has_multiple_tiles )
16621713 return true ;
16631714
1715+ // When band callback is active, allocate only a strip buffer (one tile-row height)
1716+ // instead of the full composite image. This dramatically reduces peak memory for large images.
1717+ if (ioBandCallback_)
1718+ {
1719+ uint8_t reduce = cp_.codingParams_ .dec_ .reduce_ ;
1720+ auto slatedRect = tilesToDecompress_.getSlatedTileRect ();
1721+ uint32_t unreducedTileY0 = cp_.ty0_ + (uint32_t )slatedRect.y0 * cp_.t_height_ ;
1722+ uint32_t unreducedTileY1 =
1723+ std::min (unreducedTileY0 + cp_.t_height_ , (uint32_t )scratch->y1 );
1724+ for (uint16_t i = 0 ; i < scratch->numcomps ; i++)
1725+ {
1726+ auto comp = scratch->comps + i;
1727+ comp->y0 = ceildivpow2<uint32_t >(ceildiv<uint32_t >(unreducedTileY0, comp->dy ), reduce);
1728+ uint32_t compY1 =
1729+ ceildivpow2<uint32_t >(ceildiv<uint32_t >(unreducedTileY1, comp->dy ), reduce);
1730+ comp->h = compY1 - comp->y0 ;
1731+ }
1732+ return scratch->allocCompositeData ();
1733+ }
1734+
16641735 return cp_.codingParams_ .dec_ .skipAllocateComposite_ || scratch->allocCompositeData ();
16651736}
16661737
0 commit comments