From af2e0c4d48f9822554c7daa35811febb0ff7ebeb Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Mon, 25 May 2026 13:57:26 +0200
Subject: [PATCH 01/21] Cache recent sorted node chunks

SortedNodeStore currently keeps only the most recently decoded compressed chunk per worker. Austria profiling showed about 277M node lookups and 90M compressed chunk decodes, with a one-entry cache hit rate around 67%.

Keep four recently decoded chunks in fixed per-thread arrays. The cache stays bounded and allocation-free in the lookup path while raising the measured Austria hit rate to about 85%, cutting decoded chunks to about 40M.

Validation: test_sorted_node_store passes; Liechtenstein MBTiles match semantically against the PR-stack baseline; Austria timing improved by about 2-3% wall time and about 4% user CPU with flat heap/RSS in heaptrack and Massif checks.

Co-authored-by: Codex <noreply@openai.com>
---
 src/sorted_node_store.cpp | 66 +++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 16 deletions(-)
diff --git a/src/sorted_node_store.cpp b/src/sorted_node_store.cpp
index 6ea6e8af..1b9c452e 100644
--- a/src/sorted_node_store.cpp
+++ b/src/sorted_node_store.cpp
@@ -4,6 +4,7 @@
 #include <string>
 #include <map>
 #include <bitset>
+#include <array>
 #include "sorted_node_store.h"
 #include "external/libpopcnt.h"
 #include "external/streamvbyte.h"
@@ -14,15 +15,26 @@ namespace SortedNodeStoreTypes {
 	const uint16_t ChunkSize = 256;
 	const uint16_t ChunkAlignment = 16;
 	const uint32_t ChunkCompressed = 1 << 31;
+	const uint8_t ChunkCacheSize = 4;
+
+	struct CachedChunk {
+		CachedChunk(): id(-1) {}
+
+		int64_t id;
+		std::array<int32_t, ChunkSize> lons;
+		std::array<int32_t, ChunkSize> latps;
+	};
 
 	struct ThreadStorage {
 		ThreadStorage():
 			collectingOrphans(true),
 			groupStart(-1),
 			localNodes(nullptr),
-			cachedChunk(-1),
 			arenaSpace(0),
-			arenaPtr(nullptr) {}
+			arenaPtr(nullptr) {
+				for (uint8_t i = 0; i < ChunkCacheSize; ++i)
+					cacheOrder[i] = i;
+			}
 		// When SortedNodeStore first starts, it's not confident that it has seen an
 		// entire segment, so it's in "collecting orphans" mode. Once it crosses a
 		// threshold of 64K elements, it ceases to be in this mode.
@@ -33,9 +45,8 @@ namespace SortedNodeStoreTypes {
 		uint64_t groupStart = -1;
 		std::vector<NodeStore::element_t>* localNodes = nullptr;
 
-		int64_t cachedChunk = -1;
-		std::vector<int32_t> cacheChunkLons;
-		std::vector<int32_t> cacheChunkLatps;
+		std::array<CachedChunk, ChunkCacheSize> cacheChunks;
+		std::array<uint8_t, ChunkCacheSize> cacheOrder;
 
 		uint32_t arenaSpace = 0;
 		char* arenaPtr = nullptr;
@@ -53,6 +64,13 @@ namespace SortedNodeStoreTypes {
 		auto& rv = threadStorage.back();
 		return rv.second;
 	}
+
+	void promoteCacheSlot(ThreadStorage& tls, size_t orderIndex) {
+		const uint8_t slot = tls.cacheOrder[orderIndex];
+		for (size_t i = orderIndex; i > 0; --i)
+			tls.cacheOrder[i] = tls.cacheOrder[i - 1];
+		tls.cacheOrder[0] = slot;
+	}
 }
 
 using namespace SortedNodeStoreTypes;
@@ -166,25 +184,40 @@ LatpLon SortedNodeStore::at(const NodeID id) const {
 
 		const size_t neededChunk = groupIndex * ChunkSize + chunk;
 
-		// Really naive caching strategy - just cache the last-used chunk.
-		// Probably good enough?
+		// Keep a few recently decoded chunks per worker. Way geometry tends to
+		// revisit adjacent chunks, but not always the immediately preceding one.
 		ThreadStorage& tls = s(this);
-		if (tls.cachedChunk != neededChunk) {
-			tls.cachedChunk = neededChunk;
-			tls.cacheChunkLons.reserve(256);
-			tls.cacheChunkLatps.reserve(256);
+		size_t orderIndex = ChunkCacheSize;
+		for (size_t i = 0; i < ChunkCacheSize; ++i) {
+			if (tls.cacheChunks[tls.cacheOrder[i]].id == neededChunk) {
+				orderIndex = i;
+				break;
+			}
+		}
+
+		size_t cacheSlot;
+		if (orderIndex == ChunkCacheSize) {
+			cacheSlot = tls.cacheOrder[ChunkCacheSize - 1];
+			CachedChunk& cached = tls.cacheChunks[cacheSlot];
+			cached.id = neededChunk;
 
 			uint8_t* latpData = ptr->data;
 			uint8_t* lonData = ptr->data + latpSize;
 			uint32_t recovdata[256] = {0};
 
 			streamvbyte_decode(latpData, recovdata, n);
-			tls.cacheChunkLatps[0] = ptr->firstLatp;
-			zigzag_delta_decode(recovdata, &tls.cacheChunkLatps[1], n, tls.cacheChunkLatps[0]);
+			cached.latps[0] = ptr->firstLatp;
+			zigzag_delta_decode(recovdata, &cached.latps[1], n, cached.latps[0]);
 
 			streamvbyte_decode(lonData, recovdata, n);
-			tls.cacheChunkLons[0] = ptr->firstLon;
-			zigzag_delta_decode(recovdata, &tls.cacheChunkLons[1], n, tls.cacheChunkLons[0]);
+			cached.lons[0] = ptr->firstLon;
+			zigzag_delta_decode(recovdata, &cached.lons[1], n, cached.lons[0]);
+
+			promoteCacheSlot(tls, ChunkCacheSize - 1);
+		} else {
+			cacheSlot = tls.cacheOrder[orderIndex];
+			if (orderIndex != 0)
+				promoteCacheSlot(tls, orderIndex);
 		}
 
 		size_t nodeOffset = 0;
@@ -195,7 +228,8 @@ LatpLon SortedNodeStore::at(const NodeID id) const {
 		if (!(ptr->nodeMask[nodeMaskByte] & (1 << nodeMaskBit)))
 			throw std::out_of_range("SortedNodeStore: node " + std::to_string(id) + " missing, no node");
 
-		return { tls.cacheChunkLatps[nodeOffset], tls.cacheChunkLons[nodeOffset] };
+		const CachedChunk& cached = tls.cacheChunks[cacheSlot];
+		return { cached.latps[nodeOffset], cached.lons[nodeOffset] };
 	}
 
 	UncompressedChunkInfo* ptr = (UncompressedChunkInfo*)basePtr;

From e74e3a96c0011539ea6b132f1322a64350d7be7b Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Mon, 25 May 2026 14:54:23 +0200
Subject: [PATCH 02/21] Reuse fast clip scratch ring

fast_clip constructs a result ring for each clip edge and for each polygon ring. Heaptrack showed this path producing hundreds of thousands of allocations on the Liechtenstein fixture after the sorted-node cache candidate reduced node decode cost.

Keep one scratch ring for the Sutherland-Hodgman edge passes and reuse it across a polygon's outer and inner rings. This preserves the public fast_clip wrapper and generated tile semantics while removing repeated vector growth in the clipping path.

Liechtenstein output matched semantically against the node-cache baseline. On Austria with --threads 8, wall time was effectively flat to slightly lower, system time decreased, and heaptrack allocation calls on Liechtenstein fell by about 758k. Massif peak total dropped by about 64 MB on that small fixture.

Co-authored-by: Codex <noreply@openai.com>
---
 src/geom.cpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/geom.cpp b/src/geom.cpp
index 69633372..b19a9c65 100644
--- a/src/geom.cpp
+++ b/src/geom.cpp
@@ -197,10 +197,11 @@ char bit_code(Point const &p, Box const &bbox) {
 }
 
 // Sutherland-Hodgeman polygon clipping algorithm
-void fast_clip(Ring &points, Box const &bbox) {
+void fast_clip(Ring &points, Box const &bbox, Ring &result) {
 	// clip against each side of the clip rectangle
 	for (char edge = 1; edge <= 8; edge *= 2) {
-		Ring result;
+		result.clear();
+		result.reserve(points.size() + 4);
 		Point prev = points[points.size() - 1];
 		bool prevInside = (bit_code(prev, bbox) & edge)==0;
 
@@ -214,20 +215,26 @@ void fast_clip(Ring &points, Box const &bbox) {
 			prev = p;
 			prevInside = inside;
 		}
-		points = std::move(result);
+		points.swap(result);
 		if (points.size()==0) break;
 	}
 }
 
+void fast_clip(Ring &points, Box const &bbox) {
+	Ring result;
+	fast_clip(points, bbox, result);
+}
+
 // Wrappers for polygon/multipolygon
 void fast_clip(Polygon &polygon, Box const &bbox) {
-	fast_clip(polygon.outer(), bbox);
+	Ring result;
+	fast_clip(polygon.outer(), bbox, result);
 	if (polygon.outer().empty()) {
 		polygon.inners().resize(0);
 		return;
 	}
 	for (auto &inner: polygon.inners()) {
-		fast_clip(inner, bbox);
+		fast_clip(inner, bbox, result);
 	}
 	polygon.inners().erase(std::remove_if(
 		polygon.inners().begin(), polygon.inners().end(), 

From c97f16f6d910b0f202d8c98bc1555b1dc95f3764 Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Mon, 25 May 2026 15:18:15 +0200
Subject: [PATCH 03/21] Move scaled geometry rings

Build scaled polygon rings directly in TileBbox instead of copying through intermediate vector and Ring objects. This keeps the existing scale/backtrack behavior but avoids polygon-by-value iteration and moves freshly scaled rings into the destination geometry.

The Liechtenstein fixture is semantically unchanged, while heaptrack on the same fixture reports about 488k fewer allocation calls. Austria timing is neutral to slightly faster with no stable RSS regression.

Co-authored-by: Codex <noreply@openai.com>
---
 include/coordinates_geom.h |  2 +-
 src/coordinates_geom.cpp   | 27 +++++++++++----------------
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/include/coordinates_geom.h b/include/coordinates_geom.h
index 279de459..7dacbfaf 100644
--- a/include/coordinates_geom.h
+++ b/include/coordinates_geom.h
@@ -23,7 +23,7 @@ class TileBbox {
 	TileBbox(TileCoordinates i, uint z, bool h, bool e);
 
 	std::pair<int,int> scaleLatpLon(double latp, double lon) const;
-	std::vector<Point> scaleRing(Ring const &src) const;
+	Ring scaleRing(Ring const &src) const;
 	MultiPolygon scaleGeometry(MultiPolygon const &src) const;
 	std::pair<double, double> floorLatpLon(double latp, double lon) const;
 
diff --git a/src/coordinates_geom.cpp b/src/coordinates_geom.cpp
index 0a5b2410..020216f8 100644
--- a/src/coordinates_geom.cpp
+++ b/src/coordinates_geom.cpp
@@ -33,10 +33,10 @@ pair<int,int> TileBbox::scaleLatpLon(double latp, double lon) const {
 
 // Scaling with naive self-intersection check - if we've added the new point
 // within the last 5 points, then backtrack to the last time we added it
-std::vector<Point> TileBbox::scaleRing(Ring const &src) const {
-	std::vector<Point> points;
+Ring TileBbox::scaleRing(Ring const &src) const {
+	Ring points;
 	points.reserve(src.size());
-	for(auto &i: src) {
+	for(auto const &i: src) {
 		auto scaled = scaleLatpLon(i.y(), i.x()); // -> .first is x, .second is y
 		bool found = false;
 		for (size_t j=1; j<5; j++) {
@@ -53,30 +53,25 @@ std::vector<Point> TileBbox::scaleRing(Ring const &src) const {
 
 MultiPolygon TileBbox::scaleGeometry(MultiPolygon const &src) const {
 	MultiPolygon dst;
-	for(auto poly: src) {
+	dst.reserve(src.size());
+	for(auto const &poly: src) {
 		Polygon p;
 
 		// Copy the outer ring
-		std::vector<Point> points = scaleRing(poly.outer());
+		Ring points = scaleRing(poly.outer());
 		if (points.size()<4) continue;
-		Ring outer;
-		geom::append(outer,points);
-		geom::append(p,outer);
+		p.outer() = std::move(points);
 
 		// Copy the inner rings
-		int num_rings = 0;
-		for(auto &r: poly.inners()) {
+		p.inners().reserve(poly.inners().size());
+		for(auto const &r: poly.inners()) {
 			points = scaleRing(r);
 			if (points.size()<4) continue;
-			Ring inner;
-			geom::append(inner,points);
-			num_rings++;
-			geom::interior_rings(p).resize(num_rings);
-			geom::append(p, inner, num_rings-1);
+			p.inners().push_back(std::move(points));
 		}
 
 		// Add to multipolygon
-		dst.push_back(p);
+		dst.push_back(std::move(p));
 	}
 	return dst;
 }

From 48dd40eeacee022437257a8dc938b14d0085dadc Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Mon, 25 May 2026 15:54:20 +0200
Subject: [PATCH 04/21] Reserve Visvalingam output storage

Reserve the custom simplification heap and output containers before appending points, rings, and polygons. Also read triangle-area points by const reference instead of copying them.

The Liechtenstein fixture is semantically unchanged. On the Austria fixture this is timing-neutral to slightly faster, and heaptrack on Liechtenstein reports about 65k fewer allocation calls with no stable heap or RSS regression.

Co-authored-by: Codex <noreply@openai.com>
---
 src/visvalingam.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/visvalingam.cpp b/src/visvalingam.cpp
index 7ae43e8f..1b534b93 100644
--- a/src/visvalingam.cpp
+++ b/src/visvalingam.cpp
@@ -49,6 +49,10 @@ struct visItem {
 struct minHeap {
 	std::vector<visItem *> h;
 
+	void Reserve(size_t size) {
+		h.reserve(size);
+	}
+
 	void Push(visItem *item) {
 		item->index = h.size();
 		h.push_back(item);
@@ -142,9 +146,9 @@ struct minHeap {
 
 template<typename GeometryType>
 static double doubleTriangleArea(GeometryType const &ls, int start, int i1, int i2, int i3) {
-	Point a = ls[i1 + start];
-	Point b = ls[i2 + start];
-	Point c = ls[i3 + start];
+	Point const &a = ls[i1 + start];
+	Point const &b = ls[i2 + start];
+	Point const &c = ls[i3 + start];
 
 	return std::abs((b.x() - a.x()) * (c.y() - a.y()) - (b.y() - a.y()) * (c.x() - a.x()));
 }
@@ -158,6 +162,7 @@ GeometryType visvalingam(const GeometryType &ls, double threshold, size_t retain
 
 	// build the initial minheap linked list.
 	minHeap heap;
+	heap.Reserve(end - start);
 
 	visItem linkedListStart;
 	linkedListStart.area = INFINITY;
@@ -232,6 +237,7 @@ GeometryType visvalingam(const GeometryType &ls, double threshold, size_t retain
 	}
 
 	GeometryType output;
+	output.reserve(end - start - removed);
 	visItem *item = &linkedListStart;
 	while (item != NULL) {
 		output.emplace_back(ls[item->pointIndex + start]);
@@ -250,6 +256,7 @@ Polygon simplifyVis(const Polygon &p, double max_distance) {
 	Polygon output;
 	double threshold = max_distance * max_distance * 4;
 	output.outer() = visvalingam(p.outer(), threshold, 4);
+	output.inners().reserve(p.inners().size());
 	for (const auto &ring : p.inners()) {
 		output.inners().emplace_back(visvalingam(ring, threshold, 4));
 	}
@@ -257,6 +264,7 @@ Polygon simplifyVis(const Polygon &p, double max_distance) {
 }
 MultiPolygon simplifyVis(const MultiPolygon &mp, double max_distance) { 
 	MultiPolygon output;
+	output.reserve(mp.size());
 	for (const auto &p : mp) {
 		output.emplace_back(simplifyVis(p, max_distance));
 	}

From 9f00d9218f28d689f6d13d44977b237ad8c94297 Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Mon, 25 May 2026 16:15:40 +0200
Subject: [PATCH 05/21] Avoid geometry copies while indexing

Iterate multiline and multipolygon members by const reference when calculating tile coverage for addGeometryToIndex. The loops only read geometry before calling insertIntermediateTiles, so copying each Linestring or Polygon is unnecessary.

The Liechtenstein fixture is semantically unchanged. On the Austria fixture the combined forward and reverse timing is slightly faster, with lower system time and no stable RSS regression.

Co-authored-by: Codex <noreply@openai.com>
---
 src/tile_data.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index 10c2f11b..d6fc7461 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -477,7 +477,7 @@ void TileDataSource::addGeometryToIndex(
 	const std::vector<OutputObject>& outputs,
 	const uint64_t id
 ) {
-	for (Linestring ls : geom) {
+	for (const auto& ls : geom) {
 		unordered_set<TileCoordinates> tileSet;
 		insertIntermediateTiles(ls, indexZoom, tileSet);
 		for (auto it = tileSet.begin(); it != tileSet.end(); ++it) {
@@ -496,7 +496,7 @@ void TileDataSource::addGeometryToIndex(
 ) {
 	unordered_set<TileCoordinates> tileSet;
 	bool singleOuter = geom.size()==1;
-	for (Polygon poly : geom) {
+	for (const auto& poly : geom) {
 		unordered_set<TileCoordinates> tileSetTmp;
 		insertIntermediateTiles(poly.outer(), indexZoom, tileSetTmp);
 		fillCoveredTiles(tileSetTmp);

From 110bc800223425a7d7b58ab14742ca8484f8993a Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Mon, 25 May 2026 16:30:37 +0200
Subject: [PATCH 06/21] Use segments for line tile intersections

Avoid constructing a temporary two-point Linestring for each segment clipping check. Both line geometry build paths only need a single segment-vs-box intersection test, so use boost::geometry::model::segment<Point> directly.

The Liechtenstein fixture is semantically unchanged. Heaptrack on Liechtenstein reports about 500k fewer allocation calls and about 500k fewer temporary allocations; Austria timing is effectively neutral with no stable RSS regression.

Co-authored-by: Codex <noreply@openai.com>
---
 src/osm_mem_tiles.cpp | 3 ++-
 src/tile_data.cpp     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/osm_mem_tiles.cpp b/src/osm_mem_tiles.cpp
index bc038767..ae7bd5c0 100644
--- a/src/osm_mem_tiles.cpp
+++ b/src/osm_mem_tiles.cpp
@@ -62,7 +62,8 @@ Geometry OsmMemTiles::buildWayGeometry(
 		geom::append(current_ls, ls[0]);
 
 		for(size_t i = 1; i < ls.size(); ++i) {
-			if(!geom::intersects(Linestring({ ls[i-1], ls[i] }), bbox.clippingBox)) {
+			boost::geometry::model::segment<Point> segment(ls[i-1], ls[i]);
+			if(!geom::intersects(segment, bbox.clippingBox)) {
 				if(current_ls.size() > 1)
 					out.push_back(std::move(current_ls));
 				current_ls.clear();
diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index d6fc7461..a9087ffc 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -231,7 +231,8 @@ Geometry TileDataSource::buildWayGeometry(OutputGeometryType const geomType,
 			geom::append(current_ls, ls[0]);
 
 			for(size_t i = 1; i < ls.size(); ++i) {
-				if(!geom::intersects(Linestring({ ls[i-1], ls[i] }), bbox.clippingBox)) {
+				boost::geometry::model::segment<Point> segment(ls[i-1], ls[i]);
+				if(!geom::intersects(segment, bbox.clippingBox)) {
 					if(current_ls.size() > 1)
 						out.push_back(std::move(current_ls));
 					current_ls.clear();

From 13053c4cdfd10b48ce0a18ee70e203ec7af10c44 Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Mon, 25 May 2026 16:59:31 +0200
Subject: [PATCH 07/21] Reserve OSMStore geometry output

llListPolygon and llListLinestring know the way-node range length before fillPoints appends converted coordinates. Reserving the local output containers avoids repeated vector growth without changing conversion, integrity handling, or geometry correction.

Liechtenstein semantic comparison against the submitted-stack baseline reported changed_tiles 0. Austria timing/RSS was effectively neutral after forward and reverse alternating runs; heaptrack on Liechtenstein showed about 205k fewer allocation calls, with Massif peak flat within measurement noise.

Co-authored-by: Codex <noreply@openai.com>
---
 include/osm_store.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/osm_store.h b/include/osm_store.h
index 171e9386..a60830fb 100644
--- a/include/osm_store.h
+++ b/include/osm_store.h
@@ -332,6 +332,7 @@ class OSMStore
 	template<class WayIt>
 	Polygon llListPolygon(WayIt begin, WayIt end) const {
 		Polygon poly;
+		poly.outer().reserve(end - begin);
 		fillPoints(poly.outer(), begin, end);
 		boost::geometry::correct(poly);
 		return poly;
@@ -341,6 +342,7 @@ class OSMStore
 	template<class WayIt>
 	Linestring llListLinestring(WayIt begin, WayIt end) const {
 		Linestring ls;
+		ls.reserve(end - begin);
 		fillPoints(ls, begin, end);
 		return ls;
 	}

From 84b078d936d8b2d7f51f7641a0424dbf14dd518d Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Mon, 25 May 2026 17:25:15 +0200
Subject: [PATCH 08/21] Reserve way geometry buffers

SortedWayStore::at decodes the full way-node list before building the returned LatpLon vector, and OsmMemTiles::populateLinestring receives that full vector before appending points. Reserve both local output containers to avoid repeated vector growth without changing decoding, node lookup, conversion, or cache behavior.

Liechtenstein semantic comparison against the previous accepted stack reported changed_tiles 0. Austria forward and reverse timing was consistently faster; heaptrack on Liechtenstein showed about 234k fewer allocation calls with temporary allocations and Massif peak effectively flat.

Co-authored-by: Codex <noreply@openai.com>
---
 src/osm_mem_tiles.cpp    | 1 +
 src/sorted_way_store.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/osm_mem_tiles.cpp b/src/osm_mem_tiles.cpp
index ae7bd5c0..cd9912f0 100644
--- a/src/osm_mem_tiles.cpp
+++ b/src/osm_mem_tiles.cpp
@@ -85,6 +85,7 @@ Geometry OsmMemTiles::buildWayGeometry(
 
 void OsmMemTiles::populateLinestring(Linestring& ls, NodeID objectID) const {
 	std::vector<LatpLon> nodes = wayStore.at(OSM_ID(objectID));
+	ls.reserve(nodes.size());
 
 	for (const LatpLon& node : nodes) {
 		boost::geometry::range::push_back(ls, boost::geometry::make<Point>(node.lon/10000000.0, node.latp/10000000.0));
diff --git a/src/sorted_way_store.cpp b/src/sorted_way_store.cpp
index 05785dc4..0e47150e 100644
--- a/src/sorted_way_store.cpp
+++ b/src/sorted_way_store.cpp
@@ -194,6 +194,7 @@ std::vector<LatpLon> SortedWayStore::at(WayID id) const {
 
 	std::vector<NodeID> nodes = SortedWayStore::decodeWay(wayPtr->flags, wayPtr->data);
 	std::vector<LatpLon> rv;
+	rv.reserve(nodes.size());
 	for (const NodeID& node : nodes)
 		rv.push_back(nodeStore.at(node));
 	return rv;

From c51fec1468998c5646104c910d18db8502bac797 Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Mon, 25 May 2026 17:51:59 +0200
Subject: [PATCH 09/21] Pre-size populated multipolygons

TileDataSource::populateMultiPolygon copies a complete mmap-backed multipolygon into a normal MultiPolygon. Resize the destination polygons and rings before assigning points, matching the explicit copy shape already used by storeMultiPolygon and avoiding repeated generic assignment growth.

Liechtenstein semantic comparison against the previous accepted stack reported changed_tiles 0. Austria timing was effectively neutral; heaptrack on Liechtenstein showed about 116k fewer allocation calls with temporary allocations flat and no stable RSS or Massif peak regression.

Co-authored-by: Codex <noreply@openai.com>
---
 src/tile_data.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index a9087ffc..472ab3ed 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -594,5 +594,15 @@ NodeID TileDataSource::storeMultiLinestring(const MultiLinestring& src) {
 
 void TileDataSource::populateMultiPolygon(MultiPolygon& dst, NodeID objectID) {
 	const auto &input = retrieveMultiPolygon(objectID);
-	boost::geometry::assign(dst, input);
+	dst.resize(input.size());
+	for(std::size_t i = 0; i < input.size(); ++i) {
+		dst[i].outer().resize(input[i].outer().size());
+		boost::geometry::assign(dst[i].outer(), input[i].outer());
+
+		dst[i].inners().resize(input[i].inners().size());
+		for(std::size_t j = 0; j < input[i].inners().size(); ++j) {
+			dst[i].inners()[j].resize(input[i].inners()[j].size());
+			boost::geometry::assign(dst[i].inners()[j], input[i].inners()[j]);
+		}
+	}
 }

From 4240e68a625dbe32d5c69d9784a4183563ce013e Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Mon, 25 May 2026 18:18:38 +0200
Subject: [PATCH 10/21] Reserve closed-way polygon construction

Closed-way polygon construction reads the cached linestring without mutating it, so avoid copying that linestring in the Lua Layer path. Reserve polygon outer-ring storage where the source size is known and move temporary polygons into their destination containers.

Liechtenstein semantic comparison against the previous accepted stack reported changed_tiles 0. Austria forward and reverse timing/RSS were slightly positive, heaptrack on Liechtenstein showed about 256k fewer allocation calls, and perf showed lower instructions, cycles, and cache misses.

Co-authored-by: Codex <noreply@openai.com>
---
 src/osm_lua_processing.cpp | 10 +++++++---
 src/osm_mem_tiles.cpp      |  5 ++++-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/osm_lua_processing.cpp b/src/osm_lua_processing.cpp
index bc5271a4..bcd88732 100644
--- a/src/osm_lua_processing.cpp
+++ b/src/osm_lua_processing.cpp
@@ -652,10 +652,12 @@ void OsmLuaProcessing::Layer(const string &layerName, bool area) {
 			}
 			else if (isWay) {
 				//Is there a more efficient way to do this?
-				Linestring ls = linestringCached();
+				const Linestring &ls = linestringCached();
 				Polygon p;
+				p.outer().reserve(ls.size());
 				geom::assign_points(p, ls);
-				mp.push_back(p);
+				mp.reserve(1);
+				mp.push_back(std::move(p));
 
 				auto correctionResult = CorrectGeometry(mp);
 				if(correctionResult == CorrectGeometryResult::Invalid) return;
@@ -872,8 +874,10 @@ Point OsmLuaProcessing::calculateCentroid(CentroidAlgorithm algorithm) {
 				geom::centroid(ls, centroid);
 			}
 		} else {
+			const Linestring &ls = linestringCached();
 			Polygon p;
-			geom::assign_points(p, linestringCached());
+			p.outer().reserve(ls.size());
+			geom::assign_points(p, ls);
 
 			if (algorithm == CentroidAlgorithm::Polylabel) {
 				// CONSIDER: pick precision intelligently
diff --git a/src/osm_mem_tiles.cpp b/src/osm_mem_tiles.cpp
index cd9912f0..654c6618 100644
--- a/src/osm_mem_tiles.cpp
+++ b/src/osm_mem_tiles.cpp
@@ -34,6 +34,7 @@ LatpLon OsmMemTiles::buildNodeGeometry(
 		Linestring& ls = getOrBuildLinestring(objectID);
 		Point centroid;
 		Polygon p;
+		p.outer().reserve(ls.size());
 		geom::assign_points(p, ls);
 		geom::centroid(p, centroid);
 		return LatpLon{(int32_t)(centroid.y()*10000000.0), (int32_t)(centroid.x()*10000000.0)};
@@ -124,8 +125,10 @@ void OsmMemTiles::populateMultiPolygon(MultiPolygon& dst, NodeID objectID) {
 	Linestring ls;
 	populateLinestring(ls, objectID);
 	Polygon p;
+	p.outer().reserve(ls.size());
 	geom::assign_points(p, ls);
-	dst.push_back(p);
+	dst.reserve(dst.size() + 1);
+	dst.push_back(std::move(p));
 }
 
 void OsmMemTiles::Clear() {

From 1e0dc102f8db52fe55be4dc8b3d4ef8f2683786c Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Mon, 25 May 2026 18:45:01 +0200
Subject: [PATCH 11/21] Reserve split linestring output

Split way geometry builds a temporary linestring by appending source points before moving completed pieces into the output geometry. Reserve the temporary linestring storage from the known source size, and re-reserve the remaining possible size after moving out a completed split.

Liechtenstein semantic comparison against the previous accepted stack reported changed_tiles 0. Austria timing/RSS was neutral and order-dependent, while heaptrack on Liechtenstein showed about 118k fewer allocation calls with no stable heap regression.

Co-authored-by: Codex <noreply@openai.com>
---
 src/osm_mem_tiles.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/osm_mem_tiles.cpp b/src/osm_mem_tiles.cpp
index 654c6618..e27bc703 100644
--- a/src/osm_mem_tiles.cpp
+++ b/src/osm_mem_tiles.cpp
@@ -60,6 +60,7 @@ Geometry OsmMemTiles::buildWayGeometry(
 			return out;
 
 		Linestring current_ls;
+		current_ls.reserve(ls.size());
 		geom::append(current_ls, ls[0]);
 
 		for(size_t i = 1; i < ls.size(); ++i) {
@@ -68,6 +69,7 @@ Geometry OsmMemTiles::buildWayGeometry(
 				if(current_ls.size() > 1)
 					out.push_back(std::move(current_ls));
 				current_ls.clear();
+				current_ls.reserve(ls.size() - i);
 			}
 			geom::append(current_ls, ls[i]);
 		}

From 16c3b3185b764d39a59f95cd863b0da3ff200c7b Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Mon, 25 May 2026 20:10:19 +0200
Subject: [PATCH 12/21] Avoid copying low zoom object lists

Low zoom tile collection only reads the materialized low zoom object vectors, but the helper accepted those vectors by value. That copied the full vector-of-vectors for each low zoom tile collection, working against the memory and thrash-reduction goal of the low zoom path added for large extracts.

Pass the low zoom object lists by const reference instead. This keeps behavior unchanged while avoiding the unnecessary copy.

Liechtenstein semantic comparison against the previous accepted housekeeping stack reported changed_tiles 0. Austria timing/RSS profiling was wall-time neutral and showed a stable ~80-85 MB RSS reduction across both run orders; perf counters showed no meaningful regression.

Co-authored-by: Codex <noreply@openai.com>
---
 include/tile_data.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tile_data.h b/include/tile_data.h
index 012cab93..cd85b608 100644
--- a/include/tile_data.h
+++ b/include/tile_data.h
@@ -114,7 +114,7 @@ inline OutputObjectID outputObjectWithId<OutputObjectXYID>(const OutputObjectXYI
 
 template<typename OO> void collectLowZoomObjectsForTile(
 	const unsigned int& indexZoom,
-	typename std::vector<std::vector<OO>> objects,
+	const typename std::vector<std::vector<OO>>& objects,
 	unsigned int zoom,
 	const TileCoordinates& dstIndex,
 	std::vector<OutputObjectID>& output

From f07b42589aca85f46fc0a7eae9b259858d6d58ef Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Mon, 25 May 2026 21:06:28 +0200
Subject: [PATCH 13/21] Reuse polygon area projection buffer

Area() reprojected every polygon into a fresh DegPoint polygon before asking Boost for spherical area. Reuse the per-processing projected polygon storage and fill it directly instead, preserving the existing lon/latp to lon/lat conversion while avoiding repeated ring allocations.

The candidate matched the accepted stack semantically on Liechtenstein. On warmed Austria runs it was runtime-neutral and reduced peak RSS by roughly 35-90 MiB, while Liechtenstein heaptrack showed about 110k fewer allocation calls.

Co-authored-by: Codex <noreply@openai.com>
---
 include/osm_lua_processing.h |  3 +++
 src/osm_lua_processing.cpp   | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/include/osm_lua_processing.h b/include/osm_lua_processing.h
index 539c745c..b8d536c1 100644
--- a/include/osm_lua_processing.h
+++ b/include/osm_lua_processing.h
@@ -276,6 +276,8 @@ class OsmLuaProcessing {
 	const inline Point getPoint() {
 		return Point(lon/10000000.0,latp/10000000.0);
 	}
+
+	double projectedPolygonArea(const Polygon &p);
 	
 	OSMStore &osmStore;	// global OSM store
 
@@ -310,6 +312,7 @@ class OsmLuaProcessing {
 	bool multiLinestringInited;
 	MultiPolygon multiPolygonCache;
 	bool multiPolygonInited;
+	geom::model::polygon<DegPoint> areaPolygonCache;
 
 	NodeID lastStoredGeometryId;
 	OutputGeometryType lastStoredGeometryType;
diff --git a/src/osm_lua_processing.cpp b/src/osm_lua_processing.cpp
index bcd88732..076505b2 100644
--- a/src/osm_lua_processing.cpp
+++ b/src/osm_lua_processing.cpp
@@ -512,21 +512,43 @@ void reverse_project(DegPoint& p) {
     geom::set<1>(p, latp2lat(geom::get<1>(p)));
 }
 
+template <typename DstRing, typename SrcRing>
+void projectRing(DstRing& dst, const SrcRing& src) {
+	dst.resize(src.size());
+	for (std::size_t i = 0; i < src.size(); ++i) {
+		geom::set<0>(dst[i], geom::get<0>(src[i]));
+		geom::set<1>(dst[i], latp2lat(geom::get<1>(src[i])));
+	}
+}
+
+#if BOOST_VERSION >= 106700
+double OsmLuaProcessing::projectedPolygonArea(const Polygon &p) {
+	areaPolygonCache.inners().resize(p.inners().size());
+	projectRing(areaPolygonCache.outer(), p.outer());
+	for (std::size_t i = 0; i < p.inners().size(); ++i) {
+		projectRing(areaPolygonCache.inners()[i], p.inners()[i]);
+	}
+
+	geom::strategy::area::spherical<> sph_strategy(RadiusMeter);
+	return geom::area(areaPolygonCache, sph_strategy);
+}
+#endif
+
 // Returns area
 double OsmLuaProcessing::Area() {
 	if (!IsClosed()) return 0;
 
 #if BOOST_VERSION >= 106700
-	geom::strategy::area::spherical<> sph_strategy(RadiusMeter);
 	if (isRelation) {
 		// Boost won't calculate area of a multipolygon, so we just total up the member polygons
-		return multiPolygonArea(multiPolygonCached());
+		double totalArea = 0;
+		const MultiPolygon &mp = multiPolygonCached();
+		for (MultiPolygon::const_iterator it = mp.begin(); it != mp.end(); ++it) {
+			totalArea += projectedPolygonArea(*it);
+		}
+		return totalArea;
 	} else if (isWay) {
-		// Reproject back into lat/lon and then run Boo
-		geom::model::polygon<DegPoint> p;
-		geom::assign(p,polygonCached());
-		geom::for_each_point(p, reverse_project);
-		return geom::area(p, sph_strategy);
+		return projectedPolygonArea(polygonCached());
 	}
 #else
 	if (isRelation) {

From b343b56e5680dc96519c71e96f3a364d6b481d1f Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Tue, 26 May 2026 05:47:01 +0200
Subject: [PATCH 14/21] Reuse scaled polygon output buffer

Polygon output scaled each multipolygon into fresh geometry storage before simplify/correct/write. Add destination-taking scale helpers and reuse a thread-local scaled multipolygon buffer in the writer so ring storage can be retained across objects on the same worker.

The candidate matched the accepted stack semantically on Liechtenstein. On warmed Austria runs it was runtime-neutral with no stable RSS regression, and Liechtenstein heaptrack showed about 65k fewer allocation calls. The 64 MB Massif swing matched the mmap allocator chunk size and flipped with run order.

Co-authored-by: Codex <noreply@openai.com>
---
 include/coordinates_geom.h |  2 ++
 src/coordinates_geom.cpp   | 47 +++++++++++++++++++++++++-------------
 src/tile_worker.cpp        |  4 +++-
 3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/include/coordinates_geom.h b/include/coordinates_geom.h
index 7dacbfaf..1c035d9c 100644
--- a/include/coordinates_geom.h
+++ b/include/coordinates_geom.h
@@ -23,7 +23,9 @@ class TileBbox {
 	TileBbox(TileCoordinates i, uint z, bool h, bool e);
 
 	std::pair<int,int> scaleLatpLon(double latp, double lon) const;
+	void scaleRing(Ring &dst, Ring const &src) const;
 	Ring scaleRing(Ring const &src) const;
+	void scaleGeometry(MultiPolygon &dst, MultiPolygon const &src) const;
 	MultiPolygon scaleGeometry(MultiPolygon const &src) const;
 	std::pair<double, double> floorLatpLon(double latp, double lon) const;
 
diff --git a/src/coordinates_geom.cpp b/src/coordinates_geom.cpp
index 020216f8..54344299 100644
--- a/src/coordinates_geom.cpp
+++ b/src/coordinates_geom.cpp
@@ -33,8 +33,8 @@ pair<int,int> TileBbox::scaleLatpLon(double latp, double lon) const {
 
 // Scaling with naive self-intersection check - if we've added the new point
 // within the last 5 points, then backtrack to the last time we added it
-Ring TileBbox::scaleRing(Ring const &src) const {
-	Ring points;
+void TileBbox::scaleRing(Ring &points, Ring const &src) const {
+	points.clear();
 	points.reserve(src.size());
 	for(auto const &i: src) {
 		auto scaled = scaleLatpLon(i.y(), i.x()); // -> .first is x, .second is y
@@ -48,31 +48,46 @@ Ring TileBbox::scaleRing(Ring const &src) const {
 		}
 		if (!found) points.push_back(Point(scaled.first,scaled.second));
 	}
+}
+
+Ring TileBbox::scaleRing(Ring const &src) const {
+	Ring points;
+	scaleRing(points, src);
 	return points;
 }
 
-MultiPolygon TileBbox::scaleGeometry(MultiPolygon const &src) const {
-	MultiPolygon dst;
-	dst.reserve(src.size());
+void TileBbox::scaleGeometry(MultiPolygon &dst, MultiPolygon const &src) const {
+	if (dst.size() < src.size())
+		dst.resize(src.size());
+
+	size_t polygonCount = 0;
 	for(auto const &poly: src) {
-		Polygon p;
+		Polygon &p = dst[polygonCount];
 
 		// Copy the outer ring
-		Ring points = scaleRing(poly.outer());
-		if (points.size()<4) continue;
-		p.outer() = std::move(points);
+		scaleRing(p.outer(), poly.outer());
+		if (p.outer().size()<4)
+			continue;
 
 		// Copy the inner rings
-		p.inners().reserve(poly.inners().size());
+		if (p.inners().size() < poly.inners().size())
+			p.inners().resize(poly.inners().size());
+		size_t innerCount = 0;
 		for(auto const &r: poly.inners()) {
-			points = scaleRing(r);
-			if (points.size()<4) continue;
-			p.inners().push_back(std::move(points));
+			Ring &points = p.inners()[innerCount];
+			scaleRing(points, r);
+			if (points.size()>=4)
+				innerCount++;
 		}
-
-		// Add to multipolygon
-		dst.push_back(std::move(p));
+		p.inners().resize(innerCount);
+		polygonCount++;
 	}
+	dst.resize(polygonCount);
+}
+
+MultiPolygon TileBbox::scaleGeometry(MultiPolygon const &src) const {
+	MultiPolygon dst;
+	scaleGeometry(dst, src);
 	return dst;
 }
 
diff --git a/src/tile_worker.cpp b/src/tile_worker.cpp
index 7e894332..cebbfd6b 100644
--- a/src/tile_worker.cpp
+++ b/src/tile_worker.cpp
@@ -10,6 +10,7 @@ using namespace std;
 extern bool verbose;
 
 thread_local bool enabledUserSignal = false;
+thread_local MultiPolygon scaledMultiPolygon;
 typedef std::vector<OutputObjectID>::const_iterator OutputObjectsConstIt;
 typedef std::pair<OutputObjectsConstIt, OutputObjectsConstIt> OutputObjectsConstItPair;
 
@@ -220,7 +221,8 @@ void writeMultiPolygon(
 	unsigned simplifyAlgo,
 	const MultiPolygon& mp
 ) {
-	MultiPolygon current = bbox.scaleGeometry(mp);
+	bbox.scaleGeometry(scaledMultiPolygon, mp);
+	MultiPolygon &current = scaledMultiPolygon;
 	if (simplifyLevel>0) {
 		if (simplifyAlgo == LayerDef::VISVALINGAM) {
 			current = simplifyVis(current, simplifyLevel/bbox.xscale);

From 2fb50f8b6f54267d75e39737b7b782e0c669ad28 Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Tue, 26 May 2026 06:16:10 +0200
Subject: [PATCH 15/21] Fill way geometry buffers directly

Avoid materializing temporary vectors in the hot way geometry path. OsmMemTiles now fills a reusable thread-local way-node buffer, while WayStore implementations expose fill-into-buffer overloads that preserve the existing return-by-value API for other callers.

SortedWayStore also reuses a per-thread decoded NodeID buffer when expanding encoded ways. This removes repeated short-lived vector allocations without changing the generated tile semantics checked by the Liechtenstein fixture.

Co-authored-by: Codex <noreply@openai.com>
---
 include/sharded_way_store.h |  1 +
 include/sorted_way_store.h  |  2 ++
 include/way_store.h         |  1 +
 include/way_stores.h        |  1 +
 src/osm_mem_tiles.cpp       |  7 ++++---
 src/sharded_way_store.cpp   | 13 +++++++++----
 src/sorted_way_store.cpp    | 22 ++++++++++++++++++----
 src/way_stores.cpp          | 11 ++++++++---
 8 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/include/sharded_way_store.h b/include/sharded_way_store.h
index 40a3d331..d57ebef6 100644
--- a/include/sharded_way_store.h
+++ b/include/sharded_way_store.h
@@ -14,6 +14,7 @@ class ShardedWayStore : public WayStore {
 	void reopen() override;
 	void batchStart() override;
 	std::vector<LatpLon> at(WayID wayid) const override;
+	void at(WayID wayid, std::vector<LatpLon>& output) const override;
 	bool requiresNodes() const override;
 	void insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) override;
 	void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
diff --git a/include/sorted_way_store.h b/include/sorted_way_store.h
index b99ba7de..2df99249 100644
--- a/include/sorted_way_store.h
+++ b/include/sorted_way_store.h
@@ -88,6 +88,7 @@ class SortedWayStore: public WayStore {
 	void reopen() override;
 	void batchStart() override;
 	std::vector<LatpLon> at(WayID wayid) const override;
+	void at(WayID wayid, std::vector<LatpLon>& output) const override;
 	bool requiresNodes() const override { return true; }
 	void insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) override;
 	void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
@@ -107,6 +108,7 @@ class SortedWayStore: public WayStore {
 	);
 
 	static std::vector<NodeID> decodeWay(uint16_t flags, const uint8_t* input);
+	static void decodeWay(uint16_t flags, const uint8_t* input, std::vector<NodeID>& output);
 
 private:
 	bool compressWays;
diff --git a/include/way_store.h b/include/way_store.h
index 36862344..1898ea11 100644
--- a/include/way_store.h
+++ b/include/way_store.h
@@ -15,6 +15,7 @@ class WayStore {
 	// meaningful for SortedWayStore
 	virtual void batchStart() = 0;
 	virtual std::vector<LatpLon> at(WayID wayid) const = 0;
+	virtual void at(WayID wayid, std::vector<LatpLon>& output) const = 0;
 	virtual bool requiresNodes() const = 0;
 	virtual void insertLatpLons(std::vector<ll_element_t>& newWays) = 0;
 	virtual void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) = 0;
diff --git a/include/way_stores.h b/include/way_stores.h
index 0f94e845..0502ddc6 100644
--- a/include/way_stores.h
+++ b/include/way_stores.h
@@ -15,6 +15,7 @@ class BinarySearchWayStore: public WayStore {
 	void reopen() override;
 	void batchStart() override {}
 	std::vector<LatpLon> at(WayID wayid) const override;
+	void at(WayID wayid, std::vector<LatpLon>& output) const override;
 	bool requiresNodes() const override { return false; }
 	void insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) override;
 	void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
diff --git a/src/osm_mem_tiles.cpp b/src/osm_mem_tiles.cpp
index e27bc703..60b88d56 100644
--- a/src/osm_mem_tiles.cpp
+++ b/src/osm_mem_tiles.cpp
@@ -4,6 +4,7 @@
 using namespace std;
 
 thread_local GeometryCache<Linestring> linestringCache;
+thread_local std::vector<LatpLon> wayNodes;
 
 OsmMemTiles::OsmMemTiles(
 	size_t threadNum,
@@ -87,10 +88,10 @@ Geometry OsmMemTiles::buildWayGeometry(
 }
 
 void OsmMemTiles::populateLinestring(Linestring& ls, NodeID objectID) const {
-	std::vector<LatpLon> nodes = wayStore.at(OSM_ID(objectID));
-	ls.reserve(nodes.size());
+	wayStore.at(OSM_ID(objectID), wayNodes);
+	ls.reserve(wayNodes.size());
 
-	for (const LatpLon& node : nodes) {
+	for (const LatpLon& node : wayNodes) {
 		boost::geometry::range::push_back(ls, boost::geometry::make<Point>(node.lon/10000000.0, node.latp/10000000.0));
 	}
 }
diff --git a/src/sharded_way_store.cpp b/src/sharded_way_store.cpp
index d9741082..35ad99a5 100644
--- a/src/sharded_way_store.cpp
+++ b/src/sharded_way_store.cpp
@@ -24,16 +24,22 @@ void ShardedWayStore::batchStart() {
 }
 
 std::vector<LatpLon> ShardedWayStore::at(WayID wayid) const {
+	std::vector<LatpLon> rv;
+	at(wayid, rv);
+	return rv;
+}
+
+void ShardedWayStore::at(WayID wayid, std::vector<LatpLon>& output) const {
 	for (int i = 0; i < shards(); i++) {
 		size_t index = (lastWayShard + i) % shards();
 		if (stores[index]->contains(0, wayid)) {
 			lastWayShard = index;
-			return stores[index]->at(wayid);
+			stores[index]->at(wayid, output);
+			return;
 		}
 	}
 
-	// Superfluous return to silence a compiler warning
-	return stores[shards() - 1]->at(wayid);
+	stores[shards() - 1]->at(wayid, output);
 }
 
 bool ShardedWayStore::requiresNodes() const {
@@ -78,4 +84,3 @@ const WayStore& ShardedWayStore::shard(size_t shard) const {
 }
 
 size_t ShardedWayStore::shards() const { return nodeStore.shards(); }
-
diff --git a/src/sorted_way_store.cpp b/src/sorted_way_store.cpp
index 0e47150e..daba19c3 100644
--- a/src/sorted_way_store.cpp
+++ b/src/sorted_way_store.cpp
@@ -28,6 +28,7 @@ namespace SortedWayStoreTypes {
 		uint64_t groupStart;
 		std::vector<std::pair<WayID, std::vector<NodeID>>>* localWays;
 		std::vector<uint8_t> encodedWay;
+		std::vector<NodeID> decodedWay;
 	};
 
 	thread_local std::deque<std::pair<const SortedWayStore*, ThreadStorage>> threadStorage;
@@ -136,6 +137,12 @@ bool SortedWayStore::contains(size_t shard, WayID id) const {
 }
 
 std::vector<LatpLon> SortedWayStore::at(WayID id) const {
+	std::vector<LatpLon> rv;
+	at(id, rv);
+	return rv;
+}
+
+void SortedWayStore::at(WayID id, std::vector<LatpLon>& rv) const {
 	const size_t groupIndex = id / (GroupSize * ChunkSize);
 	const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize;
 	const uint64_t chunkMaskByte = chunk / 8;
@@ -192,12 +199,13 @@ std::vector<LatpLon> SortedWayStore::at(WayID id) const {
 		wayPtr = (EncodedWay*)(endOfWayOffsetPtr + chunkPtr->wayOffsets[wayOffset] * LargeWayAlignment);
 	}
 
-	std::vector<NodeID> nodes = SortedWayStore::decodeWay(wayPtr->flags, wayPtr->data);
-	std::vector<LatpLon> rv;
+	ThreadStorage& tls = s(this);
+	SortedWayStore::decodeWay(wayPtr->flags, wayPtr->data, tls.decodedWay);
+	const std::vector<NodeID>& nodes = tls.decodedWay;
+	rv.clear();
 	rv.reserve(nodes.size());
 	for (const NodeID& node : nodes)
 		rv.push_back(nodeStore.at(node));
-	return rv;
 }
 
 void SortedWayStore::insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) {
@@ -317,11 +325,18 @@ void SortedWayStore::collectOrphans(const std::vector<std::pair<WayID, std::vect
 
 std::vector<NodeID> SortedWayStore::decodeWay(uint16_t flags, const uint8_t* input) {
 	std::vector<NodeID> rv;
+	decodeWay(flags, input, rv);
+	return rv;
+};
+
+void SortedWayStore::decodeWay(uint16_t flags, const uint8_t* input, std::vector<NodeID>& rv) {
+	rv.clear();
 
 	bool isCompressed = flags & CompressedWay;
 	bool isClosed = flags & ClosedWay;
 
 	const uint16_t length = flags & 0b0000011111111111;
+	rv.reserve(length + (isClosed ? 1 : 0));
 
 	if (!(flags & UniformUpperBits)) {
 		// The nodes don't all share the same upper int; unpack which
@@ -367,7 +382,6 @@ std::vector<NodeID> SortedWayStore::decodeWay(uint16_t flags, const uint8_t* inp
 
 	if (isClosed)
 		rv.push_back(rv[0]);
-	return rv;
 };
 
 uint16_t SortedWayStore::encodeWay(const std::vector<NodeID>& way, std::vector<uint8_t>& output, bool compress) {
diff --git a/src/way_stores.cpp b/src/way_stores.cpp
index 790ad816..86431d2e 100644
--- a/src/way_stores.cpp
+++ b/src/way_stores.cpp
@@ -23,8 +23,14 @@ bool BinarySearchWayStore::contains(size_t shard, WayID id) const {
 }
 
 std::vector<LatpLon> BinarySearchWayStore::at(WayID wayid) const {
+	std::vector<LatpLon> rv;
+	at(wayid, rv);
+	return rv;
+}
+
+void BinarySearchWayStore::at(WayID wayid, std::vector<LatpLon>& rv) const {
 	std::lock_guard<std::mutex> lock(mutex);
-	
+
 	auto iter = std::lower_bound(mLatpLonLists->begin(), mLatpLonLists->end(), wayid, [](auto const &e, auto wayid) { 
 		return e.first < wayid; 
 	});
@@ -32,12 +38,11 @@ std::vector<LatpLon> BinarySearchWayStore::at(WayID wayid) const {
 	if(iter == mLatpLonLists->end() || iter->first != wayid)
 		throw std::out_of_range("Could not find way with id " + std::to_string(wayid));
 
-	std::vector<LatpLon> rv;
+	rv.clear();
 	rv.reserve(iter->second.size());
 	// TODO: copy iter->second to rv more efficiently
 	for (const LatpLon& el : iter->second)
 		rv.push_back(el);
-	return rv;
 }
 
 void BinarySearchWayStore::insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) {

From b3efb5387dddba97c5db76d383385fe07959be31 Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Tue, 26 May 2026 06:56:28 +0200
Subject: [PATCH 16/21] Skip bounded line clipping

The line geometry path splits ways into bbox-overlapping sections and then always runs the result through Boost intersection against the extended tile box. For sections whose retained points are already inside that extended box, the intersection is an identity operation but still allocates and walks geometry.

Track whether any retained point falls outside the extended box while building the split linestring output. Return the split result directly when no clipping is needed, and keep the existing Boost intersection path for sections that still extend outside the box.

Liechtenstein semantic output matched the previous stack, and profiling showed about 241k fewer heap allocation calls on the heaptrack fixture. Austria runtime was neutral/noisy, so this should be treated as allocation cleanup rather than a wall-time improvement.

Co-authored-by: Codex <noreply@openai.com>
---
 src/osm_mem_tiles.cpp | 15 ++++++++++++++-
 src/tile_data.cpp     | 15 ++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/osm_mem_tiles.cpp b/src/osm_mem_tiles.cpp
index 60b88d56..0e9a6f07 100644
--- a/src/osm_mem_tiles.cpp
+++ b/src/osm_mem_tiles.cpp
@@ -60,6 +60,14 @@ Geometry OsmMemTiles::buildWayGeometry(
 		if(ls.empty())
 			return out;
 
+		Box extBox = bbox.getExtendBox();
+		const double minX = extBox.min_corner().x(), maxX = extBox.max_corner().x();
+		const double minY = extBox.min_corner().y(), maxY = extBox.max_corner().y();
+		auto pointInsideExtBox = [minX, maxX, minY, maxY](const Point& p) {
+			return p.x() >= minX && p.x() <= maxX && p.y() >= minY && p.y() <= maxY;
+		};
+		bool needsIntersection = !pointInsideExtBox(ls[0]);
+
 		Linestring current_ls;
 		current_ls.reserve(ls.size());
 		geom::append(current_ls, ls[0]);
@@ -73,13 +81,18 @@ Geometry OsmMemTiles::buildWayGeometry(
 				current_ls.reserve(ls.size() - i);
 			}
 			geom::append(current_ls, ls[i]);
+			if (!needsIntersection)
+				needsIntersection = !pointInsideExtBox(ls[i]);
 		}
 
 		if(current_ls.size() > 1)
 			out.push_back(std::move(current_ls));
 
+		if (!needsIntersection)
+			return out;
+
 		MultiLinestring result;
-		geom::intersection(out, bbox.getExtendBox(), result);
+		geom::intersection(out, extBox, result);
 		return result;
 
 	}
diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index 472ab3ed..66099de4 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -227,6 +227,14 @@ Geometry TileDataSource::buildWayGeometry(OutputGeometryType const geomType,
 			if(ls.empty())
 				return out;
 
+			Box extBox = bbox.getExtendBox();
+			const double minX = extBox.min_corner().x(), maxX = extBox.max_corner().x();
+			const double minY = extBox.min_corner().y(), maxY = extBox.max_corner().y();
+			auto pointInsideExtBox = [minX, maxX, minY, maxY](const Point& p) {
+				return p.x() >= minX && p.x() <= maxX && p.y() >= minY && p.y() <= maxY;
+			};
+			bool needsIntersection = !pointInsideExtBox(ls[0]);
+
 			Linestring current_ls;
 			geom::append(current_ls, ls[0]);
 
@@ -238,13 +246,18 @@ Geometry TileDataSource::buildWayGeometry(OutputGeometryType const geomType,
 					current_ls.clear();
 				}
 				geom::append(current_ls, ls[i]);
+				if (!needsIntersection)
+					needsIntersection = !pointInsideExtBox(ls[i]);
 			}
 
 			if(current_ls.size() > 1)
 				out.push_back(std::move(current_ls));
 
+			if (!needsIntersection)
+				return out;
+
 			MultiLinestring result;
-			geom::intersection(out, bbox.getExtendBox(), result);
+			geom::intersection(out, extBox, result);
 			return result;
 		}
 

From 8aa0b214110370eb985e79cb0f812162de1ad5f5 Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Tue, 26 May 2026 07:18:31 +0200
Subject: [PATCH 17/21] Move corrected ring results

The geometry correction helper returns a single corrected ring in the common path where no self-intersections are found. Returning it through an initializer list copies the ring into the result vector, which creates avoidable point-vector allocations in the Layer() geometry correction path.

Build the one-element result vector explicitly and move the corrected ring into it. This keeps the same behavior while avoiding the ring copy.

Liechtenstein semantic output matched the previous housekeeping stack. Profiling showed about 41k fewer allocation calls on the heaptrack fixture. Austria runtime was neutral to slightly slower, so this is an allocation cleanup rather than a speed improvement.

Co-authored-by: Codex <noreply@openai.com>
---
 include/geometry/correct.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/geometry/correct.hpp b/include/geometry/correct.hpp
index 7adb5d34..29ac232f 100644
--- a/include/geometry/correct.hpp
+++ b/include/geometry/correct.hpp
@@ -281,9 +281,11 @@ static inline std::vector<ring_t> correct(ring_t const &ring, boost::geometry::o
 	dissolve_find_intersections(new_ring, pseudo_vertices, start_keys);
 
 	if(start_keys.empty()) {
-		if(std::abs(boost::geometry::area(new_ring)) > remove_spike_min_area) 
-			return { new_ring };
-		else
+		if(std::abs(boost::geometry::area(new_ring)) > remove_spike_min_area) {
+			std::vector<ring_t> result;
+			result.push_back(std::move(new_ring));
+			return result;
+		} else
 			return { };
 	}
 

From f76472abc985cd4aeca3c438040b095f228c2179 Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Tue, 26 May 2026 20:54:13 +0200
Subject: [PATCH 18/21] Forward combined geometry results

The geometry correction helper receives polygons through an rvalue reference and every current caller passes std::move(...). Because the named parameter is an lvalue inside result_combine(), pushing it into the result vector copied the polygon instead of moving it.

Forward the parameter into the result vector so rvalue callers keep move semantics. This preserves behavior while avoiding avoidable geometry copies in the correction path.

Liechtenstein semantic output matched the previous housekeeping stack. Profiling showed about 24k fewer allocation calls on the heaptrack fixture and small Austria RSS/runtime improvements in both forward and reverse order.

Co-authored-by: Codex <noreply@openai.com>
---
 include/geometry/correct.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/geometry/correct.hpp b/include/geometry/correct.hpp
index 29ac232f..538e2825 100644
--- a/include/geometry/correct.hpp
+++ b/include/geometry/correct.hpp
@@ -10,6 +10,7 @@
  * ----------------------------------------------------------------------------
  */
 
+#include <utility>
 #include <vector>
 #include <boost/geometry.hpp>
 #include <boost/geometry/geometries/point_xy.hpp>
@@ -25,7 +26,7 @@ namespace impl {
 template<typename C, typename T>
 static inline void result_combine(C &result, T &&new_element)
 {
-    result.push_back(new_element);
+    result.push_back(std::forward<T>(new_element));
 
    	for(std::size_t i = 0; i < result.size() - 1; ) {
         if(!boost::geometry::intersects(result[i], result.back())) {

From db92420896a8f3a7fc2508429e4e5015d3210ef9 Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Tue, 26 May 2026 21:24:39 +0200
Subject: [PATCH 19/21] Reuse dissolve intersection output buffer

dissolve_find_intersections() currently constructs a fresh vector for each segment pair tested by the rtree callback. Heap profiling showed this path still contributed a large temporary-allocation bucket during tile generation.

Keep one output vector for the duration of the function, reserve the common two-point segment intersection capacity, and clear it before each Boost intersection call. This preserves the existing callback behavior while avoiding repeated vector construction and allocation churn.

Co-authored-by: Codex <noreply@openai.com>
---
 include/geometry/correct.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/geometry/correct.hpp b/include/geometry/correct.hpp
index 538e2825..fba96c69 100644
--- a/include/geometry/correct.hpp
+++ b/include/geometry/correct.hpp
@@ -97,6 +97,8 @@ static inline void dissolve_find_intersections(
 	if(ring.empty()) return;
 
 	boost::geometry::index::rtree<std::pair< boost::geometry::model::segment<point_t>, std::size_t >, boost::geometry::index::quadratic<16>> index;
+	std::vector<point_t> output;
+	output.reserve(2);
 
 	// Generate all by-pass intersections in the graph
 	// Generate a list of all by-pass intersections
@@ -113,7 +115,7 @@ static inline void dissolve_find_intersections(
 			auto const &line_2 = iter.first;
 			auto j = iter.second;
 			
-			std::vector<point_t> output;
+			output.clear();
 			boost::geometry::intersection(line_1, line_2, output);
 
 			for(auto const &p: output) {

From ed0b09dff2490859077bfb59ec04b330fc92c5e1 Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Wed, 27 May 2026 06:10:50 +0200
Subject: [PATCH 20/21] Move uncached multipolygons into clipping

Freshly populated multipolygons are only used to build the mutable clipping buffer, so move them into that buffer instead of copying them first. Cached clip entries still use the existing copy path because fast_clip mutates its input and cache entries must remain reusable.

If an uncached fast-clip result needs the Boost intersection fallback, re-populate the original multipolygon before intersecting so the fallback keeps the existing source-geometry behavior.

Semantic comparison against the accepted stack on the Liechtenstein fixture produced no changed tiles. Profiling showed a modest allocation-call reduction; wall time and RSS were effectively neutral, and the 64 MB Massif movement is treated as mmap-backed allocator noise rather than native RSS evidence.

Co-authored-by: Codex <noreply@openai.com>
---
 src/tile_data.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index 66099de4..29004c93 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -340,7 +340,10 @@ Geometry TileDataSource::buildWayGeometry(OutputGeometryType const geomType,
 			}
 
 			MultiPolygon mp;
-			geom::assign(mp, input);
+			if (cachedClip == nullptr)
+				mp = std::move(uncached);
+			else
+				geom::assign(mp, input);
 			fast_clip(mp, box);
 			geom::correct(mp);
 			geom::validity_failure_type failure = geom::validity_failure_type::no_failure;
@@ -353,7 +356,13 @@ Geometry TileDataSource::buildWayGeometry(OutputGeometryType const geomType,
 				}
 				if (!valid && (failure==geom::failure_self_intersections || failure==geom::failure_intersecting_interiors)) {
 					MultiPolygon output;
-					geom::intersection(input, box, output);
+					if (cachedClip == nullptr) {
+						MultiPolygon original;
+						populateMultiPolygon(original, objectID);
+						geom::intersection(original, box, output);
+					} else {
+						geom::intersection(input, box, output);
+					}
 					geom::correct(output);
 
 					// retry with Boost intersection if fast_clip has caused self-intersections

From ba8db48b733df28f398ffeeffb4323227272dcca Mon Sep 17 00:00:00 2001
From: Symmetricity <184246+Symmetricity@users.noreply.github.com>
Date: Wed, 27 May 2026 14:45:37 +0200
Subject: [PATCH 21/21] Skip redundant fast clip edge passes

fast_clip clips each ring against all four box edges even when no point is outside a given edge. Checking the existing bit code before running an edge pass avoids copying the ring through scratch output for no-op sides while preserving the existing clipping, validity, and fallback behavior.

Liechtenstein semantic comparison against the accepted housekeeping stack produced no changed tiles. Profiling showed fewer allocation calls and favorable CPU counters, with wall time and native RSS effectively neutral on the Austria fixture.

Co-authored-by: Codex <noreply@openai.com>
---
 src/geom.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/geom.cpp b/src/geom.cpp
index b19a9c65..ceb2a3b7 100644
--- a/src/geom.cpp
+++ b/src/geom.cpp
@@ -200,6 +200,15 @@ char bit_code(Point const &p, Box const &bbox) {
 void fast_clip(Ring &points, Box const &bbox, Ring &result) {
 	// clip against each side of the clip rectangle
 	for (char edge = 1; edge <= 8; edge *= 2) {
+		bool needsClip = false;
+		for (auto const &p: points) {
+			if (bit_code(p, bbox) & edge) {
+				needsClip = true;
+				break;
+			}
+		}
+		if (!needsClip) continue;
+
 		result.clear();
 		result.reserve(points.size() + 4);
 		Point prev = points[points.size() - 1];