fix: harden input validation, XML escaping, and streaming safety

ServerSideHannes · ServerSideHannes · commit bff9e72e34cf · 2026-02-06T12:32:56.000+01:00
- Add XML escaping to all user-controlled values in xml_responses.py
- Add gzip decompression size limit to prevent decompression bombs
- Harden chunked decoder: buffer limits, chunk size validation, error on truncation
- Cache KEK in Settings via PrivateAttr to avoid per-request SHA256
- Wrap range header parsing in try/except for malformed input
- Safe int() parsing for content-length headers across 3 locations
- Add usedforsecurity=False to all MD5 calls for FIPS compliance
- Remove dead code and duplicate imports in upload_part.py
- Fix over-indentation in upload_part SHA256 mismatch block
diff --git a/s3proxy/config.py b/s3proxy/config.py
@@ -2,7 +2,7 @@
 
 import hashlib
 
-from pydantic import Field, field_validator
+from pydantic import Field, PrivateAttr
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
 
@@ -49,16 +49,16 @@ class Settings(BaseSettings):
     # Logging
     log_level: str = Field(default="INFO", description="Log level (DEBUG, INFO, WARNING, ERROR)")
 
-    @field_validator("encrypt_key")
-    @classmethod
-    def hash_encrypt_key(cls, v: str) -> str:
-        """Store the raw key - we'll hash it when needed."""
-        return v
+    # Cached KEK derived from encrypt_key (computed once in model_post_init)
+    _kek: bytes = PrivateAttr()
+
+    def model_post_init(self, __context: object) -> None:
+        self._kek = hashlib.sha256(self.encrypt_key.encode()).digest()
 
     @property
     def kek(self) -> bytes:
         """Get the 32-byte Key Encryption Key (SHA256 of encrypt_key)."""
-        return hashlib.sha256(self.encrypt_key.encode()).digest()
+        return self._kek
 
     @property
     def s3_endpoint(self) -> str:
diff --git a/s3proxy/handlers/base.py b/s3proxy/handlers/base.py
@@ -98,15 +98,18 @@ def _parse_range(self, header: str, size: int) -> tuple[int, int]:
         if not header.startswith("bytes="):
             raise S3Error.invalid_range("Invalid range header format")
         spec = header[6:]
-        if spec.startswith("-"):
-            start = max(0, size - int(spec[1:]))
-            end = size - 1
-        elif spec.endswith("-"):
-            start = int(spec[:-1])
-            end = size - 1
-        else:
-            parts = spec.split("-")
-            start, end = int(parts[0]), min(int(parts[1]), size - 1)
+        try:
+            if spec.startswith("-"):
+                start = max(0, size - int(spec[1:]))
+                end = size - 1
+            elif spec.endswith("-"):
+                start = int(spec[:-1])
+                end = size - 1
+            else:
+                parts = spec.split("-")
+                start, end = int(parts[0]), min(int(parts[1]), size - 1)
+        except (ValueError, IndexError):
+            raise S3Error.invalid_range("Invalid range header format")
         if start > end or start >= size:
             raise S3Error.invalid_range("Range not satisfiable")
         return start, end
@@ -117,7 +120,10 @@ def _parse_copy_source_range(
         if not range_header:
             return 0, total_size - 1
         range_str = range_header.replace("bytes=", "")
-        start, end = map(int, range_str.split("-"))
+        try:
+            start, end = map(int, range_str.split("-"))
+        except (ValueError, TypeError):
+            raise S3Error.invalid_range("Invalid copy source range format")
         return start, end
 
     def _get_effective_etag(self, metadata: dict, fallback_etag: str) -> str:
diff --git a/s3proxy/handlers/multipart/copy.py b/s3proxy/handlers/multipart/copy.py
@@ -50,7 +50,7 @@ async def handle_upload_part_copy(self, request: Request, creds: S3Credentials)
             ciphertext = crypto.encrypt_part(plaintext, state.dek, upload_id, part_num)
             resp = await client.upload_part(bucket, key, upload_id, part_num, ciphertext)
 
-            body_md5 = hashlib.md5(plaintext).hexdigest()
+            body_md5 = hashlib.md5(plaintext, usedforsecurity=False).hexdigest()
             await self.multipart_manager.add_part(
                 bucket, key, upload_id,
                 PartMetadata(
diff --git a/s3proxy/handlers/multipart/lifecycle.py b/s3proxy/handlers/multipart/lifecycle.py
@@ -184,7 +184,7 @@ async def handle_complete_multipart_upload(
             )
 
             location = f"{self.settings.s3_endpoint}/{bucket}/{key}"
-            etag = hashlib.md5(str(state.total_plaintext_size).encode()).hexdigest()
+            etag = hashlib.md5(str(state.total_plaintext_size).encode(), usedforsecurity=False).hexdigest()
 
             return Response(
                 content=xml_responses.complete_multipart(location, bucket, key, etag),
diff --git a/s3proxy/handlers/multipart/upload_part.py b/s3proxy/handlers/multipart/upload_part.py
@@ -7,7 +7,7 @@
 import time
 from collections import deque
 from collections.abc import AsyncIterator
-from typing import TYPE_CHECKING, NoReturn
+from typing import NoReturn
 
 import structlog
 from botocore.exceptions import ClientError
@@ -23,12 +23,9 @@
     PartMetadata,
     StateMissingError,
 )
-from ...streaming import decode_aws_chunked, decode_aws_chunked_stream
+from ...streaming import decode_aws_chunked_stream
 from ..base import BaseHandler
 
-if TYPE_CHECKING:
-    from collections.abc import AsyncIterator
-
 logger: BoundLogger = structlog.get_logger(__name__)
 
 # Limit concurrent internal part uploads to bound memory usage
@@ -49,7 +46,10 @@ async def handle_upload_part(self, request: Request, creds: S3Credentials) -> Re
             # Parse request info
             content_encoding = request.headers.get("content-encoding", "")
             content_sha = request.headers.get("x-amz-content-sha256", "")
-            content_length = int(request.headers.get("content-length", "0"))
+            try:
+                content_length = int(request.headers.get("content-length", "0"))
+            except ValueError:
+                content_length = 0
 
             upload_start_time = time.monotonic()
             logger.info(
@@ -94,14 +94,14 @@ async def handle_upload_part(self, request: Request, creds: S3Credentials) -> Re
 
                 # Late signature verification for large signed uploads
                 if is_large_signed and content_sha and result["computed_sha256"] != content_sha:
-                        logger.warning(
-                            "UPLOAD_PART_SHA256_MISMATCH",
-                            bucket=bucket, key=key, part_num=part_num,
-                            expected=content_sha, computed=result["computed_sha256"],
-                        )
-                        raise S3Error.signature_does_not_match(
-                            "Signature verification failed"
-                        )
+                    logger.warning(
+                        "UPLOAD_PART_SHA256_MISMATCH",
+                        bucket=bucket, key=key, part_num=part_num,
+                        expected=content_sha, computed=result["computed_sha256"],
+                    )
+                    raise S3Error.signature_does_not_match(
+                        "Signature verification failed"
+                    )
 
                 upload_duration = time.monotonic() - upload_start_time
                 logger.info(
@@ -156,7 +156,7 @@ async def _stream_and_upload(
         # Initialize state
         buffer_chunks: deque[bytes] = deque()
         buffer_size = 0
-        md5_hash = hashlib.md5()
+        md5_hash = hashlib.md5(usedforsecurity=False)
         sha256_hash = hashlib.sha256()
         total_plaintext_size = 0
         total_ciphertext_size = 0
@@ -282,8 +282,6 @@ async def _get_stream_source(
                 content_length_mb=f"{content_length / 1024 / 1024:.2f}MB",
             )
             body = await request.body()
-            if needs_chunked_decode:
-                body = decode_aws_chunked(body)
 
             async def body_iter():
                 yield body
diff --git a/s3proxy/handlers/objects/misc.py b/s3proxy/handlers/objects/misc.py
@@ -58,7 +58,8 @@ async def handle_head_object(self, request: Request, creds: S3Credentials) -> Re
                         "Content-Length": str(meta.total_plaintext_size),
                         "Content-Type": resp.get("ContentType", "application/octet-stream"),
                         "ETag": f'"{hashlib.md5(
-                            str(meta.total_plaintext_size).encode()
+                            str(meta.total_plaintext_size).encode(),
+                            usedforsecurity=False,
                         ).hexdigest()}"',
                         **extra_headers,
                     }
@@ -261,7 +262,7 @@ async def _copy_encrypted(
 
         # Re-encrypt
         encrypted = crypto.encrypt_object(plaintext, self.settings.kek)
-        etag = hashlib.md5(plaintext).hexdigest()
+        etag = hashlib.md5(plaintext, usedforsecurity=False).hexdigest()
 
         # Build destination metadata
         dest_metadata = {
diff --git a/s3proxy/handlers/objects/put.py b/s3proxy/handlers/objects/put.py
@@ -58,7 +58,10 @@ async def handle_put_object(self, request: Request, creds: S3Credentials) -> Res
             expires = request.headers.get("expires")
             tagging = request.headers.get("x-amz-tagging")
 
-            content_length = int(request.headers.get("content-length", "0"))
+            try:
+                content_length = int(request.headers.get("content-length", "0"))
+            except ValueError:
+                content_length = 0
             is_unsigned = content_sha == "UNSIGNED-PAYLOAD"
             is_streaming_sig = content_sha.startswith("STREAMING-")
             needs_chunked_decode = "aws-chunked" in content_encoding or is_streaming_sig
@@ -122,7 +125,7 @@ async def _put_buffered(
             plaintext_mb=round(len(body) / 1024 / 1024, 2),
             ciphertext_mb=round(len(encrypted.ciphertext) / 1024 / 1024, 2),
         )
-        etag = hashlib.md5(body).hexdigest()
+        etag = hashlib.md5(body, usedforsecurity=False).hexdigest()
 
         await client.put_object(
             bucket, key, encrypted.ciphertext,
@@ -164,7 +167,7 @@ async def _put_streaming(
         parts_complete: list[dict[str, Any]] = []
         total_plaintext_size = 0
         part_num = 0
-        md5_hash = hashlib.md5()
+        md5_hash = hashlib.md5(usedforsecurity=False)
         sha256_hash = hashlib.sha256() if expected_sha256 else None
         buffer = bytearray()
 
@@ -173,7 +176,7 @@ async def upload_part(data: bytes) -> None:
             part_num += 1
             nonce = crypto.derive_part_nonce(upload_id, part_num)
             data_len = len(data)
-            data_md5 = hashlib.md5(data).hexdigest()
+            data_md5 = hashlib.md5(data, usedforsecurity=False).hexdigest()
             ciphertext = crypto.encrypt(data, dek, nonce)
             cipher_len = len(ciphertext)
             del data  # Free memory
diff --git a/s3proxy/request_handler.py b/s3proxy/request_handler.py
@@ -91,7 +91,10 @@ async def handle_proxy_request(
     memory_limit = concurrency.get_memory_limit()
 
     if memory_limit > 0 and needs_limit:
-        content_length = int(request.headers.get("content-length", "0"))
+        try:
+            content_length = int(request.headers.get("content-length", "0"))
+        except ValueError:
+            content_length = 0
         memory_needed = concurrency.estimate_memory_footprint(method, content_length)
 
         logger.info(
diff --git a/s3proxy/state/metadata.py b/s3proxy/state/metadata.py
@@ -66,10 +66,26 @@ def encode_multipart_metadata(meta: MultipartMetadata) -> str:
     return base64.b64encode(compressed).decode()
 
 
+
+# Maximum decompressed metadata size (10 MB) — prevents gzip bombs
+MAX_METADATA_SIZE = 10 * 1024 * 1024
+
+
+def _safe_gzip_decompress(data: bytes, max_size: int = MAX_METADATA_SIZE) -> bytes:
+    """Decompress gzip data with a size limit to prevent decompression bombs."""
+    with gzip.GzipFile(fileobj=__import__("io").BytesIO(data)) as f:
+        result = f.read(max_size + 1)
+    if len(result) > max_size:
+        raise ValueError(
+            f"Decompressed metadata exceeds {max_size} bytes limit"
+        )
+    return result
+
+
 def decode_multipart_metadata(encoded: str) -> MultipartMetadata:
     """Decode metadata from base64-compressed JSON."""
     compressed = base64.b64decode(encoded)
-    json_bytes = gzip.decompress(compressed)
+    json_bytes = _safe_gzip_decompress(compressed)
     data = json_loads(json_bytes)
 
     return MultipartMetadata(
diff --git a/s3proxy/streaming/chunked.py b/s3proxy/streaming/chunked.py
@@ -13,6 +13,26 @@
 # Streaming chunk size for reads/writes
 STREAM_CHUNK_SIZE = 64 * 1024  # 64KB chunks for streaming
 
+# Safety limits for chunked decoding
+_MAX_CHUNK_HEADER_SIZE = 4096  # Max header line (hex size + signature)
+_MAX_CHUNK_SIZE = 64 * 1024 * 1024  # 64 MB max per chunk
+_MAX_BUFFER_SIZE = 66 * 1024 * 1024  # Slightly above max chunk to hold chunk + framing
+
+
+def _parse_chunk_size(header: bytes) -> int:
+    """Parse and validate chunk size from header bytes."""
+    size_str = header.split(b";")[0].strip()
+    if not size_str:
+        raise ValueError("Empty chunk size")
+    chunk_size = int(size_str, 16)
+    if chunk_size < 0:
+        raise ValueError(f"Negative chunk size: {chunk_size}")
+    if chunk_size > _MAX_CHUNK_SIZE:
+        raise ValueError(
+            f"Chunk size {chunk_size} exceeds maximum {_MAX_CHUNK_SIZE}"
+        )
+    return chunk_size
+
 
 def decode_aws_chunked(body: bytes) -> bytes:
     """Decode aws-chunked transfer encoding from buffered body.
@@ -22,25 +42,27 @@ def decode_aws_chunked(body: bytes) -> bytes:
 
     Returns:
         Decoded bytes without chunk headers
+
+    Raises:
+        ValueError: If chunked encoding is malformed or truncated.
     """
     result = bytearray()
     pos = 0
     while pos < len(body):
         header_end = body.find(b"\r\n", pos)
         if header_end == -1:
-            break
+            raise ValueError("Truncated chunk: missing header terminator")
         header = body[pos:header_end]
-        size_str = header.split(b";")[0]
-        try:
-            chunk_size = int(size_str, 16)
-        except ValueError:
-            break
+        chunk_size = _parse_chunk_size(header)
         if chunk_size == 0:
             break
         data_start = header_end + 2
         data_end = data_start + chunk_size
         if data_end > len(body):
-            break
+            raise ValueError(
+                f"Truncated chunk: expected {chunk_size} bytes, "
+                f"only {len(body) - data_start} available"
+            )
         result.extend(body[data_start:data_end])
         pos = data_end + 2
     return bytes(result)
@@ -59,23 +81,32 @@ async def decode_aws_chunked_stream(
 
     Yields:
         Decoded data chunks
+
+    Raises:
+        ValueError: If buffer exceeds safety limits or encoding is malformed.
     """
     buffer = bytearray()
 
     async for raw_chunk in request.stream():
         buffer.extend(raw_chunk)
 
+        if len(buffer) > _MAX_BUFFER_SIZE:
+            raise ValueError(
+                f"Chunked decode buffer ({len(buffer)} bytes) exceeds "
+                f"maximum ({_MAX_BUFFER_SIZE} bytes)"
+            )
+
         while True:
             header_end = buffer.find(b"\r\n")
             if header_end == -1:
+                if len(buffer) > _MAX_CHUNK_HEADER_SIZE:
+                    raise ValueError(
+                        f"Chunk header exceeds {_MAX_CHUNK_HEADER_SIZE} bytes"
+                    )
                 break
 
             header = buffer[:header_end]
-            size_str = header.split(b";")[0]
-            try:
-                chunk_size = int(size_str, 16)
-            except ValueError:
-                break
+            chunk_size = _parse_chunk_size(header)
 
             if chunk_size == 0:
                 return
diff --git a/s3proxy/xml_responses.py b/s3proxy/xml_responses.py

Original file line number	Diff line number	Diff line change
`@@ -184,7 +184,7 @@ async def handle_complete_multipart_upload(`
`184`	`184`	`)`
`185`	`185`
`186`	`186`	`location = f"{self.settings.s3_endpoint}/{bucket}/{key}"`
`187`		`- etag = hashlib.md5(str(state.total_plaintext_size).encode()).hexdigest()`
	`187`	`+ etag = hashlib.md5(str(state.total_plaintext_size).encode(), usedforsecurity=False).hexdigest()`
`188`	`188`
`189`	`189`	`return Response(`
`190`	`190`	`content=xml_responses.complete_multipart(location, bucket, key, etag),`