diff --git a/docs/learn/how-it-works.md b/docs/learn/how-it-works.md index d4366df..485f702 100644 --- a/docs/learn/how-it-works.md +++ b/docs/learn/how-it-works.md @@ -37,11 +37,20 @@ The request path is literally the `model` string you pass in, joined to `models_ Important: the SDK sends only the raw pixel bytes. It does **not** send image metadata such as width/height/shape, color space, file name, or format. +For `client.models.embed_image(model, image, ...)`: + +1. The input image is serialized with `image.tobytes()`. +2. Bytes are compressed with **LZ4 frame**. +3. The request includes `x-output-dtype` to let the service return the desired numeric type. +4. Additional keyword headers are supported and sent as `x-*` headers (e.g. `pool_tokens="false"` becomes `x-pool-tokens: false`; do not include the `x_` prefix in the argument name). + ### What the SDK expects back - **Classification**: JSON (`response.json()`), typically a float (binary) or a mapping of class → probability. - **Segmentation**: a binary payload (response body) that is LZ4-compressed float16 data. The SDK decompresses it, interprets it as `np.float16`, and reshapes it to `(num_classes, height, width)`. +- **Embedding**: an LZ4-compressed binary payload plus an `x-output-shape` header, + used to reshape the output array. The SDK determines `height` and `width` from the input image: diff --git a/rationai/resources/models.py b/rationai/resources/models.py index 3a297a7..a7d3c25 100644 --- a/rationai/resources/models.py +++ b/rationai/resources/models.py @@ -70,6 +70,7 @@ def embed_image[DType: np.generic]( image: Image | NDArray[np.uint8], output_dtype: type[DType] = np.float32, # type: ignore[assignment] timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + **headers: str, ) -> NDArray[DType]: """Compute an embedding vector for an image using the specified model. @@ -78,16 +79,24 @@ def embed_image[DType: np.generic]( image: The image to embed. It must be uint8 RGB image. output_dtype: Output numpy dtype for embeddings (e.g. np.float16, np.float32). timeout: Optional timeout for the request. + **headers: Additional x- headers. Keyword underscores are converted + to hyphens and prefixed with 'x-', e.g. pool_tokens="false" + becomes x-pool-tokens: false. Returns: NDArray[DType]: The embedding array reshaped according to the `x-output-shape` response header. """ compressed_data = lz4.frame.compress(image.tobytes()) + request_headers = {"x-output-dtype": np.dtype(output_dtype).name} + request_headers.update( + {f"x-{k.replace('_', '-')}": v for k, v in headers.items()} + ) + response = self._post( model, data=compressed_data, - headers={"x-output-dtype": np.dtype(output_dtype).name}, + headers=request_headers, timeout=timeout, ) response.raise_for_status() @@ -160,6 +169,7 @@ async def embed_image[DType: np.generic]( image: Image | NDArray[np.uint8], output_dtype: type[DType] = np.float32, # type: ignore[assignment] timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT, + **headers: str, ) -> NDArray[DType]: """Compute an embedding vector for an image using the specified model. @@ -168,16 +178,24 @@ async def embed_image[DType: np.generic]( image: The image to embed. It must be uint8 RGB image. output_dtype: Output numpy dtype for embeddings (e.g. np.float16, np.float32). timeout: Optional timeout for the request. + **headers: Additional x- headers. Keyword underscores are converted + to hyphens and prefixed with 'x-', e.g. pool_tokens="false" + becomes x-pool-tokens: false. Returns: NDArray[DType]: The embedding array reshaped according to the `x-output-shape` response header. """ compressed_data = lz4.frame.compress(image.tobytes()) + request_headers = {"x-output-dtype": np.dtype(output_dtype).name} + request_headers.update( + {f"x-{k.replace('_', '-')}": v for k, v in headers.items()} + ) + response = await self._post( model, data=compressed_data, - headers={"x-output-dtype": np.dtype(output_dtype).name}, + headers=request_headers, timeout=timeout, ) response.raise_for_status()