Merge branch 'main' into feat/configurable-embed-batch-size

fede-kamel · web-flow · commit 13b57e6336b9 · 2026-01-25T20:34:17.000-05:00
diff --git a/.fern/metadata.json b/.fern/metadata.json
@@ -9,8 +9,7 @@
       "fastavro": "^1.9.4",
       "requests": "^2.0.0",
       "types-requests": "^2.0.0",
-      "tokenizers": ">=0.15,<1",
-      "httpx-sse": "^0.4.0"
+      "tokenizers": ">=0.15,<1"
     },
     "improved_imports": true,
     "pydantic_config": {
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ name = "cohere"
 
 [tool.poetry]
 name = "cohere"
-version = "5.20.1"
+version = "5.20.2"
 description = ""
 readme = "README.md"
 authors = []
@@ -38,7 +38,6 @@ Repository = 'https://github.com/cohere-ai/cohere-python'
 python = "^3.9"
 fastavro = "^1.9.4"
 httpx = ">=0.21.2"
-httpx-sse = "^0.4.0"
 pydantic = ">= 1.9.2"
 pydantic-core = ">=2.18.2"
 requests = "^2.0.0"
diff --git a/reference.md b/reference.md
@@ -1615,7 +1615,7 @@ client.rerank(
     ],
     query="What is the capital of the United States?",
     top_n=3,
-    model="rerank-v3.5",
+    model="rerank-v4.0-pro",
 )
 
 ```
@@ -2492,10 +2492,7 @@ If tool_choice isn't specified, then the model is free to choose whether to use
 <dl>
 <dd>
 
-**priority:** `typing.Optional[int]` 
-
-The priority of the request (lower means earlier handling; default 0 highest priority).
-Higher priority requests are handled first, and dropped last when the system is under load.
+**priority:** `typing.Optional[int]` — Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
     
 </dd>
 </dl>
@@ -2793,10 +2790,7 @@ If tool_choice isn't specified, then the model is free to choose whether to use
 <dl>
 <dd>
 
-**priority:** `typing.Optional[int]` 
-
-The priority of the request (lower means earlier handling; default 0 highest priority).
-Higher priority requests are handled first, and dropped last when the system is under load.
+**priority:** `typing.Optional[int]` — Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
     
 </dd>
 </dl>
@@ -2972,10 +2966,7 @@ If `NONE` is selected, when the input exceeds the maximum input token length an
 <dl>
 <dd>
 
-**priority:** `typing.Optional[int]` 
-
-The priority of the request (lower means earlier handling; default 0 highest priority).
-Higher priority requests are handled first, and dropped last when the system is under load.
+**priority:** `typing.Optional[int]` — Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
     
 </dd>
 </dl>
@@ -3038,7 +3029,7 @@ client.v2.rerank(
     ],
     query="What is the capital of the United States?",
     top_n=3,
-    model="rerank-v3.5",
+    model="rerank-v4.0-pro",
 )
 
 ```
@@ -3102,10 +3093,7 @@ For optimal performance we recommend against sending more than 1,000 documents i
 <dl>
 <dd>
 
-**priority:** `typing.Optional[int]` 
-
-The priority of the request (lower means earlier handling; default 0 highest priority).
-Higher priority requests are handled first, and dropped last when the system is under load.
+**priority:** `typing.Optional[int]` — Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
     
 </dd>
 </dl>
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,5 @@
 fastavro==1.9.4
 httpx>=0.21.2
-httpx-sse==0.4.0
 pydantic>= 1.9.2
 pydantic-core>=2.18.2
 requests==2.0.0
diff --git a/src/cohere/base_client.py b/src/cohere/base_client.py
@@ -1309,7 +1309,7 @@ def rerank(
             ],
             query="What is the capital of the United States?",
             top_n=3,
-            model="rerank-v3.5",
+            model="rerank-v4.0-pro",
         )
         """
         _response = self._raw_client.rerank(
@@ -2859,7 +2859,7 @@ async def main() -> None:
                 ],
                 query="What is the capital of the United States?",
                 top_n=3,
-                model="rerank-v3.5",
+                model="rerank-v4.0-pro",
             )
 
 
diff --git a/src/cohere/core/client_wrapper.py b/src/cohere/core/client_wrapper.py
@@ -24,10 +24,10 @@ def __init__(
 
     def get_headers(self) -> typing.Dict[str, str]:
         headers: typing.Dict[str, str] = {
-            "User-Agent": "cohere/5.20.1",
+            "User-Agent": "cohere/5.20.2",
             "X-Fern-Language": "Python",
             "X-Fern-SDK-Name": "cohere",
-            "X-Fern-SDK-Version": "5.20.1",
+            "X-Fern-SDK-Version": "5.20.2",
             **(self.get_custom_headers() or {}),
         }
         if self._client_name is not None:
diff --git a/src/cohere/types/api_meta_billed_units.py b/src/cohere/types/api_meta_billed_units.py
@@ -18,6 +18,11 @@ class ApiMetaBilledUnits(UncheckedBaseModel):
     The number of billed input tokens.
     """
 
+    image_tokens: typing.Optional[float] = pydantic.Field(default=None)
+    """
+    The number of billed image tokens.
+    """
+
     output_tokens: typing.Optional[float] = pydantic.Field(default=None)
     """
     The number of billed output tokens.
diff --git a/src/cohere/v2/client.py b/src/cohere/v2/client.py
@@ -160,8 +160,7 @@ def chat_stream(
         thinking : typing.Optional[Thinking]
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -331,8 +330,7 @@ def chat(
         thinking : typing.Optional[Thinking]
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -451,8 +449,7 @@ def embed(
             If `NONE` is selected, when the input exceeds the maximum input token length an error will be returned.
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -650,8 +647,7 @@ def rerank(
             Defaults to `4096`. Long documents will be automatically truncated to the specified number of tokens.
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -679,7 +675,7 @@ def rerank(
             ],
             query="What is the capital of the United States?",
             top_n=3,
-            model="rerank-v3.5",
+            model="rerank-v4.0-pro",
         )
         """
         _response = self._raw_client.rerank(
@@ -825,8 +821,7 @@ async def chat_stream(
         thinking : typing.Optional[Thinking]
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -1005,8 +1000,7 @@ async def chat(
         thinking : typing.Optional[Thinking]
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -1133,8 +1127,7 @@ async def embed(
             If `NONE` is selected, when the input exceeds the maximum input token length an error will be returned.
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -1219,8 +1212,7 @@ async def rerank(
             Defaults to `4096`. Long documents will be automatically truncated to the specified number of tokens.
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -1253,7 +1245,7 @@ async def main() -> None:
                 ],
                 query="What is the capital of the United States?",
                 top_n=3,
-                model="rerank-v3.5",
+                model="rerank-v4.0-pro",
             )
 
 
diff --git a/src/cohere/v2/raw_client.py b/src/cohere/v2/raw_client.py
@@ -169,8 +169,7 @@ def chat_stream(
         thinking : typing.Optional[Thinking]
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -513,8 +512,7 @@ def chat(
         thinking : typing.Optional[Thinking]
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -782,8 +780,7 @@ def embed(
             If `NONE` is selected, when the input exceeds the maximum input token length an error will be returned.
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -1000,8 +997,7 @@ def rerank(
             Defaults to `4096`. Long documents will be automatically truncated to the specified number of tokens.
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -1297,8 +1293,7 @@ async def chat_stream(
         thinking : typing.Optional[Thinking]
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -1641,8 +1636,7 @@ async def chat(
         thinking : typing.Optional[Thinking]
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -1910,8 +1904,7 @@ async def embed(
             If `NONE` is selected, when the input exceeds the maximum input token length an error will be returned.
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -2128,8 +2121,7 @@ async def rerank(
             Defaults to `4096`. Long documents will be automatically truncated to the specified number of tokens.
 
         priority : typing.Optional[int]
-            The priority of the request (lower means earlier handling; default 0 highest priority).
-            Higher priority requests are handled first, and dropped last when the system is under load.
+            Controls how early the request is handled. Lower numbers indicate higher priority (default: 0, the highest). When the system is under load, higher-priority requests are processed first and are the least likely to be dropped.
 
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
diff --git a/tests/test_async_client.py b/tests/test_async_client.py
@@ -169,7 +169,7 @@ async def test_rerank(self) -> None:
             'Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states.']
 
         response = await self.co.rerank(
-            model='rerank-english-v2.0',
+            model='rerank-v3.5',
             query='What is the capital of the United States?',
             documents=docs,
             top_n=3,
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -206,7 +206,7 @@ def test_rerank(self) -> None:
             'Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states.']
 
         response = co.rerank(
-            model='rerank-english-v2.0',
+            model='rerank-v3.5',
             query='What is the capital of the United States?',
             documents=docs,
             top_n=3,