Support Multimodal datasets

nikita-smetanin · nikita-smetanin · commit b93a6730d471 · 2025-12-22T18:39:38.000Z
diff --git a/src/together/constants.py b/src/together/constants.py
@@ -1,5 +1,6 @@
 import enum
 
+
 # Session constants
 TIMEOUT_SECS = 600
 MAX_SESSION_LIFETIME_SECS = 180
@@ -40,6 +41,11 @@
 # the number of bytes in a gigabyte, used to convert bytes to GB for readable comparison
 NUM_BYTES_IN_GB = 2**30
 
+# Multimodal limits
+MAX_IMAGES_PER_EXAMPLE = 10
+MAX_IMAGE_BYTES = 10 * 1024 * 1024  # 10MB
+# Max length = Header length + base64 factor (4/3) * image bytes
+MAX_BASE64_IMAGE_LENGTH = len("data:image/jpeg;base64,") + 4 * MAX_IMAGE_BYTES // 3
 
 # expected columns for Parquet files
 PARQUET_EXPECTED_COLUMNS = ["input_ids", "attention_mask", "labels"]
diff --git a/src/together/utils/files.py b/src/together/utils/files.py
@@ -1,27 +1,39 @@
 from __future__ import annotations
 
+import csv
 import json
 import os
-import csv
 from pathlib import Path
 from traceback import format_exc
 from typing import Any, Dict, List
 
 from tqdm import tqdm
 
 from together.constants import (
+    JSONL_REQUIRED_COLUMNS_MAP,
+    MAX_BASE64_IMAGE_LENGTH,
     MAX_FILE_SIZE_GB,
+    MAX_IMAGES_PER_EXAMPLE,
     MIN_SAMPLES,
     NUM_BYTES_IN_GB,
     PARQUET_EXPECTED_COLUMNS,
-    JSONL_REQUIRED_COLUMNS_MAP,
-    REQUIRED_COLUMNS_MESSAGE,
     POSSIBLE_ROLES_CONVERSATION,
+    REQUIRED_COLUMNS_MESSAGE,
     DatasetFormat,
 )
 from together.types import FilePurpose
 
 
+# MessageContent is a string or a list of dicts with 'type': 'text' or 'image_url', and 'text' or 'image_url.url'
+# Example: "Hello" or [
+#   {"type": "text", "text": "Hello"},
+#   {"type": "image_url", "image_url": {
+#     "url": "data:image/jpeg;base64,..."
+#   }}
+# ]
+MessageContent = str | list[dict[str, Any]]
+
+
 class InvalidFileFormatError(ValueError):
     """Exception raised for invalid file formats during file checks."""
 
@@ -70,7 +82,7 @@ def check_file(
 
     if file_size > MAX_FILE_SIZE_GB * NUM_BYTES_IN_GB:
         report_dict["message"] = (
-            f"Maximum supported file size is {MAX_FILE_SIZE_GB} GB. Found file with size of {round(file_size / NUM_BYTES_IN_GB ,3)} GB."
+            f"Maximum supported file size is {MAX_FILE_SIZE_GB} GB. Found file with size of {round(file_size / NUM_BYTES_IN_GB, 3)} GB."
         )
         report_dict["is_check_passed"] = False
     elif file_size == 0:
@@ -103,7 +115,9 @@ def check_file(
     return report_dict
 
 
-def _check_conversation_type(messages: List[Dict[str, str | bool]], idx: int) -> None:
+def _check_conversation_type(
+    messages: List[Dict[str, str | int | MessageContent]], idx: int
+) -> None:
     """Check that the conversation has correct type.
 
     Args:
@@ -175,7 +189,9 @@ def _check_conversation_roles(
         )
 
 
-def _check_message_weight(message: Dict[str, str | bool], idx: int) -> None:
+def _check_message_weight(
+    message: Dict[str, str | int | MessageContent], idx: int
+) -> int | None:
     """Check that the message has a weight with the correct type and value.
 
     Args:
@@ -199,11 +215,14 @@ def _check_message_weight(message: Dict[str, str | bool], idx: int) -> None:
                 line_number=idx + 1,
                 error_source="key_value",
             )
+        return weight
+
+    return None
 
 
 def _check_message_role(
-    message: Dict[str, str | bool], previous_role: str | None, idx: int
-) -> str | bool:
+    message: Dict[str, str | int | MessageContent], previous_role: str | None, idx: int
+) -> str:
     """Check that the message has correct roles.
 
     Args:
@@ -217,6 +236,14 @@ def _check_message_role(
     Raises:
         InvalidFileFormatError: If the message role is invalid.
     """
+    if not isinstance(message["role"], str):
+        raise InvalidFileFormatError(
+            message=f"Invalid role `{message['role']}` in conversation on line {idx + 1}. "
+            f"Role must be a string. Found {type(message['role'])}",
+            line_number=idx + 1,
+            error_source="key_value",
+        )
+
     if message["role"] not in POSSIBLE_ROLES_CONVERSATION:
         raise InvalidFileFormatError(
             message=f"Invalid role `{message['role']}` in conversation on line {idx + 1}. "
@@ -234,8 +261,132 @@ def _check_message_role(
     return message["role"]
 
 
+def _check_message_content(
+    message_content: str | int | MessageContent, role: str, idx: int
+) -> tuple[bool, int]:
+    """Check that the message content has the correct type.
+    Message content can be either a) a string or b) an OpenAI-style multimodal list of content items
+    Example:
+        a) "Hello", or
+        b) [
+             {"type": "text", "text": "Hello"},
+             {"type": "image_url", "image_url": {
+                "url": "data:image/jpeg;base64,..."
+             }}
+           ]
+
+    Args:
+        message: The message to check.
+        idx: Line number in the file.
+
+    Returns:
+        tuple[bool, int]: A tuple with message is multimodal and the number of images in the message content.
+    """
+    # Text-only message content
+    if isinstance(message_content, str):
+        return False, 0
+
+    # Multimodal message content
+    if isinstance(message_content, list):
+        num_images = 0
+        for item in message_content:
+            if not isinstance(item, dict):
+                raise InvalidFileFormatError(
+                    "The dataset is malformed, the `content` field must be a list of dicts.",
+                    line_number=idx + 1,
+                    error_source="key_value",
+                )
+            if "type" not in item:
+                raise InvalidFileFormatError(
+                    "The dataset is malformed, the `content` field must be a list of dicts with a `type` field.",
+                    line_number=idx + 1,
+                    error_source="key_value",
+                )
+
+            if item["type"] == "text":
+                if "text" not in item or not isinstance(item["text"], str):
+                    raise InvalidFileFormatError(
+                        "The dataset is malformed, the `text` field must be present in the `content` item field and be"
+                        f" a string. Got '{item.get('text')!r}' instead.",
+                        line_number=idx + 1,
+                        error_source="key_value",
+                    )
+            elif item["type"] == "image_url":
+                if role != "user":
+                    raise InvalidFileFormatError(
+                        "The dataset is malformed, only user messages can contain images.",
+                        line_number=idx + 1,
+                        error_source="key_value",
+                    )
+
+                if "image_url" not in item or not isinstance(item["image_url"], dict):
+                    raise InvalidFileFormatError(
+                        "The dataset is malformed, the `image_url` field must be present in the `content` field and "
+                        f"be a dictionary. Got {item.get('image_url')!r} instead.",
+                        line_number=idx + 1,
+                        error_source="key_value",
+                    )
+
+                image_data = item["image_url"].get("url")
+                if not image_data or not isinstance(image_data, str):
+                    raise InvalidFileFormatError(
+                        "The dataset is malformed, the `url` field must be present in the `image_url` field and be "
+                        f"a string. Got {image_data!r} instead.",
+                        line_number=idx + 1,
+                        error_source="key_value",
+                    )
+
+                if not any(
+                    image_data.startswith(f"data:image/{fmt};base64,")
+                    for fmt in ["jpeg", "png", "webp"]
+                ):
+                    raise InvalidFileFormatError(
+                        "The dataset is malformed, the `url` field must be either a JPEG, PNG or WEBP base64-encoded "
+                        "image in 'data:image/<format>;base64,<base64_encoded_image>' format. "
+                        f"Got '{image_data[:100]}...' instead.",
+                        line_number=idx + 1,
+                    )
+
+                if len(image_data) > MAX_BASE64_IMAGE_LENGTH:
+                    raise InvalidFileFormatError(
+                        "The dataset is malformed, the `url` field must contain base64-encoded image "
+                        f"that is less than 10MB, found ~{len(image_data) * 3 // 4} bytes.",
+                        line_number=idx + 1,
+                        error_source="key_value",
+                    )
+
+                num_images += 1
+            else:
+                raise InvalidFileFormatError(
+                    "The dataset is malformed, the `type` field must be either 'text' or 'image_url'. "
+                    f"Got {item['type']!r}.",
+                    line_number=idx + 1,
+                    error_source="key_value",
+                )
+
+        if num_images > MAX_IMAGES_PER_EXAMPLE:
+            raise InvalidFileFormatError(
+                f"The dataset is malformed, the `content` field must contain at most "
+                f"{MAX_IMAGES_PER_EXAMPLE} images, found {num_images}.",
+                line_number=idx + 1,
+                error_source="key_value",
+            )
+
+        # We still consider text-only messages in such format as multimodal, even if they don't have any images
+        # included - so we can process datasets with rather sparse images (i.e. not in each sample) consistently.
+        return True, num_images
+
+    raise InvalidFileFormatError(
+        message=f"Invalid content type on line {idx + 1} of the input file. Found {type(message_content)}",
+        line_number=idx + 1,
+        error_source="key_value",
+    )
+
+
 def validate_messages(
-    messages: List[Dict[str, str | bool]], idx: int, require_assistant_role: bool = True
+    messages: List[Dict[str, str | int | MessageContent]],
+    idx: int,
+    require_assistant_role: bool = True,
 ) -> None:
     """Validate the messages column.
 
@@ -249,15 +400,45 @@ def validate_messages(
     """
     _check_conversation_type(messages, idx)
 
-    has_weights = any("weight" in message for message in messages)
     previous_role = None
     assistant_role_exists = False
 
+    messages_are_multimodal: bool | None = None
+    total_number_of_images = 0
+
     for message in messages:
-        if has_weights:
-            _check_message_weight(message, idx)
+        message_weight = _check_message_weight(message, idx)
         previous_role = _check_message_role(message, previous_role, idx)
         assistant_role_exists |= previous_role == "assistant"
+        is_multimodal, number_of_images = _check_message_content(
+            message["content"], role=previous_role, idx=idx
+        )
+        # Multimodal validation
+        if number_of_images > 0 and message_weight is not None and message_weight != 0:
+            raise InvalidFileFormatError(
+                "Messages with images cannot have non-zero weights.",
+                line_number=idx + 1,
+                error_source="key_value",
+            )
+        if messages_are_multimodal is None:
+            # Detect the format of the messages in the conversation.
+            messages_are_multimodal = is_multimodal
+        elif messages_are_multimodal != is_multimodal:
+            # Due to the format limitation, we cannot mix multimodal and text only messages in the same sample.
+            raise InvalidFileFormatError(
+                "Messages in the conversation must be either all in multimodal or all intext only format.",
+                line_number=idx + 1,
+                error_source="key_value",
+            )
+        total_number_of_images += number_of_images
+
+    if total_number_of_images > MAX_IMAGES_PER_EXAMPLE:
+        raise InvalidFileFormatError(
+            f"The dataset is malformed, the `messages` must contain at most {MAX_IMAGES_PER_EXAMPLE} images. "
+            f"Found {total_number_of_images} images.",
+            line_number=idx + 1,
+            error_source="key_value",
+        )
 
     _check_conversation_roles(require_assistant_role, assistant_role_exists, idx)
 
@@ -347,12 +528,7 @@ def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> None:
                 error_source="key_value",
             )
 
-        if not isinstance(example[key][0]["content"], str):
-            raise InvalidFileFormatError(
-                message=f"The dataset is malformed, the 'content' field in `{key}` must be a string on line {idx + 1}.",
-                line_number=idx + 1,
-                error_source="key_value",
-            )
+        _check_message_content(example[key][0]["content"], role="assistant", idx=idx)
 
 
 def _check_utf8(file: Path) -> Dict[str, Any]:
@@ -454,8 +630,7 @@ def _check_csv(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
             report_dict["load_csv"] = False
             if idx < 0:
                 report_dict["message"] = (
-                    "Unable to decode file. "
-                    "File may be empty or in an unsupported format. "
+                    "Unable to decode file. File may be empty or in an unsupported format. "
                 )
             else:
                 report_dict["message"] = (
@@ -542,13 +717,10 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
                         )
                     else:
                         for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
-                            if not isinstance(json_line[column], str):
-                                raise InvalidFileFormatError(
-                                    message=f'Invalid value type for "{column}" key on line {idx + 1}. '
-                                    f"Expected string. Found {type(json_line[column])}.",
-                                    line_number=idx + 1,
-                                    error_source="key_value",
-                                )
+                            role = "assistant" if column in {"completion"} else "user"
+                            _check_message_content(
+                                json_line[column], role=role, idx=idx
+                            )
 
                     if dataset_format is None:
                         dataset_format = current_format
@@ -578,8 +750,7 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
             report_dict["load_json"] = False
             if idx < 0:
                 report_dict["message"] = (
-                    "Unable to decode file. "
-                    "File may be empty or in an unsupported format. "
+                    "Unable to decode file. File may be empty or in an unsupported format. "
                 )
             else:
                 report_dict["message"] = (