Skip to content

Commit 0890d35

Browse files
Support Multimodal datasets
1 parent a71eee3 commit 0890d35

2 files changed

Lines changed: 75 additions & 6 deletions

File tree

src/together/utils/files.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -159,12 +159,6 @@ def _check_conversation_type(
159159
line_number=idx + 1,
160160
error_source="key_value",
161161
)
162-
if not isinstance(message[column], str):
163-
raise InvalidFileFormatError(
164-
message=f"Column `{column}` is not a string on line {idx + 1}. Found {type(message[column])}",
165-
line_number=idx + 1,
166-
error_source="text_field",
167-
)
168162

169163

170164
def _check_conversation_roles(

tests/unit/test_files_checks.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,39 @@ def test_check_jsonl_valid_instruction(tmp_path: Path):
4141
assert report["has_min_samples"]
4242

4343

44+
def test_check_jsonl_valid_instruction_multimodal(tmp_path: Path):
45+
file = tmp_path / "valid_instruction_multimodal.jsonl"
46+
content = [
47+
{
48+
"prompt": [
49+
{
50+
"type": "text",
51+
"text": "What's the difference between these two images?",
52+
},
53+
{
54+
"type": "image_url",
55+
"image_url": {"url": "data:image/jpeg;base64,..."},
56+
},
57+
{
58+
"type": "image_url",
59+
"image_url": {"url": "data:image/jpeg;base64,..."},
60+
},
61+
],
62+
"completion": "The first image is a cat, the second image is a dog.",
63+
},
64+
]
65+
66+
with file.open("w") as f:
67+
f.write("\n".join(json.dumps(item) for item in content))
68+
69+
report = check_file(file)
70+
71+
assert report["is_check_passed"]
72+
assert report["utf8"]
73+
assert report["num_samples"] == len(content)
74+
assert report["has_min_samples"]
75+
76+
4477
def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path):
4578
# Create a valid JSONL file with conversational format and 1 user-assistant turn pair
4679
file = tmp_path / "valid_conversational_single_turn.jsonl"
@@ -120,6 +153,48 @@ def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path):
120153
assert report["has_min_samples"]
121154

122155

156+
def test_check_jsonl_valid_conversational_multimodal_single_turn(tmp_path: Path):
157+
file = tmp_path / "valid_conversational_multimodal_single_turn.jsonl"
158+
content = [
159+
{
160+
"messages": [
161+
{
162+
"role": "user",
163+
"content": [
164+
{
165+
"type": "text",
166+
"text": "What's the difference between these two images?",
167+
},
168+
{
169+
"type": "image_url",
170+
"image_url": {"url": "data:image/jpeg;base64,..."},
171+
},
172+
{
173+
"type": "image_url",
174+
"image_url": {"url": "data:image/jpeg;base64,..."},
175+
},
176+
],
177+
},
178+
{
179+
"role": "assistant",
180+
"content": [{"type": "text", "text": "Hi there!"}],
181+
},
182+
]
183+
},
184+
]
185+
186+
with file.open("w") as f:
187+
f.write("\n".join(json.dumps(item) for item in content))
188+
189+
report = check_file(file)
190+
191+
print(report)
192+
assert report["is_check_passed"]
193+
assert report["utf8"]
194+
assert report["num_samples"] == len(content)
195+
assert report["has_min_samples"]
196+
197+
123198
def test_check_jsonl_empty_file(tmp_path: Path):
124199
# Create an empty JSONL file
125200
file = tmp_path / "empty.jsonl"

0 commit comments

Comments
 (0)