-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathapply_chat_template_config.yaml
More file actions
46 lines (43 loc) · 1.76 KB
/
apply_chat_template_config.yaml
File metadata and controls
46 lines (43 loc) · 1.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
settings:
src_path: data/smol-smoltalk_train_first_10K.jsonl
dst_path: prepared_data/smol-smoltalk_train_first_10K.jsonl
messages_key: messages
pbin_creation_config_file_path: configs/packed_chat_dataset_config.yaml
split_config:
splitting:
train: 70
val: 15
test: 15
seed: 1234
instruction_data_transformation:
role_mapping:
system: System
user: User
assistant: Assistant
# The b_include_to_loss_token, e_include_to_loss_token are required to be part of each chat template for proper loss masking!
# messages is a required special "chat_template_data" and corresponds to the data column settings.messages_key
jinja2_chat_template: |
{{ chat_template_data.system_instruction + '\n' }}
{% for turn in messages %}
{{ turn.role + ':' }}
{% if turn.role == chat_template_data.assistant_role %}
{{ chat_template_data.special_tokens.b_include_to_loss_token}}
{% else %}
{{ " " }}
{% endif %}
{{ turn.content + '\n'}}
{% if turn.role == chat_template_data.assistant_role %}
{{ chat_template_data.special_tokens.e_assistant_token}}
{{ chat_template_data.special_tokens.e_include_to_loss_token}}
{% endif %}
{% endfor %}
# The key-value pairs of chat_template_data are passed to the Jinja2 template and
# are not type checked for full compliance with the chat tempalate!
chat_template_data:
assistant_role: Assistant
system_instruction: "You are Mody, a helpful assistant trained by the modalities team. Answer friendly and informatively to the user's messages."
# Currently only works with HF tokenizers, as the special tokens are added to the tokenizer
special_tokens:
b_include_to_loss_token: <|im_start|>
e_include_to_loss_token: <|im_end|>
e_assistant_token: <|endoftext|>