vortex/configs/evo2-40b-1m.yml at main · Zymrael/vortex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
model_name: shc-evo2-40b-8k-11T-v2

vocab_size: 512
hidden_size: 8192
# Number of independent filters in Hyena-LI
num_filters: 8192
hcl_layer_idxs: [2,6,9,13,16,20,23,27,30,34,38,41,45,48]
hcm_layer_idxs: [1,5,8,12,15,19,22,26,29,33,37,40,44,47]
hcs_layer_idxs: [0,4,7,11,14,18,21,25,28,32,36,39,43,46]
attn_layer_idxs: [3,10,17,24,31,35,42,49]
hcm_filter_length: 128
hcl_filter_groups: 8192
hcm_filter_groups: 512
hcs_filter_groups: 512
hcs_filter_length: 7
num_layers: 50

# Length of the short, depthwise FIR applied to input projections
short_filter_length: 3
num_attention_heads: 64
short_filter_bias: false # add bias to FIR
mlp_init_method: torch.nn.init.zeros_
mlp_output_init_method: torch.nn.init.zeros_
eps: 0.000001
state_size: 16
rotary_emb_base: 1000000
rotary_emb_scaling_factor: 128
use_interpolated_rotary_pos_emb: True
make_vocab_size_divisible_by: 8
inner_size_multiple_of: 128  # force GLU inner_size to be a multiple of
inner_mlp_size: 22528
log_intermediate_values: False
# Number of groups in GQA
proj_groups: 1
# Number of groups in grouped
hyena_filter_groups: 1
# Split strategy for channels
column_split_hyena: False
column_split: True
interleave: True
# Layer > 0 nn.identity activation
evo2_style_activations: True

use_fp8_input_projections: True

# Legacy options for MP / PP inference
model_parallel_size: 1
pipe_parallel_size: 1
tie_embeddings: True
mha_out_proj_bias: True
hyena_out_proj_bias: True
hyena_flip_x1x2: False
qkv_proj_bias: False
max_seqlen: 1048576
max_batch_size: 1
final_norm: True
use_flash_attn: True
use_flash_rmsnorm: False
use_flash_depthwise: False
use_flashfft: False
use_laughing_hyena: False
inference_mode: True
tokenizer_type: CharLevelTokenizer
prefill_style: fft
mlp_activation: gelu
print_activations: False