|
211 | 211 | "use_distributed_optimizer": true |
212 | 212 | } |
213 | 213 | }, |
| 214 | + { |
| 215 | + "id": "3_distopt_zero2", |
| 216 | + "args": { |
| 217 | + "dtype": "float32", |
| 218 | + "nthread_per_process": 8, |
| 219 | + "num_iteration": 10, |
| 220 | + "batch_size": 10, |
| 221 | + "total_batch_size": 5120, |
| 222 | + "use_distributed_optimizer": true, |
| 223 | + "zero_stage": 2 |
| 224 | + } |
| 225 | + }, |
214 | 226 | { |
215 | 227 | "id": "3_bfloat16_distopt", |
216 | 228 | "args": { |
|
222 | 234 | "use_distributed_optimizer": true |
223 | 235 | } |
224 | 236 | }, |
| 237 | + { |
| 238 | + "id": "3_bfloat16_distopt_zero2", |
| 239 | + "args": { |
| 240 | + "dtype": "bfloat16", |
| 241 | + "nthread_per_process": 8, |
| 242 | + "num_iteration": 10, |
| 243 | + "batch_size": 10, |
| 244 | + "total_batch_size": 5120, |
| 245 | + "use_distributed_optimizer": true, |
| 246 | + "zero_stage": 2 |
| 247 | + } |
| 248 | + }, |
225 | 249 | { |
226 | 250 | "id": "4_distopt", |
227 | 251 | "args": { |
|
234 | 258 | "use_distributed_optimizer": true |
235 | 259 | } |
236 | 260 | }, |
| 261 | + { |
| 262 | + "id": "4_distopt_zero2", |
| 263 | + "args": { |
| 264 | + "dtype": "float32", |
| 265 | + "nthread_per_process": 8, |
| 266 | + "num_iteration": 10, |
| 267 | + "batch_size": 40, |
| 268 | + "total_batch_size": 5120, |
| 269 | + "tensor_parallel": 4, |
| 270 | + "use_distributed_optimizer": true, |
| 271 | + "zero_stage": 2 |
| 272 | + } |
| 273 | + }, |
237 | 274 | { |
238 | 275 | "id": "4_bfloat16_distopt", |
239 | 276 | "args": { |
|
246 | 283 | "use_distributed_optimizer": true |
247 | 284 | } |
248 | 285 | }, |
| 286 | + { |
| 287 | + "id": "4_bfloat16_distopt_zero2", |
| 288 | + "args": { |
| 289 | + "dtype": "bfloat16", |
| 290 | + "nthread_per_process": 8, |
| 291 | + "num_iteration": 10, |
| 292 | + "batch_size": 40, |
| 293 | + "total_batch_size": 5120, |
| 294 | + "tensor_parallel": 4, |
| 295 | + "use_distributed_optimizer": true, |
| 296 | + "zero_stage": 2 |
| 297 | + } |
| 298 | + }, |
249 | 299 | { |
250 | 300 | "id": "5_distopt", |
251 | 301 | "args": { |
|
259 | 309 | "use_distributed_optimizer": true |
260 | 310 | } |
261 | 311 | }, |
| 312 | + { |
| 313 | + "id": "5_distopt_zero2", |
| 314 | + "args": { |
| 315 | + "dtype": "float32", |
| 316 | + "nthread_per_process": 8, |
| 317 | + "num_iteration": 10, |
| 318 | + "batch_size": 40, |
| 319 | + "total_batch_size": 5120, |
| 320 | + "tensor_parallel": 4, |
| 321 | + "sequence_parallel": true, |
| 322 | + "use_distributed_optimizer": true, |
| 323 | + "zero_stage": 2 |
| 324 | + } |
| 325 | + }, |
262 | 326 | { |
263 | 327 | "id": "5_bfloat16_distopt", |
264 | 328 | "args": { |
|
272 | 336 | "use_distributed_optimizer": true |
273 | 337 | } |
274 | 338 | }, |
| 339 | + { |
| 340 | + "id": "5_bfloat16_distopt_zero2", |
| 341 | + "args": { |
| 342 | + "dtype": "bfloat16", |
| 343 | + "nthread_per_process": 8, |
| 344 | + "num_iteration": 10, |
| 345 | + "batch_size": 40, |
| 346 | + "total_batch_size": 5120, |
| 347 | + "tensor_parallel": 4, |
| 348 | + "sequence_parallel": true, |
| 349 | + "use_distributed_optimizer": true, |
| 350 | + "zero_stage": 2 |
| 351 | + } |
| 352 | + }, |
275 | 353 | { |
276 | 354 | "id": "8_distopt", |
277 | 355 | "args": { |
|
0 commit comments