From 1102650eda9d2bcdf2b2a571144dd85168cfdbe6 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Mon, 8 Jul 2024 09:32:14 -0700
Subject: [PATCH 1/9] add mamba

---
 .../conf/fine_tuning/mamba/sft.yaml           | 237 ++++++++++++++++++
 launcher_scripts/nemo_launcher/core/stages.py |   3 +
 2 files changed, 240 insertions(+)
 create mode 100644 launcher_scripts/conf/fine_tuning/mamba/sft.yaml

diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
new file mode 100644
index 000000000..30cdbf1f9
--- /dev/null
+++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
@@ -0,0 +1,237 @@
+run:
+  name: sft_mamba
+  results_dir: ${base_results_dir}/${fine_tuning.run.name}
+  time_limit: "00:45:00"
+  dependency: "singleton"
+
+trainer:
+  devices: 8
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 1 # frequency with which training steps are logged
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  limit_val_batches: 1024
+  limit_test_batches: 500
+
+exp_manager:
+  explicit_log_dir: ${fine_tuning.run.results_dir}/results
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: mamba
+    name: sft-test
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${fine_tuning.model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+
+  encoder_seq_length: 1024
+  global_batch_size: 8
+  micro_batch_size: 1
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, lora, or ptuning
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: ??? # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      memmap_workers: 2
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: [1.0] # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
+    validation_ds:
+        file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        names: null # Names of the corresponding datasets used to log metrics.
+        global_batch_size: ${fine_tuning.model.global_batch_size}
+        micro_batch_size: ${fine_tuning.model.micro_batch_size}
+        shuffle: False
+        num_workers: 0
+        memmap_workers: ${model.data.train_ds.memmap_workers}
+        pin_memory: True
+        max_seq_length: 2048
+        min_seq_length: 1
+        drop_last: False
+        label_key: ${fine_tuning.model.data.train_ds.label_key}
+        add_eos: ${fine_tuning.model.data.train_ds.add_eos}
+        add_sep: ${fine_tuning.model.data.train_ds.add_sep}
+        add_bos: ${fine_tuning.model.data.train_ds.add_bos}
+        write_predictions_to_file: False
+        output_file_path_prefix: null # Prefix of the file to write predictions to.
+        truncation_field: ${fine_tuning.model.data.train_ds.truncation_field} # Options: keys in prompt_template
+        index_mapping_dir: null # Path to a directory to write index mapping files.
+        prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+        truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+        ceil_to_power_2: True
+        get_attention_mask_from_fusion: True
+        pad_to_max_length: True
+        metric:
+          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+          num_classes: null
+    test_ds:
+      file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${fine_tuning.model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      label_key: ${fine_tuning.model.data.train_ds.label_key}
+      add_eos: ${fine_tuning.model.data.train_ds.add_eos}
+      add_sep: ${fine_tuning.model.data.train_ds.add_sep}
+      add_bos: ${fine_tuning.model.data.train_ds.add_bos}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: ${fine_tuning.model.data.train_ds.truncation_field} # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${fine_tuning.model.data.train_ds.prompt_template}
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: distributed_fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index c6daa0239..79b1bfdaf 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -42,6 +42,7 @@
     "gemma",
     "falcon",
     "baichuan2",
+    "mamba",
     "mistral",
     "mistral_embedding",
     "mixtral",
@@ -1004,6 +1005,8 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
             / "examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py",
             "t5": self._nemo_code_path
             / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py",
+            "mamba": self._nemo_code_path
+            / "examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py",
             "mt5": self._nemo_code_path
             / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py",
             "falcon": self._nemo_code_path

From 959ac872531683d7adbca3585a1a0a539cdc3835 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Mon, 8 Jul 2024 11:39:19 -0700
Subject: [PATCH 2/9] fix naming

---
 launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
index 30cdbf1f9..705436e13 100644
--- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
+++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
@@ -23,7 +23,7 @@ trainer:
 exp_manager:
   explicit_log_dir: ${fine_tuning.run.results_dir}/results
   exp_dir: null
-  name: ${name}
+  name: mamba
   create_wandb_logger: True
   wandb_logger_kwargs:
     project: mamba

From ec44ff58f60689d3fc97e5382eb10c19e0c53328 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Mon, 8 Jul 2024 11:53:44 -0700
Subject: [PATCH 3/9] fix more naming

---
 launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
index 705436e13..975f8aa57 100644
--- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
+++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
@@ -164,7 +164,7 @@ model:
       pad_to_max_length: True
     validation_ds:
         file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-        names: null # Names of the corresponding datasets used to log metrics.
+        name: "squad" # Names of the corresponding datasets used to log metrics.
         global_batch_size: ${fine_tuning.model.global_batch_size}
         micro_batch_size: ${fine_tuning.model.micro_batch_size}
         shuffle: False

From f3267f5cd5cc92dcc50e2ae5b2c850ac7865579c Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Mon, 8 Jul 2024 11:56:44 -0700
Subject: [PATCH 4/9] fix more naming

---
 launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
index 975f8aa57..891550584 100644
--- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
+++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
@@ -32,7 +32,7 @@ exp_manager:
   resume_ignore_no_checkpoint: True
   create_checkpoint_callback: True
   checkpoint_callback_params:
-    monitor: validation_${model.data.validation_ds.metric.name}
+    monitor: validation_${fine_tuning.model.data.validation_ds.metric.name}
     save_top_k: 1
     mode: min
     save_nemo_on_train_end: True

From 0aefd0e623c13d8a783acd5298add576f2cb8eeb Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Mon, 8 Jul 2024 12:02:51 -0700
Subject: [PATCH 5/9] fix more naming

---
 launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
index 891550584..60a7625bc 100644
--- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
+++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
@@ -36,7 +36,7 @@ exp_manager:
     save_top_k: 1
     mode: min
     save_nemo_on_train_end: True
-    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    filename: '${fine_tuning.run.name}--{${fine_tuning.exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
     model_parallel_size: ${fine_tuning.model.tensor_model_parallel_size}
     always_save_nemo: False
     save_best_model: True

From 0acc834d4155b6b31956172f611b608ee5a4f1e3 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Mon, 8 Jul 2024 12:06:54 -0700
Subject: [PATCH 6/9] fix more naming

---
 launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
index 60a7625bc..c16039fd0 100644
--- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
+++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
@@ -169,7 +169,7 @@ model:
         micro_batch_size: ${fine_tuning.model.micro_batch_size}
         shuffle: False
         num_workers: 0
-        memmap_workers: ${model.data.train_ds.memmap_workers}
+        memmap_workers: ${fine_tuning.model.data.train_ds.memmap_workers}
         pin_memory: True
         max_seq_length: 2048
         min_seq_length: 1

From 1b43a8fe315803d5d8f5acf7defa1ec6491b157b Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Tue, 9 Jul 2024 08:56:04 -0700
Subject: [PATCH 7/9] set peft default to none

---
 launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
index c16039fd0..404a807bc 100644
--- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
+++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
@@ -85,7 +85,7 @@ model:
   ffn_dropout: 0.0
   
   peft:
-    peft_scheme: "lora"  # can be either adapter,ia3, lora, or ptuning
+    peft_scheme: "null"  # can be either adapter,ia3, lora, or ptuning
     restore_from_path: null
 
     # Used for adapter peft training

From 7080399984b1543644b98f19f163d1c8d01b5b18 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Tue, 9 Jul 2024 09:11:45 -0700
Subject: [PATCH 8/9] set peft default to none

---
 launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
index 404a807bc..393f8b335 100644
--- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
+++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
@@ -85,7 +85,7 @@ model:
   ffn_dropout: 0.0
   
   peft:
-    peft_scheme: "null"  # can be either adapter,ia3, lora, or ptuning
+    peft_scheme: "none"  # can be either adapter,ia3, lora, or ptuning
     restore_from_path: null
 
     # Used for adapter peft training

From 8c7e32b7b8f9e80321cbd10ebc9f67bb22d883ec Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Tue, 9 Jul 2024 10:07:46 -0700
Subject: [PATCH 9/9] wandb to false

---
 launcher_scripts/conf/fine_tuning/mamba/sft.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
index 393f8b335..b3b2b4770 100644
--- a/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
+++ b/launcher_scripts/conf/fine_tuning/mamba/sft.yaml
@@ -24,7 +24,7 @@ exp_manager:
   explicit_log_dir: ${fine_tuning.run.results_dir}/results
   exp_dir: null
   name: mamba
-  create_wandb_logger: True
+  create_wandb_logger: False
   wandb_logger_kwargs:
     project: mamba
     name: sft-test