From 1cf72d3f6c640c114843cd98e391c44a3b7a1c9e Mon Sep 17 00:00:00 2001 From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Date: Mon, 30 Mar 2026 14:42:39 -0400 Subject: [PATCH 1/5] Test Mosaic Tutorial Post 2.11 --- .jenkins/validate_tutorials_built.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py index c19cae44bd..4e961a9317 100644 --- a/.jenkins/validate_tutorials_built.py +++ b/.jenkins/validate_tutorials_built.py @@ -39,7 +39,7 @@ "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release. "advanced_source/semi_structured_sparse", # reenable after 3303 is fixed. "intermediate_source/torchrec_intro_tutorial.py", #failing with 2.8 reenable after 3498 - "beginner_source/mosaic_memory_profiling_tutorial.py", # failing with 2.11 issue #3774 + #"beginner_source/mosaic_memory_profiling_tutorial.py", # failing with 2.11 issue #3774 ] def tutorial_source_dirs() -> List[Path]: From 3c8fc5994c2f9c0ad4f366bf686e17ab8b71649b Mon Sep 17 00:00:00 2001 From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Date: Mon, 30 Mar 2026 19:00:15 -0400 Subject: [PATCH 2/5] Update validate_tutorials_built.py --- .jenkins/validate_tutorials_built.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py index 4e961a9317..33be1680f0 100644 --- a/.jenkins/validate_tutorials_built.py +++ b/.jenkins/validate_tutorials_built.py @@ -39,7 +39,7 @@ "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release. "advanced_source/semi_structured_sparse", # reenable after 3303 is fixed. "intermediate_source/torchrec_intro_tutorial.py", #failing with 2.8 reenable after 3498 - #"beginner_source/mosaic_memory_profiling_tutorial.py", # failing with 2.11 issue #3774 + #"beginner_source/mosaic_memory_profiling_tutorial.py", # failing with 2.11 RC issue #3774 ] def tutorial_source_dirs() -> List[Path]: From 6e486afced64bf494a1ea593e048762377186c95 Mon Sep 17 00:00:00 2001 From: sekyonda <127536312+sekyondaMeta@users.noreply.github.com> Date: Wed, 1 Apr 2026 10:28:54 -0400 Subject: [PATCH 3/5] Fix activation checkpointing crash by using use_reentrant=False Switches gradient_checkpointing_enable() to use non-reentrant checkpointing, which properly preserves dropout RNG state during recomputation and resolves the SystemError during loss.backward(). Issue: #3774 --- beginner_source/mosaic_memory_profiling_tutorial.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/beginner_source/mosaic_memory_profiling_tutorial.py b/beginner_source/mosaic_memory_profiling_tutorial.py index db188a5e90..9262953f24 100644 --- a/beginner_source/mosaic_memory_profiling_tutorial.py +++ b/beginner_source/mosaic_memory_profiling_tutorial.py @@ -309,7 +309,9 @@ def run_training_ac( model = GPT2LMHeadModel.from_pretrained("gpt2") if activation_checkpointing: - model.gradient_checkpointing_enable() + model.gradient_checkpointing_enable( + gradient_checkpointing_kwargs={"use_reentrant": False} + ) print("Activation checkpointing is ENABLED") else: print("Activation checkpointing is DISABLED") From f92d01b67fb6858f91c94a3765c8b55eb6b7a423 Mon Sep 17 00:00:00 2001 From: sekyonda <127536312+sekyondaMeta@users.noreply.github.com> Date: Wed, 1 Apr 2026 11:02:34 -0400 Subject: [PATCH 4/5] Revert "Fix activation checkpointing crash by using use_reentrant=False" This reverts commit 6e486afced64bf494a1ea593e048762377186c95. --- beginner_source/mosaic_memory_profiling_tutorial.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/beginner_source/mosaic_memory_profiling_tutorial.py b/beginner_source/mosaic_memory_profiling_tutorial.py index 9262953f24..db188a5e90 100644 --- a/beginner_source/mosaic_memory_profiling_tutorial.py +++ b/beginner_source/mosaic_memory_profiling_tutorial.py @@ -309,9 +309,7 @@ def run_training_ac( model = GPT2LMHeadModel.from_pretrained("gpt2") if activation_checkpointing: - model.gradient_checkpointing_enable( - gradient_checkpointing_kwargs={"use_reentrant": False} - ) + model.gradient_checkpointing_enable() print("Activation checkpointing is ENABLED") else: print("Activation checkpointing is DISABLED") From 2eaef420fce16e8f6d5f23cb0fdaede065ca6e67 Mon Sep 17 00:00:00 2001 From: sekyonda <127536312+sekyondaMeta@users.noreply.github.com> Date: Wed, 1 Apr 2026 12:04:30 -0400 Subject: [PATCH 5/5] Disable dropout to workaround PyTorch 2.11 checkpoint recomputation bug Disable dropout (resid_pdrop=0, attn_pdrop=0, embd_pdrop=0) in the run_training_ac function to avoid SystemError from _VF.dropout returning NULL during backward recomputation of GPT2Block. Dropout is irrelevant to the memory profiling purpose of this tutorial. Issue: #3774 --- beginner_source/mosaic_memory_profiling_tutorial.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/beginner_source/mosaic_memory_profiling_tutorial.py b/beginner_source/mosaic_memory_profiling_tutorial.py index db188a5e90..4d0a19ee3e 100644 --- a/beginner_source/mosaic_memory_profiling_tutorial.py +++ b/beginner_source/mosaic_memory_profiling_tutorial.py @@ -306,7 +306,13 @@ def run_training_ac( # Load model print(f"Loading GPT-2 (activation_checkpointing={activation_checkpointing})...") - model = GPT2LMHeadModel.from_pretrained("gpt2") + # Disable dropout to avoid PyTorch 2.11 checkpoint recomputation bug (#3774). + # _VF.dropout returns NULL without setting an exception during backward + # recomputation of GPT2Block. Dropout is irrelevant to memory profiling. + # Original: model = GPT2LMHeadModel.from_pretrained("gpt2") + model = GPT2LMHeadModel.from_pretrained( + "gpt2", resid_pdrop=0, attn_pdrop=0, embd_pdrop=0 + ) if activation_checkpointing: model.gradient_checkpointing_enable()