up

metascroy · metascroy · commit 915723bebfe8 · 2026-04-06T17:33:20.000-07:00
diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py
@@ -48,14 +48,37 @@ class COMPILE_SPEC_KEYS(Enum):
 
 
 class MULTIMETHOD_WEIGHT_SHARING_STRATEGY(Enum):
-    # Methods are processed independently with no weight sharing.
+    """Strategy for sharing weights across methods in multi-method models.
+
+    When exporting a model with multiple methods (e.g., prefill and decode),
+    these strategies control how CoreML models are organized and how weights
+    are shared. Different strategies have different tradeoffs — experiment
+    with them to find the best fit for your use case.
+
+    DISABLED:
+        Each method is compiled into its own independent CoreML model.
+        No weight sharing occurs; weights are duplicated across methods.
+        Simplest strategy with no constraints on model structure.
+
+    POSITIONAL:
+        Partitions are aligned by index across methods. Partition 0 from
+        all methods are combined into one multifunction CoreML model,
+        partition 1 into another, and so on. This enables weight sharing
+        for parameters that appear at the same partition index. Requires
+        all methods to have the same number of partitions.
+
+    ONE_BLOB:
+        All partitions from all methods are packed into a single
+        multifunction CoreML model. This maximizes weight sharing
+        opportunities (any parameter can be shared across any method)
+        and does not require partition counts to match. However, it may
+        result in longer compile times and higher peak memory since the
+        entire model — including any method-specific (non-shared) weights
+        — lives in a single blob.
+    """
+
     DISABLED = "disabled"
-    # Partitions must align positionally across methods; enables weight sharing
-    # via NamedDataStore. Raises an error if partition counts don't match.
     POSITIONAL = "positional"
-    # All partitions from all methods are combined into a single multifunction
-    # model. No partition count alignment is required. Function names use
-    # "{method_name}__{partition_idx}" encoding.
     ONE_BLOB = "one_blob"
 
 
@@ -843,7 +866,9 @@ def _preprocess_positional(
                     f"Method '{method_name}' has {len(programs)} partitions, but "
                     f"'{first_method}' has {num_partitions}. POSITIONAL weight sharing "
                     "strategy requires all methods to have the same number of partitions. "
-                    "Use MULTIMETHOD_WEIGHT_SHARING_STRATEGY.DISABLED if methods should "
+                    "Use MULTIMETHOD_WEIGHT_SHARING_STRATEGY.ONE_BLOB (which supports "
+                    "different partition counts per method) or "
+                    "MULTIMETHOD_WEIGHT_SHARING_STRATEGY.DISABLED if methods should "
                     "be processed independently."
                 )
 
@@ -1034,7 +1059,7 @@ def _preprocess_one_blob(
                 method_spec = method_model.get_spec()
                 input_names = [inp.name for inp in method_spec.description.input]
                 output_names = [out.name for out in method_spec.description.output]
-                methods_metadata[method_name] = MethodMetadata(
+                methods_metadata[function_name] = MethodMetadata(
                     inputNames=input_names,
                     outputNames=output_names,
                 )
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
@@ -656,16 +656,22 @@ - (nullable ETCoreMLAsset *)modelAssetWithMetadata:(const ModelMetadata&)metadat
             return nil;
         }
         
-        std::string method_name_str = [methodName UTF8String];
-        const MethodMetadata* method_metadata = metadataValue.get_method_metadata(method_name_str);
+        if (functionName == nil || functionName.length == 0) {
+            ETCoreMLLogErrorAndSetNSError(error,
+                                          ETCoreMLErrorCorruptedModel,
+                                          "functionName must be non-nil and non-empty for multifunction model metadata lookup.");
+            return nil;
+        }
+        std::string lookup_key = [functionName UTF8String];
+        const MethodMetadata* method_metadata = metadataValue.get_method_metadata(lookup_key);
         if (method_metadata != nullptr) {
             metadataValue.input_names = method_metadata->input_names;
             metadataValue.output_names = method_metadata->output_names;
         } else {
             ETCoreMLLogErrorAndSetNSError(error,
                                           ETCoreMLErrorCorruptedModel,
-                                          "Method '%@' not found in multifunction model metadata.",
-                                          methodName);
+                                          "Function '%@' not found in multifunction model metadata.",
+                                          functionName);
             return nil;
         }
     }
diff --git a/backends/apple/coreml/test/test_coreml_multifunction.py b/backends/apple/coreml/test/test_coreml_multifunction.py
@@ -9,13 +9,14 @@
 
 import coremltools as ct
 import torch
-
+import torch.nn as nn
 from executorch.backends.apple.coreml.compiler.coreml_preprocess import (
     CoreMLBackend,
     MULTIMETHOD_WEIGHT_SHARING_STRATEGY,
 )
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
 from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
+from executorch.exir.graph_break import remove_graph_break_ops
 
 
 def is_fbcode():
@@ -320,6 +321,92 @@ def test_multifunction_one_blob_simple_model(self):
                 )
             )
 
+    def test_multifunction_one_blob_multiple_partitions(self):
+        """Test ONE_BLOB with multiple partitions per method.
+
+        Uses graph breaks to force the CoreML partitioner to create multiple
+        partitions within each method (forward and prefill). The two partitions
+        have a different number of inputs and outputs so their metadata
+        (input/output name lists) differ.
+
+        Partition 0: 1 input (x) → 2 outputs (a, b)
+        Partition 1: 2 inputs (a, b) → 1 output (result)
+        """
+
+        class _GraphBreak(nn.Module):
+            def forward(self, x):
+                return torch.ops.executorch_utils.graph_break.Tensor(x)
+
+        class MultiPartitionModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear_a = nn.Linear(16, 16)
+                self.linear_b = nn.Linear(16, 16)
+                self.graph_break_a = _GraphBreak()
+                self.graph_break_b = _GraphBreak()
+                self.linear_out = nn.Linear(32, 16)
+
+            def forward(self, x):
+                a = self.linear_a(x)
+                b = self.linear_b(x)
+                a = self.graph_break_a(a)
+                b = self.graph_break_b(b)
+                combined = torch.cat([a, b], dim=-1)
+                return self.linear_out(combined)
+
+        model = MultiPartitionModel()
+        model.eval()
+
+        decode_inputs = (torch.randn(1, 1, 16),)
+        prefill_inputs = (torch.randn(1, 8, 16),)
+
+        exported_programs = {
+            "forward": torch.export.export(model, decode_inputs),
+            "prefill": torch.export.export(model, prefill_inputs),
+        }
+
+        partitioner = CoreMLPartitioner(
+            compile_specs=self._get_compile_specs(
+                strategy=MULTIMETHOD_WEIGHT_SHARING_STRATEGY.ONE_BLOB,
+            ),
+        )
+
+        edge_manager = to_edge_transform_and_lower(
+            exported_programs,
+            partitioner=[partitioner],
+            compile_config=self.edge_compile_config,
+        )
+
+        self.assertIn("forward", edge_manager.methods)
+        self.assertIn("prefill", edge_manager.methods)
+
+        remove_graph_break_ops(edge_manager)
+
+        et_program = edge_manager.to_executorch()
+
+        if _TEST_RUNTIME:
+            runtime = Runtime.get()
+            program = runtime.load_program(et_program.buffer)
+
+            self.assertIn("forward", program.method_names)
+            self.assertIn("prefill", program.method_names)
+
+            forward_method = program.load_method("forward")
+            decode_output = forward_method.execute(decode_inputs)
+            expected_decode = model(*decode_inputs)
+            self.assertTrue(
+                torch.allclose(decode_output[0], expected_decode, atol=1e-4, rtol=1e-4)
+            )
+
+            prefill_method = program.load_method("prefill")
+            prefill_output = prefill_method.execute(prefill_inputs)
+            expected_prefill = model(*prefill_inputs)
+            self.assertTrue(
+                torch.allclose(
+                    prefill_output[0], expected_prefill, atol=1e-4, rtol=1e-4
+                )
+            )
+
 
 if __name__ == "__main__":
     test_runner = TestCoreMLMultifunction()
@@ -328,4 +415,5 @@ def test_multifunction_one_blob_simple_model(self):
     test_runner.test_multifunction_without_weight_sharing()
     test_runner.test_multifunction_with_constant_methods()
     test_runner.test_multifunction_one_blob_simple_model()
+    test_runner.test_multifunction_one_blob_multiple_partitions()
     print("All tests passed!")
diff --git a/examples/apple/coreml/llama/export_static_llm_coreml.py b/examples/apple/coreml/llama/export_static_llm_coreml.py
@@ -24,9 +24,7 @@
 
 import coremltools as ct
 import torch
-import torch.nn as nn
 import torch.utils._pytree as pytree
-
 from executorch.backends.apple.coreml.compiler.coreml_preprocess import (
     CoreMLBackend,
     MULTIMETHOD_WEIGHT_SHARING_STRATEGY,
@@ -42,69 +40,13 @@
 from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
 from executorch.exir.backend.utils import format_delegated_graph
 from executorch.exir.capture._config import ExecutorchBackendConfig
+from executorch.exir.graph_break import BlockWithGraphBreak, remove_graph_break_ops
 from executorch.exir.passes import MemoryPlanningPass
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 from executorch.extension.export_util.utils import save_pte_program
-from torch.library import impl, Library
 from torchao.quantization.granularity import PerAxis, PerGroup
 from torchao.quantization.quant_api import IntxWeightOnlyConfig, quantize_
 
-# Define custom graph break op
-lib = Library("executorch_utils", "DEF")
-lib.define("graph_break.Tensor(Tensor x) -> Tensor")
-
-
-@impl(lib, "graph_break.Tensor", "CompositeExplicitAutograd")
-def graph_break_impl(x):
-    return x
-
-
-class ExecutorchGraphBreakModule(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, *args, **kwargs):
-        return tuple(
-            (
-                torch.ops.executorch_utils.graph_break.Tensor(a)
-                if isinstance(a, torch.Tensor)
-                else a
-            )
-            for a in args
-        )
-
-
-class BlockWithGraphBreak(nn.Module):
-    def __init__(self, block: nn.Module, break_before: bool = True):
-        super().__init__()
-        self.graph_break = ExecutorchGraphBreakModule()
-        self.block = block
-        self.break_before = break_before
-
-    def forward(self, *args, **kwargs):
-        if self.break_before:
-            new_args = self.graph_break(*args)
-            out = self.block(*new_args, **kwargs)
-            return out
-        else:
-            out = self.block(*args, **kwargs)
-            out = self.graph_break(*out)
-            return out
-
-
-def remove_graph_break_(edge_manager):
-    """Remove graph break ops from all methods in the edge manager."""
-    from executorch.exir.dialects._ops import ops as exir_ops
-
-    # Get all method names
-    method_names = edge_manager.methods
-    for method_name in method_names:
-        ep = edge_manager.exported_program(method_name)
-        for n in ep.graph_module.graph.nodes:
-            if n.target == exir_ops.edge.executorch_utils.graph_break.Tensor:
-                n.replace_all_uses_with(n.args[0])
-        ep.graph_module.graph.eliminate_dead_code()
-
 
 def load_model(
     checkpoint_path: str,
@@ -695,7 +637,7 @@ def main():
 
     # Convert to ExecuTorch
     print("\nConverting to ExecuTorch...")
-    remove_graph_break_(edge_manager)
+    remove_graph_break_ops(edge_manager)
     executorch_program = edge_manager.to_executorch(
         ExecutorchBackendConfig(
             extract_delegate_segments=True,
diff --git a/exir/graph_break.py b/exir/graph_break.py