From 89ad9186e831a6ae765583242ca065da6ce3330e Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 27 May 2024 04:35:00 +0000 Subject: [PATCH 01/61] torch wip --- python/ark/data_type.py | 33 +++++++++++++++++++++++---------- python/ark/torch_mock.py | 11 +++++++++++ 2 files changed, 34 insertions(+), 10 deletions(-) create mode 100644 python/ark/torch_mock.py diff --git a/python/ark/data_type.py b/python/ark/data_type.py index fe95d0d88..de64c1d7d 100644 --- a/python/ark/data_type.py +++ b/python/ark/data_type.py @@ -3,26 +3,29 @@ import numpy from . import _ark_core - +try: + import torch +except ImportError: + from . import torch_mock as torch _REGISTRY_DATA_TYPE = { - "fp32": {"np": numpy.float32}, - "fp16": {"np": numpy.float16}, - "bf16": {"np": None}, - "int32": {"np": numpy.int32}, - "uint32": {"np": numpy.uint32}, - "int8": {"np": numpy.int8}, - "uint8": {"np": numpy.uint8}, - "byte": {"np": numpy.ubyte}, + "fp32": {"np": numpy.float32, "torch": torch.float32}, + "fp16": {"np": numpy.float16, "torch": torch.float16}, + "bf16": {"np": None, "torch": torch.bfloat16}, + "int32": {"np": numpy.int32, "torch": torch.int32}, + "uint32": {"np": numpy.uint32, "torch": None}, + "int8": {"np": numpy.int8, "torch": torch.int8}, + "uint8": {"np": numpy.uint8, "torch": torch.uint8}, + "byte": {"np": numpy.ubyte, "torch": torch.uint8}, } - class MetaDataType(type): def __new__(cls, name, bases, attrs): new_class = super().__new__(cls, name, bases, attrs) if name in _REGISTRY_DATA_TYPE: reg = _REGISTRY_DATA_TYPE[name] new_class.to_numpy = staticmethod(lambda: reg["np"]) + new_class.to_torch = staticmethod(lambda: reg["torch"]) new_class.ctype = staticmethod( lambda: getattr(_ark_core, name.upper()) ) @@ -104,6 +107,16 @@ def to_numpy() -> numpy.dtype: """ ... + @staticmethod + def to_torch() -> torch.dtype: + """ + Return the corresponding torch data type. + + Returns: + torch.dtype: The corresponding torch data type. + """ + ... + @staticmethod def ctype() -> _ark_core._DataType: """ diff --git a/python/ark/torch_mock.py b/python/ark/torch_mock.py new file mode 100644 index 000000000..e58a3eda8 --- /dev/null +++ b/python/ark/torch_mock.py @@ -0,0 +1,11 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +class dtype: ... +class float32: ... +class float16: ... +class bfloat16: ... +class int32: ... +class int8: ... +class uint8: ... +class ubyte: ... From ab1998ecef18116bd92f4ea91b14c69becc66655 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sun, 26 May 2024 21:43:10 -0700 Subject: [PATCH 02/61] Update ut-cuda.yml --- .github/workflows/ut-cuda.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml index e938ca877..5a78818ff 100644 --- a/.github/workflows/ut-cuda.yml +++ b/.github/workflows/ut-cuda.yml @@ -7,6 +7,8 @@ on: pull_request: branches: - main + types: + - ready_for_review jobs: UnitTest: From ece4f553f62dc2da591321be3f7d5e34bff2c80d Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 27 May 2024 07:24:41 +0000 Subject: [PATCH 03/61] torch wip --- python/ark/data_type.py | 2 ++ python/ark/module.py | 33 ++++++++++++++++++++++++++++----- python/ark/tensor.py | 35 +++++++++++++++++++++++++++++++++++ python/ark/torch_mock.py | 18 ++++++++++++++++++ 4 files changed, 83 insertions(+), 5 deletions(-) diff --git a/python/ark/data_type.py b/python/ark/data_type.py index de64c1d7d..f5ccd9e5b 100644 --- a/python/ark/data_type.py +++ b/python/ark/data_type.py @@ -3,6 +3,7 @@ import numpy from . import _ark_core + try: import torch except ImportError: @@ -19,6 +20,7 @@ "byte": {"np": numpy.ubyte, "torch": torch.uint8}, } + class MetaDataType(type): def __new__(cls, name, bases, attrs): new_class = super().__new__(cls, name, bases, attrs) diff --git a/python/ark/module.py b/python/ark/module.py index 62b941281..459beeda6 100644 --- a/python/ark/module.py +++ b/python/ark/module.py @@ -3,9 +3,14 @@ import logging import numpy as np -from typing import Any, Dict +from typing import Any, Dict, Union from .tensor import Parameter +try: + import torch +except ImportError: + from . import torch_mock as torch + class Module: """ @@ -57,7 +62,9 @@ def params_dict(self, prefix="") -> Dict[str, Parameter]: return params_dict def load_state_dict( - self, state_dict: Dict[str, np.ndarray], prefix: str = "" + self, + state_dict: Dict[str, Union[np.ndarray, torch.Tensor]], + prefix: str = "", ): """ Loads a model from a state_dict and copy the parameters to the device GPU. @@ -68,20 +75,36 @@ def load_state_dict( all_keys = set(state_dict.keys()) pd = self.params_dict(prefix) for name, param in pd.items(): - param.from_numpy(state_dict[name]) + data = state_dict.get(name, None) + if isinstance(data, np.ndarray): + param.from_numpy(data) + elif isinstance(data, torch.Tensor): + param.from_torch(data) + else: + continue all_keys.remove(name) if all_keys: logging.warning( f"{len(all_keys)} unused parameter(s) in state_dict" ) - def state_dict(self, prefix: str = "") -> Dict[str, np.ndarray]: + def state_dict( + self, prefix: str = "", mode: str = "numpy" + ) -> Dict[str, Union[np.ndarray, torch.Tensor]]: """ Copies the parameters from the device GPU to the host and saves the model to a state_dict. Must be called after the executor is launched. """ - return {k: v.to_numpy() for k, v in self.params_dict(prefix).items()} + if mode == "numpy": + return { + k: v.to_numpy() for k, v in self.params_dict(prefix).items() + } + elif mode == "torch": + return { + k: v.to_torch() for k, v in self.params_dict(prefix).items() + } + raise ValueError(f"Unsupported mode: {mode}") def forward(self, *args: Any, **kwargs: Any) -> Any: ... diff --git a/python/ark/tensor.py b/python/ark/tensor.py index 316d18566..625f82bce 100644 --- a/python/ark/tensor.py +++ b/python/ark/tensor.py @@ -8,6 +8,15 @@ from .data_type import DataType from .runtime import Runtime +try: + import torch + + _no_torch = False +except ImportError: + from . import torch_mock as torch + + _no_torch = True + NullTensor = _NullTensor @@ -89,6 +98,32 @@ def from_numpy(self, ndarray: np.ndarray) -> "Tensor": rt.executor.tensor_write(self._tensor, ndarray) return self + def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor: + """ """ + if _no_torch: + raise ImportError("torch is not available") + torch_type = self.dtype().to_torch() + if tensor is None: + return torch.from_numpy(self.to_numpy()) + elif tensor.shape != self.shape(): + raise ValueError("torch tensor shape does not match the tensor") + elif tensor.dtype != torch_type: + raise ValueError("torch tensor dtype does not match the tensor") + elif not tensor.is_contiguous(): + raise ValueError("torch tensor is not contiguous in memory") + elif tensor.numel() != self.nelems(): + raise ValueError("torch tensor size does not match the tensor") + tensor.copy_(torch.from_numpy(self.to_numpy())) + return tensor + + def from_torch(self, tensor: torch.Tensor) -> "Tensor": + """ """ + if _no_torch: + raise ImportError("torch is not available") + if tensor.is_cuda: + tensor = tensor.cpu() + return self.from_numpy(tensor.numpy()) + class Parameter(Tensor): """ diff --git a/python/ark/torch_mock.py b/python/ark/torch_mock.py index e58a3eda8..68333e431 100644 --- a/python/ark/torch_mock.py +++ b/python/ark/torch_mock.py @@ -1,11 +1,29 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. + class dtype: ... + + class float32: ... + + class float16: ... + + class bfloat16: ... + + class int32: ... + + class int8: ... + + class uint8: ... + + class ubyte: ... + + +class Tensor: ... From 952b7610c31288cc8851aa6466461f2ba7a2393f Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 27 May 2024 23:14:40 +0000 Subject: [PATCH 04/61] runtime module --- ark/api/planner.cpp | 4 +- examples/tutorial/torch_tutorial.py | 23 ++++++++ python/ark/__init__.py | 2 +- python/ark/data_type.py | 22 +++++++ python/ark/module.py | 71 +++++++++++++++++++++- python/ark/tensor.py | 91 ++++++++++++++++++++--------- 6 files changed, 181 insertions(+), 32 deletions(-) create mode 100644 examples/tutorial/torch_tutorial.py diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp index ad5048c0e..5c9d09f2e 100644 --- a/ark/api/planner.cpp +++ b/ark/api/planner.cpp @@ -56,8 +56,8 @@ static void check_config_field(const ModelOpRef op, const Json &config, std::string DefaultPlanner::Impl::plan(bool pretty) const { const auto gpu_info = GpuManager::get_instance(gpu_id_)->info(); size_t num_sm = gpu_info.num_sm; - Json task_infos; - Json processor_groups; + Json task_infos = Json::array(); + Json processor_groups = Json::array(); size_t max_num_warps = 1; size_t max_num_processors = 1; size_t next_node_id = 0; diff --git a/examples/tutorial/torch_tutorial.py b/examples/tutorial/torch_tutorial.py new file mode 100644 index 000000000..5677d41cd --- /dev/null +++ b/examples/tutorial/torch_tutorial.py @@ -0,0 +1,23 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import ark +import torch + + +class ArkAddModule(ark.RuntimeModule): + def build_forward(self, x: ark.Tensor, y: ark.Tensor) -> ark.Tensor: + return ark.add(x, y) + +# ARK module for addition +module = ArkAddModule() + +# Define two torch arrays +x = torch.ones(64) * 2 +y = torch.ones(64) * 3 + +# Run the ARK module +z = module(x, y) + +# Print the result +print(z) diff --git a/python/ark/__init__.py b/python/ark/__init__.py index 92e9c39c3..2a4d164e4 100644 --- a/python/ark/__init__.py +++ b/python/ark/__init__.py @@ -38,7 +38,7 @@ def set_world_size(world_size): from .init import init from .tensor import Dims, Tensor, Parameter -from .module import Module +from .module import Module, RuntimeModule from .runtime import Runtime, DefaultPlanner from .serialize import save, load from .data_type import ( diff --git a/python/ark/data_type.py b/python/ark/data_type.py index f5ccd9e5b..8ab982106 100644 --- a/python/ark/data_type.py +++ b/python/ark/data_type.py @@ -64,6 +64,28 @@ def from_numpy(np_type: numpy.dtype) -> "DataType": f" to ark data type." ) + @staticmethod + def from_torch(torch_type: torch.dtype) -> "DataType": + """ + Return the corresponding ark data type. + + Parameters: + torch_type (torch.dtype): The torch data type. + + Returns: + DataType: The corresponding ark data type. + + Raises: + ValueError: If there is no defined conversion from torch data type to ark data type. + """ + for type_name, reg in _REGISTRY_DATA_TYPE.items(): + if reg["torch"] == torch_type: + return DataType.from_name(type_name) + raise ValueError( + f"Undefined conversion from torch data type {torch_type}" + f" to ark data type." + ) + @staticmethod def from_name(type_name: str) -> "DataType": """ diff --git a/python/ark/module.py b/python/ark/module.py index 459beeda6..b7919d2cd 100644 --- a/python/ark/module.py +++ b/python/ark/module.py @@ -3,14 +3,19 @@ import logging import numpy as np -from typing import Any, Dict, Union -from .tensor import Parameter +from typing import Any, Dict, List, Union +from .tensor import Tensor, Parameter +from .runtime import Runtime, DefaultPlanner try: import torch + + _no_torch = False except ImportError: from . import torch_mock as torch + _no_torch = True + class Module: """ @@ -109,3 +114,65 @@ def state_dict( def forward(self, *args: Any, **kwargs: Any) -> Any: ... def backward(self, *args: Any, **kwargs: Any) -> Any: ... + + def initialize(self): + for param in self.parameters.values(): + param.initialize() + for module in self.sub_modules.values(): + module.initialize() + + +def _recursive_ark_to_torch(object): + if isinstance(object, Tensor): + return object.to_torch() + if isinstance(object, dict): + return {k: _recursive_ark_to_torch(v) for k, v in object.items()} + if isinstance(object, list): + return [_recursive_ark_to_torch(v) for v in object] + return object + + +class RuntimeModule(Module): + def __init__(self): + if _no_torch: + raise ImportError("torch is not available") + super().__init__() + self.built_forward = False + self.built_backward = False + self.forward_input_tensor_args: List[Tensor] = [] + self.forward_input_tensor_kwargs: Dict[str, Tensor] = {} + self.forward_output = None + self.backward_tensor_args = [] + self.backward_tensor_kwargs = {} + + def build_forward(self, *args: Any, **kwargs: Any) -> Any: ... + + def build_backward(self, *args: Any, **kwargs: Any) -> Any: ... + + def forward(self, *args: Any, **kwargs: Any) -> Any: + if not self.built_forward: + for arg in args: + if isinstance(arg, torch.Tensor): + self.forward_input_tensor_args.append( + Tensor.from_torch(arg) + ) + for key, value in kwargs.items(): + if isinstance(value, torch.Tensor): + self.forward_input_tensor_kwargs[key] = Tensor.from_torch( + value + ) + self.forward_output = self.build_forward( + *self.forward_input_tensor_args, + **self.forward_input_tensor_kwargs, + ) + self.built_forward = True + + with Runtime.get_runtime() as rt: + rt.launch(plan=DefaultPlanner().plan()) + for arg in self.forward_input_tensor_args: + arg.initialize() + for value in self.forward_input_tensor_kwargs.values(): + value.initialize() + + rt.run() + return _recursive_ark_to_torch(self.forward_output) diff --git a/python/ark/tensor.py b/python/ark/tensor.py index 625f82bce..f264bb440 100644 --- a/python/ark/tensor.py +++ b/python/ark/tensor.py @@ -2,11 +2,12 @@ # Licensed under the MIT license. import numpy as np -from typing import List +from typing import Callable, List, Union, Type from _ark_core import _Dims, _Tensor, _NullTensor from .data_type import DataType from .runtime import Runtime +from .model import Model try: import torch @@ -24,14 +25,19 @@ class Dims(_Dims): pass +Initializer = Type[Callable[[], Union[torch.Tensor, np.ndarray]]] + + class Tensor: - def __init__(self, _tensor: _Tensor): + def __init__(self, _tensor: _Tensor, initializer: Initializer = None): """ Initializes a new instance of the Tensor class. Args: _tensor (_ark_core._Tensor): The underlying _Tensor object. """ self._tensor = _tensor + self.initializer: Initializer = initializer + Model.get_model().add_tensor(self) def shape(self) -> List[int]: """ @@ -80,24 +86,6 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray: rt.executor.tensor_read(self._tensor, ndarray) return ndarray - def from_numpy(self, ndarray: np.ndarray) -> "Tensor": - """ - Copies the tensor from a host numpy array to the device. - """ - rt = Runtime.get_runtime() - if not rt.launched(): - raise RuntimeError( - "Tensor is not allocated yet. `Tensor.from_numpy()` is " - "usable only after you call `Runtime.launch()`." - ) - ndarray = ndarray.astype(self.dtype().to_numpy()) - if not ndarray.flags["C_CONTIGUOUS"]: - ndarray = np.ascontiguousarray(ndarray) - if ndarray.nbytes != self.nelems() * self.dtype().element_size(): - raise ValueError("ndarray size does not match the tensor") - rt.executor.tensor_write(self._tensor, ndarray) - return self - def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor: """ """ if _no_torch: @@ -116,13 +104,62 @@ def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor: tensor.copy_(torch.from_numpy(self.to_numpy())) return tensor - def from_torch(self, tensor: torch.Tensor) -> "Tensor": - """ """ - if _no_torch: - raise ImportError("torch is not available") - if tensor.is_cuda: - tensor = tensor.cpu() - return self.from_numpy(tensor.numpy()) + @staticmethod + def from_numpy(ndarray: np.ndarray): + return Tensor( + Model.get_model().tensor( + Dims(list(ndarray.shape)), + DataType.from_numpy(ndarray.dtype).ctype(), + Dims(), + Dims(), + Dims(), + "", + ), + lambda: ndarray, + ) + + @staticmethod + def from_torch(tensor: torch.Tensor): + return Tensor( + Model.get_model().tensor( + Dims(list(tensor.shape)), + DataType.from_torch(tensor.dtype).ctype(), + Dims(), + Dims(), + Dims(), + "", + ), + lambda: tensor, + ) + + def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor": + """ + Copies the tensor from a host numpy array to the device. + """ + rt = Runtime.get_runtime() + if not rt.launched(): + raise RuntimeError( + "Tensor is not allocated yet. `Tensor.from_numpy()` is " + "usable only after you call `Runtime.launch()`." + ) + if isinstance(data, torch.Tensor): + data = data.cpu().numpy() + data = data.astype(self.dtype().to_numpy()) + if not data.flags["C_CONTIGUOUS"]: + data = np.ascontiguousarray(data) + if data.nbytes != self.nelems() * self.dtype().element_size(): + raise ValueError("data size does not match the tensor") + rt.executor.tensor_write(self._tensor, data) + return self + + def initialize(self) -> "Tensor": + """ + Initializes the tensor. + """ + if self.initializer is not None: + data = self.initializer() + self.copy(data) + return self class Parameter(Tensor): From a40926812f7b02f02e1e48a981c65e21c4dadfaa Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 27 May 2024 23:20:44 +0000 Subject: [PATCH 05/61] fix --- python/ark/tensor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ark/tensor.py b/python/ark/tensor.py index f264bb440..5168791a8 100644 --- a/python/ark/tensor.py +++ b/python/ark/tensor.py @@ -37,7 +37,6 @@ def __init__(self, _tensor: _Tensor, initializer: Initializer = None): """ self._tensor = _tensor self.initializer: Initializer = initializer - Model.get_model().add_tensor(self) def shape(self) -> List[int]: """ From 8e4622707b34cd4a71579bd65d7ba484e2424969 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 27 May 2024 23:52:16 +0000 Subject: [PATCH 06/61] fix --- ark/include/kernels/kernel_template.in | 5 ++++- examples/tutorial/torch_tutorial.py | 6 +++++- python/ark/module.py | 20 +++++++++++++------- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in index bc842ea4a..5bba320a5 100644 --- a/ark/include/kernels/kernel_template.in +++ b/ark/include/kernels/kernel_template.in @@ -59,9 +59,12 @@ void @NAME@(int *_iter) { sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE); ark_loop_body(_buf, _i); } + if (threadIdx.x == 0) { + __threadfence_system(); + } + sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE); if (threadIdx.x == 0 && blockIdx.x == 0) { atomicStoreRelaxed(_iter, 0); } - sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE); } } diff --git a/examples/tutorial/torch_tutorial.py b/examples/tutorial/torch_tutorial.py index 5677d41cd..e9482a7cc 100644 --- a/examples/tutorial/torch_tutorial.py +++ b/examples/tutorial/torch_tutorial.py @@ -9,6 +9,7 @@ class ArkAddModule(ark.RuntimeModule): def build_forward(self, x: ark.Tensor, y: ark.Tensor) -> ark.Tensor: return ark.add(x, y) + # ARK module for addition module = ArkAddModule() @@ -19,5 +20,8 @@ def build_forward(self, x: ark.Tensor, y: ark.Tensor) -> ark.Tensor: # Run the ARK module z = module(x, y) +w = module(x, z) + # Print the result -print(z) +print(z) # 5 +print(w) # 7 diff --git a/python/ark/module.py b/python/ark/module.py index b7919d2cd..a266f522d 100644 --- a/python/ark/module.py +++ b/python/ark/module.py @@ -6,6 +6,8 @@ from typing import Any, Dict, List, Union from .tensor import Tensor, Parameter from .runtime import Runtime, DefaultPlanner +from .ops import tensor +from .data_type import DataType try: import torch @@ -154,12 +156,16 @@ def forward(self, *args: Any, **kwargs: Any) -> Any: for arg in args: if isinstance(arg, torch.Tensor): self.forward_input_tensor_args.append( - Tensor.from_torch(arg) + tensor( + list(arg.shape), + DataType.from_torch(arg.dtype), + ) ) for key, value in kwargs.items(): if isinstance(value, torch.Tensor): - self.forward_input_tensor_kwargs[key] = Tensor.from_torch( - value + self.forward_input_tensor_kwargs[key] = tensor( + list(value.shape), + DataType.from_torch(value.dtype), ) self.forward_output = self.build_forward( *self.forward_input_tensor_args, @@ -169,10 +175,10 @@ def forward(self, *args: Any, **kwargs: Any) -> Any: with Runtime.get_runtime() as rt: rt.launch(plan=DefaultPlanner().plan()) - for arg in self.forward_input_tensor_args: - arg.initialize() - for value in self.forward_input_tensor_kwargs.values(): - value.initialize() + for tns, arg in zip(self.forward_input_tensor_args, args): + tns.copy(arg) + for key, value in self.forward_input_tensor_kwargs.items(): + value.copy(kwargs[key]) rt.run() return _recursive_ark_to_torch(self.forward_output) From eee7ec2b4bb1cde335e99d780657c70e497542c9 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 28 May 2024 19:00:09 +0000 Subject: [PATCH 07/61] some fixes --- python/ark/module.py | 23 ++++++++++++++++------- python/ark/tensor.py | 28 +++++++++++++++++++++------- python/executor_py.cpp | 15 ++++++++++++++- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/python/ark/module.py b/python/ark/module.py index a266f522d..faeeea40d 100644 --- a/python/ark/module.py +++ b/python/ark/module.py @@ -83,12 +83,9 @@ def load_state_dict( pd = self.params_dict(prefix) for name, param in pd.items(): data = state_dict.get(name, None) - if isinstance(data, np.ndarray): - param.from_numpy(data) - elif isinstance(data, torch.Tensor): - param.from_torch(data) - else: + if data is None: continue + param.copy(data) all_keys.remove(name) if all_keys: logging.warning( @@ -143,6 +140,8 @@ def __init__(self): self.built_backward = False self.forward_input_tensor_args: List[Tensor] = [] self.forward_input_tensor_kwargs: Dict[str, Tensor] = {} + self.forward_input_args = [] + self.forward_input_kwargs = {} self.forward_output = None self.backward_tensor_args = [] self.backward_tensor_kwargs = {} @@ -161,15 +160,25 @@ def forward(self, *args: Any, **kwargs: Any) -> Any: DataType.from_torch(arg.dtype), ) ) + self.forward_input_args.append( + self.forward_input_tensor_args[-1] + ) + else: + self.forward_input_args.append(arg) for key, value in kwargs.items(): if isinstance(value, torch.Tensor): self.forward_input_tensor_kwargs[key] = tensor( list(value.shape), DataType.from_torch(value.dtype), ) + self.forward_input_kwargs[key] = ( + self.forward_input_tensor_kwargs[key] + ) + else: + self.forward_input_kwargs[key] = value self.forward_output = self.build_forward( - *self.forward_input_tensor_args, - **self.forward_input_tensor_kwargs, + *self.forward_input_args, + **self.forward_input_kwargs, ) self.built_forward = True diff --git a/python/ark/tensor.py b/python/ark/tensor.py index 5168791a8..a567264d5 100644 --- a/python/ark/tensor.py +++ b/python/ark/tensor.py @@ -142,13 +142,27 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor": "usable only after you call `Runtime.launch()`." ) if isinstance(data, torch.Tensor): - data = data.cpu().numpy() - data = data.astype(self.dtype().to_numpy()) - if not data.flags["C_CONTIGUOUS"]: - data = np.ascontiguousarray(data) - if data.nbytes != self.nelems() * self.dtype().element_size(): - raise ValueError("data size does not match the tensor") - rt.executor.tensor_write(self._tensor, data) + if data.dtype != self.dtype().to_torch(): + raise ValueError("data dtype does not match the tensor") + if not data.is_contiguous(): + data = data.contiguous() + if data.numel() != self.nelems(): + raise ValueError("data size does not match the tensor") + rt.executor.tensor_write( + self._tensor, + data.data_ptr(), + data.numel() * data.element_size(), + ) + elif isinstance(data, np.ndarray): + if data.dtype != self.dtype().to_numpy(): + raise ValueError("data dtype does not match the tensor") + if not data.flags["C_CONTIGUOUS"]: + data = np.ascontiguousarray(data) + if data.nbytes != self.nelems() * self.dtype().element_size(): + raise ValueError("data size does not match the tensor") + rt.executor.tensor_write(self._tensor, data) + else: + raise ValueError("data must be a numpy array or a torch tensor") return self def initialize(self) -> "Tensor": diff --git a/python/executor_py.cpp b/python/executor_py.cpp index dc2840329..13a81608e 100644 --- a/python/executor_py.cpp +++ b/python/executor_py.cpp @@ -17,6 +17,11 @@ static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor, info.size * info.itemsize); } +static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor, + size_t host_address, size_t bytes) { + exe->tensor_write(tensor, reinterpret_cast(host_address), bytes); +} + static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor, py::buffer host_buffer) { py::buffer_info info = host_buffer.request(); @@ -39,5 +44,13 @@ void register_executor(py::module &m) { .def("destroy", &ark::Executor::destroy) .def("destroyed", &ark::Executor::destroyed) .def("tensor_read", &tensor_read, py::arg("tensor"), py::arg("data")) - .def("tensor_write", &tensor_write, py::arg("tensor"), py::arg("data")); + .def( + "tensor_write", + py::overload_cast( + &tensor_write), + py::arg("tensor"), py::arg("data")) + .def("tensor_write", + py::overload_cast(&tensor_write), + py::arg("tensor"), py::arg("address"), py::arg("bytes")); } From 87b9b0127de668f810847d04d4c2a08178439ee0 Mon Sep 17 00:00:00 2001 From: Noli Gerawork <86308445+naturalcandy@users.noreply.github.com> Date: Tue, 18 Jun 2024 11:20:45 -0400 Subject: [PATCH 08/61] Python API Multiple Runtime Support (#216) - Introduced support for multiple Runtime instances - Added utility functions for multi-runtime management - Ensured backward compatibility with existing usage patterns of Runtime - Added unit tests for multi-runtime functionality --------- Co-authored-by: noli --- ark/api/executor.cpp | 101 +++++++++++++++++++++ ark/include/ark/executor.hpp | 6 ++ python/ark/init.py | 5 +- python/ark/ops.py | 138 ++++++++++++++++++++++------ python/ark/runtime.py | 139 +++++++++++++++++++++++------ python/ark/tensor.py | 69 ++++++++++---- python/executor_py.cpp | 30 ++++++- python/unittest/test.py | 1 + python/unittest/test_conversion.py | 93 +++++++++++++++++++ python/unittest/test_runtime.py | 121 ++++++++++++++++++++++--- 10 files changed, 610 insertions(+), 93 deletions(-) create mode 100644 python/unittest/test_conversion.py diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index 198d22e51..a0711bfe8 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -3,12 +3,15 @@ #include "ark/executor.hpp" +#include + #include #include #include #include #include +#include "ark/data_type.hpp" #include "ark/model.hpp" #include "ark/planner.hpp" #include "codegen.hpp" @@ -154,6 +157,8 @@ class Executor::Impl { void tensor_read(const Tensor tensor, void *data, size_t bytes) const; void tensor_write(const Tensor tensor, const void *data, size_t bytes) const; + DLDeviceType get_device_type() const; + DLManagedTensor *get_dl_tensor(const Tensor &tensor) const; private: void init_communicator(); @@ -783,6 +788,94 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data, copy_stream_->sync(); } +DLDeviceType Executor::Impl::get_device_type() const { +#if defined(ARK_CUDA) + return kDLCUDA; +#elif defined(ARK_ROCM) + return kDLROCM; +#else + return kDLCPU; +#endif +} + +DLDataType get_dl_dtype(const DataType &ark_data_type) { + DLDataType dl_data_type; + dl_data_type.lanes = 1; + if (ark_data_type == FP32) { + dl_data_type.code = kDLFloat; + dl_data_type.bits = 32; + } else if (ark_data_type == FP16) { + dl_data_type.code = kDLFloat; + dl_data_type.bits = 16; + } else if (ark_data_type == BF16) { + dl_data_type.code = kDLBfloat; + dl_data_type.bits = 16; + } else if (ark_data_type == INT32) { + dl_data_type.code = kDLInt; + dl_data_type.bits = 32; + } else if (ark_data_type == UINT32) { + dl_data_type.code = kDLUInt; + dl_data_type.bits = 32; + } else if (ark_data_type == INT8) { + dl_data_type.code = kDLInt; + dl_data_type.bits = 8; + } else if (ark_data_type == UINT8) { + dl_data_type.code = kDLUInt; + dl_data_type.bits = 8; + } else if (ark_data_type == BYTE) { + dl_data_type.code = kDLUInt; + dl_data_type.bits = 8; + } else { + ERR(InvalidUsageError, "Unsupported data type"); + } + return dl_data_type; +} + +DLManagedTensor *Executor::Impl::get_dl_tensor(const Tensor &tensor) const { + DLTensor dl_tensor; + dl_tensor.data = + buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id())); + size_t offset_in_elements = + tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0]; + dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes(); + dl_tensor.device.device_type = get_device_type(); + dl_tensor.device.device_id = static_cast(gpu_id_); + dl_tensor.ndim = static_cast(tensor.shape().ndims()); + dl_tensor.dtype = get_dl_dtype(tensor.data_type()); + + dl_tensor.shape = + tensor.shape().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim]; + dl_tensor.strides = + tensor.strides().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim]; + auto shape = tensor.shape(); + if (dl_tensor.shape) { + for (int i = 0; i < dl_tensor.ndim; ++i) { + dl_tensor.shape[i] = shape[i]; + } + } + if (dl_tensor.strides) { + dl_tensor.strides[dl_tensor.ndim - 1] = 1; + for (int i = dl_tensor.ndim - 2; i >= 0; --i) { + dl_tensor.strides[i] = + dl_tensor.shape[i + 1] * dl_tensor.strides[i + 1]; + } + } + DLManagedTensor *dl_managed_tensor = new DLManagedTensor(); + dl_managed_tensor->dl_tensor = dl_tensor; + dl_managed_tensor->manager_ctx = nullptr; + dl_managed_tensor->deleter = [](DLManagedTensor *self) { + if (self->dl_tensor.shape) { + delete[] self->dl_tensor.shape; + self->dl_tensor.shape = nullptr; + } + if (self->dl_tensor.strides) { + delete[] self->dl_tensor.strides; + self->dl_tensor.strides = nullptr; + } + }; + return dl_managed_tensor; +} + Executor::Executor(int rank, int world_size, int gpu_id, const std::string &name, const std::string &plan) : impl_(std::make_unique(rank, world_size, gpu_id, name, @@ -818,6 +911,14 @@ void Executor::tensor_write(const Tensor tensor, const void *data, impl_->tensor_write(tensor, data, bytes); } +DLDeviceType Executor::get_device_type() const { + return impl_->get_device_type(); +} + +DLManagedTensor *Executor::get_dl_tensor(const Tensor &tensor) const { + return impl_->get_dl_tensor(tensor); +} + DefaultExecutor::DefaultExecutor(const Model &model, int gpu_id, const std::string &name) : Executor( diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp index 4682af7d0..54c49cd29 100644 --- a/ark/include/ark/executor.hpp +++ b/ark/include/ark/executor.hpp @@ -4,6 +4,8 @@ #ifndef ARK_EXECUTOR_HPP #define ARK_EXECUTOR_HPP +#include + #include #include #include @@ -62,6 +64,10 @@ class Executor { void tensor_write(const Tensor tensor, const void *data, size_t bytes) const; + DLManagedTensor *get_dl_tensor(const Tensor &tensor) const; + + DLDeviceType get_device_type() const; + private: class Impl; std::unique_ptr impl_; diff --git a/python/ark/init.py b/python/ark/init.py index be71e8e02..dbf7c1569 100644 --- a/python/ark/init.py +++ b/python/ark/init.py @@ -9,7 +9,6 @@ def init(): """Initializes ARK.""" Model.reset() - if _RuntimeState.executor is not None: - if not _RuntimeState.executor.destroyed(): - _RuntimeState.executor.destroy() + if _RuntimeState.runtime: + _RuntimeState.delete_all() _ark_core.init() diff --git a/python/ark/ops.py b/python/ark/ops.py index bc1c3ed13..86b021aef 100644 --- a/python/ark/ops.py +++ b/python/ark/ops.py @@ -59,6 +59,8 @@ def add( tensor_add = ark.add(tensor1, tensor2) """ if isinstance(input, Tensor) and isinstance(other, Tensor): + if input.runtime_id != other.runtime_id: + raise ValueError("Tensors must be on the same runtime") a = input._tensor b = other._tensor elif isinstance(input, Tensor): @@ -75,7 +77,9 @@ def add( ) if output is not NullTensor: output = output._tensor - return Tensor(Model.get_model().add(a, b, output, name)) + return Tensor( + Model.get_model().add(a, b, output, name), runtime_id=input.runtime_id + ) def cast( @@ -88,7 +92,8 @@ def cast( if output is not NullTensor: output = output._tensor return Tensor( - Model.get_model().cast(input._tensor, dtype.ctype(), output, name) + Model.get_model().cast(input._tensor, dtype.ctype(), output, name), + runtime_id=input.runtime_id, ) @@ -97,10 +102,12 @@ def constant( shape: Iterable[int], dtype: DataType = fp32, name: str = "constant", + runtime_id: int = -1, ) -> Tensor: """Constant.""" return Tensor( - Model.get_model().constant(value, Dims(shape), dtype.ctype(), name) + Model.get_model().constant(value, Dims(shape), dtype.ctype(), name), + runtime_id=runtime_id, ) @@ -112,7 +119,10 @@ def copy( output = output._tensor if isinstance(input, Tensor): intput = intput._tensor - return Tensor(Model.get_model().copy(intput, output, name)) + return Tensor( + Model.get_model().copy(intput, output, name), + runtime_id=input.runtime_id, + ) def div( @@ -130,8 +140,13 @@ def div( if output is not NullTensor: output = output._tensor if isinstance(other, Tensor): + if input.runtime_id != other.runtime_id: + raise ValueError("Tensors must be on the same runtime") other = other._tensor - return Tensor(Model.get_model().div(input._tensor, other, output, name)) + return Tensor( + Model.get_model().div(input._tensor, other, output, name), + runtime_id=input.runtime_id, + ) def embedding( @@ -141,10 +156,15 @@ def embedding( name: str = "embedding", ) -> Tensor: """Embedding layer.""" + if input.runtime_id != weight.runtime_id: + raise ValueError("Tensors must be on the same runtime") if output is not NullTensor: output = output._tensor return Tensor( - Model.get_model().embedding(input._tensor, weight._tensor, output, name) + Model.get_model().embedding( + input._tensor, weight._tensor, output, name + ), + runtime_id=input.runtime_id, ) @@ -158,7 +178,10 @@ def exp( """ if output is not NullTensor: output = output._tensor - return Tensor(Model.get_model().exp(input._tensor, output, name)) + return Tensor( + Model.get_model().exp(input._tensor, output, name), + runtime_id=input.runtime_id, + ) def gelu( @@ -174,7 +197,10 @@ def gelu( """ if output is not NullTensor: output = output._tensor - return Tensor(Model.get_model().gelu(input._tensor, output, name)) + return Tensor( + Model.get_model().gelu(input._tensor, output, name), + runtime_id=input.runtime_id, + ) def identity( @@ -189,8 +215,13 @@ def identity( for dep in deps: if not isinstance(dep, Tensor): raise TypeError("All dependencies should be a tensor") + if input.runtime_id != dep.runtime_id: + raise ValueError("All tensors must be on the same runtime") dep_tensors.append(dep._tensor) - return Tensor(Model.get_model().identity(input._tensor, dep_tensors, name)) + return Tensor( + Model.get_model().identity(input._tensor, dep_tensors, name), + runtime_id=input.runtime_id, + ) def matmul( @@ -210,6 +241,8 @@ def matmul( Usage: tensor_matmul = ark.matmul(tensor1, tensor2) """ + if input.runtime_id != other.runtime_id: + raise ValueError("Tensors must be on the same runtime") if output is not NullTensor: output = output._tensor return Tensor( @@ -220,7 +253,8 @@ def matmul( transpose_input, transpose_other, name, - ) + ), + runtime_id=input.runtime_id, ) @@ -239,8 +273,13 @@ def mul( if output is not NullTensor: output = output._tensor if isinstance(other, Tensor): + if input.runtime_id != other.runtime_id: + raise ValueError("Tensors must be on the same runtime") other = other._tensor - return Tensor(Model.get_model().mul(input._tensor, other, output, name)) + return Tensor( + Model.get_model().mul(input._tensor, other, output, name), + runtime_id=input.runtime_id, + ) def noop(input: Tensor, name: str = "noop"): @@ -268,7 +307,8 @@ def reduce_max( return Tensor( Model.get_model().reduce_max( input._tensor, axis, keepdims, output, name - ) + ), + runtime_id=input.runtime_id, ) @@ -290,7 +330,8 @@ def reduce_mean( return Tensor( Model.get_model().reduce_mean( input._tensor, axis, keepdims, output, name - ) + ), + runtime_id=input.runtime_id, ) @@ -314,7 +355,8 @@ def reduce_sum( return Tensor( Model.get_model().reduce_sum( input._tensor, axis, keepdims, output, name - ) + ), + runtime_id=input.runtime_id, ) @@ -329,7 +371,10 @@ def relu( """ if output is not NullTensor: output = output._tensor - return Tensor(Model.get_model().relu(input._tensor, output, name)) + return Tensor( + Model.get_model().relu(input._tensor, output, name), + runtime_id=input.runtime_id, + ) def reshape( @@ -357,7 +402,8 @@ def reshape( if len(shape) > 4: raise ValueError("Only support tensors with up to 4 dimensions") return Tensor( - Model.get_model().reshape(input._tensor, Dims(shape), allowzero, name) + Model.get_model().reshape(input._tensor, Dims(shape), allowzero, name), + runtime_id=input.runtime_id, ) @@ -374,8 +420,11 @@ def rope( """ if output is not NullTensor: output = output._tensor + if input.runtime_id != other.runtime_id: + raise ValueError("Tensors must be on the same runtime") return Tensor( - Model.get_model().rope(input._tensor, other._tensor, output, name) + Model.get_model().rope(input._tensor, other._tensor, output, name), + runtime_id=input.runtime_id, ) @@ -389,7 +438,10 @@ def rsqrt( """ if output is not NullTensor: output = output._tensor - return Tensor(Model.get_model().rsqrt(input._tensor, output, name)) + return Tensor( + Model.get_model().rsqrt(input._tensor, output, name), + runtime_id=input.runtime_id, + ) def sharding( @@ -407,7 +459,9 @@ def sharding( _tensor_list = Model.get_model().sharding( input._tensor, axis, dim_per_shard, name ) - return [Tensor(_tensor) for _tensor in _tensor_list] + return [ + Tensor(_tensor, runtime_id=input.runtime_id) for _tensor in _tensor_list + ] def sigmoid( @@ -421,7 +475,10 @@ def sigmoid( """ if output is not NullTensor: output = output._tensor - return Tensor(Model.get_model().sigmoid(input._tensor, output, name)) + return Tensor( + Model.get_model().sigmoid(input._tensor, output, name), + runtime_id=input.runtime_id, + ) def sqrt( @@ -434,7 +491,10 @@ def sqrt( """ if output is not NullTensor: output = output._tensor - return Tensor(Model.get_model().sqrt(input._tensor, output, name)) + return Tensor( + Model.get_model().sqrt(input._tensor, output, name), + runtime_id=input.runtime_id, + ) def sub( @@ -452,8 +512,13 @@ def sub( if output is not NullTensor: output = output._tensor if isinstance(other, Tensor): + if input.runtime_id != other.runtime_id: + raise ValueError("Tensors must be on the same runtime") other = other._tensor - return Tensor(Model.get_model().sub(input._tensor, other, output, name)) + return Tensor( + Model.get_model().sub(input._tensor, other, output, name), + runtime_id=input.runtime_id, + ) def tensor( @@ -463,6 +528,7 @@ def tensor( offsets: Iterable[int] = [], padded_shape: Iterable[int] = [], name: str = "", + runtime_id: int = -1, ) -> Tensor: """ Construct a tensor with given shape and data type. @@ -470,7 +536,10 @@ def tensor( tensor = ark.tensor([1, 2, 3, 4], dtype=ark.fp32) tensor = ark.tensor([1, 2], dtype=ark.fp16) """ - return Tensor(_tensor(shape, dtype, strides, offsets, padded_shape, name)) + return Tensor( + _tensor(shape, dtype, strides, offsets, padded_shape, name), + runtime_id=runtime_id, + ) def transpose( @@ -496,7 +565,8 @@ def transpose( if len(perm) > 4: raise ValueError("Only support perm up to 4 dimensions") return Tensor( - Model.get_model().transpose(input._tensor, perm, output, name) + Model.get_model().transpose(input._tensor, perm, output, name), + runtime_id=input.runtime_id, ) @@ -515,11 +585,15 @@ def mean( def ones( - shape: Iterable[int], dtype: DataType = fp32, name: str = "ones" + shape: Iterable[int], + dtype: DataType = fp32, + name: str = "ones", + runtime_id: int = -1, ) -> Tensor: """Ones.""" return Tensor( - Model.get_model().constant(1, Dims(shape), dtype.ctype(), name) + Model.get_model().constant(1, Dims(shape), dtype.ctype(), name), + runtime_id=runtime_id, ) @@ -530,12 +604,14 @@ def parameter( offsets: Iterable[int] = [], padded_shape: Iterable[int] = [], name: str = "", + runtime_id: int = -1, ) -> Parameter: """ Construct a parameter with given shape and data type. """ return Parameter( - _tensor(shape, dtype, strides, offsets, padded_shape, name) + _tensor(shape, dtype, strides, offsets, padded_shape, name), + runtime_id=runtime_id, ) @@ -569,11 +645,15 @@ def layernorm( def zeros( - shape: Iterable[int], dtype: DataType = fp32, name: str = "zeros" + shape: Iterable[int], + dtype: DataType = fp32, + name: str = "zeros", + runtime_id: int = -1, ) -> Tensor: """Zeros.""" return Tensor( - Model.get_model().constant(0, Dims(shape), dtype.ctype(), name) + Model.get_model().constant(0, Dims(shape), dtype.ctype(), name), + runtime_id=runtime_id, ) diff --git a/python/ark/runtime.py b/python/ark/runtime.py index 7480ce7da..798eaf9d5 100644 --- a/python/ark/runtime.py +++ b/python/ark/runtime.py @@ -3,7 +3,7 @@ import logging from enum import Enum -from typing import Callable +from typing import Callable, Dict, List from _ark_core import _Executor, _DefaultPlanner from .model import Model @@ -14,8 +14,36 @@ class _RuntimeState: The _RuntimeState class is used to store the state of the model. """ - runtime = None - executor = None + runtime: Dict[int, "Runtime"] = {} + + @staticmethod + def reset_all(): + """ + Resets all runtimes. + """ + runtime_ids = list(_RuntimeState.runtime.keys()) + for runtime_id in runtime_ids: + _RuntimeState.runtime[runtime_id].reset() + + @staticmethod + def delete_all(): + """ + Deletes all runtimes. + """ + runtime_ids = list(_RuntimeState.runtime.keys()) + for runtime_id in runtime_ids: + _RuntimeState.runtime[runtime_id].reset(delete=True) + + @staticmethod + def print_runtime_states(): + """ + Print runtimes and their corresponding states. + """ + print(f"{'Runtime ID':<12} | {'Status':<20}") + print(f"{'-'*12} | {'-'*20}") + for runtime_id, runtime in _RuntimeState.runtime.items(): + runtime_id = "-1(Default)" if runtime_id == -1 else runtime_id + print(f"{runtime_id:<12} | {runtime.state:<20}") class DefaultPlanner(_DefaultPlanner): @@ -61,22 +89,48 @@ class State(Enum): LaunchedNotRunning = 1 Running = 2 + def __init__(self, runtime_id: int = -1): + self.runtime_id = runtime_id + self.executor: Executor = None + self.state: Runtime.State = Runtime.State.Init + _RuntimeState.runtime[runtime_id] = self + + def get_state(self) -> "Runtime.State": + """ + Get the runtime state. + """ + return self.state + @staticmethod - def get_runtime() -> "Runtime": + def exists(runtime_id: int) -> bool: """ - Get the runtime. + Check if a runtime exists with the given ID. """ - if _RuntimeState.runtime is None: - _RuntimeState.runtime = Runtime() - return _RuntimeState.runtime + return runtime_id in _RuntimeState.runtime - def __init__(self): - self.executor: Executor = None - self.state: Runtime.State = Runtime.State.Init - _RuntimeState.runtime = self + @staticmethod + def get_all_ids() -> List[int]: + """ + Get a list of all existing runtime IDs. + """ + return list(_RuntimeState.runtime.keys()) - def __del__(self): - self.reset() + @staticmethod + def get_runtime(runtime_id=-1) -> "Runtime": + """ + Get the runtime by ID. If runtime_id is not provided, use a default ID of -1. + If the runtime does not exist, create a new runtime with the given ID. + """ + if runtime_id not in _RuntimeState.runtime: + _RuntimeState.runtime[runtime_id] = Runtime(runtime_id) + return _RuntimeState.runtime[runtime_id] + + @staticmethod + def see_runtime_statuses() -> "Dict[int, Runtime]": + """ + Returns the runtime dictionary containing all of the runtimes. + """ + return _RuntimeState.runtime def __enter__(self): return self @@ -113,7 +167,9 @@ def launch( initialized. The executor will compile the cuda kernels and launch the ARK runtime. """ if self.launched(): - logging.warn("Runtime is already launched, skip launching") + logging.warn( + f"Runtime {self.runtime_id} is already launched, skip launching" + ) return if not plan: if not plan_path: @@ -124,19 +180,19 @@ def launch( # If the RuntimeState is init, we need to create a new executor and # compile the kernels if self.state == Runtime.State.Init: - if _RuntimeState.executor is not None: - if not _RuntimeState.executor.destroyed(): - logging.warn("Destroying an old executor") - _RuntimeState.executor.destroy() - - _RuntimeState.executor = Executor( + if self.executor is not None: + if not self.executor.destroyed(): + logging.warn( + f"Runtime {self.runtime_id}, has already been launched. Destroying the old executor" + ) + self.executor.destroy() + self.executor = Executor( rank, world_size, gpu_id, "ArkRuntime", plan, ) - self.executor = _RuntimeState.executor self.executor.compile() self.executor.launch() self.state = Runtime.State.LaunchedNotRunning @@ -146,8 +202,8 @@ def run(self, iter=1, non_blocking=False): Run the ARK program for iter iterations and wait for the kernel to finish. """ if self.state != Runtime.State.LaunchedNotRunning: - logging.error("ARK runtime is not launched") - raise RuntimeError("ARK runtime is not launched") + logging.error(f"ARK runtime {self.runtime_id} is not launched") + raise RuntimeError(f"ARK runtime {self.runtime_id} is not launched") self.state = Runtime.State.Running self.executor.run(iter) if not non_blocking: @@ -158,7 +214,9 @@ def wait(self): Wait for the kernel to finish. """ if self.state != Runtime.State.Running: - logging.warn("ARK runtime is not running, skip waiting") + logging.warn( + f"ARK runtime {self.runtime_id} is not running, skip waiting" + ) return self.executor.wait() self.state = Runtime.State.LaunchedNotRunning @@ -169,15 +227,17 @@ def stop(self) -> float: Once this is called, we need to call `launch()` again to run the model again. """ if not self.launched(): - logging.warn("ARK runtime is never launched, skip stopping") + logging.warn( + f"ARK runtime {self.runtime_id} is never launched, skip stopping" + ) return elapsed = self.executor.stop() self.state = Runtime.State.LaunchedNotRunning return elapsed - def reset(self): + def reset(self, delete=False): """ - Reset the runtime. + Reset the runtime. If delete is True, delete the runtime associated with the runtime_id. """ if self.launched(): self.stop() @@ -186,3 +246,26 @@ def reset(self): self.executor.destroy() self.executor = None self.state = Runtime.State.Init + if delete: + del _RuntimeState.runtime[self.runtime_id] + + @staticmethod + def reset_all_runtimes(): + """ + Reset all runtimes. + """ + _RuntimeState.reset_all() + + @staticmethod + def delete_all_runtimes(): + """ + Delete all runtimes. + """ + _RuntimeState.delete_all() + + @staticmethod + def print_runtime_states(): + """ + Print runtimes and their corresponding states. + """ + _RuntimeState.print_runtime_states() diff --git a/python/ark/tensor.py b/python/ark/tensor.py index a567264d5..00e266929 100644 --- a/python/ark/tensor.py +++ b/python/ark/tensor.py @@ -29,14 +29,22 @@ class Dims(_Dims): class Tensor: - def __init__(self, _tensor: _Tensor, initializer: Initializer = None): + def __init__( + self, + _tensor: _Tensor, + initializer: Initializer = None, + runtime_id: int = -1, + ): """ Initializes a new instance of the Tensor class. Args: _tensor (_ark_core._Tensor): The underlying _Tensor object. + intializer (Initializer): The initializer for the Tensor. + runtime_id (int): The ID of the Runtime to use. Defaults to -1, which is the default Runtime. """ self._tensor = _tensor self.initializer: Initializer = initializer + self.runtime_id = runtime_id def shape(self) -> List[int]: """ @@ -69,7 +77,7 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray: an empty numpy array without the data buffer will be returned. """ np_type = self.dtype().to_numpy() - rt = Runtime.get_runtime() + rt = Runtime.get_runtime(self.runtime_id) if not rt.launched(): return np.ndarray(self.shape(), dtype=np_type, buffer=None) if ndarray is None: @@ -85,7 +93,9 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray: rt.executor.tensor_read(self._tensor, ndarray) return ndarray - def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor: + def to_torch( + self, tensor: torch.Tensor = None, runtime_id: int = -1 + ) -> torch.Tensor: """ """ if _no_torch: raise ImportError("torch is not available") @@ -100,22 +110,42 @@ def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor: raise ValueError("torch tensor is not contiguous in memory") elif tensor.numel() != self.nelems(): raise ValueError("torch tensor size does not match the tensor") - tensor.copy_(torch.from_numpy(self.to_numpy())) + tensor.copy_(torch.from_numpy(self.to_numpy(self.runtime_id))) return tensor - @staticmethod - def from_numpy(ndarray: np.ndarray): - return Tensor( - Model.get_model().tensor( - Dims(list(ndarray.shape)), - DataType.from_numpy(ndarray.dtype).ctype(), - Dims(), - Dims(), - Dims(), - "", - ), - lambda: ndarray, - ) + def get_torch_view(self) -> torch.Tensor: + """ + Returns a torch tensor that shares the same memory with the device tensor. + """ + if _no_torch: + raise ImportError("torch is not available") + rt = Runtime.get_runtime(self.runtime_id) + if not rt.launched(): + raise RuntimeError( + "Tensor is not allocated yet. `Tensor.get_torch_view()` is " + "usable only after you call `Runtime.launch()`." + ) + dl_tensor = rt.executor.get_dl_tensor(self._tensor) + torch_view = torch.utils.dlpack.from_dlpack(dl_tensor) + return torch_view + + def from_numpy(self, ndarray: np.ndarray) -> "Tensor": + """ + Copies the tensor from a host numpy array to the device. + """ + rt = Runtime.get_runtime(self.runtime_id) + if not rt.launched(): + raise RuntimeError( + "Tensor is not allocated yet. `Tensor.from_numpy()` is " + "usable only after you call `Runtime.launch()`." + ) + ndarray = ndarray.astype(self.dtype().to_numpy()) + if not ndarray.flags["C_CONTIGUOUS"]: + ndarray = np.ascontiguousarray(ndarray) + if ndarray.nbytes != self.nelems() * self.dtype().element_size(): + raise ValueError("ndarray size does not match the tensor") + rt.executor.tensor_write(self._tensor, ndarray) + return self @staticmethod def from_torch(tensor: torch.Tensor): @@ -135,7 +165,7 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor": """ Copies the tensor from a host numpy array to the device. """ - rt = Runtime.get_runtime() + rt = Runtime.get_runtime(self.runtime_id) if not rt.launched(): raise RuntimeError( "Tensor is not allocated yet. `Tensor.from_numpy()` is " @@ -180,8 +210,9 @@ class Parameter(Tensor): A tensor as a parameter. """ - def __init__(self, _tensor: _Tensor): + def __init__(self, _tensor: _Tensor, runtime_id: int = -1): """ Initializes a new instance of the Parameter class. """ super().__init__(_tensor) + self.runtime_id = runtime_id diff --git a/python/executor_py.cpp b/python/executor_py.cpp index 13a81608e..59bee5a9b 100644 --- a/python/executor_py.cpp +++ b/python/executor_py.cpp @@ -1,13 +1,14 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#include #include #include #include #include #include - +#include namespace py = pybind11; static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor, @@ -29,6 +30,29 @@ static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor, info.size * info.itemsize); } +DLManagedTensor *to_dlpack(ark::Executor &exe, const ark::Tensor &tensor) { + DLManagedTensor *dl_tensor = exe.get_dl_tensor(tensor); + return dl_tensor; +} + +void free_capsule(PyObject *capsule) { + const char *name = PyCapsule_GetName(capsule); + auto *dl_managed_tensor = + static_cast(PyCapsule_GetPointer(capsule, name)); + if (dl_managed_tensor) { + dl_managed_tensor->deleter(dl_managed_tensor); + dl_managed_tensor = nullptr; + } +} + +py::capsule to_dlpack_capsule(ark::Executor &self, const ark::Tensor &tensor) { + DLManagedTensor *dl_managed_tensor = to_dlpack(self, tensor); + const char *capsule_name = "dltensor"; + PyObject *dl_capsule = PyCapsule_New(static_cast(dl_managed_tensor), + capsule_name, free_capsule); + return py::reinterpret_steal(dl_capsule); +} + void register_executor(py::module &m) { py::class_(m, "_Executor") .def( @@ -52,5 +76,7 @@ void register_executor(py::module &m) { .def("tensor_write", py::overload_cast(&tensor_write), - py::arg("tensor"), py::arg("address"), py::arg("bytes")); + py::arg("tensor"), py::arg("address"), py::arg("bytes")) + .def("get_dl_tensor", &to_dlpack_capsule), + py::arg("tensor"); } diff --git a/python/unittest/test.py b/python/unittest/test.py index f6f9b97af..e43ff11e2 100644 --- a/python/unittest/test.py +++ b/python/unittest/test.py @@ -9,3 +9,4 @@ from test_model import * from test_runtime import * +from test_conversion import * diff --git a/python/unittest/test_conversion.py b/python/unittest/test_conversion.py new file mode 100644 index 000000000..8f941a033 --- /dev/null +++ b/python/unittest/test_conversion.py @@ -0,0 +1,93 @@ +import torch +import numpy as np +import ark + + +def initialize_tensor(dimensions, dtype): + tensor = ark.tensor(dimensions, dtype) + tensor_host = np.random.rand(*dimensions).astype(dtype.to_numpy()) + return tensor, tensor_host + + +# Test function to validate the integrity of the PyTorch view of the ARK tensor, +# including its data and attributes such as shape and data type. +def test_values_fixed_dims(num_dims: int, size: int, dtype: ark.DataType): + ark.init() + dimensions = [size] * num_dims + + input_tensor, input_tensor_host = initialize_tensor(dimensions, dtype) + other_tensor, other_tensor_host = initialize_tensor(dimensions, dtype) + output_tensor = ark.add(input_tensor, other_tensor) + + runtime = ark.Runtime() + runtime.launch() + + input_tensor.from_numpy(input_tensor_host) + other_tensor.from_numpy(other_tensor_host) + + input_view = input_tensor.get_torch_view() + other_view = other_tensor.get_torch_view() + output_view = output_tensor.get_torch_view() + + runtime.run() + + input_view_numpy = input_view.cpu().numpy() + other_view_numpy = other_view.cpu().numpy() + output_view_numpy = output_view.cpu().numpy() + + output_tensor_host = output_tensor.to_numpy() + + runtime.stop() + runtime.delete_all_runtimes() + + assert np.allclose(input_tensor_host, input_view_numpy) + assert np.allclose(other_tensor_host, other_view_numpy) + assert np.allclose(output_tensor_host, output_view_numpy) + + +# Function to check if there is a difference between two arrays at a specific index +def check_diff(input_tensor_host, input_view_numpy, value, index): + mask = np.ones(input_tensor_host.shape, dtype=bool) + mask[index] = False + if not np.allclose(input_tensor_host[mask], input_view_numpy[mask]): + print("Difference found at index: ", index) + return False + if input_view_numpy[index] != value: + print(input_view_numpy[index], value) + return False + return True + + +# Test function to check if changes to the torch views are reflected in the original tensors +def test_aliasing(dtype: ark.DataType): + ark.init() + dimensions = [4, 4] + input_tensor, input_tensor_host = initialize_tensor(dimensions, dtype) + other_tensor, other_tensor_host = initialize_tensor(dimensions, dtype) + output_tensor = ark.mul(input_tensor, other_tensor) + runtime = ark.Runtime() + runtime.launch() + input_tensor.from_numpy(input_tensor_host) + other_tensor.from_numpy(other_tensor_host) + + input_view = input_tensor.get_torch_view() + other_view = other_tensor.get_torch_view() + output_view = output_tensor.get_torch_view() + # make changes to the views + input_view[1, 1] = 20 + other_view[0, 0] = 30 + runtime.run() + output_view[3, 0] = 40 + + output_tensor_host = output_tensor.to_numpy() + input_view_numpy = input_view.cpu().numpy() + other_view_numpy = other_view.cpu().numpy() + output_view_numpy = output_view.cpu().numpy() + # Check if changes to the views are reflected in the original tensors + print(input_view_numpy) + assert check_diff(input_tensor_host, input_view_numpy, 20, (1, 1)) + assert check_diff(other_tensor_host, other_view_numpy, 30, (0, 0)) + assert check_diff(output_tensor_host, output_view_numpy, 40, (3, 0)) + + runtime.stop() + runtime.reset() diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py index bd9098fe8..fd34bb96b 100644 --- a/python/unittest/test_runtime.py +++ b/python/unittest/test_runtime.py @@ -4,21 +4,20 @@ import ark import json +empty_plan = json.dumps( + { + "Rank": 0, + "WorldSize": 1, + "NumProcessors": 1, + "NumWarpsPerProcessor": 1, + "TaskInfos": [], + "ProcessorGroups": [], + } +) + def test_runtime_relaunch(): ark.init() - - empty_plan = json.dumps( - { - "Rank": 0, - "WorldSize": 1, - "NumProcessors": 1, - "NumWarpsPerProcessor": 1, - "TaskInfos": [], - "ProcessorGroups": [], - } - ) - with ark.Runtime.get_runtime() as rt: assert rt.launched() == False rt.launch(plan=empty_plan) @@ -28,3 +27,101 @@ def test_runtime_relaunch(): assert rt.launched() == False rt.launch(plan=empty_plan) assert rt.launched() == True + + +def test_multiple_runtime_launch(): + ark.init() + num_runtimes = 5 + for i in range(num_runtimes): + rt = ark.Runtime.get_runtime(i) + assert rt.launched() == False + rt.launch(gpu_id=i, plan=empty_plan) + assert rt.launched() == True + for i in range(num_runtimes): + rt = ark.Runtime.get_runtime(i) + assert rt.launched() == True + ark.Runtime.delete_all_runtimes() + + +def test_stop_runtime(): + ark.init() + rt1 = ark.Runtime.get_runtime(1) + rt1.launch(plan=empty_plan, gpu_id=1) + rt2 = ark.Runtime.get_runtime(2) + rt2.launch(plan=empty_plan, gpu_id=2) + rt1.stop() + rt1.reset() + assert rt1.state == ark.Runtime.State.Init + assert rt2.state == ark.Runtime.State.LaunchedNotRunning + ark.Runtime.delete_all_runtimes() + + +def test_reset_runtime(): + ark.init() + rt1 = ark.Runtime.get_runtime(0) + rt1.launch(plan=empty_plan, gpu_id=1) + rt2 = ark.Runtime.get_runtime(1) + rt2.launch(plan=empty_plan, gpu_id=2) + rt1.reset() + assert rt1.launched() == False + assert rt2.launched() == True + rt1.launch(plan=empty_plan) + assert rt1.launched() == True + ark.Runtime.delete_all_runtimes() + + +def test_multiple_runtimes_complex(): + ark.init() + num_runtimes = 3 + runtime_list = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)] + default_runtime = ark.Runtime.get_runtime() + runtime_list.append(default_runtime) + for i, rt in enumerate(runtime_list): + rt.launch(plan=empty_plan, gpu_id=i) + assert rt.launched() == True + runtime_list[0].stop() + assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning + for rt in runtime_list[1:]: + assert rt.launched() == True + runtime_list[1].reset() + assert runtime_list[1].state == ark.Runtime.State.Init + assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning + assert runtime_list[2].state == ark.Runtime.State.LaunchedNotRunning + runtime_list[1].launch(plan=empty_plan, gpu_id=1) + for rt in runtime_list: + assert rt.launched() == True + ark.Runtime.delete_all_runtimes() + + +def test_runtime_state_after_reset(): + ark.init() + rt = ark.Runtime.get_runtime() + rt.launch(plan=empty_plan) + rt.reset() + assert rt.launched() == False + assert rt.running() == False + ark.Runtime.delete_all_runtimes() + + +def test_see_runtime_statuses(): + ark.init() + num_runtimes = 3 + runtimes = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)] + runtime_statuses = ark.Runtime.see_runtime_statuses() + assert len(runtime_statuses) == num_runtimes + for i in range(num_runtimes): + assert i in runtime_statuses + for i, rt in enumerate(runtimes): + assert runtime_statuses[i] == rt + ark.Runtime.delete_all_runtimes() + + +def test_multiple_runtimes_init(): + ark.init() + runtimes = [ark.Runtime.get_runtime(i) for i in range(3)] + for rt in runtimes: + assert rt.state == ark.Runtime.State.Init + ark.init() + runtimes = ark.Runtime.see_runtime_statuses() + assert len(runtimes) == 0 + ark.Runtime.delete_all_runtimes() From 9a0556bde84a4dd6a76f39155d60957c9165ad52 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 18 Jun 2024 21:30:02 +0000 Subject: [PATCH 09/61] cmake dlpack --- .gitmodules | 4 ++++ ark/CMakeLists.txt | 1 + third_party/CMakeLists.txt | 13 +++++++++++++ third_party/dlpack | 1 + 4 files changed, 19 insertions(+) create mode 160000 third_party/dlpack diff --git a/.gitmodules b/.gitmodules index ced5dcf94..ec484eb61 100644 --- a/.gitmodules +++ b/.gitmodules @@ -17,3 +17,7 @@ [submodule "third_party/json"] path = third_party/json url = https://github.com/nlohmann/json + +[submodule "third_party/dlpack"] + path = third_party/dlpack + url = https://github.com/dmlc/dlpack diff --git a/ark/CMakeLists.txt b/ark/CMakeLists.txt index 4457d3c0b..ce03b65ed 100644 --- a/ark/CMakeLists.txt +++ b/ark/CMakeLists.txt @@ -17,6 +17,7 @@ set(COMMON_LIBS ARK::numa ARK::ibverbs pthread rt) target_include_directories(ark_obj PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) target_include_directories(ark_obj PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_include_directories(ark_obj SYSTEM PRIVATE + ${DLPACK_INCLUDE_DIRS} ${JSON_INCLUDE_DIRS} ${MSCCLPP_INCLUDE_DIRS} ${IBVERBS_INCLUDE_DIRS} diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index 75916d962..cc4b5eb5c 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -35,6 +35,19 @@ if (NOT json_POPULATED) endif() set(JSON_INCLUDE_DIRS ${json_SOURCE_DIR}/include PARENT_SCOPE) +# DLPack +FetchContent_Declare( + dlpack + GIT_REPOSITORY https://github.com/dmlc/dlpack + GIT_TAG v0.8 + SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/dlpack +) +FetchContent_GetProperties(dlpack) +if (NOT dlpack_POPULATED) + FetchContent_Populate(dlpack) +endif() +set(DLPACK_INCLUDE_DIRS ${dlpack_SOURCE_DIR}/include PARENT_SCOPE) + if(USE_CUDA) # Configure CUTLASS FetchContent_Declare( diff --git a/third_party/dlpack b/third_party/dlpack new file mode 160000 index 000000000..365b823ce --- /dev/null +++ b/third_party/dlpack @@ -0,0 +1 @@ +Subproject commit 365b823cedb281cd0240ca601aba9b78771f91a3 From 75f7831b700783e899beaa15f950f125a7520d6c Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 18 Jun 2024 22:38:35 +0000 Subject: [PATCH 10/61] include dlpack for pybind --- python/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index efb9aea3e..bd25d01e6 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -20,3 +20,4 @@ file(GLOB_RECURSE BIND_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.c pybind11_add_module(ark_py ${BIND_SOURCES}) set_target_properties(ark_py PROPERTIES OUTPUT_NAME _ark_core) target_link_libraries(ark_py PRIVATE ark_static) +target_include_directories(ark_py SYSTEM PRIVATE ${DLPACK_INCLUDE_DIRS}) From 94b44f20a15c892d5a47e1597d838891ca600553 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 24 Jun 2024 23:51:22 +0000 Subject: [PATCH 11/61] support d2d copy --- ark/api/executor.cpp | 99 ++++++++++++++++++++---------- ark/include/ark/executor.hpp | 10 ++- python/ark/tensor.py | 42 +++++++++---- python/executor_py.cpp | 33 +++++++--- python/unittest/test_conversion.py | 37 ++++++++++- 5 files changed, 162 insertions(+), 59 deletions(-) diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index a0711bfe8..96e53c8cf 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -147,6 +147,8 @@ class Executor::Impl { const std::string &plan); ~Impl() = default; + int gpu_id() const { return gpu_id_; } + void compile(); void launch(int64_t max_spin_count); void run(int iter); @@ -154,9 +156,10 @@ class Executor::Impl { float stop(int64_t max_spin_count); void barrier(); - void tensor_read(const Tensor tensor, void *data, size_t bytes) const; + void tensor_read(const Tensor tensor, void *data, size_t bytes, + bool is_d2d) const; void tensor_write(const Tensor tensor, const void *data, - size_t bytes) const; + size_t bytes, bool is_d2d) const; DLDeviceType get_device_type() const; DLManagedTensor *get_dl_tensor(const Tensor &tensor) const; @@ -731,57 +734,83 @@ void Executor::Impl::barrier() { } void Executor::Impl::tensor_read(const Tensor tensor, void *data, - size_t bytes) const { + size_t bytes, bool is_d2d) const { GLOG(gpuSetDevice(gpu_id_)); size_t tensor_data_bytes = tensor.shape().nelems() * tensor.data_type().bytes(); - if (bytes < tensor_data_bytes) { - ERR(InvalidUsageError, "Data buffer (", bytes, - ") is smaller than the tensor data (", tensor_data_bytes, ")."); + if (bytes != tensor_data_bytes) { + ERR(InvalidUsageError, "Destination bytes (", bytes, + ") mismatches the tensor data bytes (", tensor_data_bytes, ")."); } - size_t tensor_bytes = - tensor.strides().nelems() * tensor.data_type().bytes(); - void *src = - buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id())); + size_t buffer_id = tensor.ref()->buffer()->id(); + if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) { + ERR(NotFoundError, "Invalid buffer ID: ", buffer_id); + } + size_t offset = buffer_id_to_offset_.at(buffer_id); + auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyDeviceToHost; + void *src = buffer_->ref(offset); if (tensor.strides() == tensor.shape()) { - GLOG(gpuMemcpyAsync(data, src, bytes, gpuMemcpyDeviceToHost, - copy_stream_->get())); - copy_stream_->sync(); + GLOG(gpuMemcpyAsync(data, src, bytes, kind, copy_stream_->get())); } else { + size_t tensor_bytes = + tensor.strides().nelems() * tensor.data_type().bytes(); std::vector tensor_host(tensor_bytes); GLOG(gpuMemcpyAsync(tensor_host.data(), src, tensor_bytes, gpuMemcpyDeviceToHost, copy_stream_->get())); copy_stream_->sync(); - tensor_to_data(tensor_host.data(), static_cast(data), - tensor.shape(), tensor.strides(), tensor.offsets(), - tensor.data_type().bytes()); + if (!is_d2d) { + tensor_to_data(tensor_host.data(), static_cast(data), + tensor.shape(), tensor.strides(), tensor.offsets(), + tensor.data_type().bytes()); + return; + } + // TODO: convert data layout on the device directly + std::vector data_host(bytes); + tensor_to_data(tensor_host.data(), data_host.data(), + tensor.shape(), tensor.strides(), tensor.offsets(), + tensor.data_type().bytes()); + GLOG(gpuMemcpyAsync(data, data_host.data(), bytes, + gpuMemcpyHostToDevice, copy_stream_->get())); } + copy_stream_->sync(); } void Executor::Impl::tensor_write(const Tensor tensor, const void *data, - size_t bytes) const { + size_t bytes, bool is_d2d) const { GLOG(gpuSetDevice(gpu_id_)); size_t tensor_data_bytes = tensor.shape().nelems() * tensor.data_type().bytes(); - if (bytes < tensor_data_bytes) { - ERR(InvalidUsageError, "Data buffer (", bytes, - ") is smaller than the tensor data (", tensor_data_bytes, ")."); + if (bytes != tensor_data_bytes) { + ERR(InvalidUsageError, "Source bytes (", bytes, + ") mismatches the tensor data bytes (", tensor_data_bytes, ")."); + } + size_t buffer_id = tensor.ref()->buffer()->id(); + if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) { + ERR(NotFoundError, "Invalid buffer ID: ", buffer_id); } + size_t offset = buffer_id_to_offset_.at(buffer_id); size_t tensor_bytes = tensor.strides().nelems() * tensor.data_type().bytes(); - void *dst = - buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id())); + auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice; + void *dst = buffer_->ref(offset); if (tensor.strides() == tensor.shape()) { - GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, gpuMemcpyHostToDevice, - copy_stream_->get())); + GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_->get())); } else { std::vector tensor_host(tensor_bytes); - GLOG(gpuMemcpyAsync(tensor_host.data(), dst, tensor_bytes, - gpuMemcpyDeviceToHost, copy_stream_->get())); - copy_stream_->sync(); - data_to_tensor(tensor_host.data(), static_cast(data), - tensor.shape(), tensor.strides(), tensor.offsets(), - tensor.data_type().bytes()); + if (!is_d2d) { + data_to_tensor(tensor_host.data(), static_cast(data), + tensor.shape(), tensor.strides(), tensor.offsets(), + tensor.data_type().bytes()); + } else { + // TODO: convert data layout on the device directly + std::vector tmp(bytes); + GLOG(gpuMemcpyAsync(tmp.data(), data, bytes, + gpuMemcpyDeviceToHost, copy_stream_->get())); + copy_stream_->sync(); + data_to_tensor(tensor_host.data(), tmp.data(), + tensor.shape(), tensor.strides(), tensor.offsets(), + tensor.data_type().bytes()); + } GLOG(gpuMemcpyAsync(dst, tensor_host.data(), tensor_bytes, gpuMemcpyHostToDevice, copy_stream_->get())); } @@ -883,6 +912,8 @@ Executor::Executor(int rank, int world_size, int gpu_id, Executor::~Executor() = default; +int Executor::gpu_id() const { return impl_->gpu_id(); } + void Executor::compile() { impl_->compile(); } void Executor::launch(int64_t max_spin_count) { impl_->launch(max_spin_count); } @@ -902,13 +933,13 @@ void Executor::destroy() { impl_.reset(nullptr); } bool Executor::destroyed() const { return impl_.get() == nullptr; } void Executor::tensor_read(const Tensor tensor, void *data, - size_t bytes) const { - impl_->tensor_read(tensor, data, bytes); + size_t bytes, bool is_d2d) const { + impl_->tensor_read(tensor, data, bytes, is_d2d); } void Executor::tensor_write(const Tensor tensor, const void *data, - size_t bytes) const { - impl_->tensor_write(tensor, data, bytes); + size_t bytes, bool is_d2d) const { + impl_->tensor_write(tensor, data, bytes, is_d2d); } DLDeviceType Executor::get_device_type() const { diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp index 54c49cd29..a5d6f0273 100644 --- a/ark/include/ark/executor.hpp +++ b/ark/include/ark/executor.hpp @@ -23,6 +23,9 @@ class Executor { ~Executor(); + /// Return the GPU ID. + int gpu_id() const; + /// Compile the model. This must be called before `launch()`. void compile(); @@ -59,10 +62,11 @@ class Executor { data.size() * sizeof(T)); } - void tensor_read(const Tensor tensor, void *data, size_t bytes) const; + void tensor_read(const Tensor tensor, void *data, size_t bytes, + bool is_d2d = false) const; - void tensor_write(const Tensor tensor, const void *data, - size_t bytes) const; + void tensor_write(const Tensor tensor, const void *data, size_t bytes, + bool is_d2d = false) const; DLManagedTensor *get_dl_tensor(const Tensor &tensor) const; diff --git a/python/ark/tensor.py b/python/ark/tensor.py index 00e266929..eff1bf20e 100644 --- a/python/ark/tensor.py +++ b/python/ark/tensor.py @@ -77,10 +77,17 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray: an empty numpy array without the data buffer will be returned. """ np_type = self.dtype().to_numpy() + if np_type is None: + raise ValueError( + f"Tensor data type {self.dtype().__name__} is not supported by numpy." + ) rt = Runtime.get_runtime(self.runtime_id) if not rt.launched(): - return np.ndarray(self.shape(), dtype=np_type, buffer=None) - if ndarray is None: + raise RuntimeError( + "Tensor is not allocated yet. `Tensor.to_numpy()` is " + "usable only after you call `Runtime.launch()`." + ) + elif ndarray is None: ndarray = np.zeros(self.shape(), dtype=np_type) elif not ndarray.flags["C_CONTIGUOUS"]: raise ValueError("ndarray is not contiguous in memory") @@ -99,9 +106,18 @@ def to_torch( """ """ if _no_torch: raise ImportError("torch is not available") + rt = Runtime.get_runtime(self.runtime_id) + if not rt.launched(): + raise RuntimeError( + "Tensor is not allocated yet. `Tensor.to_torch()` is " + "usable only after you call `Runtime.launch()`." + ) torch_type = self.dtype().to_torch() if tensor is None: - return torch.from_numpy(self.to_numpy()) + dev_name = f"cuda:{rt.executor.gpu_id()}" + tensor = torch.zeros( + self.shape(), dtype=torch_type, device=torch.device(dev_name) + ) elif tensor.shape != self.shape(): raise ValueError("torch tensor shape does not match the tensor") elif tensor.dtype != torch_type: @@ -110,7 +126,10 @@ def to_torch( raise ValueError("torch tensor is not contiguous in memory") elif tensor.numel() != self.nelems(): raise ValueError("torch tensor size does not match the tensor") - tensor.copy_(torch.from_numpy(self.to_numpy(self.runtime_id))) + tensor_bytes = self.nelems() * self.dtype().element_size() + rt.executor.tensor_read( + self._tensor, tensor.data_ptr(), tensor_bytes, True + ) return tensor def get_torch_view(self) -> torch.Tensor: @@ -163,7 +182,8 @@ def from_torch(tensor: torch.Tensor): def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor": """ - Copies the tensor from a host numpy array to the device. + Copies data into this tensor. The data type may differ, + but the size must match. """ rt = Runtime.get_runtime(self.runtime_id) if not rt.launched(): @@ -171,24 +191,22 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor": "Tensor is not allocated yet. `Tensor.from_numpy()` is " "usable only after you call `Runtime.launch()`." ) + tensor_bytes = self.nelems() * self.dtype().element_size() if isinstance(data, torch.Tensor): - if data.dtype != self.dtype().to_torch(): - raise ValueError("data dtype does not match the tensor") if not data.is_contiguous(): data = data.contiguous() - if data.numel() != self.nelems(): + if data.numel() * data.element_size() != tensor_bytes: raise ValueError("data size does not match the tensor") rt.executor.tensor_write( self._tensor, data.data_ptr(), - data.numel() * data.element_size(), + tensor_bytes, + data.device.type == "cuda", ) elif isinstance(data, np.ndarray): - if data.dtype != self.dtype().to_numpy(): - raise ValueError("data dtype does not match the tensor") if not data.flags["C_CONTIGUOUS"]: data = np.ascontiguousarray(data) - if data.nbytes != self.nelems() * self.dtype().element_size(): + if data.nbytes != tensor_bytes: raise ValueError("data size does not match the tensor") rt.executor.tensor_write(self._tensor, data) else: diff --git a/python/executor_py.cpp b/python/executor_py.cpp index 59bee5a9b..b6cf8a7a8 100644 --- a/python/executor_py.cpp +++ b/python/executor_py.cpp @@ -15,19 +15,24 @@ static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor, py::buffer host_buffer) { py::buffer_info info = host_buffer.request(); exe->tensor_write(tensor, reinterpret_cast(info.ptr), - info.size * info.itemsize); + info.size * info.itemsize, false); } static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor, - size_t host_address, size_t bytes) { - exe->tensor_write(tensor, reinterpret_cast(host_address), bytes); + size_t address, size_t bytes, bool is_d2d) { + exe->tensor_write(tensor, reinterpret_cast(address), bytes, is_d2d); } static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor, py::buffer host_buffer) { py::buffer_info info = host_buffer.request(); exe->tensor_read(tensor, reinterpret_cast(info.ptr), - info.size * info.itemsize); + info.size * info.itemsize, false); +} + +static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor, + size_t address, size_t bytes, bool is_d2d) { + exe->tensor_read(tensor, reinterpret_cast(address), bytes, is_d2d); } DLManagedTensor *to_dlpack(ark::Executor &exe, const ark::Tensor &tensor) { @@ -59,6 +64,7 @@ void register_executor(py::module &m) { py::init(), py::arg("rank"), py::arg("world_size"), py::arg("gpu_id"), py::arg("name"), py::arg("plan")) + .def("gpu_id", &ark::Executor::gpu_id) .def("compile", &ark::Executor::compile) .def("launch", &ark::Executor::launch, py::arg("max_spin_count") = -1) .def("run", &ark::Executor::run, py::arg("iter")) @@ -67,7 +73,16 @@ void register_executor(py::module &m) { .def("barrier", &ark::Executor::barrier) .def("destroy", &ark::Executor::destroy) .def("destroyed", &ark::Executor::destroyed) - .def("tensor_read", &tensor_read, py::arg("tensor"), py::arg("data")) + .def( + "tensor_read", + py::overload_cast( + &tensor_read), + py::arg("tensor"), py::arg("data")) + .def("tensor_read", + py::overload_cast(&tensor_read), + py::arg("tensor"), py::arg("address"), py::arg("bytes"), + py::arg("is_d2d")) .def( "tensor_write", py::overload_cast( @@ -75,8 +90,8 @@ void register_executor(py::module &m) { py::arg("tensor"), py::arg("data")) .def("tensor_write", py::overload_cast(&tensor_write), - py::arg("tensor"), py::arg("address"), py::arg("bytes")) - .def("get_dl_tensor", &to_dlpack_capsule), - py::arg("tensor"); + size_t, bool>(&tensor_write), + py::arg("tensor"), py::arg("address"), py::arg("bytes"), + py::arg("is_d2d")) + .def("get_dl_tensor", &to_dlpack_capsule); } diff --git a/python/unittest/test_conversion.py b/python/unittest/test_conversion.py index 8f941a033..5befa1c34 100644 --- a/python/unittest/test_conversion.py +++ b/python/unittest/test_conversion.py @@ -1,7 +1,14 @@ -import torch +import pytest import numpy as np import ark +try: + import torch + + _no_torch = False +except ImportError: + _no_torch = True + def initialize_tensor(dimensions, dtype): tensor = ark.tensor(dimensions, dtype) @@ -11,6 +18,8 @@ def initialize_tensor(dimensions, dtype): # Test function to validate the integrity of the PyTorch view of the ARK tensor, # including its data and attributes such as shape and data type. +@pytest.mark.parametrize("num_dims,size", [(1, 5), (1, 1024), (2, 5), (2, 32)]) +@pytest.mark.parametrize("dtype", [ark.fp16, ark.fp32]) def test_values_fixed_dims(num_dims: int, size: int, dtype: ark.DataType): ark.init() dimensions = [size] * num_dims @@ -59,6 +68,7 @@ def check_diff(input_tensor_host, input_view_numpy, value, index): # Test function to check if changes to the torch views are reflected in the original tensors +@pytest.mark.parametrize("dtype", [ark.fp16, ark.fp32]) def test_aliasing(dtype: ark.DataType): ark.init() dimensions = [4, 4] @@ -91,3 +101,28 @@ def test_aliasing(dtype: ark.DataType): runtime.stop() runtime.reset() + + +def test_conversion_torch(): + if _no_torch: + pytest.skip("PyTorch not available") + + dimensions = [4, 4] + + ark.init() + t = ark.constant(7, dimensions) + + with ark.Runtime() as rt: + rt.launch() + + torch_tensor = t.to_torch() + + assert torch_tensor.shape == (4, 4) + assert torch_tensor.dtype == torch.float32 + assert torch_tensor.device.type == "cuda" + assert torch.all(torch_tensor == 0) + + rt.run() + + torch_tensor = t.to_torch() + assert torch.all(torch_tensor == 7) From 20c23f34b17ecfa24d96ffa8799c3c173b468c53 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 24 Jun 2024 23:58:59 +0000 Subject: [PATCH 12/61] lint --- ark/api/executor.cpp | 46 +++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index 96e53c8cf..ae3e5f499 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -158,8 +158,8 @@ class Executor::Impl { void tensor_read(const Tensor tensor, void *data, size_t bytes, bool is_d2d) const; - void tensor_write(const Tensor tensor, const void *data, - size_t bytes, bool is_d2d) const; + void tensor_write(const Tensor tensor, const void *data, size_t bytes, + bool is_d2d) const; DLDeviceType get_device_type() const; DLManagedTensor *get_dl_tensor(const Tensor &tensor) const; @@ -733,8 +733,8 @@ void Executor::Impl::barrier() { } } -void Executor::Impl::tensor_read(const Tensor tensor, void *data, - size_t bytes, bool is_d2d) const { +void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes, + bool is_d2d) const { GLOG(gpuSetDevice(gpu_id_)); size_t tensor_data_bytes = tensor.shape().nelems() * tensor.data_type().bytes(); @@ -760,15 +760,15 @@ void Executor::Impl::tensor_read(const Tensor tensor, void *data, copy_stream_->sync(); if (!is_d2d) { tensor_to_data(tensor_host.data(), static_cast(data), - tensor.shape(), tensor.strides(), tensor.offsets(), - tensor.data_type().bytes()); + tensor.shape(), tensor.strides(), tensor.offsets(), + tensor.data_type().bytes()); return; } // TODO: convert data layout on the device directly std::vector data_host(bytes); - tensor_to_data(tensor_host.data(), data_host.data(), - tensor.shape(), tensor.strides(), tensor.offsets(), - tensor.data_type().bytes()); + tensor_to_data(tensor_host.data(), data_host.data(), tensor.shape(), + tensor.strides(), tensor.offsets(), + tensor.data_type().bytes()); GLOG(gpuMemcpyAsync(data, data_host.data(), bytes, gpuMemcpyHostToDevice, copy_stream_->get())); } @@ -794,22 +794,24 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data, auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice; void *dst = buffer_->ref(offset); if (tensor.strides() == tensor.shape()) { - GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_->get())); + GLOG( + gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_->get())); } else { std::vector tensor_host(tensor_bytes); if (!is_d2d) { - data_to_tensor(tensor_host.data(), static_cast(data), - tensor.shape(), tensor.strides(), tensor.offsets(), - tensor.data_type().bytes()); + data_to_tensor(tensor_host.data(), + static_cast(data), tensor.shape(), + tensor.strides(), tensor.offsets(), + tensor.data_type().bytes()); } else { // TODO: convert data layout on the device directly std::vector tmp(bytes); - GLOG(gpuMemcpyAsync(tmp.data(), data, bytes, - gpuMemcpyDeviceToHost, copy_stream_->get())); + GLOG(gpuMemcpyAsync(tmp.data(), data, bytes, gpuMemcpyDeviceToHost, + copy_stream_->get())); copy_stream_->sync(); - data_to_tensor(tensor_host.data(), tmp.data(), - tensor.shape(), tensor.strides(), tensor.offsets(), - tensor.data_type().bytes()); + data_to_tensor(tensor_host.data(), tmp.data(), tensor.shape(), + tensor.strides(), tensor.offsets(), + tensor.data_type().bytes()); } GLOG(gpuMemcpyAsync(dst, tensor_host.data(), tensor_bytes, gpuMemcpyHostToDevice, copy_stream_->get())); @@ -932,13 +934,13 @@ void Executor::destroy() { impl_.reset(nullptr); } bool Executor::destroyed() const { return impl_.get() == nullptr; } -void Executor::tensor_read(const Tensor tensor, void *data, - size_t bytes, bool is_d2d) const { +void Executor::tensor_read(const Tensor tensor, void *data, size_t bytes, + bool is_d2d) const { impl_->tensor_read(tensor, data, bytes, is_d2d); } -void Executor::tensor_write(const Tensor tensor, const void *data, - size_t bytes, bool is_d2d) const { +void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes, + bool is_d2d) const { impl_->tensor_write(tensor, data, bytes, is_d2d); } From ebe85604cb7249b4e0d7d6c3eed69758c4c6825f Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 25 Jun 2024 01:21:42 +0000 Subject: [PATCH 13/61] Seperate DLPack from C++ interfaces --- ark/api/executor.cpp | 127 +++++------------------------------ ark/include/ark/executor.hpp | 8 +-- python/executor_py.cpp | 90 ++++++++++++++++++++++++- 3 files changed, 106 insertions(+), 119 deletions(-) diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index ae3e5f499..ebfa7016d 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -156,12 +156,12 @@ class Executor::Impl { float stop(int64_t max_spin_count); void barrier(); + uintptr_t tensor_address(const Tensor tensor) const; + void tensor_read(const Tensor tensor, void *data, size_t bytes, bool is_d2d) const; void tensor_write(const Tensor tensor, const void *data, size_t bytes, bool is_d2d) const; - DLDeviceType get_device_type() const; - DLManagedTensor *get_dl_tensor(const Tensor &tensor) const; private: void init_communicator(); @@ -733,6 +733,15 @@ void Executor::Impl::barrier() { } } +uintptr_t Executor::Impl::tensor_address(const Tensor tensor) const { + size_t buffer_id = tensor.ref()->buffer()->id(); + if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) { + ERR(NotFoundError, "Invalid buffer ID: ", buffer_id); + } + size_t offset = buffer_id_to_offset_.at(buffer_id); + return reinterpret_cast(buffer_->ref(offset)); +} + void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes, bool is_d2d) const { GLOG(gpuSetDevice(gpu_id_)); @@ -742,13 +751,8 @@ void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes, ERR(InvalidUsageError, "Destination bytes (", bytes, ") mismatches the tensor data bytes (", tensor_data_bytes, ")."); } - size_t buffer_id = tensor.ref()->buffer()->id(); - if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) { - ERR(NotFoundError, "Invalid buffer ID: ", buffer_id); - } - size_t offset = buffer_id_to_offset_.at(buffer_id); auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyDeviceToHost; - void *src = buffer_->ref(offset); + void *src = reinterpret_cast(tensor_address(tensor)); if (tensor.strides() == tensor.shape()) { GLOG(gpuMemcpyAsync(data, src, bytes, kind, copy_stream_->get())); } else { @@ -784,15 +788,10 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data, ERR(InvalidUsageError, "Source bytes (", bytes, ") mismatches the tensor data bytes (", tensor_data_bytes, ")."); } - size_t buffer_id = tensor.ref()->buffer()->id(); - if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) { - ERR(NotFoundError, "Invalid buffer ID: ", buffer_id); - } - size_t offset = buffer_id_to_offset_.at(buffer_id); size_t tensor_bytes = tensor.strides().nelems() * tensor.data_type().bytes(); auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice; - void *dst = buffer_->ref(offset); + void *dst = reinterpret_cast(tensor_address(tensor)); if (tensor.strides() == tensor.shape()) { GLOG( gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_->get())); @@ -819,94 +818,6 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data, copy_stream_->sync(); } -DLDeviceType Executor::Impl::get_device_type() const { -#if defined(ARK_CUDA) - return kDLCUDA; -#elif defined(ARK_ROCM) - return kDLROCM; -#else - return kDLCPU; -#endif -} - -DLDataType get_dl_dtype(const DataType &ark_data_type) { - DLDataType dl_data_type; - dl_data_type.lanes = 1; - if (ark_data_type == FP32) { - dl_data_type.code = kDLFloat; - dl_data_type.bits = 32; - } else if (ark_data_type == FP16) { - dl_data_type.code = kDLFloat; - dl_data_type.bits = 16; - } else if (ark_data_type == BF16) { - dl_data_type.code = kDLBfloat; - dl_data_type.bits = 16; - } else if (ark_data_type == INT32) { - dl_data_type.code = kDLInt; - dl_data_type.bits = 32; - } else if (ark_data_type == UINT32) { - dl_data_type.code = kDLUInt; - dl_data_type.bits = 32; - } else if (ark_data_type == INT8) { - dl_data_type.code = kDLInt; - dl_data_type.bits = 8; - } else if (ark_data_type == UINT8) { - dl_data_type.code = kDLUInt; - dl_data_type.bits = 8; - } else if (ark_data_type == BYTE) { - dl_data_type.code = kDLUInt; - dl_data_type.bits = 8; - } else { - ERR(InvalidUsageError, "Unsupported data type"); - } - return dl_data_type; -} - -DLManagedTensor *Executor::Impl::get_dl_tensor(const Tensor &tensor) const { - DLTensor dl_tensor; - dl_tensor.data = - buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id())); - size_t offset_in_elements = - tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0]; - dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes(); - dl_tensor.device.device_type = get_device_type(); - dl_tensor.device.device_id = static_cast(gpu_id_); - dl_tensor.ndim = static_cast(tensor.shape().ndims()); - dl_tensor.dtype = get_dl_dtype(tensor.data_type()); - - dl_tensor.shape = - tensor.shape().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim]; - dl_tensor.strides = - tensor.strides().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim]; - auto shape = tensor.shape(); - if (dl_tensor.shape) { - for (int i = 0; i < dl_tensor.ndim; ++i) { - dl_tensor.shape[i] = shape[i]; - } - } - if (dl_tensor.strides) { - dl_tensor.strides[dl_tensor.ndim - 1] = 1; - for (int i = dl_tensor.ndim - 2; i >= 0; --i) { - dl_tensor.strides[i] = - dl_tensor.shape[i + 1] * dl_tensor.strides[i + 1]; - } - } - DLManagedTensor *dl_managed_tensor = new DLManagedTensor(); - dl_managed_tensor->dl_tensor = dl_tensor; - dl_managed_tensor->manager_ctx = nullptr; - dl_managed_tensor->deleter = [](DLManagedTensor *self) { - if (self->dl_tensor.shape) { - delete[] self->dl_tensor.shape; - self->dl_tensor.shape = nullptr; - } - if (self->dl_tensor.strides) { - delete[] self->dl_tensor.strides; - self->dl_tensor.strides = nullptr; - } - }; - return dl_managed_tensor; -} - Executor::Executor(int rank, int world_size, int gpu_id, const std::string &name, const std::string &plan) : impl_(std::make_unique(rank, world_size, gpu_id, name, @@ -934,6 +845,10 @@ void Executor::destroy() { impl_.reset(nullptr); } bool Executor::destroyed() const { return impl_.get() == nullptr; } +uintptr_t Executor::tensor_address(const Tensor tensor) const { + return impl_->tensor_address(tensor); +} + void Executor::tensor_read(const Tensor tensor, void *data, size_t bytes, bool is_d2d) const { impl_->tensor_read(tensor, data, bytes, is_d2d); @@ -944,14 +859,6 @@ void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes, impl_->tensor_write(tensor, data, bytes, is_d2d); } -DLDeviceType Executor::get_device_type() const { - return impl_->get_device_type(); -} - -DLManagedTensor *Executor::get_dl_tensor(const Tensor &tensor) const { - return impl_->get_dl_tensor(tensor); -} - DefaultExecutor::DefaultExecutor(const Model &model, int gpu_id, const std::string &name) : Executor( diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp index a5d6f0273..b8cdaf273 100644 --- a/ark/include/ark/executor.hpp +++ b/ark/include/ark/executor.hpp @@ -4,8 +4,6 @@ #ifndef ARK_EXECUTOR_HPP #define ARK_EXECUTOR_HPP -#include - #include #include #include @@ -50,6 +48,8 @@ class Executor { bool destroyed() const; + uintptr_t tensor_address(const Tensor tensor) const; + template void tensor_read(const Tensor tensor, std::vector &data) const { tensor_read(tensor, reinterpret_cast(data.data()), @@ -68,10 +68,6 @@ class Executor { void tensor_write(const Tensor tensor, const void *data, size_t bytes, bool is_d2d = false) const; - DLManagedTensor *get_dl_tensor(const Tensor &tensor) const; - - DLDeviceType get_device_type() const; - private: class Impl; std::unique_ptr impl_; diff --git a/python/executor_py.cpp b/python/executor_py.cpp index b6cf8a7a8..e5ab4f964 100644 --- a/python/executor_py.cpp +++ b/python/executor_py.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace py = pybind11; static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor, @@ -35,9 +36,92 @@ static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor, exe->tensor_read(tensor, reinterpret_cast(address), bytes, is_d2d); } -DLManagedTensor *to_dlpack(ark::Executor &exe, const ark::Tensor &tensor) { - DLManagedTensor *dl_tensor = exe.get_dl_tensor(tensor); - return dl_tensor; +static DLDataType get_dl_dtype(const ark::DataType &ark_data_type) { + DLDataType dl_data_type; + dl_data_type.lanes = 1; + if (ark_data_type == ark::FP32) { + dl_data_type.code = kDLFloat; + dl_data_type.bits = 32; + } else if (ark_data_type == ark::FP16) { + dl_data_type.code = kDLFloat; + dl_data_type.bits = 16; + } else if (ark_data_type == ark::BF16) { + dl_data_type.code = kDLBfloat; + dl_data_type.bits = 16; + } else if (ark_data_type == ark::INT32) { + dl_data_type.code = kDLInt; + dl_data_type.bits = 32; + } else if (ark_data_type == ark::UINT32) { + dl_data_type.code = kDLUInt; + dl_data_type.bits = 32; + } else if (ark_data_type == ark::INT8) { + dl_data_type.code = kDLInt; + dl_data_type.bits = 8; + } else if (ark_data_type == ark::UINT8) { + dl_data_type.code = kDLUInt; + dl_data_type.bits = 8; + } else if (ark_data_type == ark::BYTE) { + dl_data_type.code = kDLUInt; + dl_data_type.bits = 8; + } else { + throw std::runtime_error("unexpected error"); + } + return dl_data_type; +} + +static DLDeviceType get_device_type() { +#if defined(ARK_CUDA) + return kDLCUDA; +#elif defined(ARK_ROCM) + return kDLROCM; +#else + return kDLCPU; +#endif +} + +static DLManagedTensor *to_dlpack(ark::Executor &exe, + const ark::Tensor &tensor) { + DLTensor dl_tensor; + dl_tensor.data = reinterpret_cast(exe.tensor_address(tensor)); + size_t offset_in_elements = + tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0]; + dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes(); + dl_tensor.device.device_type = get_device_type(); + dl_tensor.device.device_id = static_cast(exe.gpu_id()); + dl_tensor.ndim = static_cast(tensor.shape().ndims()); + dl_tensor.dtype = get_dl_dtype(tensor.data_type()); + + dl_tensor.shape = + tensor.shape().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim]; + dl_tensor.strides = + tensor.strides().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim]; + auto shape = tensor.shape(); + if (dl_tensor.shape) { + for (int i = 0; i < dl_tensor.ndim; ++i) { + dl_tensor.shape[i] = shape[i]; + } + } + if (dl_tensor.strides) { + dl_tensor.strides[dl_tensor.ndim - 1] = 1; + for (int i = dl_tensor.ndim - 2; i >= 0; --i) { + dl_tensor.strides[i] = + dl_tensor.shape[i + 1] * dl_tensor.strides[i + 1]; + } + } + DLManagedTensor *dl_managed_tensor = new DLManagedTensor(); + dl_managed_tensor->dl_tensor = dl_tensor; + dl_managed_tensor->manager_ctx = nullptr; + dl_managed_tensor->deleter = [](DLManagedTensor *self) { + if (self->dl_tensor.shape) { + delete[] self->dl_tensor.shape; + self->dl_tensor.shape = nullptr; + } + if (self->dl_tensor.strides) { + delete[] self->dl_tensor.strides; + self->dl_tensor.strides = nullptr; + } + }; + return dl_managed_tensor; } void free_capsule(PyObject *capsule) { From 08c9b899c22b759a6f4f194b7932f48d08eeb8f4 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 25 Jun 2024 01:30:50 +0000 Subject: [PATCH 14/61] Update workflow trigger --- .github/workflows/ut-cuda.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml index 5a78818ff..918c1a4a8 100644 --- a/.github/workflows/ut-cuda.yml +++ b/.github/workflows/ut-cuda.yml @@ -7,8 +7,7 @@ on: pull_request: branches: - main - types: - - ready_for_review + types: [opened, synchronize, reopened, ready_for_review] jobs: UnitTest: From 1fa08afa36010116cdcd6d89e64db104f3fa23d1 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 25 Jun 2024 20:53:29 +0000 Subject: [PATCH 15/61] expose exceptions --- ark/api/dims.cpp | 1 - ark/include/ark.hpp | 1 + ark/{ => include/ark}/error.hpp | 15 ++++++++++----- ark/logging.h | 2 +- python/ark/__init__.py | 12 ++++++++++++ python/ark/error.py | 12 ++++++++++++ python/ark_py.cpp | 2 ++ python/error_py.cpp | 25 +++++++++++++++++++++++++ python/unittest/test_error.py | 12 ++++++++++++ 9 files changed, 75 insertions(+), 7 deletions(-) rename ark/{ => include/ark}/error.hpp (70%) create mode 100644 python/ark/error.py create mode 100644 python/error_py.cpp create mode 100644 python/unittest/test_error.py diff --git a/ark/api/dims.cpp b/ark/api/dims.cpp index a2830a060..a1f03b426 100644 --- a/ark/api/dims.cpp +++ b/ark/api/dims.cpp @@ -5,7 +5,6 @@ #include -#include "error.hpp" #include "logging.h" namespace ark { diff --git a/ark/include/ark.hpp b/ark/include/ark.hpp index a7b2f7f70..2ca796172 100644 --- a/ark/include/ark.hpp +++ b/ark/include/ark.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/ark/error.hpp b/ark/include/ark/error.hpp similarity index 70% rename from ark/error.hpp rename to ark/include/ark/error.hpp index e08acd975..78d02cab3 100644 --- a/ark/error.hpp +++ b/ark/include/ark/error.hpp @@ -1,17 +1,21 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_ERROR_HPP_ -#define ARK_ERROR_HPP_ +#ifndef ARK_ERROR_HPP +#define ARK_ERROR_HPP #include #include namespace ark { -class BaseError : public std::runtime_error { +class BaseError : public std::exception { + private: + std::string msg_; + public: - BaseError(const std::string &msg) : std::runtime_error(msg) {} + BaseError(const std::string &msg) : msg_(msg) {} + const char *what() const noexcept override { return msg_.c_str(); } }; #define REGISTER_ERROR_TYPE(_name) \ @@ -20,6 +24,7 @@ class BaseError : public std::runtime_error { _name(const std::string &msg) : BaseError(msg) {} \ }; +REGISTER_ERROR_TYPE(InternalError) REGISTER_ERROR_TYPE(InvalidUsageError) REGISTER_ERROR_TYPE(NotFoundError) REGISTER_ERROR_TYPE(ModelError) @@ -32,4 +37,4 @@ REGISTER_ERROR_TYPE(UnitTestError) } // namespace ark -#endif // ARK_ERROR_HPP_ +#endif // ARK_ERROR_HPP diff --git a/ark/logging.h b/ark/logging.h index d29793ff7..6eb8aaf91 100644 --- a/ark/logging.h +++ b/ark/logging.h @@ -8,7 +8,7 @@ #include #include -#include "error.hpp" +#include "ark/error.hpp" namespace ark { diff --git a/python/ark/__init__.py b/python/ark/__init__.py index 2a4d164e4..3d162c3e4 100644 --- a/python/ark/__init__.py +++ b/python/ark/__init__.py @@ -91,3 +91,15 @@ def set_world_size(world_size): ones, zeros, ) +from .error import ( + InternalError, + InvalidUsageError, + NotFoundError, + ModelError, + SchedulerError, + ExecutorError, + SystemError, + GpuError, + RuntimeError, +) + diff --git a/python/ark/error.py b/python/ark/error.py new file mode 100644 index 000000000..d3ac3aee8 --- /dev/null +++ b/python/ark/error.py @@ -0,0 +1,12 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from _ark_core import _InternalError as InternalError +from _ark_core import _InvalidUsageError as InvalidUsageError +from _ark_core import _NotFoundError as NotFoundError +from _ark_core import _ModelError as ModelError +from _ark_core import _SchedulerError as SchedulerError +from _ark_core import _ExecutorError as ExecutorError +from _ark_core import _SystemError as SystemError +from _ark_core import _GpuError as GpuError +from _ark_core import _RuntimeError as RuntimeError diff --git a/python/ark_py.cpp b/python/ark_py.cpp index 35c3b21c3..1bc4255d6 100644 --- a/python/ark_py.cpp +++ b/python/ark_py.cpp @@ -9,6 +9,7 @@ namespace py = pybind11; extern void register_data_type(py::module &m); extern void register_dims(py::module &m); +extern void register_error(py::module &m); extern void register_executor(py::module &m); extern void register_init(py::module &m); extern void register_model_graph(py::module &m); @@ -23,6 +24,7 @@ PYBIND11_MODULE(_ark_core, m) { register_data_type(m); register_dims(m); + register_error(m); register_executor(m); register_init(m); register_model_graph(m); diff --git a/python/error_py.cpp b/python/error_py.cpp new file mode 100644 index 000000000..863d8423d --- /dev/null +++ b/python/error_py.cpp @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include +#include +#include + +#include + +namespace py = pybind11; + +#define REGISTER_ERROR_PY(_name) \ + py::register_exception(m, "_" #_name) + +void register_error(py::module &m) { + REGISTER_ERROR_PY(InternalError); + REGISTER_ERROR_PY(InvalidUsageError); + REGISTER_ERROR_PY(NotFoundError); + REGISTER_ERROR_PY(ModelError); + REGISTER_ERROR_PY(SchedulerError); + REGISTER_ERROR_PY(ExecutorError); + REGISTER_ERROR_PY(SystemError); + REGISTER_ERROR_PY(GpuError); + REGISTER_ERROR_PY(RuntimeError); +} diff --git a/python/unittest/test_error.py b/python/unittest/test_error.py new file mode 100644 index 000000000..c063c05c5 --- /dev/null +++ b/python/unittest/test_error.py @@ -0,0 +1,12 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import ark + + +def test_error(): + ark.init() + try: + ark.tensor([0]) + except Exception as e: + assert isinstance(e, ark.InvalidUsageError) From 59caff1eddb0a01c4f7bdf6e082b96d22e10ad6e Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 26 Jun 2024 23:25:35 +0000 Subject: [PATCH 16/61] Build python module by default --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ee1e3566e..9ba2f2c55 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,7 @@ option(USE_CUDA "Use NVIDIA/CUDA." OFF) option(USE_ROCM "Use AMD/ROCm." OFF) option(BYPASS_GPU_CHECK "Bypass GPU check." OFF) option(BUILD_TESTS "Build unit tests." ON) +option(BUILD_PYTHON "Build Python module." ON) if(BYPASS_GPU_CHECK) if(USE_CUDA) From efb2c78145cab0832971205911320264bbe74870 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 29 Jun 2024 03:51:19 +0000 Subject: [PATCH 17/61] revert --- ark/include/kernels/kernel_template.in | 1 + 1 file changed, 1 insertion(+) diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in index 876e6a1b4..ea1862920 100644 --- a/ark/include/kernels/kernel_template.in +++ b/ark/include/kernels/kernel_template.in @@ -64,5 +64,6 @@ void @NAME@(char *_buf, int *_iter) { if (threadIdx.x == 0 && blockIdx.x == 0) { atomicStoreRelaxed(_iter, 0); } + sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE); } } From 8975f9d4a0574f0421e79f6dd49e7443e7244606 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 29 Jun 2024 04:03:20 +0000 Subject: [PATCH 18/61] Do not use `sys.path` for importing `_ark_core` --- python/ark/__init__.py | 5 +---- python/ark/error.py | 18 +++++++++--------- python/ark/init.py | 2 +- python/ark/model.py | 2 +- python/ark/runtime.py | 2 +- python/ark/tensor.py | 2 +- 6 files changed, 14 insertions(+), 17 deletions(-) diff --git a/python/ark/__init__.py b/python/ark/__init__.py index 3d162c3e4..031afc7ba 100644 --- a/python/ark/__init__.py +++ b/python/ark/__init__.py @@ -7,9 +7,7 @@ if os.environ.get("ARK_ROOT", None) is None: os.environ["ARK_ROOT"] = os.path.abspath(os.path.dirname(__file__)) -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) - -import _ark_core +from . import _ark_core from .model import Model @@ -102,4 +100,3 @@ def set_world_size(world_size): GpuError, RuntimeError, ) - diff --git a/python/ark/error.py b/python/ark/error.py index d3ac3aee8..40f7391ac 100644 --- a/python/ark/error.py +++ b/python/ark/error.py @@ -1,12 +1,12 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -from _ark_core import _InternalError as InternalError -from _ark_core import _InvalidUsageError as InvalidUsageError -from _ark_core import _NotFoundError as NotFoundError -from _ark_core import _ModelError as ModelError -from _ark_core import _SchedulerError as SchedulerError -from _ark_core import _ExecutorError as ExecutorError -from _ark_core import _SystemError as SystemError -from _ark_core import _GpuError as GpuError -from _ark_core import _RuntimeError as RuntimeError +from ._ark_core import _InternalError as InternalError +from ._ark_core import _InvalidUsageError as InvalidUsageError +from ._ark_core import _NotFoundError as NotFoundError +from ._ark_core import _ModelError as ModelError +from ._ark_core import _SchedulerError as SchedulerError +from ._ark_core import _ExecutorError as ExecutorError +from ._ark_core import _SystemError as SystemError +from ._ark_core import _GpuError as GpuError +from ._ark_core import _RuntimeError as RuntimeError diff --git a/python/ark/init.py b/python/ark/init.py index dbf7c1569..32f530791 100644 --- a/python/ark/init.py +++ b/python/ark/init.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import _ark_core +from . import _ark_core from .model import Model from .runtime import _RuntimeState diff --git a/python/ark/model.py b/python/ark/model.py index e6208fc16..87af88f49 100644 --- a/python/ark/model.py +++ b/python/ark/model.py @@ -2,7 +2,7 @@ # Licensed under the MIT license. from typing import NewType -from _ark_core import _Model +from ._ark_core import _Model _ModelState = NewType("_ModelState", None) diff --git a/python/ark/runtime.py b/python/ark/runtime.py index 798eaf9d5..efae6ab3c 100644 --- a/python/ark/runtime.py +++ b/python/ark/runtime.py @@ -5,7 +5,7 @@ from enum import Enum from typing import Callable, Dict, List -from _ark_core import _Executor, _DefaultPlanner +from ._ark_core import _Executor, _DefaultPlanner from .model import Model diff --git a/python/ark/tensor.py b/python/ark/tensor.py index eff1bf20e..ac2886960 100644 --- a/python/ark/tensor.py +++ b/python/ark/tensor.py @@ -4,7 +4,7 @@ import numpy as np from typing import Callable, List, Union, Type -from _ark_core import _Dims, _Tensor, _NullTensor +from ._ark_core import _Dims, _Tensor, _NullTensor from .data_type import DataType from .runtime import Runtime from .model import Model From 153837ba60497413d70c90fed945eaa037c84a29 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 2 Jul 2024 04:09:10 +0000 Subject: [PATCH 19/61] wip --- ark/api/executor.cpp | 51 +- ark/codegen.cpp | 3 +- ark/include/ark/executor.hpp | 7 +- ark/include/kernels/common/broadcast.h | 4 +- ark/model/model_json.cpp | 11 +- ark/model/model_json.hpp | 2 +- ark/model/model_op.cpp | 5 +- ark/ops/ops_all_reduce_test.cpp | 2 +- ark/ops/ops_arithmetic_test.cpp | 48 +- ark/ops/ops_embedding_test.cpp | 2 +- ark/ops/ops_matmul.cpp | 30 +- ark/ops/ops_test_common.cpp | 10 +- ark/ops/ops_test_common.hpp | 6 +- examples/llama/README.md | 4 +- examples/llama/model_test.py | 88 +- plan_gpu0.json | 2504 ++++++++++++++++++++++++ python/ark/__init__.py | 1 + python/ark/profiler.py | 30 + python/executor_py.cpp | 1 + 19 files changed, 2706 insertions(+), 103 deletions(-) create mode 100644 plan_gpu0.json create mode 100644 python/ark/profiler.py diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index b052040ef..4af9df7c0 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -143,11 +143,13 @@ static size_t tensor_stride_bytes(const Json &tensor) { class Executor::Impl { public: - Impl(int rank, int world_size, int gpu_id, const std::string &name, - const std::string &plan); + Impl(int rank, int world_size, int gpu_id, const std::string &name); ~Impl() = default; + void init(const std::string &plan); + int gpu_id() const { return gpu_id_; } + std::string plan() const { return plan_json_.dump_pretty(); } void compile(); void launch(int64_t max_spin_count); @@ -173,11 +175,13 @@ class Executor::Impl { const int rank_; const int world_size_; int gpu_id_; + std::string name_; bool is_launched_ = false; bool is_recording_ = false; float elapsed_msec_ = -1; + PlanJson plan_json_; std::map buffer_id_to_offset_; size_t total_bytes_; std::shared_ptr codegen_; @@ -199,8 +203,8 @@ class Executor::Impl { }; Executor::Impl::Impl(int rank, int world_size, int gpu_id, - const std::string &name, const std::string &plan) - : rank_(rank), world_size_(world_size), gpu_id_(gpu_id) { + const std::string &name) + : rank_(rank), world_size_(world_size), gpu_id_(gpu_id), name_(name) { if (rank < 0 || rank >= world_size) { ERR(InvalidUsageError, "Invalid rank ", rank, " with world size ", world_size); @@ -211,17 +215,18 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id, if (world_size_ > 1) { init_communicator(); } +} - Json plan_json; +void Executor::Impl::init(const std::string &plan) { auto &plan_path = get_env().enforce_plan_path; if (!plan_path.empty()) { LOG(INFO, "Enforce executor plan path: ", plan_path); - plan_json = Json::parse(read_file(plan_path)); + plan_json_ = Json::parse(read_file(plan_path)); } else { - plan_json = Json::parse(plan); + plan_json_ = Json::parse(plan); } - buffer_id_to_offset_ = init_buffers(plan_json); + buffer_id_to_offset_ = init_buffers(plan_json_); std::string buffer_id_to_offset_str; for (const auto &kv : buffer_id_to_offset_) { @@ -230,7 +235,7 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id, } codegen_ = - std::make_shared(plan_json, buffer_id_to_offset_, name); + std::make_shared(plan_json_, buffer_id_to_offset_, name_); auto gpu_manager = GpuManager::get_instance(gpu_id_); timer_begin_ = gpu_manager->create_event(); @@ -249,13 +254,13 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id, static_cast(gpu_manager->info().smem_block_total); if (world_size_ > 1) { - auto remote_ranks = init_remote_ranks(plan_json); + auto remote_ranks = init_remote_ranks(plan_json_); init_channels(remote_ranks); } kernel_ = std::shared_ptr(new GpuKernel( gpu_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1}, - std::max(smem_block_total, size_t(4)), name, + std::max(smem_block_total, size_t(4)), name_, {std::pair{buffer_->ref(), sizeof(buffer_->ref())}, std::pair{flag, sizeof(flag)}})); } @@ -812,13 +817,18 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data, Executor::Executor(int rank, int world_size, int gpu_id, const std::string &name, const std::string &plan) - : impl_(std::make_unique(rank, world_size, gpu_id, name, - plan)) {} + : impl_(std::make_unique(rank, world_size, gpu_id, name)) { + if (!plan.empty()) { + impl_->init(plan); + } +} Executor::~Executor() = default; int Executor::gpu_id() const { return impl_->gpu_id(); } +std::string Executor::plan() const { return impl_->plan(); } + void Executor::compile() { impl_->compile(); } void Executor::launch(int64_t max_spin_count) { impl_->launch(max_spin_count); } @@ -852,14 +862,17 @@ void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes, } DefaultExecutor::DefaultExecutor(const Model &model, int gpu_id, - const std::string &name) + const std::vector& config_rules, + const std::string& name) : Executor( model.rank(), model.world_size(), (gpu_id < 0) ? (model.rank() % get_env().num_ranks_per_host) : gpu_id, - name, - DefaultPlanner(model, (gpu_id < 0) ? (model.rank() % - get_env().num_ranks_per_host) - : gpu_id) - .plan()) {} + name, "") { + DefaultPlanner planner(model, impl_->gpu_id()); + for (const auto &rule : config_rules) { + planner.install_config_rule(rule); + } + impl_->init(planner.plan()); +} } // namespace ark diff --git a/ark/codegen.cpp b/ark/codegen.cpp index cd6206284..09ff28dd3 100644 --- a/ark/codegen.cpp +++ b/ark/codegen.cpp @@ -305,7 +305,8 @@ std::string CodeGenerator::Impl::resource_group( n_slots = total_warps / num_warps_per_task; } if (n_slots == 0) { - ERR(SchedulerError, "not enough resources for task group"); + ERR(SchedulerError, "not enough resources for task group: ", + tg.dump()); } size_t task_b = *task_range.begin(); diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp index b8cdaf273..2473e1b14 100644 --- a/ark/include/ark/executor.hpp +++ b/ark/include/ark/executor.hpp @@ -5,6 +5,7 @@ #define ARK_EXECUTOR_HPP #include +#include #include #include #include @@ -24,6 +25,9 @@ class Executor { /// Return the GPU ID. int gpu_id() const; + /// Return the plan string. + std::string plan() const; + /// Compile the model. This must be called before `launch()`. void compile(); @@ -68,7 +72,7 @@ class Executor { void tensor_write(const Tensor tensor, const void *data, size_t bytes, bool is_d2d = false) const; - private: + protected: class Impl; std::unique_ptr impl_; }; @@ -78,6 +82,7 @@ class Model; class DefaultExecutor : public Executor { public: DefaultExecutor(const Model &model, int gpu_id = -1, + const std::vector& config_rules = {}, const std::string &name = "DefaultExecutor"); }; diff --git a/ark/include/kernels/common/broadcast.h b/ark/include/kernels/common/broadcast.h index 97b12e004..858938613 100644 --- a/ark/include/kernels/common/broadcast.h +++ b/ark/include/kernels/common/broadcast.h @@ -186,9 +186,9 @@ struct Broadcast2Intrinsic { (BroadcastInput0 && BroadcastInput1) ? OutNelemPerThread : BroadcastInput0 - ? math::gcd::value + ? math::gcd::value : BroadcastInput1 - ? math::gcd::value + ? math::gcd::value : math::gcd::value>::value; diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp index 0057ef0aa..97ce71967 100644 --- a/ark/model/model_json.cpp +++ b/ark/model/model_json.cpp @@ -272,7 +272,16 @@ static void verify_format_plan(const Json &json) { } } -PlanJson::PlanJson(const Json &json) : Json(json) { verify_format_plan(*this); } +PlanJson::PlanJson(const Json &json) + : Json((json != nullptr) ? json + : Json{{"Rank", 0}, + {"WorldSize", 1}, + {"NumProcessors", 1}, + {"NumWarpsPerProcessor", 1}, + {"TaskInfos", Json::array()}, + {"ProcessorGroups", Json::array()}}) { + verify_format_plan(*this); +} static std::stringstream &dump_pretty_plan(const Json &json, std::stringstream &ss, int indent, diff --git a/ark/model/model_json.hpp b/ark/model/model_json.hpp index cf5fbbce2..e42640a9a 100644 --- a/ark/model/model_json.hpp +++ b/ark/model/model_json.hpp @@ -18,7 +18,7 @@ class ModelJson : public Json { class PlanJson : public Json { public: - PlanJson(const Json &json); + PlanJson(const Json &json = nullptr); std::string dump_pretty(int indent = 0, int indent_step = 2) const; }; diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp index 6cdba5d02..b5a0645c8 100644 --- a/ark/model/model_op.cpp +++ b/ark/model/model_op.cpp @@ -202,8 +202,11 @@ std::shared_ptr ModelOp::deserialize(const Json &serialized) { } else if (!serialized.contains("Args")) { ERR(InvalidUsageError, "ModelOp deserialization failed: missing Args"); } + // Run `ModelOpT::from_name` before `construct()` to ensure all operators + // are registered. + auto op_type = ModelOpT::from_name(serialized["Type"]); auto ret = model_op_factory()->construct(serialized["Type"]); - ret->type_ = ModelOpT::from_name(serialized["Type"]); + ret->type_ = op_type; ret->name_ = serialized["Name"]; ret->is_virtual_ = serialized["IsVirtual"]; for (const auto &t : serialized["ReadTensors"]) { diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp index 9e2c6f675..54c6426fa 100644 --- a/ark/ops/ops_all_reduce_test.cpp +++ b/ark/ops/ops_all_reduce_test.cpp @@ -94,7 +94,7 @@ void test_all_reduce_internal(ark::DimType nelem) { auto result = ark::op_test("all_reduce", m, {ones}, {output}, baseline_all_reduce, - {ones_vec.data()}, false, gpu_id, NumGpus); + {ones_vec.data()}); UNITTEST_LOG(result); UNITTEST_EQ(result.max_diff[0], 0.0f); return ark::unittest::SUCCESS; diff --git a/ark/ops/ops_arithmetic_test.cpp b/ark/ops/ops_arithmetic_test.cpp index 3fdc5ac7e..c7c18b603 100644 --- a/ark/ops/ops_arithmetic_test.cpp +++ b/ark/ops/ops_arithmetic_test.cpp @@ -2,6 +2,7 @@ // Licensed under the MIT license. #include "ops_test_common.hpp" +#include "model/model_json.hpp" template void baseline_add(std::vector &outputs, @@ -142,12 +143,25 @@ ark::unittest::State test_add_fp32() { ark::unittest::State test_add_fp16() { ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP16); - ark::Tensor t1 = m.tensor({8192}, ark::FP16); + ark::Tensor t0 = m.tensor({32, 2048, 2048}, ark::FP16); + ark::Tensor t1 = m.tensor({32, 2048, 2048}, ark::FP16); ark::Tensor out = m.add(t0, t1); auto result = - ark::op_test("add_fp16", m, {t0, t1}, {out}, baseline_add); + ark::op_test("add_fp16", m, {t0, t1}, {out}, baseline_add, {}, + { + ark::DefaultPlanner::ConfigRule([](const std::string op_str, const std::string) { + auto op = ark::Json::parse(op_str); + ark::Json config; + if (op.at("Type") == "Add") { + config["NumWarps"] = 4; + config["SramBytes"] = 0; + config["Tile"] = {128, 256}; + config["NumTasks"] = 4096; + } + return config.dump(); + }) + }); UNITTEST_LOG(result); UNITTEST_EQ(result.max_diff[0], 0.0f); return ark::unittest::SUCCESS; @@ -416,20 +430,20 @@ ark::unittest::State test_div_invalid() { int main() { ark::init(); - UNITTEST(test_add_fp32); + // UNITTEST(test_add_fp32); UNITTEST(test_add_fp16); - UNITTEST(test_add_bf16); - UNITTEST(test_add_overwrite); - UNITTEST(test_add_broadcast); - UNITTEST(test_add_invalid); - UNITTEST(test_sub_fp32); - UNITTEST(test_sub_invalid); - UNITTEST(test_mul_fp32); - UNITTEST(test_mul_fp16); - UNITTEST(test_mul_overwrite); - UNITTEST(test_mul_broadcast); - UNITTEST(test_mul_invalid); - UNITTEST(test_div_fp32); - UNITTEST(test_div_invalid); + // UNITTEST(test_add_bf16); + // UNITTEST(test_add_overwrite); + // UNITTEST(test_add_broadcast); + // UNITTEST(test_add_invalid); + // UNITTEST(test_sub_fp32); + // UNITTEST(test_sub_invalid); + // UNITTEST(test_mul_fp32); + // UNITTEST(test_mul_fp16); + // UNITTEST(test_mul_overwrite); + // UNITTEST(test_mul_broadcast); + // UNITTEST(test_mul_invalid); + // UNITTEST(test_div_fp32); + // UNITTEST(test_div_invalid); return ark::unittest::SUCCESS; } diff --git a/ark/ops/ops_embedding_test.cpp b/ark/ops/ops_embedding_test.cpp index 822973106..4f9df046a 100644 --- a/ark/ops/ops_embedding_test.cpp +++ b/ark/ops/ops_embedding_test.cpp @@ -80,7 +80,7 @@ ark::unittest::State test_embedding() { } auto result = ark::op_test("embedding_" + type_str, m, {ti, tw}, {to}, baseline_embedding, - {ti_data.data(), tw_data.data()}, true); + {ti_data.data(), tw_data.data()}); UNITTEST_LOG(result); UNITTEST_EQ(result.max_diff[0], 0.0f); return ark::unittest::SUCCESS; diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp index b259f99c8..b4553a4ed 100644 --- a/ark/ops/ops_matmul.cpp +++ b/ark/ops/ops_matmul.cpp @@ -189,45 +189,55 @@ std::vector ModelOpMatmul::impl_args([ } static const Json get_default_config(const ArchRef arch, - const ModelDataType &data_type) { + const ModelDataType &data_type, + const Dims &mnk) { + if (data_type != FP32.ref() && data_type != FP16.ref() && + data_type != BF16.ref()) { + ERR(InvalidUsageError, + "Unsupported data type: ", data_type->type_name()); + } + if (!arch->belongs_to(ARCH_CUDA) && !arch->belongs_to(ARCH_ROCM)) { + ERR(InvalidUsageError, "Unsupported architecture: ", arch->name()); + } + DimType tm = (mnk[0] > mnk[1]) ? 256 : 128; + DimType tn = (mnk[0] > mnk[1]) ? 128 : 256; if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP32.ref()) { return {{"NumWarps", 8}, {"SramBytes", 147456}, - {"TileShapeMNK", {128, 256, 32}}}; + {"TileShapeMNK", {tm, tn, 32}}}; } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP16.ref()) { return {{"NumWarps", 8}, {"SramBytes", 147456}, - {"TileShapeMNK", {128, 256, 64}}}; + {"TileShapeMNK", {tm, tn, 64}}}; } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == BF16.ref()) { return {{"NumWarps", 8}, {"SramBytes", 147456}, - {"TileShapeMNK", {128, 256, 64}}}; + {"TileShapeMNK", {tm, tn, 64}}}; } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP32.ref()) { return {{"NumWarps", 4}, {"SramBytes", 24672}, - {"TileShapeMNK", {128, 256, 16}}}; + {"TileShapeMNK", {tm, tn, 16}}}; } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP16.ref()) { return {{"NumWarps", 4}, {"SramBytes", 24672}, - {"TileShapeMNK", {128, 256, 32}}}; + {"TileShapeMNK", {tm, tn, 32}}}; } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == BF16.ref()) { return {{"NumWarps", 4}, {"SramBytes", 24672}, - {"TileShapeMNK", {128, 256, 32}}}; + {"TileShapeMNK", {tm, tn, 32}}}; } - ERR(InvalidUsageError, "Unsupported arch and data type: ", arch->name(), - " and ", data_type->type_name()); + ERR(InternalError, "Unexpected error"); return {}; } Json ModelOpMatmul::default_config(const ArchRef arch) const { auto result = result_tensors_[0]; - Json config = get_default_config(arch, result->data_type()); check_fields_args(args_, {"TransposeInput", "TransposeOther"}); Dims mnk = calc_problem_size(read_tensors_[0]->padded_shape(), read_tensors_[1]->padded_shape(), args_.at("TransposeInput").value(), args_.at("TransposeOther").value()); + Json config = get_default_config(arch, result->data_type(), mnk); size_t tile_x = config.at("TileShapeMNK")[0]; size_t tile_y = config.at("TileShapeMNK")[1]; if (mnk[0] % tile_x != 0 || mnk[1] % tile_y != 0) { diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp index 50317fba7..ad2c208b6 100644 --- a/ark/ops/ops_test_common.cpp +++ b/ark/ops/ops_test_common.cpp @@ -36,8 +36,9 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, const std::vector &outputs, OpsTestBaseline baseline, const std::vector &inputs_data, - bool print_on_error, int rank, int world_size) { - DefaultExecutor exe(model); + const std::vector& config_rules, + bool print_on_error) { + DefaultExecutor exe(model, -1, config_rules); exe.compile(); std::vector>> inputs_data_storages; @@ -133,7 +134,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, for (auto t : gt) { gt_ptrs.push_back(t->data()); } - baseline(gt_ptrs, output_shapes, inputs_data_refs, input_shapes, rank); + baseline(gt_ptrs, output_shapes, inputs_data_refs, input_shapes, model.rank()); std::stringstream test_name; test_name << test_name_prefix; @@ -147,6 +148,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, OpsTestResult result; result.test_name = test_name.str(); + result.plan = exe.plan(); // Compare results with the ground truth. for (size_t i = 0; i < outputs.size(); i++) { @@ -187,7 +189,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, GLOG(gpuDeviceSynchronize()); // Throughput test. - if (world_size > 1) { + if (model.world_size() > 1) { // For multi-GPU, we need to make sure that all GPUs run the same // number of iterations. Rather than doing allgather, we just // use a magic number here. diff --git a/ark/ops/ops_test_common.hpp b/ark/ops/ops_test_common.hpp index 01e97dbb1..a32d9b748 100644 --- a/ark/ops/ops_test_common.hpp +++ b/ark/ops/ops_test_common.hpp @@ -10,6 +10,7 @@ #include "ark/model.hpp" #include "ark/model_ref.hpp" +#include "ark/planner.hpp" #include "ark/random.hpp" #include "bfloat16.h" #include "half.h" @@ -133,6 +134,7 @@ TensorCompareResult tensor_compare(T *ground_truth, T *res, Dims shape, struct OpsTestResult { std::string test_name; + std::string plan; int iter; float msec_per_iter; std::vector mse; @@ -170,8 +172,8 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, const std::vector &outputs, OpsTestBaseline baseline, const std::vector &inputs_data = {}, - bool print_on_error = false, int rank = 0, - int world_size = 1); + const std::vector& config_rules = {}, + bool print_on_error = false); OpsTestGpuMem to_gpu(void *host_ptr, size_t size); diff --git a/examples/llama/README.md b/examples/llama/README.md index 090dd1de3..1fe040ae0 100644 --- a/examples/llama/README.md +++ b/examples/llama/README.md @@ -29,10 +29,10 @@ Llama2 examples over ARK. 4. Download Llama2 model weights and tokenizer weights. * The model and tokenizer should be compatible with the [official PyTorch implementation](https://github.com/facebookresearch/llama/blob/main/llama). -5. Run the model accuracy test. `--pth_path` is the path to the model weights file (`consolidated.00.pth`). +5. Run the model accuracy test. `--ckpt_dir` is the directory where the model weight files are at (e.g., `consolidated.00.pth`). ```bash - python3 model_test.py --pth_path=/path/to/model/weights.pth + python3 model_test.py --ckpt_dir=/directory/of/model/weights ``` 6. Test text generation. `--pth_path` is the path to the model weights file (`consolidated.00.pth`), `--tok_path` is the path to the tokenizer weights file (`tokenizer.model`), and `--params_path` is the path to the model parameters (`params.json`). diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py index 737d3ec8b..585341640 100644 --- a/examples/llama/model_test.py +++ b/examples/llama/model_test.py @@ -58,30 +58,34 @@ def run_ark( ] output = module(*module_inputs) - runtime = ark.Runtime() - # Prefer num_warps_per_sm = 16 for nvidia and 8 for amd - runtime.launch(num_warps_per_sm=8) + with ark.Runtime() as rt: + rt.launch(plan_path="/mnt/changhohwang/ark/plan_gpu0.json") - # Load model parameters - if state_dict: - module.load_state_dict(state_dict) + # Load model parameters + if state_dict: + print("Loading state_dict") + module.load_state_dict(state_dict) + print("Loading state_dict done") - # Load input data into tensors - tensors = [i for i in module_inputs if isinstance(i, ark.Tensor)] - tensor_data = [i for i in inputs if isinstance(i, np.ndarray)] - for tensor, ndarray in zip(tensors, tensor_data): - tensor.from_numpy(ndarray) + # Load input data into tensors + tensors = [i for i in module_inputs if isinstance(i, ark.Tensor)] + tensor_data = [i for i in inputs if isinstance(i, np.ndarray)] + for tensor, ndarray in zip(tensors, tensor_data): + tensor.from_numpy(ndarray) - start_time = time.time() + start_time = time.time() - # Run the model - runtime.run(iter=iterations) + # Run the model + print("Run:", iterations) - end_time = time.time() + rt.run(iter=iterations) + print("Run done") - if isinstance(output, list) or isinstance(output, tuple): - outputs = [o.to_numpy() for o in output] - outputs = [output.to_numpy()] + end_time = time.time() + + if isinstance(output, list) or isinstance(output, tuple): + outputs = [o.to_numpy() for o in output] + outputs = [output.to_numpy()] return RunResults(outputs=outputs, runtime=end_time - start_time) @@ -160,7 +164,9 @@ def test_module( else: prefix = module_name_prefix + "." if module_name_prefix else "" # Load the state_dict from the given path + print("Loading ckpt:", ckpt_path) state_dict_pt = torch.load(ckpt_path) + print("Loading ckpt done") state_dict_pt = { k[len(prefix) :]: v for k, v in state_dict_pt.items() @@ -182,6 +188,7 @@ def test_module( rank=rank, world_size=world_size, ) + print("Run ARK done") if not test_thru_ark_only: # PyTorch module @@ -195,6 +202,7 @@ def test_module( inputs_pt, iterations=test_thru_iterations if test_thru else 1, ) + print("Run PyTorch done") if test_thru: print( @@ -447,26 +455,26 @@ def test_transformer_block( ) output = module(feature_tensor, 0, freqs_cis_ark_tensor, None) - ark.Model.get_model().create_nodes() - print(ark.Model.get_model().serialize()) - - # test_module( - # module_class_ark=model_ark.TransformerBlock, - # module_args_ark=[ - # 0, - # args, - # ark.DataType.from_numpy(dtype), - # rank, - # world_size, - # ], - # inputs_ark=[feature, 0, freqs_cis_ark, None], - # module_class_pt=model_pt.TransformerBlock, - # module_args_pt=[0, args], - # inputs_pt=[feature.astype(dtype), 0, freqs_cis, None], - # module_name_prefix="layers.0", - # rank=rank, - # world_size=world_size, - # ) + # print(ark.Model.get_model().serialize()) + + test_module( + module_class_ark=model_ark.TransformerBlock, + module_args_ark=[ + 0, + args, + ark.DataType.from_numpy(dtype), + rank, + world_size, + ], + inputs_ark=[feature, 0, freqs_cis_ark, None], + module_class_pt=model_pt.TransformerBlock, + module_args_pt=[0, args], + inputs_pt=[feature.astype(dtype), 0, freqs_cis, None], + module_name_prefix="layers.0", + rank=rank, + world_size=world_size, + test_thru=True, + ) def test_transformer( @@ -570,7 +578,7 @@ def worker( # Configurations args = ModelArgs7B() batch_size = 1 - seq_len = 512 + seq_len = 2048 dtype = np.float16 world_size = ngpus @@ -578,7 +586,7 @@ def worker( args.vocab_size = 32000 # Reduce max_seq_len due to OOM from the PyTorch model - args.max_seq_len = 512 + args.max_seq_len = 2048 # Verify the configurations assert batch_size <= args.max_batch_size diff --git a/plan_gpu0.json b/plan_gpu0.json new file mode 100644 index 000000000..49b6bdd98 --- /dev/null +++ b/plan_gpu0.json @@ -0,0 +1,2504 @@ +{ + "Rank": 0, + "WorldSize": 1, + "NumProcessors": 304, + "NumWarpsPerProcessor": 4, + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 1, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Rope", + "Name": "rope", + "IsVirtual": false, + "ReadTensors": [ + {"Id":12,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":15,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [32,128], + "NumTasks": 2048 + } + } + ] + }, + { + "Id": 2, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Transpose", + "Name": "transpose", + "IsVirtual": false, + "ReadTensors": [ + {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":19,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Permutation": {"DIMS":[0,2,1,3]} + }, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [8,128], + "NumTasks": 8192 + } + } + ] + }, + { + "Id": 3, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 4, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Rope", + "Name": "rope_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":13,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":17,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [32,128], + "NumTasks": 2048 + } + } + ] + }, + { + "Id": 5, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Transpose", + "Name": "transpose_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":23,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Permutation": {"DIMS":[0,2,3,1]} + }, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [8,128], + "NumTasks": 8192 + } + } + ] + }, + { + "Id": 6, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 7, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Transpose", + "Name": "transpose_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":14,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":21,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Permutation": {"DIMS":[0,2,1,3]} + }, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [8,128], + "NumTasks": 8192 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":false} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 4096 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "ScalarMul", + "Name": "mul", + "IsVirtual": false, + "ReadTensors": [ + {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Factor": {"FLOAT":0.0883883461356163} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 4096 + } + } + ] + }, + { + "Id": 10, + "NumWarps": 1, + "SramBytes": 256, + "Ops": [ + { + "Type": "ReduceMax", + "Name": "reduce_max", + "IsVirtual": false, + "ReadTensors": [ + {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Axis": {"INT":3}, + "KeepDim": {"BOOL":true} + }, + "Config": { + "NumWarps": 1, + "ImplType": "WarpWise", + "SramBytes": 256, + "NumTasks": 65536 + } + } + ] + }, + { + "Id": 11, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Sub", + "Name": "sub", + "IsVirtual": false, + "ReadTensors": [ + {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 4096 + } + } + ] + }, + { + "Id": 12, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Exp", + "Name": "exp", + "IsVirtual": false, + "ReadTensors": [ + {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 4096 + } + } + ] + }, + { + "Id": 13, + "NumWarps": 1, + "SramBytes": 256, + "Ops": [ + { + "Type": "ReduceSum", + "Name": "reduce_sum", + "IsVirtual": false, + "ReadTensors": [ + {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":33,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Axis": {"INT":3}, + "KeepDim": {"BOOL":true} + }, + "Config": { + "NumWarps": 1, + "ImplType": "WarpWise", + "SramBytes": 256, + "NumTasks": 65536 + } + } + ] + }, + { + "Id": 14, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Div", + "Name": "div", + "IsVirtual": false, + "ReadTensors": [ + {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 4096 + } + } + ] + }, + { + "Id": 15, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":36,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":false} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [256,128,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 16, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Transpose", + "Name": "transpose_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":38,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":39,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Permutation": {"DIMS":[0,2,1,3]} + }, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [8,128], + "NumTasks": 8192 + } + } + ] + }, + { + "Id": 17, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":40,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":41,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":42,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 18, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Cast", + "Name": "cast", + "IsVirtual": false, + "ReadTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":54,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 19, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":56,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 20, + "NumWarps": 1, + "SramBytes": 256, + "Ops": [ + { + "Type": "ReduceMean", + "Name": "reduce_mean", + "IsVirtual": false, + "ReadTensors": [ + {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":58,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Axis": {"INT":2}, + "KeepDim": {"BOOL":true} + }, + "Config": { + "NumWarps": 1, + "ImplType": "WarpWise", + "SramBytes": 256, + "NumTasks": 2048 + } + } + ] + }, + { + "Id": 21, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Rsqrt", + "Name": "rsqrt", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":60,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [64,1], + "NumTasks": 32 + } + } + ] + }, + { + "Id": 22, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":62,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 23, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":50,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 24, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Cast", + "Name": "cast_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":65,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 25, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":43,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":67,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":68,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 26, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Rope", + "Name": "rope_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":73,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":76,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 27, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Transpose", + "Name": "transpose_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":80,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Permutation": {"DIMS":[0,2,1,3]} + }, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [8,128], + "NumTasks": 8192 + } + } + ] + }, + { + "Id": 28, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_7", + "IsVirtual": false, + "ReadTensors": [ + {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":44,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":69,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":70,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 29, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Rope", + "Name": "rope_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":74,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":78,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 30, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Transpose", + "Name": "transpose_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":84,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Permutation": {"DIMS":[0,2,3,1]} + }, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [8,8], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 31, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_8", + "IsVirtual": false, + "ReadTensors": [ + {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":45,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":71,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":72,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 32, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Transpose", + "Name": "transpose_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":75,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":82,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Permutation": {"DIMS":[0,2,1,3]} + }, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [8,128], + "NumTasks": 8192 + } + } + ] + }, + { + "Id": 33, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_9", + "IsVirtual": false, + "ReadTensors": [ + {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":86,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":false} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 4096 + } + } + ] + }, + { + "Id": 34, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "ScalarMul", + "Name": "mul_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":88,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Factor": {"FLOAT":0.0883883461356163} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 4096 + } + } + ] + }, + { + "Id": 35, + "NumWarps": 1, + "SramBytes": 256, + "Ops": [ + { + "Type": "ReduceMax", + "Name": "reduce_max_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":90,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Axis": {"INT":3}, + "KeepDim": {"BOOL":true} + }, + "Config": { + "NumWarps": 1, + "ImplType": "WarpWise", + "SramBytes": 256, + "NumTasks": 65536 + } + } + ] + }, + { + "Id": 36, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Sub", + "Name": "sub_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 4096 + } + } + ] + }, + { + "Id": 37, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Exp", + "Name": "exp_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 4096 + } + } + ] + }, + { + "Id": 38, + "NumWarps": 1, + "SramBytes": 256, + "Ops": [ + { + "Type": "ReduceSum", + "Name": "reduce_sum_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":94,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Axis": {"INT":3}, + "KeepDim": {"BOOL":true} + }, + "Config": { + "NumWarps": 1, + "ImplType": "WarpWise", + "SramBytes": 256, + "NumTasks": 65536 + } + } + ] + }, + { + "Id": 39, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Div", + "Name": "div_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 4096 + } + } + ] + }, + { + "Id": 40, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_10", + "IsVirtual": false, + "ReadTensors": [ + {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":97,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":false} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [256,128,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 41, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Transpose", + "Name": "transpose_7", + "IsVirtual": false, + "ReadTensors": [ + {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":99,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":100,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Permutation": {"DIMS":[0,2,1,3]} + }, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [8,128], + "NumTasks": 8192 + } + } + ] + }, + { + "Id": 42, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_11", + "IsVirtual": false, + "ReadTensors": [ + {"Id":101,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":46,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":102,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 43, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Add", + "Name": "add", + "IsVirtual": false, + "ReadTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":104,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 44, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Cast", + "Name": "cast_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":106,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 45, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":108,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 46, + "NumWarps": 1, + "SramBytes": 256, + "Ops": [ + { + "Type": "ReduceMean", + "Name": "reduce_mean_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":110,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Axis": {"INT":2}, + "KeepDim": {"BOOL":true} + }, + "Config": { + "NumWarps": 1, + "ImplType": "WarpWise", + "SramBytes": 256, + "NumTasks": 2048 + } + } + ] + }, + { + "Id": 47, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Rsqrt", + "Name": "rsqrt_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":112,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [64,1], + "NumTasks": 32 + } + } + ] + }, + { + "Id": 48, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":114,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 49, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_7", + "IsVirtual": false, + "ReadTensors": [ + {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":51,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 50, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Cast", + "Name": "cast_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":117,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 51, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_12", + "IsVirtual": false, + "ReadTensors": [ + {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":47,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":119,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 688 + } + } + ] + }, + { + "Id": 52, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Sigmoid", + "Name": "sigmoid", + "IsVirtual": false, + "ReadTensors": [ + {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":121,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 688 + } + } + ] + }, + { + "Id": 53, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_8", + "IsVirtual": false, + "ReadTensors": [ + {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":123,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 688 + } + } + ] + }, + { + "Id": 54, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_13", + "IsVirtual": false, + "ReadTensors": [ + {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":49,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":125,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 688 + } + } + ] + }, + { + "Id": 55, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_9", + "IsVirtual": false, + "ReadTensors": [ + {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":127,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 688 + } + } + ] + }, + { + "Id": 56, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_14", + "IsVirtual": false, + "ReadTensors": [ + {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":48,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":129,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 57, + "NumWarps": 4, + "SramBytes": 0, + "Ops": [ + { + "Type": "Add", + "Name": "add_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":131,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":132,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,256], + "NumTasks": 256 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,2048],"Granularity":4} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,8192],"Granularity":4} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,2048],"Granularity":4} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,8192],"Granularity":4} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,8192],"Granularity":4} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":8,"TaskRange":[0,4096],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":9,"TaskRange":[0,4096],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,256], + "TaskGroups": [ + {"TaskId":10,"TaskRange":[0,65536],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":11,"TaskRange":[0,4096],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":12,"TaskRange":[0,4096],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,256], + "TaskGroups": [ + {"TaskId":13,"TaskRange":[0,65536],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":14,"TaskRange":[0,4096],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":15,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":16,"TaskRange":[0,8192],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":17,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":18,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":19,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,256], + "TaskGroups": [ + {"TaskId":20,"TaskRange":[0,2048],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,32], + "ResourceGroups": [ + { + "ProcessorRange": [0,32], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":21,"TaskRange":[0,32],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":22,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":23,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":24,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":25,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":26,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":27,"TaskRange":[0,8192],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":28,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":29,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":30,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":31,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":32,"TaskRange":[0,8192],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":33,"TaskRange":[0,4096],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":34,"TaskRange":[0,4096],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,256], + "TaskGroups": [ + {"TaskId":35,"TaskRange":[0,65536],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":36,"TaskRange":[0,4096],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":37,"TaskRange":[0,4096],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,256], + "TaskGroups": [ + {"TaskId":38,"TaskRange":[0,65536],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":39,"TaskRange":[0,4096],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":40,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":41,"TaskRange":[0,8192],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":42,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":43,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":44,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":45,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,256], + "TaskGroups": [ + {"TaskId":46,"TaskRange":[0,2048],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,32], + "ResourceGroups": [ + { + "ProcessorRange": [0,32], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":47,"TaskRange":[0,32],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":48,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":49,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":50,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":51,"TaskRange":[0,688],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":52,"TaskRange":[0,688],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":53,"TaskRange":[0,688],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":54,"TaskRange":[0,688],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":55,"TaskRange":[0,688],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":56,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":57,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/python/ark/__init__.py b/python/ark/__init__.py index 031afc7ba..f2f604be9 100644 --- a/python/ark/__init__.py +++ b/python/ark/__init__.py @@ -100,3 +100,4 @@ def set_world_size(world_size): GpuError, RuntimeError, ) +from .profiler import Profiler diff --git a/python/ark/profiler.py b/python/ark/profiler.py new file mode 100644 index 000000000..b959ceb18 --- /dev/null +++ b/python/ark/profiler.py @@ -0,0 +1,30 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import json +import sys +import time +from .runtime import Runtime + + +class Profiler: + def __init__(self, plan: str): + self.plan = json.loads(plan) + + def run(self): + num_processor_groups = len(self.plan["ProcessorGroups"]) + new_plan = { + "Rank": self.plan["Rank"], "WorldSize": self.plan["WorldSize"], + "NumProcessors": self.plan["NumProcessors"], + "NumWarpsPerProcessor": self.plan["NumWarpsPerProcessor"], + "TaskInfos": self.plan["TaskInfos"], + "ProcessorGroups": [{}]} + for i in range(num_processor_groups): + new_plan["ProcessorGroups"][0] = self.plan["ProcessorGroups"][i] + with Runtime() as rt: + rt.launch(plan=json.dumps(new_plan)) + start_time = time.time() + iter = 1000 + rt.run(iter=iter) + end_time = time.time() + sys.stderr.write(f"Processor group {i} runtime: {(end_time - start_time)/iter:.6f} seconds/iter\n") diff --git a/python/executor_py.cpp b/python/executor_py.cpp index e5ab4f964..a6e5308ee 100644 --- a/python/executor_py.cpp +++ b/python/executor_py.cpp @@ -149,6 +149,7 @@ void register_executor(py::module &m) { py::arg("rank"), py::arg("world_size"), py::arg("gpu_id"), py::arg("name"), py::arg("plan")) .def("gpu_id", &ark::Executor::gpu_id) + .def("plan", &ark::Executor::plan) .def("compile", &ark::Executor::compile) .def("launch", &ark::Executor::launch, py::arg("max_spin_count") = -1) .def("run", &ark::Executor::run, py::arg("iter")) From ff8c4b8fc4ff178befa375ffc8ac546806fa6c4b Mon Sep 17 00:00:00 2001 From: Noli Gerawork <86308445+naturalcandy@users.noreply.github.com> Date: Tue, 2 Jul 2024 21:25:07 -0400 Subject: [PATCH 20/61] torch to ark (#217) - Adds Torch to ARK tensor conversion support - New ModelBufferManager class handles external buffer registration and simplifies buffer access during kernel initialization - Adds test cases for ARK to Torch conversion support --------- Co-authored-by: Changho Hwang --- ark/api/executor.cpp | 53 ++++++++++++++++--- ark/api/tensor.cpp | 18 ++++++- ark/codegen.cpp | 36 +++++++++---- ark/codegen.hpp | 4 +- ark/include/ark/tensor.hpp | 2 + ark/model/model_buffer.cpp | 55 ++++++++++++++++++-- ark/model/model_buffer.hpp | 15 ++++++ ark/model_buffer_manager.hpp | 58 +++++++++++++++++++++ python/ark/tensor.py | 26 +++++----- python/tensor_py.cpp | 46 ++++++++++++++++- python/unittest/test_conversion.py | 81 +++++++++++++++++++++++++++++- 11 files changed, 355 insertions(+), 39 deletions(-) create mode 100644 ark/model_buffer_manager.hpp diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index 4af9df7c0..0a780bcc0 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include "ark/data_type.hpp" #include "ark/model.hpp" @@ -24,6 +25,7 @@ #include "gpu/gpu_manager.h" #include "logging.h" #include "model/model_buffer.hpp" +#include "model_buffer_manager.hpp" #include "model/model_data_type.hpp" #include "model/model_tensor.hpp" #include "utils/utils_net.hpp" @@ -234,8 +236,15 @@ void Executor::Impl::init(const std::string &plan) { std::to_string(kv.first) + ": " + std::to_string(kv.second) + ", "; } - codegen_ = - std::make_shared(plan_json_, buffer_id_to_offset_, name_); + ModelBufferManager &buffer_manager = ModelBufferManager::get_instance(); + + if (!buffer_manager.is_empty()) { + codegen_ = std::make_shared( + plan_json_, buffer_id_to_offset_, name, &buffer_manager); + } else { + codegen_ = std::make_shared(plan_json_, + buffer_id_to_offset_, name); + } auto gpu_manager = GpuManager::get_instance(gpu_id_); timer_begin_ = gpu_manager->create_event(); @@ -367,7 +376,16 @@ std::map Executor::Impl::init_buffers(const Json &plan_json) { } continue; } - buffer_id_to_offset[buf_info->buffer->id()] = offset; + if (buf_info->buffer->is_external()) { + if (buf_info->buffer->device_id() != gpu_id_) { + ERR(InvalidUsageError, + "PyTorch tensor and model execution are on different GPUs"); + } + continue; + } else { + buffer_id_to_offset[buf_info->buffer->id()] = offset; + offset += buf_info->bytes; + } for (const auto &tag_info : buf_info->buffer->send_tags()) { remote_rank_to_send_tags_and_offsets[tag_info.first] .first.push_back(tag_info.second); @@ -380,7 +398,6 @@ std::map Executor::Impl::init_buffers(const Json &plan_json) { remote_rank_to_recv_tags_and_offsets[tag_info.first] .second.push_back(offset); } - offset += buf_info->bytes; } total_bytes_ = offset; @@ -456,7 +473,11 @@ std::map Executor::Impl::init_buffers(const Json &plan_json) { bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 1); bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 2); for (int i = 0; i < len; ++i) { - buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] = offsets[i]; + if (!buffer_id_to_info[send_tag_to_buffer_id[tags[i]]] + ->buffer->is_external()) { + buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] = + offsets[i]; + } } } for (auto &kv : remote_rank_to_recv_tag_to_buffer_id) { @@ -472,10 +493,13 @@ std::map Executor::Impl::init_buffers(const Json &plan_json) { bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 4); bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 5); for (int i = 0; i < len; ++i) { - buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] = offsets[i]; + if (!buffer_id_to_info[recv_tag_to_buffer_id[tags[i]]] + ->buffer->is_external()) { + buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] = + offsets[i]; + } } } - return buffer_id_to_offset; } @@ -742,6 +766,11 @@ uintptr_t Executor::Impl::tensor_address(const Tensor tensor) const { void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes, bool is_d2d) const { GLOG(gpuSetDevice(gpu_id_)); + if (tensor.ref()->buffer()->is_external()) { + ERR(InvalidUsageError, + "Reading data from a tensor preallocated by PyTorch is not " + "supported. Use PyTorch's native methods."); + } size_t tensor_data_bytes = tensor.shape().nelems() * tensor.data_type().bytes(); if (bytes != tensor_data_bytes) { @@ -779,6 +808,11 @@ void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes, void Executor::Impl::tensor_write(const Tensor tensor, const void *data, size_t bytes, bool is_d2d) const { GLOG(gpuSetDevice(gpu_id_)); + if (tensor.ref()->buffer()->is_external()) { + ERR(InvalidUsageError, + "Writing data to a tensor preallocated by PyTorch is not " + "supported. Use PyTorch's native methods."); + } size_t tensor_data_bytes = tensor.shape().nelems() * tensor.data_type().bytes(); if (bytes != tensor_data_bytes) { @@ -843,7 +877,10 @@ float Executor::stop(int64_t max_spin_count) { void Executor::barrier() { impl_->barrier(); } -void Executor::destroy() { impl_.reset(nullptr); } +void Executor::destroy() { + ModelBufferManager::get_instance().clear_buffers(); + impl_.reset(nullptr); +} bool Executor::destroyed() const { return impl_.get() == nullptr; } diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp index 4b03c3ac8..4d33bd9f1 100644 --- a/ark/api/tensor.cpp +++ b/ark/api/tensor.cpp @@ -3,11 +3,25 @@ #include "ark/tensor.hpp" +#include "model/model_buffer.hpp" #include "model/model_data_type.hpp" #include "model/model_tensor.hpp" namespace ark { +Tensor::Tensor(void* data_ptr, int32_t device_id, + const std::vector& shape, + const DataType& dtype) { + size_t external_data_size = std::accumulate(shape.begin(), shape.end(), 1, + std::multiplies()) * + dtype.bytes(); + auto buffer = + std::make_shared(data_ptr, external_data_size, device_id); + auto tensor = std::make_shared(dtype.ref(), buffer, Dims(shape), + Dims(shape), Dims(), Dims()); + ref_ = tensor; +} + size_t Tensor::id() const { if (ref_) { return ref_->id(); @@ -43,14 +57,14 @@ Dims Tensor::padded_shape() const { return Dims(); } -const DataType &Tensor::data_type() const { +const DataType& Tensor::data_type() const { if (ref_) { return DataType::from_name(ref_->data_type()->type_name()); } return NONE; } -std::ostream &operator<<(std::ostream &os, const Tensor &tensor) { +std::ostream& operator<<(std::ostream& os, const Tensor& tensor) { if (tensor.is_null()) { os << "null"; } else { diff --git a/ark/codegen.cpp b/ark/codegen.cpp index 09ff28dd3..a97e5e45b 100644 --- a/ark/codegen.cpp +++ b/ark/codegen.cpp @@ -10,6 +10,7 @@ #include "file_io.h" #include "logging.h" #include "model/model_buffer.hpp" +#include "model_buffer_manager.hpp" #include "model/model_data_type.hpp" #include "model/model_op.hpp" #include "model/model_tensor.hpp" @@ -43,7 +44,7 @@ class CodeGenerator::Impl { public: Impl(const PlanJson &plan, const std::map &buffer_id_to_offset, - const std::string &name); + const std::string &name, ModelBufferManager *buffer_manager); ~Impl() = default; private: @@ -64,6 +65,8 @@ class CodeGenerator::Impl { std::string sync_process_range(const Range &ranges, int state_id); + ModelBufferManager *buffer_manager_; + protected: friend class CodeGenerator; @@ -78,14 +81,18 @@ class CodeGenerator::Impl { CodeGenerator::Impl::Impl(const PlanJson &plan, const std::map &buffer_id_to_offset, - const std::string &name) - : buffer_id_to_offset_(buffer_id_to_offset), name_(name) { + const std::string &name, + ModelBufferManager *buffer_manager) + : buffer_id_to_offset_(buffer_id_to_offset), + name_(name), + buffer_manager_(buffer_manager) { rank_ = plan.at("Rank"); world_size_ = plan.at("WorldSize"); num_procs_ = plan.at("NumProcessors"); num_warps_per_proc_ = plan.at("NumWarpsPerProcessor"); std::stringstream definitions_ss; + for (auto &task_json : plan.at("TaskInfos")) { definitions_ss << this->def_task(task_json); } @@ -224,11 +231,19 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) { auto &arg = impl_args[i]; if (arg.type_name() == "TENSOR") { auto tns = arg.value(); - size_t buffer_offset = - buffer_id_to_offset_.at(tns->buffer()->id()); - size_t offset = buffer_offset + ModelOffset(tns).value(); - ss << "(" << tns->data_type()->type_str() << "*)&_buf[" - << offset << "]"; + if (tns->buffer()->is_external()) { + void *buf_addr = + ModelBufferManager::get_instance().get_buffer( + tns->buffer()->id()); + ss << "(" << tns->data_type()->type_str() << "*)" + << buf_addr; + } else { + size_t buffer_offset = + buffer_id_to_offset_.at(tns->buffer()->id()); + size_t offset = buffer_offset + ModelOffset(tns).value(); + ss << "(" << tns->data_type()->type_str() << "*)&_buf[" + << offset << "]"; + } } else if (arg.type_name() == "OFFSET") { auto moff = arg.value(); size_t buffer_offset = @@ -431,8 +446,9 @@ std::string CodeGenerator::Impl::sync_process_range(const Range &range, CodeGenerator::CodeGenerator( const PlanJson &plan, const std::map &buffer_id_to_offset, - const std::string &name) - : impl_(std::make_shared(plan, buffer_id_to_offset, name)) {} + const std::string &name, ModelBufferManager *buffer_manager) + : impl_(std::make_shared(plan, buffer_id_to_offset, name, + buffer_manager)) {} std::string CodeGenerator::code() const { return impl_->code_; } diff --git a/ark/codegen.hpp b/ark/codegen.hpp index 4f8307e7e..a2976e644 100644 --- a/ark/codegen.hpp +++ b/ark/codegen.hpp @@ -8,6 +8,7 @@ #include #include +#include "model_buffer_manager.hpp" #include "model/model_json.hpp" namespace ark { @@ -16,7 +17,8 @@ class CodeGenerator { public: CodeGenerator(const PlanJson &plan, const std::map &buffer_id_to_offset, - const std::string &name = "ark_kernel"); + const std::string &name = "ark_kernel", + ModelBufferManager *buffer_manager = nullptr); ~CodeGenerator() = default; diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp index 747ce5fea..d13748175 100644 --- a/ark/include/ark/tensor.hpp +++ b/ark/include/ark/tensor.hpp @@ -31,6 +31,8 @@ class Tensor { Tensor(ModelTensorRef ref) : ref_(ref) {} Tensor(const Tensor &other) = default; Tensor &operator=(const Tensor &other) = default; + Tensor(void *data_ptr, int32_t device_id, const std::vector &shape, + const DataType &dtype); bool operator==(const Tensor &other) const { return ref_ == other.ref_; } bool operator!=(const Tensor &other) const { return ref_ != other.ref_; } diff --git a/ark/model/model_buffer.cpp b/ark/model/model_buffer.cpp index 4ce91b5e4..ce8f37727 100644 --- a/ark/model/model_buffer.cpp +++ b/ark/model/model_buffer.cpp @@ -4,13 +4,13 @@ #include "model_buffer.hpp" #include "logging.h" +#include "model_buffer_manager.hpp" namespace ark { -ModelBuffer::ModelBuffer(int rank) : rank_(rank) { - static size_t id = 0; - id_ = id++; -} +size_t ModelBuffer::curr_id = 0; + +ModelBuffer::ModelBuffer(int rank) : rank_(rank) { id_ = curr_id++; } ModelBuffer::ModelBuffer(size_t id, int rank, const std::vector &send_tags, @@ -24,6 +24,23 @@ ModelBuffer::ModelBuffer(size_t id, int rank, } } +ModelBuffer::ModelBuffer(void *data, size_t size, int32_t device_id) + : rank_(-1), + external_data_(data), + external_data_size_(size), + device_id_(device_id), + is_external_(true) { + id_ = curr_id++; +} + +ModelBuffer::ModelBuffer(size_t id, void *data, size_t size, int32_t device_id) + : id_(id), + rank_(-1), + external_data_(data), + external_data_size_(size), + device_id_(device_id), + is_external_(true) {} + void ModelBuffer::tag_send(int remote_rank, int tag) { send_tags_.insert(TagInfo{remote_rank, tag}); } @@ -46,6 +63,14 @@ Json ModelBuffer::serialize() const { } j["SendTags"] = send_tags; j["RecvTags"] = recv_tags; + j["IsExternal"] = is_external_; + if (is_external_) { + ModelBufferManager::get_instance().register_buffer(id_, external_data_, + external_data_size_); + j["ExternalDataSize"] = external_data_size_; + j["DeviceId"] = device_id_; + } + // external_data_ptr_ is not included in JSON return j; } @@ -62,6 +87,28 @@ std::shared_ptr ModelBuffer::deserialize(const Json &serialized) { } else if (!serialized.contains("RecvTags")) { ERR(InvalidUsageError, "ModelBuffer deserialization failed: missing RecvTags"); + } else if (!serialized.contains("IsExternal")) { + ERR(InvalidUsageError, + "ModelBuffer deserialization failed: missing IsExternal"); + } + if (serialized["IsExternal"]) { + if (!serialized.contains("ExternalDataSize")) { + ERR(InvalidUsageError, + "ModelBuffer deserialization failed: missing ExternalDataSize"); + } else if (!serialized.contains("DeviceId")) { + ERR(InvalidUsageError, + "ModelBuffer deserialization failed: missing DeviceId"); + } + void *data_ptr = + ModelBufferManager::get_instance().get_buffer(serialized["Id"]); + if (!data_ptr) { + ERR(InvalidUsageError, + "ModelBuffer deserialization failed: external buffer not found " + "in BufferManager"); + } + return std::make_shared(serialized["Id"], data_ptr, + serialized["ExternalDataSize"], + serialized["DeviceId"]); } return std::make_shared(serialized["Id"], serialized["Rank"], serialized["SendTags"], diff --git a/ark/model/model_buffer.hpp b/ark/model/model_buffer.hpp index 7ad3db206..e7f1045b2 100644 --- a/ark/model/model_buffer.hpp +++ b/ark/model/model_buffer.hpp @@ -22,6 +22,10 @@ class ModelBuffer { ModelBuffer(size_t id, int rank, const std::vector &send_tags, const std::vector &recv_tags); + // externally managed buffer + ModelBuffer(void *data, size_t size, int32_t device_id); + ModelBuffer(size_t id, void *data, size_t size, int32_t device_id); + size_t id() const { return id_; } int rank() const { return rank_; } @@ -44,11 +48,22 @@ class ModelBuffer { static std::shared_ptr deserialize(const Json &serialized); + // external buffer management + size_t external_data_size() const { return external_data_size_; } + void *external_data() const { return external_data_; } + int32_t device_id() const { return device_id_; } + bool is_external() const { return is_external_; } + private: + static size_t curr_id; size_t id_; int rank_; std::set send_tags_; std::set recv_tags_; + void *external_data_ = nullptr; + size_t external_data_size_ = 0; + int32_t device_id_; + bool is_external_ = false; }; } // namespace ark diff --git a/ark/model_buffer_manager.hpp b/ark/model_buffer_manager.hpp new file mode 100644 index 000000000..7b705f4c8 --- /dev/null +++ b/ark/model_buffer_manager.hpp @@ -0,0 +1,58 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_MODEL_BUFFER_MANAGER_HPP_ +#define ARK_MODEL_BUFFER_MANAGER_HPP_ + +#include +#include + +namespace ark { +// Manages externally allocated buffers not in the ARK memory space. +class ModelBufferManager { + public: + static ModelBufferManager& get_instance() { + static ModelBufferManager instance; + return instance; + } + + void register_buffer(size_t id, void* data, size_t size) { + buffers_[id] = std::make_tuple(data, size); + } + + void* get_buffer(size_t id) { + auto it = buffers_.find(id); + if (it != buffers_.end()) { + return std::get<0>(it->second); + } + return nullptr; + } + + size_t get_buffer_size(size_t id) { + auto it = buffers_.find(id); + if (it != buffers_.end()) { + return std::get<1>(it->second); + } + return 0; + } + + const std::unordered_map>& get_buffers() + const { + return buffers_; + } + + void clear_buffers() { buffers_.clear(); } + + bool is_empty() const { return buffers_.empty(); } + + private: + std::unordered_map> + buffers_; // Maps buffer IDs to pointers and sizes. + size_t next_compact_id_ = 0; + ModelBufferManager() {} + ModelBufferManager(const ModelBufferManager&) = delete; + ModelBufferManager& operator=(const ModelBufferManager&) = delete; +}; +} // namespace ark + +#endif // ARK_MODEL_BUFFER_MANAGER_HPP_ diff --git a/python/ark/tensor.py b/python/ark/tensor.py index ac2886960..8f26dc96e 100644 --- a/python/ark/tensor.py +++ b/python/ark/tensor.py @@ -167,18 +167,20 @@ def from_numpy(self, ndarray: np.ndarray) -> "Tensor": return self @staticmethod - def from_torch(tensor: torch.Tensor): - return Tensor( - Model.get_model().tensor( - Dims(list(tensor.shape)), - DataType.from_torch(tensor.dtype).ctype(), - Dims(), - Dims(), - Dims(), - "", - ), - lambda: tensor, - ) + def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor": + """ + Returns an ARK tensor that shares the same memory with the torch tensor. + """ + if _no_torch: + raise ImportError("torch is not available") + elif not tensor.is_contiguous(): + raise ValueError("Torch tensor must be contiguous.") + elif tensor.device.type == "cpu": + raise ValueError("Torch tensor must be on a device.") + ark_dtype = DataType.from_torch(tensor.dtype) + dl_capsule = torch.utils.dlpack.to_dlpack(tensor) + ark_tensor = _Tensor(dl_capsule, ark_dtype.ctype()) + return Tensor(ark_tensor, runtime_id=runtime_id) def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor": """ diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp index fbd909d3d..16eb03421 100644 --- a/python/tensor_py.cpp +++ b/python/tensor_py.cpp @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#include #include #include #include @@ -9,8 +10,51 @@ namespace py = pybind11; -void register_tensor(py::module &m) { +struct DLTensorMetadata { + void* data_ptr; + int32_t device_id; + DLDeviceType device_type; + int32_t ndim; + DLDataType dtype; + std::vector shape; + std::vector strides; + uint64_t byte_offset; +}; + +static DLTensorMetadata extractDLTensorMetadata(DLManagedTensor* dl_tensor) { + DLTensorMetadata metadata; + metadata.data_ptr = dl_tensor->dl_tensor.data; + metadata.device_id = dl_tensor->dl_tensor.device.device_id; + metadata.device_type = dl_tensor->dl_tensor.device.device_type; + metadata.ndim = dl_tensor->dl_tensor.ndim; + metadata.dtype = dl_tensor->dl_tensor.dtype; + metadata.shape.assign( + dl_tensor->dl_tensor.shape, + dl_tensor->dl_tensor.shape + dl_tensor->dl_tensor.ndim); + if (dl_tensor->dl_tensor.strides != nullptr) { + metadata.strides.assign( + dl_tensor->dl_tensor.strides, + dl_tensor->dl_tensor.strides + dl_tensor->dl_tensor.ndim); + } + metadata.byte_offset = dl_tensor->dl_tensor.byte_offset; + return metadata; +} + +void register_tensor(py::module& m) { py::class_(m, "_Tensor") + .def(py::init([](py::capsule capsule, const ark::DataType& dtype) { + DLManagedTensor* dl_tensor = (DLManagedTensor*)capsule; + if (!dl_tensor) { + throw std::runtime_error( + "Capsule does not contain a DLManagedTensor"); + } + DLTensorMetadata metadata = extractDLTensorMetadata(dl_tensor); + int32_t device_id = metadata.device_id; + void* data_ptr = metadata.data_ptr; + auto shape = metadata.shape; + + return new ark::Tensor(data_ptr, device_id, shape, dtype); + })) .def("id", &ark::Tensor::id) .def("shape", &ark::Tensor::shape, py::return_value_policy::reference) .def("strides", &ark::Tensor::strides, diff --git a/python/unittest/test_conversion.py b/python/unittest/test_conversion.py index 5befa1c34..833b88662 100644 --- a/python/unittest/test_conversion.py +++ b/python/unittest/test_conversion.py @@ -1,6 +1,7 @@ import pytest import numpy as np import ark +from typing import Callable try: import torch @@ -9,6 +10,8 @@ except ImportError: _no_torch = True +# ARK to Torch tests + def initialize_tensor(dimensions, dtype): tensor = ark.tensor(dimensions, dtype) @@ -69,7 +72,7 @@ def check_diff(input_tensor_host, input_view_numpy, value, index): # Test function to check if changes to the torch views are reflected in the original tensors @pytest.mark.parametrize("dtype", [ark.fp16, ark.fp32]) -def test_aliasing(dtype: ark.DataType): +def test_ark_to_torch_aliasing(dtype: ark.DataType): ark.init() dimensions = [4, 4] input_tensor, input_tensor_host = initialize_tensor(dimensions, dtype) @@ -126,3 +129,79 @@ def test_conversion_torch(): torch_tensor = t.to_torch() assert torch.all(torch_tensor == 7) + + +# Torch to ARK tests + +ArkBinOp = Callable[[ark.Tensor, ark.Tensor], ark.Tensor] +TorchBinOp = Callable[[torch.Tensor, torch.Tensor], torch.Tensor] +ArkUnOp = Callable[[ark.Tensor], ark.Tensor] +TorchUnOp = Callable[[torch.Tensor], torch.Tensor] + + +# Verify the accuracy of binary operations involving ARK view tensors +@pytest.mark.parametrize( + "dtype, ark_op, torch_op, tensor_dims", + [(torch.float16, ark.add, torch.add, (2, 3))], +) +def test_bin_op(dtype, ark_op: ArkBinOp, torch_op: TorchBinOp, tensor_dims): + ark.init() + input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0") + other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0") + expected_output = torch_op(input_tensor, other_tensor).cpu().numpy() + input_ark_view = ark.Tensor.from_torch(input_tensor) + other_ark_view = ark.Tensor.from_torch(other_tensor) + output = ark_op(input_ark_view, other_ark_view) + runtime = ark.Runtime() + runtime.launch() + runtime.run() + output_host = output.to_numpy() + runtime.stop() + runtime.reset() + assert np.allclose(output_host, expected_output) + + +# Verify the accuracy of unary operations involving ARK view tensors +@pytest.mark.parametrize( + "dtype, ark_op, torch_op, tensor_dims", + [(torch.float16, ark.exp, torch.exp, (3, 3))], +) +def test_unary_op(dtype, ark_op: ArkUnOp, torch_op: TorchUnOp, tensor_dims): + ark.init() + input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0") + expected_output = torch_op(input_tensor).cpu().numpy() + input_ark_view = ark.Tensor.from_torch(input_tensor) + output = ark_op(input_ark_view) + runtime = ark.Runtime() + runtime.launch() + runtime.run() + output_host = output.to_numpy() + runtime.stop() + runtime.reset() + assert np.allclose(output_host, expected_output) + + +# Test function to check if changes in torch tensors are reflected in ARK views +@pytest.mark.parametrize("dtype, tensor_dims", [(torch.float16, (64, 64))]) +def test_torch_to_ark_aliasing(dtype, tensor_dims): + ark.init() + # Initialize a PyTorch tensor + input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0") + other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0") + + input_ark_view = ark.Tensor.from_torch(input_tensor) + other_ark_view = ark.Tensor.from_torch(other_tensor) + + output = ark.add(input_ark_view, other_ark_view) + # Perform in place operations + input_tensor += other_tensor + other_tensor += input_tensor + expected_output = (input_tensor + other_tensor).cpu().numpy() + + runtime = ark.Runtime() + runtime.launch() + runtime.run() + output_host = output.to_numpy() + runtime.stop() + runtime.reset() + assert np.allclose(output_host, expected_output) From fe35541e02029b0d9a8da4cbdccf2565cbf516b0 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 3 Jul 2024 06:51:43 +0000 Subject: [PATCH 21/61] wip --- ark/api/executor.cpp | 22 +- ark/api/planner.cpp | 1 + ark/codegen.cpp | 16 +- ark/codegen.hpp | 3 +- ark/model/model_json.cpp | 14 +- ark/model_buffer_manager.hpp | 5 +- cmake/Utils.cmake | 2 +- docs/plan_file.md | 18 + examples/llama/model_test.py | 2 +- examples/tutorial/default_plan.json | 115 +++--- examples/tutorial/model.json | 46 +-- examples/tutorial/plan.json | 63 ++-- examples/tutorial/plan_1_larger_tile.json | 47 +-- examples/tutorial/plan_2_split_k.json | 63 ++-- examples/tutorial/plan_3_overwrite.json | 63 ++-- examples/tutorial/plan_tutorial.py | 4 +- plan_gpu0.json | 415 +++++++++++----------- python/ark/__init__.py | 3 +- python/ark/planner.py | 184 ++++++++++ python/ark/profiler.py | 30 +- python/ark/runtime.py | 52 +-- python/unittest/test_runtime.py | 27 +- 22 files changed, 686 insertions(+), 509 deletions(-) create mode 100644 python/ark/planner.py diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index 0a780bcc0..20b162b16 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -228,6 +228,16 @@ void Executor::Impl::init(const std::string &plan) { plan_json_ = Json::parse(plan); } + auto gpu_manager = GpuManager::get_instance(gpu_id_); + + if (!gpu_manager->info().arch->belongs_to( + Arch::from_name(plan_json_.at("Architecture")))) { + LOG(WARN, "Architecture name of the plan `", + plan_json_.at("Architecture").get(), + "` is not compatible with the GPU architecture `", + gpu_manager->info().arch->name(), "`."); + } + buffer_id_to_offset_ = init_buffers(plan_json_); std::string buffer_id_to_offset_str; @@ -236,17 +246,9 @@ void Executor::Impl::init(const std::string &plan) { std::to_string(kv.first) + ": " + std::to_string(kv.second) + ", "; } - ModelBufferManager &buffer_manager = ModelBufferManager::get_instance(); + codegen_ = std::make_shared(plan_json_, buffer_id_to_offset_, + name_); - if (!buffer_manager.is_empty()) { - codegen_ = std::make_shared( - plan_json_, buffer_id_to_offset_, name, &buffer_manager); - } else { - codegen_ = std::make_shared(plan_json_, - buffer_id_to_offset_, name); - } - - auto gpu_manager = GpuManager::get_instance(gpu_id_); timer_begin_ = gpu_manager->create_event(); timer_end_ = gpu_manager->create_event(); buffer_ = gpu_manager->malloc(total_bytes_, 65536); diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp index 5c9d09f2e..14e1b7b41 100644 --- a/ark/api/planner.cpp +++ b/ark/api/planner.cpp @@ -119,6 +119,7 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const { Json plan; plan["Rank"] = model_.rank(); plan["WorldSize"] = model_.world_size(); + plan["Architecture"] = gpu_info.arch->name(); plan["NumProcessors"] = max_num_processors; plan["NumWarpsPerProcessor"] = max_num_warps; plan["TaskInfos"] = task_infos; diff --git a/ark/codegen.cpp b/ark/codegen.cpp index a97e5e45b..55327329a 100644 --- a/ark/codegen.cpp +++ b/ark/codegen.cpp @@ -44,7 +44,7 @@ class CodeGenerator::Impl { public: Impl(const PlanJson &plan, const std::map &buffer_id_to_offset, - const std::string &name, ModelBufferManager *buffer_manager); + const std::string &name); ~Impl() = default; private: @@ -65,8 +65,6 @@ class CodeGenerator::Impl { std::string sync_process_range(const Range &ranges, int state_id); - ModelBufferManager *buffer_manager_; - protected: friend class CodeGenerator; @@ -81,11 +79,8 @@ class CodeGenerator::Impl { CodeGenerator::Impl::Impl(const PlanJson &plan, const std::map &buffer_id_to_offset, - const std::string &name, - ModelBufferManager *buffer_manager) - : buffer_id_to_offset_(buffer_id_to_offset), - name_(name), - buffer_manager_(buffer_manager) { + const std::string &name) + : buffer_id_to_offset_(buffer_id_to_offset), name_(name) { rank_ = plan.at("Rank"); world_size_ = plan.at("WorldSize"); num_procs_ = plan.at("NumProcessors"); @@ -446,9 +441,8 @@ std::string CodeGenerator::Impl::sync_process_range(const Range &range, CodeGenerator::CodeGenerator( const PlanJson &plan, const std::map &buffer_id_to_offset, - const std::string &name, ModelBufferManager *buffer_manager) - : impl_(std::make_shared(plan, buffer_id_to_offset, name, - buffer_manager)) {} + const std::string &name) + : impl_(std::make_shared(plan, buffer_id_to_offset, name)) {} std::string CodeGenerator::code() const { return impl_->code_; } diff --git a/ark/codegen.hpp b/ark/codegen.hpp index a2976e644..1ed8ec9f2 100644 --- a/ark/codegen.hpp +++ b/ark/codegen.hpp @@ -17,8 +17,7 @@ class CodeGenerator { public: CodeGenerator(const PlanJson &plan, const std::map &buffer_id_to_offset, - const std::string &name = "ark_kernel", - ModelBufferManager *buffer_manager = nullptr); + const std::string &name = "ark_kernel"); ~CodeGenerator() = default; diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp index 97ce71967..86eb843e2 100644 --- a/ark/model/model_json.cpp +++ b/ark/model/model_json.cpp @@ -250,9 +250,13 @@ static void verify_format_processor_group(const Json &json) { } static void verify_format_plan(const Json &json) { - const std::vector required_fields = { - "Rank", "WorldSize", "NumProcessors", "NumWarpsPerProcessor", - "TaskInfos", "ProcessorGroups"}; + const std::vector required_fields = {"Rank", + "WorldSize", + "Architecture", + "NumProcessors", + "NumWarpsPerProcessor", + "TaskInfos", + "ProcessorGroups"}; for (const auto &field : required_fields) { if (!json.contains(field)) { ERR(NotFoundError, "PlanJson: " + field + " not found"); @@ -276,6 +280,7 @@ PlanJson::PlanJson(const Json &json) : Json((json != nullptr) ? json : Json{{"Rank", 0}, {"WorldSize", 1}, + {"Architecture", "ANY"}, {"NumProcessors", 1}, {"NumWarpsPerProcessor", 1}, {"TaskInfos", Json::array()}, @@ -292,6 +297,9 @@ static std::stringstream &dump_pretty_plan(const Json &json, dump_pretty_item(json.at("WorldSize"), "WorldSize", ss, indent + indent_step) << ",\n"; + dump_pretty_item(json.at("Architecture"), "Architecture", ss, + indent + indent_step) + << ",\n"; dump_pretty_item(json.at("NumProcessors"), "NumProcessors", ss, indent + indent_step) << ",\n"; diff --git a/ark/model_buffer_manager.hpp b/ark/model_buffer_manager.hpp index 7b705f4c8..4baaec7fe 100644 --- a/ark/model_buffer_manager.hpp +++ b/ark/model_buffer_manager.hpp @@ -46,9 +46,8 @@ class ModelBufferManager { bool is_empty() const { return buffers_.empty(); } private: - std::unordered_map> - buffers_; // Maps buffer IDs to pointers and sizes. - size_t next_compact_id_ = 0; + // Maps buffer IDs to pointers and sizes. + std::unordered_map> buffers_; ModelBufferManager() {} ModelBufferManager(const ModelBufferManager&) = delete; ModelBufferManager& operator=(const ModelBufferManager&) = delete; diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index 9bb83fb42..b1fd1b132 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -14,7 +14,7 @@ if(GIT_CLANG_FORMAT) COMMAND ${GIT_CLANG_FORMAT} --style=file --diff || true ) add_custom_target(cpplint-autofix - COMMAND ${GIT_CLANG_FORMAT} --style=file || true + COMMAND ${GIT_CLANG_FORMAT} --style=file --extensions cc,cpp,h,hpp,cu,in,hip || true ) else() message(STATUS "git-clang-format not found.") diff --git a/docs/plan_file.md b/docs/plan_file.md index 90a4537a2..c06ccc35d 100644 --- a/docs/plan_file.md +++ b/docs/plan_file.md @@ -6,6 +6,7 @@ See an example plan file: [Example 1](../examples/tutorial/default_plan.json) - Rank (Int) - WorldSize (Int) + - Architecture (String) - NumProcessors (Int) - NumWarpsPerProcessor (Int) - TaskInfos (Array of TaskInfo) @@ -42,6 +43,23 @@ See an example plan file: [Example 1](../examples/tutorial/default_plan.json) `ProcessorRange`, `WarpRange`, `SramRange`, and `TaskRange` are in the "range" format, i.e., `[Begin, End, Step]` that indicates an arithmetic integer sequence with a common difference of `Step`, starting from `Begin` and ends before `End` (does not include `End`). They alternatively can be in the format `[Begin, End]` that assumes `Step` is 1. +## Architecture + +A name that refers to the hardware architecture where the plan is supposed to run over. The following names are currently supported. + +- `ANY`: compatible with all architectures. + +- NVIDIA Family + - `CUDA`: compatible with all supported NVIDIA architectures. + - `CUDA_70`: compatible with NVIDIA Volta architecture. + - `CUDA_80`: compatible with NVIDIA Ampere architecture. + - `CUDA_90`: compatible with NVIDIA Hopper architecture. + +- AMD Family + - `ROCM`: compatible with all supported AMD architectures. + - `ROCM_90A`: compatible with AMD CDNA 2 (GFX90A) architecture. + - `ROCM_942`: compatible with AMD CDNA 3 (GFX942) architecture. + ## TaskInfo A `TaskInfo` object describes a sequential set of operators. The followings describe each field of `TaskInfo`. diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py index 585341640..71485be45 100644 --- a/examples/llama/model_test.py +++ b/examples/llama/model_test.py @@ -59,7 +59,7 @@ def run_ark( output = module(*module_inputs) with ark.Runtime() as rt: - rt.launch(plan_path="/mnt/changhohwang/ark/plan_gpu0.json") + rt.launch(ark.Plan.from_file("/mnt/changhohwang/ark/plan_gpu0.json")) # Load model parameters if state_dict: diff --git a/examples/tutorial/default_plan.json b/examples/tutorial/default_plan.json index c6b4be243..bb774a5b8 100644 --- a/examples/tutorial/default_plan.json +++ b/examples/tutorial/default_plan.json @@ -1,36 +1,37 @@ { "Rank": 0, "WorldSize": 1, - "NumProcessors": 108, - "NumWarpsPerProcessor": 8, + "Architecture": "ROCM_942", + "NumProcessors": 304, + "NumWarpsPerProcessor": 4, "TaskInfos": [ { "Id": 0, - "NumWarps": 8, - "SramBytes": 147456, + "NumWarps": 4, + "SramBytes": 24672, "Ops": [ { "Type": "Matmul", "Name": "matmul", "IsVirtual": false, "ReadTensors": [ - {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, "TransposeOther": {"BOOL":true} }, "Config": { - "NumWarps": 8, - "SramBytes": 147456, - "TileShapeMNK": [128,256,64], + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], "NumTasks": 172 } } @@ -46,13 +47,13 @@ "Name": "sigmoid", "IsVirtual": false, "ReadTensors": [ - {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -74,14 +75,14 @@ "Name": "mul", "IsVirtual": false, "ReadTensors": [ - {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -95,31 +96,31 @@ }, { "Id": 3, - "NumWarps": 8, - "SramBytes": 147456, + "NumWarps": 4, + "SramBytes": 24672, "Ops": [ { "Type": "Matmul", "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, "TransposeOther": {"BOOL":true} }, "Config": { - "NumWarps": 8, - "SramBytes": 147456, - "TileShapeMNK": [128,256,64], + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], "NumTasks": 172 } } @@ -135,14 +136,14 @@ "Name": "mul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -156,31 +157,31 @@ }, { "Id": 5, - "NumWarps": 8, - "SramBytes": 147456, + "NumWarps": 4, + "SramBytes": 24672, "Ops": [ { "Type": "Matmul", "Name": "matmul_2", "IsVirtual": false, "ReadTensors": [ - {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, "TransposeOther": {"BOOL":true} }, "Config": { - "NumWarps": 8, - "SramBytes": 147456, - "TileShapeMNK": [128,256,64], + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], "NumTasks": 64 } } @@ -189,12 +190,12 @@ ], "ProcessorGroups": [ { - "ProcessorRange": [0,108], + "ProcessorRange": [0,172], "ResourceGroups": [ { - "ProcessorRange": [0,108], - "WarpRange": [0,8], - "SramRange": [0,147456], + "ProcessorRange": [0,172], + "WarpRange": [0,4], + "SramRange": [0,24672], "TaskGroups": [ {"TaskId":0,"TaskRange":[0,172],"Granularity":1} ] @@ -202,10 +203,10 @@ ] }, { - "ProcessorRange": [0,108], + "ProcessorRange": [0,304], "ResourceGroups": [ { - "ProcessorRange": [0,108], + "ProcessorRange": [0,304], "WarpRange": [0,1], "SramRange": [0,0], "TaskGroups": [ @@ -215,10 +216,10 @@ ] }, { - "ProcessorRange": [0,108], + "ProcessorRange": [0,304], "ResourceGroups": [ { - "ProcessorRange": [0,108], + "ProcessorRange": [0,304], "WarpRange": [0,1], "SramRange": [0,0], "TaskGroups": [ @@ -228,12 +229,12 @@ ] }, { - "ProcessorRange": [0,108], + "ProcessorRange": [0,172], "ResourceGroups": [ { - "ProcessorRange": [0,108], - "WarpRange": [0,8], - "SramRange": [0,147456], + "ProcessorRange": [0,172], + "WarpRange": [0,4], + "SramRange": [0,24672], "TaskGroups": [ {"TaskId":3,"TaskRange":[0,172],"Granularity":1} ] @@ -241,10 +242,10 @@ ] }, { - "ProcessorRange": [0,108], + "ProcessorRange": [0,304], "ResourceGroups": [ { - "ProcessorRange": [0,108], + "ProcessorRange": [0,304], "WarpRange": [0,1], "SramRange": [0,0], "TaskGroups": [ @@ -258,8 +259,8 @@ "ResourceGroups": [ { "ProcessorRange": [0,64], - "WarpRange": [0,8], - "SramRange": [0,147456], + "WarpRange": [0,4], + "SramRange": [0,24672], "TaskGroups": [ {"TaskId":5,"TaskRange":[0,64],"Granularity":1} ] @@ -267,4 +268,4 @@ ] } ] -} +} \ No newline at end of file diff --git a/examples/tutorial/model.json b/examples/tutorial/model.json index 1bc9233a5..a6ba8e8be 100644 --- a/examples/tutorial/model.json +++ b/examples/tutorial/model.json @@ -12,14 +12,14 @@ "Name": "matmul", "IsVirtual": false, "ReadTensors": [ - {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -31,13 +31,13 @@ "Name": "sigmoid", "IsVirtual": false, "ReadTensors": [ - {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {} }, @@ -46,14 +46,14 @@ "Name": "mul", "IsVirtual": false, "ReadTensors": [ - {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {} } @@ -69,14 +69,14 @@ "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -95,14 +95,14 @@ "Name": "mul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {} }, @@ -111,14 +111,14 @@ "Name": "matmul_2", "IsVirtual": false, "ReadTensors": [ - {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, diff --git a/examples/tutorial/plan.json b/examples/tutorial/plan.json index c0854e505..335c27549 100644 --- a/examples/tutorial/plan.json +++ b/examples/tutorial/plan.json @@ -1,6 +1,7 @@ { "Rank": 0, "WorldSize": 1, + "Architecture": "CUDA_80", "NumProcessors": 108, "NumWarpsPerProcessor": 8, "TaskInfos": [ @@ -14,14 +15,14 @@ "Name": "matmul", "IsVirtual": false, "ReadTensors": [ - {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, - {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} + {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, + {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} ], "WriteTensors": [ - {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -46,13 +47,13 @@ "Name": "sigmoid", "IsVirtual": false, "ReadTensors": [ - {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "WriteTensors": [ - {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": {}, "Config": { @@ -74,14 +75,14 @@ "Name": "mul", "IsVirtual": false, "ReadTensors": [ - {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, - {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, + {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "WriteTensors": [ - {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": {}, "Config": { @@ -103,14 +104,14 @@ "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, - {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} + {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, + {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} ], "WriteTensors": [ - {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -135,14 +136,14 @@ "Name": "mul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, - {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, + {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "WriteTensors": [ - {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": {}, "Config": { @@ -164,14 +165,14 @@ "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]}, - {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]} + {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]}, + {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]} ], "WriteTensors": [ - {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "ResultTensors": [ - {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -196,14 +197,14 @@ "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]}, - {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]} + {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]}, + {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]} ], "WriteTensors": [ - {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "ResultTensors": [ - {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -228,14 +229,14 @@ "Name": "add_1", "IsVirtual": false, "ReadTensors": [ - {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, - {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, + {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "WriteTensors": [ - {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "ResultTensors": [ - {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "Args": {}, "Config": { diff --git a/examples/tutorial/plan_1_larger_tile.json b/examples/tutorial/plan_1_larger_tile.json index 3a3f66530..04d2e9d60 100644 --- a/examples/tutorial/plan_1_larger_tile.json +++ b/examples/tutorial/plan_1_larger_tile.json @@ -1,6 +1,7 @@ { "Rank": 0, "WorldSize": 1, + "Architecture": "CUDA_80", "NumProcessors": 108, "NumWarpsPerProcessor": 8, "TaskInfos": [ @@ -14,14 +15,14 @@ "Name": "matmul", "IsVirtual": false, "ReadTensors": [ - {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, - {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} + {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, + {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} ], "WriteTensors": [ - {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -46,13 +47,13 @@ "Name": "sigmoid", "IsVirtual": false, "ReadTensors": [ - {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "WriteTensors": [ - {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": {}, "Config": { @@ -74,14 +75,14 @@ "Name": "mul", "IsVirtual": false, "ReadTensors": [ - {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, - {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, + {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "WriteTensors": [ - {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": {}, "Config": { @@ -103,14 +104,14 @@ "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, - {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} + {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, + {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} ], "WriteTensors": [ - {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -135,14 +136,14 @@ "Name": "mul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, - {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, + {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "WriteTensors": [ - {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": {}, "Config": { @@ -164,14 +165,14 @@ "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, - {"Id":2,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008]} + {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, + {"Id":2,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008]} ], "WriteTensors": [ - {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "ResultTensors": [ - {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "Args": { "TransposeInput": {"BOOL":false}, diff --git a/examples/tutorial/plan_2_split_k.json b/examples/tutorial/plan_2_split_k.json index 493515d8c..837944171 100644 --- a/examples/tutorial/plan_2_split_k.json +++ b/examples/tutorial/plan_2_split_k.json @@ -1,6 +1,7 @@ { "Rank": 0, "WorldSize": 1, + "Architecture": "CUDA_80", "NumProcessors": 108, "NumWarpsPerProcessor": 8, "TaskInfos": [ @@ -14,14 +15,14 @@ "Name": "matmul", "IsVirtual": false, "ReadTensors": [ - {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, - {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} + {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, + {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} ], "WriteTensors": [ - {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -46,13 +47,13 @@ "Name": "sigmoid", "IsVirtual": false, "ReadTensors": [ - {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "WriteTensors": [ - {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": {}, "Config": { @@ -74,14 +75,14 @@ "Name": "mul", "IsVirtual": false, "ReadTensors": [ - {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, - {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, + {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "WriteTensors": [ - {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": {}, "Config": { @@ -103,14 +104,14 @@ "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, - {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} + {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, + {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} ], "WriteTensors": [ - {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -135,14 +136,14 @@ "Name": "mul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, - {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, + {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "WriteTensors": [ - {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": {}, "Config": { @@ -164,14 +165,14 @@ "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]}, - {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]} + {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]}, + {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]} ], "WriteTensors": [ - {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "ResultTensors": [ - {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -196,14 +197,14 @@ "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]}, - {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]} + {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]}, + {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]} ], "WriteTensors": [ - {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "ResultTensors": [ - {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -228,14 +229,14 @@ "Name": "add_1", "IsVirtual": false, "ReadTensors": [ - {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, - {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, + {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "WriteTensors": [ - {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "ResultTensors": [ - {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "Args": {}, "Config": { diff --git a/examples/tutorial/plan_3_overwrite.json b/examples/tutorial/plan_3_overwrite.json index c0854e505..335c27549 100644 --- a/examples/tutorial/plan_3_overwrite.json +++ b/examples/tutorial/plan_3_overwrite.json @@ -1,6 +1,7 @@ { "Rank": 0, "WorldSize": 1, + "Architecture": "CUDA_80", "NumProcessors": 108, "NumWarpsPerProcessor": 8, "TaskInfos": [ @@ -14,14 +15,14 @@ "Name": "matmul", "IsVirtual": false, "ReadTensors": [ - {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, - {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} + {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, + {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} ], "WriteTensors": [ - {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -46,13 +47,13 @@ "Name": "sigmoid", "IsVirtual": false, "ReadTensors": [ - {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "WriteTensors": [ - {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": {}, "Config": { @@ -74,14 +75,14 @@ "Name": "mul", "IsVirtual": false, "ReadTensors": [ - {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, - {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, + {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "WriteTensors": [ - {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": {}, "Config": { @@ -103,14 +104,14 @@ "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, - {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} + {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, + {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]} ], "WriteTensors": [ - {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -135,14 +136,14 @@ "Name": "mul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, - {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}, + {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "WriteTensors": [ - {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "ResultTensors": [ - {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} + {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]} ], "Args": {}, "Config": { @@ -164,14 +165,14 @@ "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]}, - {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]} + {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]}, + {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]} ], "WriteTensors": [ - {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "ResultTensors": [ - {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -196,14 +197,14 @@ "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]}, - {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]} + {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]}, + {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]} ], "WriteTensors": [ - {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "ResultTensors": [ - {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -228,14 +229,14 @@ "Name": "add_1", "IsVirtual": false, "ReadTensors": [ - {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, - {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}, + {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "WriteTensors": [ - {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "ResultTensors": [ - {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} + {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]} ], "Args": {}, "Config": { diff --git a/examples/tutorial/plan_tutorial.py b/examples/tutorial/plan_tutorial.py index 056523e15..989f29c5e 100644 --- a/examples/tutorial/plan_tutorial.py +++ b/examples/tutorial/plan_tutorial.py @@ -339,7 +339,7 @@ def main(plan_path: str): plan = planner.plan() with open("default_plan.json", "w") as f: - f.write(plan) + f.write(str(plan)) rt.launch(plan=plan) # Initialize @@ -364,7 +364,7 @@ def main(plan_path: str): print(f"File {plan_path} does not exist. Exiting...") return with ark.Runtime.get_runtime() as rt: - rt.launch(plan_path=plan_path) + rt.launch(plan=ark.Plan.from_file(plan_path)) # Initialize InputModule.initialize() diff --git a/plan_gpu0.json b/plan_gpu0.json index 49b6bdd98..63c1943e3 100644 --- a/plan_gpu0.json +++ b/plan_gpu0.json @@ -1,6 +1,7 @@ { "Rank": 0, "WorldSize": 1, + "Architecture": "ROCM_942", "NumProcessors": 304, "NumWarpsPerProcessor": 4, "TaskInfos": [ @@ -14,14 +15,14 @@ "Name": "matmul", "IsVirtual": false, "ReadTensors": [ - {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":6,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":6,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":7,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":7,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -46,14 +47,14 @@ "Name": "rope", "IsVirtual": false, "ReadTensors": [ - {"Id":12,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":12,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":15,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":15,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -75,13 +76,13 @@ "Name": "transpose", "IsVirtual": false, "ReadTensors": [ - {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":19,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":19,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Permutation": {"DIMS":[0,2,1,3]} @@ -105,14 +106,14 @@ "Name": "matmul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":8,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":8,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -137,14 +138,14 @@ "Name": "rope_1", "IsVirtual": false, "ReadTensors": [ - {"Id":13,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":13,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":17,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":17,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -166,13 +167,13 @@ "Name": "transpose_2", "IsVirtual": false, "ReadTensors": [ - {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":23,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":23,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Permutation": {"DIMS":[0,2,3,1]} @@ -196,14 +197,14 @@ "Name": "matmul_2", "IsVirtual": false, "ReadTensors": [ - {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":10,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":10,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":11,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":11,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -228,13 +229,13 @@ "Name": "transpose_1", "IsVirtual": false, "ReadTensors": [ - {"Id":14,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":14,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":21,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":21,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Permutation": {"DIMS":[0,2,1,3]} @@ -258,14 +259,14 @@ "Name": "matmul_3", "IsVirtual": false, "ReadTensors": [ - {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -290,13 +291,13 @@ "Name": "mul", "IsVirtual": false, "ReadTensors": [ - {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":27,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":27,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Factor": {"FLOAT":0.0883883461356163} @@ -320,13 +321,13 @@ "Name": "reduce_max", "IsVirtual": false, "ReadTensors": [ - {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":29,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":29,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Axis": {"INT":3}, @@ -351,14 +352,14 @@ "Name": "sub", "IsVirtual": false, "ReadTensors": [ - {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -380,13 +381,13 @@ "Name": "exp", "IsVirtual": false, "ReadTensors": [ - {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -408,13 +409,13 @@ "Name": "reduce_sum", "IsVirtual": false, "ReadTensors": [ - {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":33,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":33,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Axis": {"INT":3}, @@ -439,14 +440,14 @@ "Name": "div", "IsVirtual": false, "ReadTensors": [ - {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -468,14 +469,14 @@ "Name": "matmul_4", "IsVirtual": false, "ReadTensors": [ - {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":36,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":36,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -500,13 +501,13 @@ "Name": "transpose_3", "IsVirtual": false, "ReadTensors": [ - {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":38,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":38,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":39,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":39,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Permutation": {"DIMS":[0,2,1,3]} @@ -530,14 +531,14 @@ "Name": "matmul_5", "IsVirtual": false, "ReadTensors": [ - {"Id":40,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":40,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":41,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":41,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":42,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":42,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -562,13 +563,13 @@ "Name": "cast", "IsVirtual": false, "ReadTensors": [ - {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":54,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":54,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -590,14 +591,14 @@ "Name": "mul_1", "IsVirtual": false, "ReadTensors": [ - {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":56,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":56,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -619,13 +620,13 @@ "Name": "reduce_mean", "IsVirtual": false, "ReadTensors": [ - {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":58,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":58,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Axis": {"INT":2}, @@ -650,13 +651,13 @@ "Name": "rsqrt", "IsVirtual": false, "ReadTensors": [ - {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":60,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":60,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -678,14 +679,14 @@ "Name": "mul_2", "IsVirtual": false, "ReadTensors": [ - {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":62,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":62,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -707,14 +708,14 @@ "Name": "mul_3", "IsVirtual": false, "ReadTensors": [ - {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":50,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":50,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -736,13 +737,13 @@ "Name": "cast_1", "IsVirtual": false, "ReadTensors": [ - {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":65,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":65,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -764,14 +765,14 @@ "Name": "matmul_6", "IsVirtual": false, "ReadTensors": [ - {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":43,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":43,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":67,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":67,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":68,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":68,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -796,14 +797,14 @@ "Name": "rope_2", "IsVirtual": false, "ReadTensors": [ - {"Id":73,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":73,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":76,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":76,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -825,13 +826,13 @@ "Name": "transpose_4", "IsVirtual": false, "ReadTensors": [ - {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":80,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":80,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Permutation": {"DIMS":[0,2,1,3]} @@ -855,14 +856,14 @@ "Name": "matmul_7", "IsVirtual": false, "ReadTensors": [ - {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":44,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":44,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":69,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":69,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":70,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":70,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -887,14 +888,14 @@ "Name": "rope_3", "IsVirtual": false, "ReadTensors": [ - {"Id":74,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":74,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":78,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":78,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -916,13 +917,13 @@ "Name": "transpose_6", "IsVirtual": false, "ReadTensors": [ - {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":84,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":84,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Permutation": {"DIMS":[0,2,3,1]} @@ -946,14 +947,14 @@ "Name": "matmul_8", "IsVirtual": false, "ReadTensors": [ - {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":45,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":45,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":71,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":71,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":72,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":72,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -978,13 +979,13 @@ "Name": "transpose_5", "IsVirtual": false, "ReadTensors": [ - {"Id":75,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":75,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":82,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":82,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Permutation": {"DIMS":[0,2,1,3]} @@ -1008,14 +1009,14 @@ "Name": "matmul_9", "IsVirtual": false, "ReadTensors": [ - {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":86,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":86,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -1040,13 +1041,13 @@ "Name": "mul_4", "IsVirtual": false, "ReadTensors": [ - {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":88,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":88,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Factor": {"FLOAT":0.0883883461356163} @@ -1070,13 +1071,13 @@ "Name": "reduce_max_1", "IsVirtual": false, "ReadTensors": [ - {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":90,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":90,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Axis": {"INT":3}, @@ -1101,14 +1102,14 @@ "Name": "sub_1", "IsVirtual": false, "ReadTensors": [ - {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -1130,13 +1131,13 @@ "Name": "exp_1", "IsVirtual": false, "ReadTensors": [ - {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -1158,13 +1159,13 @@ "Name": "reduce_sum_1", "IsVirtual": false, "ReadTensors": [ - {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":94,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":94,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Axis": {"INT":3}, @@ -1189,14 +1190,14 @@ "Name": "div_1", "IsVirtual": false, "ReadTensors": [ - {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -1218,14 +1219,14 @@ "Name": "matmul_10", "IsVirtual": false, "ReadTensors": [ - {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":97,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":97,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -1250,13 +1251,13 @@ "Name": "transpose_7", "IsVirtual": false, "ReadTensors": [ - {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":99,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":99,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":100,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":100,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Permutation": {"DIMS":[0,2,1,3]} @@ -1280,14 +1281,14 @@ "Name": "matmul_11", "IsVirtual": false, "ReadTensors": [ - {"Id":101,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":46,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":101,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":46,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":102,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":102,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -1312,14 +1313,14 @@ "Name": "add", "IsVirtual": false, "ReadTensors": [ - {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":104,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":104,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -1341,13 +1342,13 @@ "Name": "cast_2", "IsVirtual": false, "ReadTensors": [ - {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":106,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":106,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -1369,14 +1370,14 @@ "Name": "mul_5", "IsVirtual": false, "ReadTensors": [ - {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":108,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":108,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -1398,13 +1399,13 @@ "Name": "reduce_mean_1", "IsVirtual": false, "ReadTensors": [ - {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":110,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":110,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "Axis": {"INT":2}, @@ -1429,13 +1430,13 @@ "Name": "rsqrt_1", "IsVirtual": false, "ReadTensors": [ - {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":112,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":112,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -1457,14 +1458,14 @@ "Name": "mul_6", "IsVirtual": false, "ReadTensors": [ - {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":114,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":114,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -1486,14 +1487,14 @@ "Name": "mul_7", "IsVirtual": false, "ReadTensors": [ - {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":51,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":51,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -1515,13 +1516,13 @@ "Name": "cast_3", "IsVirtual": false, "ReadTensors": [ - {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":117,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":117,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -1543,14 +1544,14 @@ "Name": "matmul_12", "IsVirtual": false, "ReadTensors": [ - {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":47,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":47,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":119,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":119,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -1575,13 +1576,13 @@ "Name": "sigmoid", "IsVirtual": false, "ReadTensors": [ - {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":121,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":121,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -1603,14 +1604,14 @@ "Name": "mul_8", "IsVirtual": false, "ReadTensors": [ - {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":123,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":123,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -1632,14 +1633,14 @@ "Name": "matmul_13", "IsVirtual": false, "ReadTensors": [ - {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":49,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":49,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":125,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":125,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -1664,14 +1665,14 @@ "Name": "mul_9", "IsVirtual": false, "ReadTensors": [ - {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":127,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":127,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { @@ -1693,14 +1694,14 @@ "Name": "matmul_14", "IsVirtual": false, "ReadTensors": [ - {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":48,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":48,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":129,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":129,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { "TransposeInput": {"BOOL":false}, @@ -1725,14 +1726,14 @@ "Name": "add_1", "IsVirtual": false, "ReadTensors": [ - {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}, - {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":131,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":131,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":132,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[]}} + {"Id":132,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": {}, "Config": { diff --git a/python/ark/__init__.py b/python/ark/__init__.py index f2f604be9..e96972906 100644 --- a/python/ark/__init__.py +++ b/python/ark/__init__.py @@ -37,7 +37,7 @@ def set_world_size(world_size): from .init import init from .tensor import Dims, Tensor, Parameter from .module import Module, RuntimeModule -from .runtime import Runtime, DefaultPlanner +from .runtime import Runtime from .serialize import save, load from .data_type import ( DataType, @@ -100,4 +100,5 @@ def set_world_size(world_size): GpuError, RuntimeError, ) +from .planner import DefaultPlanner, Plan from .profiler import Profiler diff --git a/python/ark/planner.py b/python/ark/planner.py new file mode 100644 index 000000000..8814896d2 --- /dev/null +++ b/python/ark/planner.py @@ -0,0 +1,184 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import copy +import json +from typing import Callable, Dict, List, Any + +from ._ark_core import _DefaultPlanner +from .model import Model + + +def idnt(indent): + return " " * indent + + +def dquote(s): + return '"' + s + '"' + + +def denser_json_obj(obj, key, level, indent, indent_step, ret=""): + if len(obj) == 0: + if key: + return ret + idnt(indent) + dquote(key) + ": {}" + else: + return ret + idnt(indent) + "{}" + ret += idnt(indent) + if key: + ret += dquote(key) + ": {\n" + else: + ret += "{\n" + num_item = len(obj) + for k, v in obj.items(): + is_obj_or_arr = isinstance(v, dict) or isinstance(v, list) + is_num_arr = isinstance(v, list) and v and isinstance(v[0], int) + if level <= 0 or not is_obj_or_arr or is_num_arr: + ret += ( + idnt(indent + indent_step) + + dquote(k) + + ": " + + json.dumps(v, separators=(",", ":")) + ) + elif isinstance(v, dict): + ret += denser_json_obj( + v, k, level - 1, indent + indent_step, indent_step + ) + elif isinstance(v, list): + ret += denser_json_arr( + v, k, level - 1, indent + indent_step, indent_step + ) + num_item -= 1 + if num_item > 0: + ret += ",\n" + else: + ret += "\n" + ret += idnt(indent) + "}" + return ret + + +def denser_json_arr(obj, key, level, indent, indent_step, ret=""): + if len(obj) == 0: + if key: + return ret + idnt(indent) + dquote(key) + ": []" + else: + return ret + idnt(indent) + "[]" + ret += idnt(indent) + if key: + ret += dquote(key) + ": [\n" + else: + ret += "[\n" + num_item = len(obj) + for v in obj: + is_obj_or_arr = isinstance(v, dict) or isinstance(v, list) + is_num_arr = ( + isinstance(v, list) + and v + and (isinstance(v[0], int) or isinstance(v[0], float)) + ) + if level <= 0 or not is_obj_or_arr or is_num_arr: + ret += idnt(indent + indent_step) + json.dumps( + v, separators=(",", ":") + ) + elif isinstance(v, dict): + ret += denser_json_obj( + v, "", level - 1, indent + indent_step, indent_step + ) + elif isinstance(v, list): + ret += denser_json_arr( + v, "", level - 1, indent + indent_step, indent_step + ) + num_item -= 1 + if num_item > 0: + ret += ",\n" + else: + ret += "\n" + ret += idnt(indent) + "]" + return ret + + +def denser_json(obj, level, indent_step=2): + if isinstance(obj, dict): + return denser_json_obj(obj, "", level, 0, indent_step, "") + elif isinstance(obj, list): + return denser_json_arr(obj, "", level, 0, indent_step, "") + return json.dumps(obj, indent=indent_step) + + +class Plan: + def __init__(self, plan: Dict[str, Any]): + if plan is None: + plan = {} + plan["Rank"] = 0 + plan["WorldSize"] = 1 + plan["Architecture"] = "ANY" + plan["NumProcessors"] = 1 + plan["NumWarpsPerProcessor"] = 1 + plan["TaskInfos"] = [] + plan["ProcessorGroups"] = [] + else: + plan = copy.deepcopy(plan) + self.plan = plan + + def __str__(self) -> str: + return denser_json(self.plan, 5) + + @property + def rank(self) -> int: + return self.plan["Rank"] + + @property + def world_size(self) -> int: + return self.plan["WorldSize"] + + @property + def architecture(self) -> str: + return self.plan["Architecture"] + + @property + def num_processors(self) -> int: + return self.plan["NumProcessors"] + + @property + def num_warps_per_processor(self) -> int: + return self.plan["NumWarpsPerProcessor"] + + @property + def task_infos(self) -> List[Dict[str, Any]]: + return self.plan["TaskInfos"] + + @property + def processor_groups(self) -> List[Dict[str, Any]]: + return self.plan["ProcessorGroups"] + + @staticmethod + def from_str(plan_str: str) -> "Plan": + plan = json.loads(plan_str) + return Plan(plan) + + @staticmethod + def from_file(file_path: str) -> "Plan": + with open(file_path, "r") as f: + plan = json.load(f) + return Plan(plan) + + +class DefaultPlanner(_DefaultPlanner): + def __init__(self, device_id: int = 0): + compressed = Model.get_model().compress() + super().__init__(compressed, device_id) + + def install_config_rule(self, rule: Callable[[str, str], str]): + """ + Install a configuration rule. + + Args: + rule: A function that takes an operator description and a target + architecture name and returns a configuration description. + """ + super().install_config_rule(rule) + + def plan(self) -> Plan: + """ + Generate an execution plan. + """ + return Plan.from_str(super().plan(pretty=False)) diff --git a/python/ark/profiler.py b/python/ark/profiler.py index b959ceb18..feb78e0de 100644 --- a/python/ark/profiler.py +++ b/python/ark/profiler.py @@ -1,30 +1,36 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import json import sys import time + from .runtime import Runtime +from .planner import Plan class Profiler: - def __init__(self, plan: str): - self.plan = json.loads(plan) + def __init__(self, plan: Plan): + self.plan = plan def run(self): - num_processor_groups = len(self.plan["ProcessorGroups"]) + num_processor_groups = len(self.plan.processor_groups) new_plan = { - "Rank": self.plan["Rank"], "WorldSize": self.plan["WorldSize"], - "NumProcessors": self.plan["NumProcessors"], - "NumWarpsPerProcessor": self.plan["NumWarpsPerProcessor"], - "TaskInfos": self.plan["TaskInfos"], - "ProcessorGroups": [{}]} + "Rank": self.plan.rank, + "WorldSize": self.plan.world_size, + "Architecture": self.plan.architecture, + "NumProcessors": self.plan.num_processors, + "NumWarpsPerProcessor": self.plan.num_warps_per_processor, + "TaskInfos": self.plan.task_infos, + "ProcessorGroups": [None], + } for i in range(num_processor_groups): - new_plan["ProcessorGroups"][0] = self.plan["ProcessorGroups"][i] + new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i] with Runtime() as rt: - rt.launch(plan=json.dumps(new_plan)) + rt.launch(plan=str(new_plan)) start_time = time.time() iter = 1000 rt.run(iter=iter) end_time = time.time() - sys.stderr.write(f"Processor group {i} runtime: {(end_time - start_time)/iter:.6f} seconds/iter\n") + sys.stderr.write( + f"Processor group {i} runtime: {(end_time - start_time)/iter:.6f} seconds/iter\n" + ) diff --git a/python/ark/runtime.py b/python/ark/runtime.py index efae6ab3c..40bfaaa63 100644 --- a/python/ark/runtime.py +++ b/python/ark/runtime.py @@ -3,10 +3,11 @@ import logging from enum import Enum -from typing import Callable, Dict, List +from typing import Dict, List -from ._ark_core import _Executor, _DefaultPlanner +from ._ark_core import _Executor from .model import Model +from .planner import DefaultPlanner, Plan class _RuntimeState: @@ -46,33 +47,9 @@ def print_runtime_states(): print(f"{runtime_id:<12} | {runtime.state:<20}") -class DefaultPlanner(_DefaultPlanner): - def __init__(self, gpu_id: int = 0): - compressed = Model.get_model().compress() - super().__init__(compressed, gpu_id) - - def install_config_rule(self, rule: Callable[[str, str], str]): - """ - Install a configuration rule. - - Args: - rule: A function that takes an operator description and a target - architecture name and returns a configuration description. - """ - super().install_config_rule(rule) - - def plan(self, pretty: bool = True) -> str: - """ - Generate an execution plan. - - Args: - pretty: Whether to generate a pretty plan. - """ - return super().plan(pretty) - - class Executor(_Executor): - pass + def __init__(self, plan: Plan, device_id: int, name: str): + super().__init__(plan.rank, plan.world_size, device_id, name, str(plan)) class Runtime: @@ -155,11 +132,8 @@ def running(self) -> bool: def launch( self, - rank: int = 0, - world_size: int = 1, - gpu_id: int = 0, - plan: str = "", - plan_path: str = "", + plan: Plan = None, + device_id: int = 0, ): """ Create an executor and schedule the ARK model. The scheduler will generate @@ -172,11 +146,7 @@ def launch( ) return if not plan: - if not plan_path: - plan = DefaultPlanner(gpu_id).plan() - else: - with open(plan_path, "r") as f: - plan = f.read() + plan = DefaultPlanner(device_id).plan() # If the RuntimeState is init, we need to create a new executor and # compile the kernels if self.state == Runtime.State.Init: @@ -187,11 +157,9 @@ def launch( ) self.executor.destroy() self.executor = Executor( - rank, - world_size, - gpu_id, - "ArkRuntime", plan, + device_id, + "ArkRuntime", ) self.executor.compile() self.executor.launch() diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py index fd34bb96b..b075c64ea 100644 --- a/python/unittest/test_runtime.py +++ b/python/unittest/test_runtime.py @@ -2,18 +2,9 @@ # Licensed under the MIT license. import ark -import json -empty_plan = json.dumps( - { - "Rank": 0, - "WorldSize": 1, - "NumProcessors": 1, - "NumWarpsPerProcessor": 1, - "TaskInfos": [], - "ProcessorGroups": [], - } -) + +empty_plan = ark.Plan(None) def test_runtime_relaunch(): @@ -35,7 +26,7 @@ def test_multiple_runtime_launch(): for i in range(num_runtimes): rt = ark.Runtime.get_runtime(i) assert rt.launched() == False - rt.launch(gpu_id=i, plan=empty_plan) + rt.launch(plan=empty_plan, device_id=i) assert rt.launched() == True for i in range(num_runtimes): rt = ark.Runtime.get_runtime(i) @@ -46,9 +37,9 @@ def test_multiple_runtime_launch(): def test_stop_runtime(): ark.init() rt1 = ark.Runtime.get_runtime(1) - rt1.launch(plan=empty_plan, gpu_id=1) + rt1.launch(plan=empty_plan, device_id=1) rt2 = ark.Runtime.get_runtime(2) - rt2.launch(plan=empty_plan, gpu_id=2) + rt2.launch(plan=empty_plan, device_id=2) rt1.stop() rt1.reset() assert rt1.state == ark.Runtime.State.Init @@ -59,9 +50,9 @@ def test_stop_runtime(): def test_reset_runtime(): ark.init() rt1 = ark.Runtime.get_runtime(0) - rt1.launch(plan=empty_plan, gpu_id=1) + rt1.launch(plan=empty_plan, device_id=1) rt2 = ark.Runtime.get_runtime(1) - rt2.launch(plan=empty_plan, gpu_id=2) + rt2.launch(plan=empty_plan, device_id=2) rt1.reset() assert rt1.launched() == False assert rt2.launched() == True @@ -77,7 +68,7 @@ def test_multiple_runtimes_complex(): default_runtime = ark.Runtime.get_runtime() runtime_list.append(default_runtime) for i, rt in enumerate(runtime_list): - rt.launch(plan=empty_plan, gpu_id=i) + rt.launch(plan=empty_plan, device_id=i) assert rt.launched() == True runtime_list[0].stop() assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning @@ -87,7 +78,7 @@ def test_multiple_runtimes_complex(): assert runtime_list[1].state == ark.Runtime.State.Init assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning assert runtime_list[2].state == ark.Runtime.State.LaunchedNotRunning - runtime_list[1].launch(plan=empty_plan, gpu_id=1) + runtime_list[1].launch(plan=empty_plan, device_id=1) for rt in runtime_list: assert rt.launched() == True ark.Runtime.delete_all_runtimes() From 0cb10b92c601306d537eb3de6259cf73e59b33df Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 3 Jul 2024 07:58:34 +0000 Subject: [PATCH 22/61] fix a reduction perf bug --- ark/include/kernels/reduce.h | 18 +++++++++--------- plan_gpu0.json | 36 ++++++++++++++++++------------------ python/ark/profiler.py | 24 +++++++++++++++--------- 3 files changed, 42 insertions(+), 36 deletions(-) diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h index 30c8b7831..3d0b4e008 100644 --- a/ark/include/kernels/reduce.h +++ b/ark/include/kernels/reduce.h @@ -53,7 +53,7 @@ DEVICE bf16 warpReduce(bf16 val) { template DEVICE DataType warpsReduce(DataType val, int tid, int smem_per_warp) { val = warpReduce(val); - if (LanesNum > Arch::ThreadsPerWarp) { + if constexpr (LanesNum > Arch::ThreadsPerWarp) { ReduceSharedStorage *shared = UnitOp::template shared_memory>( smem_per_warp); @@ -351,8 +351,8 @@ struct WwiseReduce { /// @param in Input tensor. /// @param uop_idx Index of the unit operator. template - static DEVICE void runW(DataType *out, DataType *in, int uop_idx, - int smem_per_warp) { + static DEVICE void run(DataType *out, DataType *in, int uop_idx, + int smem_per_warp) { using ShapeChecker = ReduceShapeChecker; constexpr int NelemPerThread = @@ -450,8 +450,8 @@ template ::runW(out, in, uop_idx, - smem_per_warp); + SmemBytes, ReduceTypeSum, Axis>::run(out, in, uop_idx, + smem_per_warp); } template ::runW(out, in, uop_idx, - smem_per_warp); + SmemBytes, ReduceTypeMean, Axis>::run(out, in, uop_idx, + smem_per_warp); } template ::runW(out, in, uop_idx, - smem_per_warp); + SmemBytes, ReduceTypeMax, Axis>::run(out, in, uop_idx, + smem_per_warp); } } // namespace ark diff --git a/plan_gpu0.json b/plan_gpu0.json index 63c1943e3..99e2da8fa 100644 --- a/plan_gpu0.json +++ b/plan_gpu0.json @@ -314,7 +314,7 @@ { "Id": 10, "NumWarps": 1, - "SramBytes": 256, + "SramBytes": 0, "Ops": [ { "Type": "ReduceMax", @@ -336,7 +336,7 @@ "Config": { "NumWarps": 1, "ImplType": "WarpWise", - "SramBytes": 256, + "SramBytes": 0, "NumTasks": 65536 } } @@ -402,7 +402,7 @@ { "Id": 13, "NumWarps": 1, - "SramBytes": 256, + "SramBytes": 0, "Ops": [ { "Type": "ReduceSum", @@ -424,7 +424,7 @@ "Config": { "NumWarps": 1, "ImplType": "WarpWise", - "SramBytes": 256, + "SramBytes": 0, "NumTasks": 65536 } } @@ -613,7 +613,7 @@ { "Id": 20, "NumWarps": 1, - "SramBytes": 256, + "SramBytes": 0, "Ops": [ { "Type": "ReduceMean", @@ -635,7 +635,7 @@ "Config": { "NumWarps": 1, "ImplType": "WarpWise", - "SramBytes": 256, + "SramBytes": 0, "NumTasks": 2048 } } @@ -1064,7 +1064,7 @@ { "Id": 35, "NumWarps": 1, - "SramBytes": 256, + "SramBytes": 0, "Ops": [ { "Type": "ReduceMax", @@ -1086,7 +1086,7 @@ "Config": { "NumWarps": 1, "ImplType": "WarpWise", - "SramBytes": 256, + "SramBytes": 0, "NumTasks": 65536 } } @@ -1152,7 +1152,7 @@ { "Id": 38, "NumWarps": 1, - "SramBytes": 256, + "SramBytes": 0, "Ops": [ { "Type": "ReduceSum", @@ -1174,7 +1174,7 @@ "Config": { "NumWarps": 1, "ImplType": "WarpWise", - "SramBytes": 256, + "SramBytes": 0, "NumTasks": 65536 } } @@ -1392,7 +1392,7 @@ { "Id": 46, "NumWarps": 1, - "SramBytes": 256, + "SramBytes": 0, "Ops": [ { "Type": "ReduceMean", @@ -1414,7 +1414,7 @@ "Config": { "NumWarps": 1, "ImplType": "WarpWise", - "SramBytes": 256, + "SramBytes": 0, "NumTasks": 2048 } } @@ -1883,7 +1883,7 @@ { "ProcessorRange": [0,304], "WarpRange": [0,4], - "SramRange": [0,256], + "SramRange": [0,0], "TaskGroups": [ {"TaskId":10,"TaskRange":[0,65536],"Granularity":1} ] @@ -1922,7 +1922,7 @@ { "ProcessorRange": [0,304], "WarpRange": [0,4], - "SramRange": [0,256], + "SramRange": [0,0], "TaskGroups": [ {"TaskId":13,"TaskRange":[0,65536],"Granularity":1} ] @@ -2013,7 +2013,7 @@ { "ProcessorRange": [0,304], "WarpRange": [0,4], - "SramRange": [0,256], + "SramRange": [0,0], "TaskGroups": [ {"TaskId":20,"TaskRange":[0,2048],"Granularity":1} ] @@ -2208,7 +2208,7 @@ { "ProcessorRange": [0,304], "WarpRange": [0,4], - "SramRange": [0,256], + "SramRange": [0,0], "TaskGroups": [ {"TaskId":35,"TaskRange":[0,65536],"Granularity":1} ] @@ -2247,7 +2247,7 @@ { "ProcessorRange": [0,304], "WarpRange": [0,4], - "SramRange": [0,256], + "SramRange": [0,0], "TaskGroups": [ {"TaskId":38,"TaskRange":[0,65536],"Granularity":1} ] @@ -2351,7 +2351,7 @@ { "ProcessorRange": [0,304], "WarpRange": [0,4], - "SramRange": [0,256], + "SramRange": [0,0], "TaskGroups": [ {"TaskId":46,"TaskRange":[0,2048],"Granularity":1} ] diff --git a/python/ark/profiler.py b/python/ark/profiler.py index feb78e0de..529a0d506 100644 --- a/python/ark/profiler.py +++ b/python/ark/profiler.py @@ -8,11 +8,22 @@ from .planner import Plan +def timeit(plan: Plan): + with Runtime() as rt: + rt.launch(plan=plan) + start_time = time.time() + iter = 1000 + rt.run(iter=iter) + end_time = time.time() + return (end_time - start_time) / iter + + class Profiler: def __init__(self, plan: Plan): self.plan = plan def run(self): + sys.stderr.write(f"End-to-end: {timeit(self.plan):.6f} seconds/iter\n") num_processor_groups = len(self.plan.processor_groups) new_plan = { "Rank": self.plan.rank, @@ -25,12 +36,7 @@ def run(self): } for i in range(num_processor_groups): new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i] - with Runtime() as rt: - rt.launch(plan=str(new_plan)) - start_time = time.time() - iter = 1000 - rt.run(iter=iter) - end_time = time.time() - sys.stderr.write( - f"Processor group {i} runtime: {(end_time - start_time)/iter:.6f} seconds/iter\n" - ) + lat_per_iter = timeit(Plan(new_plan)) + sys.stderr.write( + f"Processor group {i}: {lat_per_iter:.6f} seconds/iter\n" + ) From 0fde9c5dc486ba1edb20235115575d360558ece9 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jul 2024 07:17:32 +0000 Subject: [PATCH 23/61] optimize --- ark/include/kernels/common/sync.h | 12 +-- ark/ops/ops_broadcast.cpp | 4 +- examples/llama/model_test.py | 2 +- plan_gpu0.json | 172 ++++++++---------------------- 4 files changed, 51 insertions(+), 139 deletions(-) diff --git a/ark/include/kernels/common/sync.h b/ark/include/kernels/common/sync.h index 85f7639c9..f47625600 100644 --- a/ark/include/kernels/common/sync.h +++ b/ark/include/kernels/common/sync.h @@ -106,25 +106,19 @@ DEVICE void sync_warps() { static_assert(Arch::ThreadsPerWarp == 64, ""); if constexpr (NumWarps == 1) { __builtin_amdgcn_wave_barrier(); - } else if constexpr (NumWarps == 16) { - __syncthreads(); } else { static_assert(ARK_SMEM_RESERVED_BYTES >= sizeof(sync::WarpGroupState), ""); - int lane_id = threadIdx.x & 63; - if (lane_id == 0) { + if ((threadIdx.x & 63) == 0) { constexpr int MaxOldCnt = NumWarps - 1; - int warp_id = threadIdx.x >> 6; - int group_id = warp_id / NumWarps; + int group_id = (threadIdx.x >> 6) / NumWarps; sync::WarpGroupState *state = reinterpret_cast(_ARK_SMEM); unsigned int tmp = state->is_inc_flag[group_id] ^ 1; if (atomicInc(&state->cnt[group_id], MaxOldCnt) == MaxOldCnt) { state->flag[group_id] = tmp; } else { - while (atomicAdd(&state->flag[group_id], 0) != tmp) - __builtin_amdgcn_s_sleep(1); - __asm__ __volatile__("s_wakeup"); + while (atomicAdd(&state->flag[group_id], 0) != tmp); } state->is_inc_flag[group_id] = tmp; } diff --git a/ark/ops/ops_broadcast.cpp b/ark/ops/ops_broadcast.cpp index 3985a0500..f20e8c4dc 100644 --- a/ark/ops/ops_broadcast.cpp +++ b/ark/ops/ops_broadcast.cpp @@ -27,8 +27,8 @@ ModelOpBroadcast1::ModelOpBroadcast1(const std::string &type_name, std::string ModelOpBroadcast1::impl_name(const Json &config) const { check_fields_config(config, {"NumWarps", "Tile"}); int num_warps = config.at("NumWarps"); - auto &tile_shape = config.at("Tile"); - Dims unit_out_dims{tile_shape[0], tile_shape[1]}; + const auto& tile_shape = config.at("Tile").get>(); + Dims unit_out_dims(tile_shape); return function_name_string( pascal_to_snake(type()->type_name()), diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py index 71485be45..053015c04 100644 --- a/examples/llama/model_test.py +++ b/examples/llama/model_test.py @@ -473,7 +473,7 @@ def test_transformer_block( module_name_prefix="layers.0", rank=rank, world_size=world_size, - test_thru=True, + test_thru=False, ) diff --git a/plan_gpu0.json b/plan_gpu0.json index 99e2da8fa..cad05f774 100644 --- a/plan_gpu0.json +++ b/plan_gpu0.json @@ -31,7 +31,7 @@ "Config": { "NumWarps": 4, "SramBytes": 24672, - "TileShapeMNK": [128,256,32], + "TileShapeMNK": [256,128,32], "NumTasks": 256 } } @@ -39,7 +39,7 @@ }, { "Id": 1, - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, "Ops": [ { @@ -58,17 +58,17 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [32,128], - "NumTasks": 2048 + "Tile": [256,1,128], + "NumTasks": 256 } } ] }, { "Id": 2, - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, "Ops": [ { @@ -88,10 +88,10 @@ "Permutation": {"DIMS":[0,2,1,3]} }, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [8,128], - "NumTasks": 8192 + "Tile": [256,128], + "NumTasks": 256 } } ] @@ -122,7 +122,7 @@ "Config": { "NumWarps": 4, "SramBytes": 24672, - "TileShapeMNK": [128,256,32], + "TileShapeMNK": [256,128,32], "NumTasks": 256 } } @@ -130,7 +130,7 @@ }, { "Id": 4, - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, "Ops": [ { @@ -149,17 +149,17 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [32,128], - "NumTasks": 2048 + "Tile": [256,1,128], + "NumTasks": 256 } } ] }, { "Id": 5, - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, "Ops": [ { @@ -170,19 +170,19 @@ {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":23,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + {"Id":23,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + {"Id":24,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { - "Permutation": {"DIMS":[0,2,3,1]} + "Permutation": {"DIMS":[0,2,1,3]} }, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [8,128], - "NumTasks": 8192 + "Tile": [256,128], + "NumTasks": 256 } } ] @@ -213,7 +213,7 @@ "Config": { "NumWarps": 4, "SramBytes": 24672, - "TileShapeMNK": [128,256,32], + "TileShapeMNK": [256,128,32], "NumTasks": 256 } } @@ -221,7 +221,7 @@ }, { "Id": 7, - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, "Ops": [ { @@ -241,10 +241,10 @@ "Permutation": {"DIMS":[0,2,1,3]} }, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [8,128], - "NumTasks": 8192 + "Tile": [256,128], + "NumTasks": 256 } } ] @@ -260,7 +260,7 @@ "IsVirtual": false, "ReadTensors": [ {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + {"Id":24,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} @@ -270,12 +270,12 @@ ], "Args": { "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":false} + "TransposeOther": {"BOOL":true} }, "Config": { "NumWarps": 4, "SramBytes": 24672, - "TileShapeMNK": [128,256,32], + "TileShapeMNK": [256,128,32], "NumTasks": 4096 } } @@ -305,7 +305,7 @@ "Config": { "NumWarps": 4, "SramBytes": 0, - "Tile": [128,256], + "Tile": [256,128], "NumTasks": 4096 } } @@ -1747,119 +1747,36 @@ } ], "ProcessorGroups": [ - { - "ProcessorRange": [0,256], - "ResourceGroups": [ - { - "ProcessorRange": [0,256], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":0,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, { "ProcessorRange": [0,304], "ResourceGroups": [ { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":1,"TaskRange":[0,2048],"Granularity":4} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":2,"TaskRange":[0,8192],"Granularity":4} - ] - } - ] - }, - { - "ProcessorRange": [0,256], - "ResourceGroups": [ - { - "ProcessorRange": [0,256], + "ProcessorRange": [0,86], "WarpRange": [0,4], "SramRange": [0,24672], "TaskGroups": [ - {"TaskId":3,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":4,"TaskRange":[0,2048],"Granularity":4} + {"TaskId":0,"TaskRange":[0,256],"Granularity":1}, + {"TaskId":1,"TaskRange":[0,256],"Granularity":1}, + {"TaskId":2,"TaskRange":[0,256],"Granularity":1} ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ + }, { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":5,"TaskRange":[0,8192],"Granularity":4} - ] - } - ] - }, - { - "ProcessorRange": [0,256], - "ResourceGroups": [ - { - "ProcessorRange": [0,256], + "ProcessorRange": [86,172], "WarpRange": [0,4], "SramRange": [0,24672], "TaskGroups": [ - {"TaskId":6,"TaskRange":[0,256],"Granularity":1} + {"TaskId":3,"TaskRange":[0,256],"Granularity":1}, + {"TaskId":4,"TaskRange":[0,256],"Granularity":1}, + {"TaskId":5,"TaskRange":[0,256],"Granularity":1} ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":7,"TaskRange":[0,8192],"Granularity":4} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ + }, { - "ProcessorRange": [0,304], + "ProcessorRange": [172,258], "WarpRange": [0,4], "SramRange": [0,24672], "TaskGroups": [ - {"TaskId":8,"TaskRange":[0,4096],"Granularity":1} + {"TaskId":6,"TaskRange":[0,256],"Granularity":1}, + {"TaskId":7,"TaskRange":[0,256],"Granularity":1} ] } ] @@ -1870,8 +1787,9 @@ { "ProcessorRange": [0,304], "WarpRange": [0,4], - "SramRange": [0,0], + "SramRange": [0,24672], "TaskGroups": [ + {"TaskId":8,"TaskRange":[0,4096],"Granularity":1}, {"TaskId":9,"TaskRange":[0,4096],"Granularity":1} ] } From c4be6d1bf7b7fcacdd11dd3efad7b4170461ce41 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 5 Jul 2024 00:14:05 +0000 Subject: [PATCH 24/61] wip --- ark/codegen.cpp | 6 +- arkprof.py | 4 + examples/llama/model_test.py | 23 +- examples/llama/plan_llama2_7b_b1_s2048.json | 1723 +++++++++++++++++++ python/ark/profiler.py | 12 +- 5 files changed, 1751 insertions(+), 17 deletions(-) create mode 100644 arkprof.py create mode 100644 examples/llama/plan_llama2_7b_b1_s2048.json diff --git a/ark/codegen.cpp b/ark/codegen.cpp index 55327329a..587bcae59 100644 --- a/ark/codegen.cpp +++ b/ark/codegen.cpp @@ -298,10 +298,14 @@ std::string CodeGenerator::Impl::resource_group( size_t proc_b = *rg_proc_range.begin(); size_t proc_e = *rg_proc_range.end(); size_t proc_s = rg_proc_range.step(); + std::map task_infos_map; + for (auto &task_info : task_infos) { + task_infos_map[task_info.at("Id").get()] = task_info; + } std::stringstream ss; for (auto &tg : rg_json["TaskGroups"]) { size_t task_id = tg["TaskId"]; - auto &task_info = task_infos[task_id]; + auto &task_info = task_infos_map.at(task_id); Range task_range(tg["TaskRange"][0], tg["TaskRange"][1]); size_t task_gran = tg["Granularity"]; size_t num_warps_per_task = task_info["NumWarps"]; diff --git a/arkprof.py b/arkprof.py new file mode 100644 index 000000000..782bba560 --- /dev/null +++ b/arkprof.py @@ -0,0 +1,4 @@ +import ark +import sys + +ark.Profiler(ark.Plan.from_file(sys.argv[1])).run(iter=1000, profile_processor_groups=False) diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py index 053015c04..19c680854 100644 --- a/examples/llama/model_test.py +++ b/examples/llama/model_test.py @@ -59,7 +59,8 @@ def run_ark( output = module(*module_inputs) with ark.Runtime() as rt: - rt.launch(ark.Plan.from_file("/mnt/changhohwang/ark/plan_gpu0.json")) + plan = ark.Plan.from_file("plan_llama2_7b_b1_s2048.json") + rt.launch(plan) # Load model parameters if state_dict: @@ -438,22 +439,22 @@ def test_transformer_block( low=-1, high=1, size=(batch_size, seq_len, args.dim) ).astype(dtype) - module = model_ark.Attention( - args, ark.DataType.from_numpy(dtype), rank, world_size - ) + # module = model_ark.Attention( + # args, ark.DataType.from_numpy(dtype), rank, world_size + # ) # module_inputs = [ # ark.tensor(list(i.shape), ark.DataType.from_numpy(i.dtype)) # if isinstance(i, np.ndarray) # else i # for i in inputs # ] - feature_tensor = ark.tensor( - list(feature.shape), ark.DataType.from_numpy(feature.dtype) - ) - freqs_cis_ark_tensor = ark.tensor( - list(freqs_cis_ark.shape), ark.DataType.from_numpy(freqs_cis_ark.dtype) - ) - output = module(feature_tensor, 0, freqs_cis_ark_tensor, None) + # feature_tensor = ark.tensor( + # list(feature.shape), ark.DataType.from_numpy(feature.dtype) + # ) + # freqs_cis_ark_tensor = ark.tensor( + # list(freqs_cis_ark.shape), ark.DataType.from_numpy(freqs_cis_ark.dtype) + # ) + # output = module(feature_tensor, 0, freqs_cis_ark_tensor, None) # print(ark.Model.get_model().serialize()) diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json new file mode 100644 index 000000000..d0e46d228 --- /dev/null +++ b/examples/llama/plan_llama2_7b_b1_s2048.json @@ -0,0 +1,1723 @@ +{ + "Rank": 0, + "WorldSize": 1, + "Architecture": "ROCM_942", + "NumProcessors": 304, + "NumWarpsPerProcessor": 4, + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Cast", + "Name": "cast", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":11,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 1, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul", + "IsVirtual": false, + "ReadTensors": [ + {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":13,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":14,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 2, + "NumWarps": 1, + "SramBytes": 256, + "Ops": [ + { + "Type": "ReduceMean", + "Name": "reduce_mean", + "IsVirtual": false, + "ReadTensors": [ + {"Id":14,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":15,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":16,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "Axis": {"INT":2}, + "KeepDim": {"BOOL":true} + }, + "Config": { + "NumWarps": 1, + "ImplType": "WarpWise", + "SramBytes": 256, + "NumTasks": 2048 + } + } + ] + }, + { + "Id": 3, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Rsqrt", + "Name": "rsqrt", + "IsVirtual": false, + "ReadTensors": [ + {"Id":16,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":17,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":18,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [64,1], + "NumTasks": 32 + } + } + ] + }, + { + "Id": 4, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":18,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":19,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 5, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":7,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":21,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 6, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Cast", + "Name": "cast_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":21,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":22,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 7, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul", + "IsVirtual": false, + "ReadTensors": [ + {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":24,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":25,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Rope", + "Name": "rope", + "IsVirtual": false, + "ReadTensors": [ + {"Id":30,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":10,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":33,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":34,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Transpose", + "Name": "transpose", + "IsVirtual": false, + "ReadTensors": [ + {"Id":34,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "Permutation": {"DIMS":[0,2,1,3]} + }, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [8,8], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 10, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":26,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":27,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 11, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Rope", + "Name": "rope_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":31,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":10,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":35,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 12, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Transpose", + "Name": "transpose_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":41,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":42,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "Permutation": {"DIMS":[0,2,3,1]} + }, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [8,8], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 13, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":28,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":29,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 14, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Transpose", + "Name": "transpose_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":32,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":39,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":40,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "Permutation": {"DIMS":[0,2,1,3]} + }, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [8,8], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 15, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":42,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":43,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":44,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":false} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 4096 + } + } + ] + }, + { + "Id": 16, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "ScalarMul", + "Name": "mul_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":44,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":45,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "Factor": {"FLOAT":0.0883883461356163} + }, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 2097152 + } + } + ] + }, + { + "Id": 17, + "NumWarps": 1, + "SramBytes": 256, + "Ops": [ + { + "Type": "ReduceMax", + "Name": "reduce_max", + "IsVirtual": false, + "ReadTensors": [ + {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":47,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":48,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "Axis": {"INT":3}, + "KeepDim": {"BOOL":true} + }, + "Config": { + "NumWarps": 1, + "ImplType": "WarpWise", + "SramBytes": 256, + "NumTasks": 65536 + } + } + ] + }, + { + "Id": 18, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Sub", + "Name": "sub", + "IsVirtual": false, + "ReadTensors": [ + {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":48,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 2097152 + } + } + ] + }, + { + "Id": 19, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Exp", + "Name": "exp", + "IsVirtual": false, + "ReadTensors": [ + {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 2097152 + } + } + ] + }, + { + "Id": 20, + "NumWarps": 1, + "SramBytes": 256, + "Ops": [ + { + "Type": "ReduceSum", + "Name": "reduce_sum", + "IsVirtual": false, + "ReadTensors": [ + {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":51,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "Axis": {"INT":3}, + "KeepDim": {"BOOL":true} + }, + "Config": { + "NumWarps": 1, + "ImplType": "WarpWise", + "SramBytes": 256, + "NumTasks": 65536 + } + } + ] + }, + { + "Id": 21, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Div", + "Name": "div", + "IsVirtual": false, + "ReadTensors": [ + {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":52,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 2097152 + } + } + ] + }, + { + "Id": 22, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":40,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":54,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":false} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [256,128,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 23, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Transpose", + "Name": "transpose_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":57,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "Permutation": {"DIMS":[0,2,1,3]} + }, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [8,8], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 24, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":60,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 25, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Add", + "Name": "add", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":60,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 26, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Cast", + "Name": "cast_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 27, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":65,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":66,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 28, + "NumWarps": 1, + "SramBytes": 256, + "Ops": [ + { + "Type": "ReduceMean", + "Name": "reduce_mean_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":66,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":67,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":68,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "Axis": {"INT":2}, + "KeepDim": {"BOOL":true} + }, + "Config": { + "NumWarps": 1, + "ImplType": "WarpWise", + "SramBytes": 256, + "NumTasks": 2048 + } + } + ] + }, + { + "Id": 29, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Rsqrt", + "Name": "rsqrt_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":68,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":69,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":70,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [64,1], + "NumTasks": 32 + } + } + ] + }, + { + "Id": 30, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":70,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":71,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 31, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":8,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":73,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 32, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Cast", + "Name": "cast_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":73,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":74,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + }, + { + "Id": 33, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":4,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":76,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 688 + } + } + ] + }, + { + "Id": 34, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Sigmoid", + "Name": "sigmoid", + "IsVirtual": false, + "ReadTensors": [ + {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":78,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":79,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 352256 + } + } + ] + }, + { + "Id": 35, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_7", + "IsVirtual": false, + "ReadTensors": [ + {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":79,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":80,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":81,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 352256 + } + } + ] + }, + { + "Id": 36, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_7", + "IsVirtual": false, + "ReadTensors": [ + {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":6,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":82,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":83,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 688 + } + } + ] + }, + { + "Id": 37, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Mul", + "Name": "mul_8", + "IsVirtual": false, + "ReadTensors": [ + {"Id":81,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":83,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":84,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":85,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 352256 + } + } + ] + }, + { + "Id": 38, + "NumWarps": 4, + "SramBytes": 24672, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_8", + "IsVirtual": false, + "ReadTensors": [ + {"Id":85,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":5,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":86,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":87,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 24672, + "TileShapeMNK": [128,256,32], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 39, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Add", + "Name": "add_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":87,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":88,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":89,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1,64], + "NumTasks": 131072 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,256], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,2048],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,32], + "ResourceGroups": [ + { + "ProcessorRange": [0,32], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,32],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":8,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":9,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":10,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":11,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":12,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":13,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":14,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":15,"TaskRange":[0,4096],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":16,"TaskRange":[0,2097152],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,256], + "TaskGroups": [ + {"TaskId":17,"TaskRange":[0,65536],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":18,"TaskRange":[0,2097152],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":19,"TaskRange":[0,2097152],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,256], + "TaskGroups": [ + {"TaskId":20,"TaskRange":[0,65536],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":21,"TaskRange":[0,2097152],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":22,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":23,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":24,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":25,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":26,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":27,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,256], + "TaskGroups": [ + {"TaskId":28,"TaskRange":[0,2048],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,32], + "ResourceGroups": [ + { + "ProcessorRange": [0,32], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":29,"TaskRange":[0,32],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":30,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":31,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":32,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":33,"TaskRange":[0,688],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":34,"TaskRange":[0,352256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":35,"TaskRange":[0,352256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":36,"TaskRange":[0,688],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":37,"TaskRange":[0,352256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,256], + "ResourceGroups": [ + { + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], + "TaskGroups": [ + {"TaskId":38,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,304], + "ResourceGroups": [ + { + "ProcessorRange": [0,304], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":39,"TaskRange":[0,131072],"Granularity":1} + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/python/ark/profiler.py b/python/ark/profiler.py index 529a0d506..56233247c 100644 --- a/python/ark/profiler.py +++ b/python/ark/profiler.py @@ -8,11 +8,10 @@ from .planner import Plan -def timeit(plan: Plan): +def timeit(plan: Plan, iter: int): with Runtime() as rt: rt.launch(plan=plan) start_time = time.time() - iter = 1000 rt.run(iter=iter) end_time = time.time() return (end_time - start_time) / iter @@ -22,8 +21,11 @@ class Profiler: def __init__(self, plan: Plan): self.plan = plan - def run(self): - sys.stderr.write(f"End-to-end: {timeit(self.plan):.6f} seconds/iter\n") + def run(self, iter: int = 1000, profile_processor_groups: bool = False): + sys.stderr.write(f"End-to-end: {timeit(self.plan, iter):.6f} seconds/iter\n") + + if not profile_processor_groups: + return num_processor_groups = len(self.plan.processor_groups) new_plan = { "Rank": self.plan.rank, @@ -36,7 +38,7 @@ def run(self): } for i in range(num_processor_groups): new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i] - lat_per_iter = timeit(Plan(new_plan)) + lat_per_iter = timeit(Plan(new_plan), iter) sys.stderr.write( f"Processor group {i}: {lat_per_iter:.6f} seconds/iter\n" ) From cc30912486c24f71617ee2200c7429ea2e610d51 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 5 Jul 2024 07:12:49 +0000 Subject: [PATCH 25/61] optimization --- examples/llama/plan_llama2_7b_b1_s2048.json | 732 ++++---------------- 1 file changed, 126 insertions(+), 606 deletions(-) diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json index d0e46d228..15b0de2d0 100644 --- a/examples/llama/plan_llama2_7b_b1_s2048.json +++ b/examples/llama/plan_llama2_7b_b1_s2048.json @@ -27,17 +27,10 @@ "Config": { "NumWarps": 1, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [1,4096], + "NumTasks": 2048 } - } - ] - }, - { - "Id": 1, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Mul", "Name": "mul", @@ -56,17 +49,10 @@ "Config": { "NumWarps": 1, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [1,4096], + "NumTasks": 2048 } - } - ] - }, - { - "Id": 2, - "NumWarps": 1, - "SramBytes": 256, - "Ops": [ + }, { "Type": "ReduceMean", "Name": "reduce_mean", @@ -87,7 +73,7 @@ "Config": { "NumWarps": 1, "ImplType": "WarpWise", - "SramBytes": 256, + "SramBytes": 0, "NumTasks": 2048 } } @@ -144,17 +130,10 @@ "Config": { "NumWarps": 1, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [1,4096], + "NumTasks": 2048 } - } - ] - }, - { - "Id": 5, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Mul", "Name": "mul_2", @@ -173,17 +152,10 @@ "Config": { "NumWarps": 1, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [1,4096], + "NumTasks": 2048 } - } - ] - }, - { - "Id": 6, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Cast", "Name": "cast_1", @@ -201,8 +173,8 @@ "Config": { "NumWarps": 1, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [1,4096], + "NumTasks": 2048 } } ] @@ -233,17 +205,10 @@ "Config": { "NumWarps": 4, "SramBytes": 24672, - "TileShapeMNK": [128,256,32], + "TileShapeMNK": [256,128,32], "NumTasks": 256 } - } - ] - }, - { - "Id": 8, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Rope", "Name": "rope", @@ -260,19 +225,12 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [256,1,128], + "NumTasks": 256 } - } - ] - }, - { - "Id": 9, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Transpose", "Name": "transpose", @@ -290,10 +248,10 @@ "Permutation": {"DIMS":[0,2,1,3]} }, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [8,8], - "NumTasks": 131072 + "Tile": [256,128], + "NumTasks": 256 } } ] @@ -324,17 +282,10 @@ "Config": { "NumWarps": 4, "SramBytes": 24672, - "TileShapeMNK": [128,256,32], + "TileShapeMNK": [256,128,32], "NumTasks": 256 } - } - ] - }, - { - "Id": 11, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Rope", "Name": "rope_1", @@ -351,19 +302,12 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [256,128], + "NumTasks": 256 } - } - ] - }, - { - "Id": 12, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Transpose", "Name": "transpose_2", @@ -372,19 +316,19 @@ {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ - {"Id":41,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + {"Id":41,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "ResultTensors": [ - {"Id":42,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + {"Id":42,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "Args": { - "Permutation": {"DIMS":[0,2,3,1]} + "Permutation": {"DIMS":[0,2,1,3]} }, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [8,8], - "NumTasks": 131072 + "Tile": [256,128], + "NumTasks": 256 } } ] @@ -415,17 +359,10 @@ "Config": { "NumWarps": 4, "SramBytes": 24672, - "TileShapeMNK": [128,256,32], + "TileShapeMNK": [256,128,32], "NumTasks": 256 } - } - ] - }, - { - "Id": 14, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Transpose", "Name": "transpose_1", @@ -443,10 +380,10 @@ "Permutation": {"DIMS":[0,2,1,3]} }, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [8,8], - "NumTasks": 131072 + "Tile": [256,128], + "NumTasks": 256 } } ] @@ -462,7 +399,7 @@ "IsVirtual": false, "ReadTensors": [ {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":42,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + {"Id":42,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} ], "WriteTensors": [ {"Id":43,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} @@ -472,22 +409,15 @@ ], "Args": { "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":false} + "TransposeOther": {"BOOL":true} }, "Config": { "NumWarps": 4, "SramBytes": 24672, - "TileShapeMNK": [128,256,32], + "TileShapeMNK": [256,128,32], "NumTasks": 4096 } - } - ] - }, - { - "Id": 16, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "ScalarMul", "Name": "mul_3", @@ -505,10 +435,10 @@ "Factor": {"FLOAT":0.0883883461356163} }, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 2097152 + "Tile": [256,128], + "NumTasks": 4096 } } ] @@ -516,7 +446,7 @@ { "Id": 17, "NumWarps": 1, - "SramBytes": 256, + "SramBytes": 0, "Ops": [ { "Type": "ReduceMax", @@ -538,17 +468,10 @@ "Config": { "NumWarps": 1, "ImplType": "WarpWise", - "SramBytes": 256, + "SramBytes": 0, "NumTasks": 65536 } - } - ] - }, - { - "Id": 18, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Sub", "Name": "sub", @@ -567,17 +490,10 @@ "Config": { "NumWarps": 1, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 2097152 + "Tile": [1,2048], + "NumTasks": 65536 } - } - ] - }, - { - "Id": 19, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Exp", "Name": "exp", @@ -595,17 +511,10 @@ "Config": { "NumWarps": 1, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 2097152 + "Tile": [1,2048], + "NumTasks": 65536 } - } - ] - }, - { - "Id": 20, - "NumWarps": 1, - "SramBytes": 256, - "Ops": [ + }, { "Type": "ReduceSum", "Name": "reduce_sum", @@ -626,17 +535,10 @@ "Config": { "NumWarps": 1, "ImplType": "WarpWise", - "SramBytes": 256, + "SramBytes": 0, "NumTasks": 65536 } - } - ] - }, - { - "Id": 21, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Div", "Name": "div", @@ -655,8 +557,8 @@ "Config": { "NumWarps": 1, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 2097152 + "Tile": [1,2048], + "NumTasks": 65536 } } ] @@ -690,14 +592,7 @@ "TileShapeMNK": [256,128,32], "NumTasks": 256 } - } - ] - }, - { - "Id": 23, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Transpose", "Name": "transpose_3", @@ -715,10 +610,10 @@ "Permutation": {"DIMS":[0,2,1,3]} }, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [8,8], - "NumTasks": 131072 + "Tile": [256,1,128], + "NumTasks": 256 } } ] @@ -749,17 +644,10 @@ "Config": { "NumWarps": 4, "SramBytes": 24672, - "TileShapeMNK": [128,256,32], + "TileShapeMNK": [256,128,32], "NumTasks": 256 } - } - ] - }, - { - "Id": 25, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Add", "Name": "add", @@ -776,19 +664,12 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [256,128], + "NumTasks": 256 } - } - ] - }, - { - "Id": 26, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Cast", "Name": "cast_2", @@ -804,19 +685,12 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [256,128], + "NumTasks": 256 } - } - ] - }, - { - "Id": 27, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Mul", "Name": "mul_4", @@ -833,10 +707,10 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [256,128], + "NumTasks": 256 } } ] @@ -844,7 +718,7 @@ { "Id": 28, "NumWarps": 1, - "SramBytes": 256, + "SramBytes": 0, "Ops": [ { "Type": "ReduceMean", @@ -866,7 +740,7 @@ "Config": { "NumWarps": 1, "ImplType": "WarpWise", - "SramBytes": 256, + "SramBytes": 0, "NumTasks": 2048 } } @@ -923,17 +797,10 @@ "Config": { "NumWarps": 1, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [1,4096], + "NumTasks": 2048 } - } - ] - }, - { - "Id": 31, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Mul", "Name": "mul_6", @@ -952,17 +819,10 @@ "Config": { "NumWarps": 1, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [1,4096], + "NumTasks": 2048 } - } - ] - }, - { - "Id": 32, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Cast", "Name": "cast_3", @@ -980,8 +840,8 @@ "Config": { "NumWarps": 1, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [1,4096], + "NumTasks": 2048 } } ] @@ -1012,17 +872,10 @@ "Config": { "NumWarps": 4, "SramBytes": 24672, - "TileShapeMNK": [128,256,32], + "TileShapeMNK": [256,128,32], "NumTasks": 688 } - } - ] - }, - { - "Id": 34, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Sigmoid", "Name": "sigmoid", @@ -1038,19 +891,12 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 352256 + "Tile": [256,128], + "NumTasks": 688 } - } - ] - }, - { - "Id": 35, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Mul", "Name": "mul_7", @@ -1067,10 +913,10 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 352256 + "Tile": [256,128], + "NumTasks": 688 } } ] @@ -1101,17 +947,10 @@ "Config": { "NumWarps": 4, "SramBytes": 24672, - "TileShapeMNK": [128,256,32], + "TileShapeMNK": [256,128,32], "NumTasks": 688 } - } - ] - }, - { - "Id": 37, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Mul", "Name": "mul_8", @@ -1128,10 +967,10 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 352256 + "Tile": [256,128], + "NumTasks": 688 } } ] @@ -1162,17 +1001,10 @@ "Config": { "NumWarps": 4, "SramBytes": 24672, - "TileShapeMNK": [128,256,32], + "TileShapeMNK": [256,128,32], "NumTasks": 256 } - } - ] - }, - { - "Id": 39, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ + }, { "Type": "Add", "Name": "add_1", @@ -1189,10 +1021,10 @@ ], "Args": {}, "Config": { - "NumWarps": 1, + "NumWarps": 4, "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 + "Tile": [256,128], + "NumTasks": 256 } } ] @@ -1204,23 +1036,23 @@ "ResourceGroups": [ { "ProcessorRange": [0,304], - "WarpRange": [0,1], + "WarpRange": [0,4], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":0,"TaskRange":[0,131072],"Granularity":1} + {"TaskId":0,"TaskRange":[0,2048],"Granularity":4} ] } ] }, { - "ProcessorRange": [0,304], + "ProcessorRange": [0,32], "ResourceGroups": [ { - "ProcessorRange": [0,304], + "ProcessorRange": [0,32], "WarpRange": [0,1], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":1,"TaskRange":[0,131072],"Granularity":1} + {"TaskId":3,"TaskRange":[0,32],"Granularity":1} ] } ] @@ -1230,101 +1062,23 @@ "ResourceGroups": [ { "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,0], "TaskGroups": [ - {"TaskId":2,"TaskRange":[0,2048],"Granularity":1} + {"TaskId":4,"TaskRange":[0,2048],"Granularity":4} ] } ] }, { - "ProcessorRange": [0,32], + "ProcessorRange": [0,256], "ResourceGroups": [ { - "ProcessorRange": [0,32], - "WarpRange": [0,1], - "SramRange": [0,0], + "ProcessorRange": [0,256], + "WarpRange": [0,4], + "SramRange": [0,24672], "TaskGroups": [ - {"TaskId":3,"TaskRange":[0,32],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":4,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":5,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":6,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,256], - "ResourceGroups": [ - { - "ProcessorRange": [0,256], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":7,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":8,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":9,"TaskRange":[0,131072],"Granularity":1} + {"TaskId":7,"TaskRange":[0,256],"Granularity":1} ] } ] @@ -1342,32 +1096,6 @@ } ] }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":11,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":12,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, { "ProcessorRange": [0,256], "ResourceGroups": [ @@ -1381,19 +1109,6 @@ } ] }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":14,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, { "ProcessorRange": [0,304], "ResourceGroups": [ @@ -1412,75 +1127,10 @@ "ResourceGroups": [ { "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":16,"TaskRange":[0,2097152],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,256], - "TaskGroups": [ - {"TaskId":17,"TaskRange":[0,65536],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":18,"TaskRange":[0,2097152],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":19,"TaskRange":[0,2097152],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,256], - "TaskGroups": [ - {"TaskId":20,"TaskRange":[0,65536],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], + "WarpRange": [0,4], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":21,"TaskRange":[0,2097152],"Granularity":1} + {"TaskId":17,"TaskRange":[0,65536],"Granularity":4} ] } ] @@ -1498,19 +1148,6 @@ } ] }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":23,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, { "ProcessorRange": [0,256], "ResourceGroups": [ @@ -1529,49 +1166,10 @@ "ResourceGroups": [ { "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":25,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":26,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], + "WarpRange": [0,4], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":27,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,256], - "TaskGroups": [ - {"TaskId":28,"TaskRange":[0,2048],"Granularity":1} + {"TaskId":28,"TaskRange":[0,2048],"Granularity":4} ] } ] @@ -1594,36 +1192,10 @@ "ResourceGroups": [ { "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":30,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":31,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], + "WarpRange": [0,4], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":32,"TaskRange":[0,131072],"Granularity":1} + {"TaskId":30,"TaskRange":[0,2048],"Granularity":4} ] } ] @@ -1641,32 +1213,6 @@ } ] }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":34,"TaskRange":[0,352256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":35,"TaskRange":[0,352256],"Granularity":1} - ] - } - ] - }, { "ProcessorRange": [0,304], "ResourceGroups": [ @@ -1680,19 +1226,6 @@ } ] }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":37,"TaskRange":[0,352256],"Granularity":1} - ] - } - ] - }, { "ProcessorRange": [0,256], "ResourceGroups": [ @@ -1705,19 +1238,6 @@ ] } ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,1], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":39,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] } ] } \ No newline at end of file From 34a87d867669aae49b2a29056aadfed694d97b33 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 8 Jul 2024 02:10:40 +0000 Subject: [PATCH 26/61] optimize --- examples/llama/plan_llama2_7b_b1_s2048.json | 97 ++++++++++++++++----- 1 file changed, 76 insertions(+), 21 deletions(-) diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json index 15b0de2d0..d5c9fe552 100644 --- a/examples/llama/plan_llama2_7b_b1_s2048.json +++ b/examples/llama/plan_llama2_7b_b1_s2048.json @@ -3,7 +3,7 @@ "WorldSize": 1, "Architecture": "ROCM_942", "NumProcessors": 304, - "NumWarpsPerProcessor": 4, + "NumWarpsPerProcessor": 8, "TaskInfos": [ { "Id": 0, @@ -948,7 +948,7 @@ "NumWarps": 4, "SramBytes": 24672, "TileShapeMNK": [256,128,32], - "NumTasks": 688 + "NumTasks": 602 } }, { @@ -970,7 +970,61 @@ "NumWarps": 4, "SramBytes": 0, "Tile": [256,128], - "NumTasks": 688 + "NumTasks": 602 + } + } + ] + }, + { + "Id": 37, + "NumWarps": 4, + "SramBytes": 16480, + "Ops": [ + { + "Type": "Matmul", + "Name": "matmul_7", + "IsVirtual": false, + "ReadTensors": [ + {"Id":102,"DataType":"FP16","Shape":[1,1792,4096],"Strides":[1,2048,4096],"Offsets":[0,256,0],"PaddedShape":[1,1792,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":6,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":101,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":100,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": { + "TransposeInput": {"BOOL":false}, + "TransposeOther": {"BOOL":true} + }, + "Config": { + "NumWarps": 4, + "SramBytes": 16480, + "TileShapeMNK": [128,128,32], + "NumTasks": 172 + } + }, + { + "Type": "Mul", + "Name": "mul_8", + "IsVirtual": false, + "ReadTensors": [ + {"Id":81,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, + {"Id":83,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "WriteTensors": [ + {"Id":84,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "ResultTensors": [ + {"Id":85,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} + ], + "Args": {}, + "Config": { + "NumWarps": 4, + "SramBytes": 0, + "Tile": [128,128], + "NumTasks": 172 } } ] @@ -1036,10 +1090,10 @@ "ResourceGroups": [ { "ProcessorRange": [0,304], - "WarpRange": [0,4], + "WarpRange": [0,8], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":0,"TaskRange":[0,2048],"Granularity":4} + {"TaskId":0,"TaskRange":[0,2048],"Granularity":7} ] } ] @@ -1062,10 +1116,10 @@ "ResourceGroups": [ { "ProcessorRange": [0,304], - "WarpRange": [0,4], + "WarpRange": [0,8], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":4,"TaskRange":[0,2048],"Granularity":4} + {"TaskId":4,"TaskRange":[0,2048],"Granularity":7} ] } ] @@ -1114,10 +1168,10 @@ "ResourceGroups": [ { "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,24672], + "WarpRange": [0,8], + "SramRange": [0,49344], "TaskGroups": [ - {"TaskId":15,"TaskRange":[0,4096],"Granularity":1} + {"TaskId":15,"TaskRange":[0,4096],"Granularity":2} ] } ] @@ -1127,10 +1181,10 @@ "ResourceGroups": [ { "ProcessorRange": [0,304], - "WarpRange": [0,4], + "WarpRange": [0,8], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":17,"TaskRange":[0,65536],"Granularity":4} + {"TaskId":17,"TaskRange":[0,65536],"Granularity":8} ] } ] @@ -1166,10 +1220,10 @@ "ResourceGroups": [ { "ProcessorRange": [0,304], - "WarpRange": [0,4], + "WarpRange": [0,8], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":28,"TaskRange":[0,2048],"Granularity":4} + {"TaskId":28,"TaskRange":[0,2048],"Granularity":7} ] } ] @@ -1192,10 +1246,10 @@ "ResourceGroups": [ { "ProcessorRange": [0,304], - "WarpRange": [0,4], + "WarpRange": [0,8], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":30,"TaskRange":[0,2048],"Granularity":4} + {"TaskId":30,"TaskRange":[0,2048],"Granularity":7} ] } ] @@ -1205,8 +1259,8 @@ "ResourceGroups": [ { "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,24672], + "WarpRange": [0,8], + "SramRange": [0,49344], "TaskGroups": [ {"TaskId":33,"TaskRange":[0,688],"Granularity":1} ] @@ -1218,10 +1272,11 @@ "ResourceGroups": [ { "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,24672], + "WarpRange": [0,8], + "SramRange": [0,49344], "TaskGroups": [ - {"TaskId":36,"TaskRange":[0,688],"Granularity":1} + {"TaskId":36,"TaskRange":[0,602],"Granularity":2}, + {"TaskId":37,"TaskRange":[0,172],"Granularity":1} ] } ] From 866112de65a6fd5d3c3d89d80cdc53ff27c8c36a Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 9 Jul 2024 01:07:21 +0000 Subject: [PATCH 27/61] optimize --- ark/include/kernels/common/sync.h | 3 + ark/include/kernels/reduce.h | 41 +++++++-- examples/llama/plan_llama2_7b_b1_s2048.json | 94 +-------------------- 3 files changed, 36 insertions(+), 102 deletions(-) diff --git a/ark/include/kernels/common/sync.h b/ark/include/kernels/common/sync.h index f47625600..456a32eb7 100644 --- a/ark/include/kernels/common/sync.h +++ b/ark/include/kernels/common/sync.h @@ -106,6 +106,9 @@ DEVICE void sync_warps() { static_assert(Arch::ThreadsPerWarp == 64, ""); if constexpr (NumWarps == 1) { __builtin_amdgcn_wave_barrier(); + } else if constexpr (NumWarps == ARK_WARPS_PER_BLOCK) { + // asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier " ::); + __syncthreads(); } else { static_assert(ARK_SMEM_RESERVED_BYTES >= sizeof(sync::WarpGroupState), ""); diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h index 3d0b4e008..2dd79d2c3 100644 --- a/ark/include/kernels/reduce.h +++ b/ark/include/kernels/reduce.h @@ -355,8 +355,15 @@ struct WwiseReduce { int smem_per_warp) { using ShapeChecker = ReduceShapeChecker; + constexpr int InConsecBytes = sizeof(DataType) * InShape::W; constexpr int NelemPerThread = - DefaultNelemPerThread::value; + (InConsecBytes % 16 == 0) + ? 16 / sizeof(DataType) + : (InConsecBytes % 8 == 0) + ? 8 / sizeof(DataType) + : (InConsecBytes % 4 == 0) + ? 4 / sizeof(DataType) + : (InConsecBytes % 2 == 0) ? 2 / sizeof(DataType) : 1; constexpr int NonReduceDimLength = UnitOutDims::N * UnitOutDims::C * UnitOutDims::H; @@ -397,22 +404,38 @@ struct WwiseReduce { &in[idx_in]); } - DataType finalSum; - ReduceType::template identity<1>(&finalSum); + static_assert(math::is_pow2::value, + "NelemPerThread must be power of 2"); + if constexpr (NelemPerThread > 8) { #pragma unroll - for (int i = 0; i < NelemPerThread; ++i) { - ReduceType::template reduce<1>(&finalSum, &finalSum, &reduced[i]); + for (int i = 8; i < NelemPerThread; i += 8) { + ReduceType::template reduce<8>(&reduced[0], &reduced[0], &reduced[i]); + } + ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]); + ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + } else if constexpr (NelemPerThread == 8) { + ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]); + ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + } else if constexpr (NelemPerThread == 4) { + ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + } else if constexpr (NelemPerThread == 2) { + ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); } - UnitOp::sync_threads(); + if constexpr (InShape::W % ThreadsPerRow != 0) { + UnitOp::sync_threads(); + } // final reduction on shared memory using warp shuffle. - finalSum = warpsReduce( - finalSum, tid, smem_per_warp); + reduced[0] = warpsReduce( + reduced[0], tid, smem_per_warp); // write the result to output. if (tid % ThreadsPerRow == 0) { - ReduceType::template postReduce<1>(&out[idx_out], &finalSum, + ReduceType::template postReduce<1>(&out[idx_out], &reduced[0], InShape::W); } diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json index d5c9fe552..b0bc757dc 100644 --- a/examples/llama/plan_llama2_7b_b1_s2048.json +++ b/examples/llama/plan_llama2_7b_b1_s2048.json @@ -230,29 +230,6 @@ "Tile": [256,1,128], "NumTasks": 256 } - }, - { - "Type": "Transpose", - "Name": "transpose", - "IsVirtual": false, - "ReadTensors": [ - {"Id":34,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Permutation": {"DIMS":[0,2,1,3]} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [256,128], - "NumTasks": 256 - } } ] }, @@ -307,29 +284,6 @@ "Tile": [256,128], "NumTasks": 256 } - }, - { - "Type": "Transpose", - "Name": "transpose_2", - "IsVirtual": false, - "ReadTensors": [ - {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":41,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":42,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Permutation": {"DIMS":[0,2,1,3]} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [256,128], - "NumTasks": 256 - } } ] }, @@ -362,29 +316,6 @@ "TileShapeMNK": [256,128,32], "NumTasks": 256 } - }, - { - "Type": "Transpose", - "Name": "transpose_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":32,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":39,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":40,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Permutation": {"DIMS":[0,2,1,3]} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [256,128], - "NumTasks": 256 - } } ] }, @@ -592,29 +523,6 @@ "TileShapeMNK": [256,128,32], "NumTasks": 256 } - }, - { - "Type": "Transpose", - "Name": "transpose_3", - "IsVirtual": false, - "ReadTensors": [ - {"Id":55,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":56,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":57,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Permutation": {"DIMS":[0,2,1,3]} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [256,1,128], - "NumTasks": 256 - } } ] }, @@ -1184,7 +1092,7 @@ "WarpRange": [0,8], "SramRange": [0,0], "TaskGroups": [ - {"TaskId":17,"TaskRange":[0,65536],"Granularity":8} + {"TaskId":17,"TaskRange":[0,65536],"Granularity":1} ] } ] From 68e787ae377c282c9d117e6650eb112a34c54a9c Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 9 Jul 2024 20:51:44 +0000 Subject: [PATCH 28/61] fix bf16 matmul --- ark/ops/ops_matmul.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp index b4553a4ed..a24b95d72 100644 --- a/ark/ops/ops_matmul.cpp +++ b/ark/ops/ops_matmul.cpp @@ -223,7 +223,7 @@ static const Json get_default_config(const ArchRef arch, {"TileShapeMNK", {tm, tn, 32}}}; } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == BF16.ref()) { return {{"NumWarps", 4}, - {"SramBytes", 24672}, + {"SramBytes", 24624}, {"TileShapeMNK", {tm, tn, 32}}}; } ERR(InternalError, "Unexpected error"); From b18bdb2e66d30c34b21657e15bb6cf491f108544 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 10 Jul 2024 23:44:07 +0000 Subject: [PATCH 29/61] Enhance executor interfaces --- ark/api/executor.cpp | 295 +++++++++++++++++++---------- ark/gpu/gpu_event.cpp | 11 +- ark/gpu/gpu_event.h | 4 +- ark/gpu/gpu_kernel.cpp | 2 +- ark/gpu/gpu_kernel.h | 2 +- ark/gpu/gpu_manager.cpp | 18 +- ark/gpu/gpu_manager.h | 4 +- ark/include/ark/executor.hpp | 46 +++-- ark/model/model_json.cpp | 11 +- ark/model/model_json.hpp | 2 +- ark/model/model_op.cpp | 5 +- ark/ops/ops_all_reduce_test.cpp | 7 +- ark/ops/ops_communication_test.cpp | 8 +- ark/ops/ops_embedding_test.cpp | 6 +- ark/ops/ops_test_common.cpp | 20 +- ark/ops/ops_test_common.hpp | 15 +- cmake/Utils.cmake | 2 +- python/ark/runtime.py | 4 +- python/ark/tensor.py | 10 +- python/executor_py.cpp | 59 +++++- 20 files changed, 344 insertions(+), 187 deletions(-) diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index 14625161f..2f50a4280 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -140,10 +140,17 @@ static size_t tensor_stride_bytes(const Json &tensor) { class Executor::Impl { public: - Impl(int rank, int world_size, int gpu_id, const std::string &name, - const std::string &plan); + Impl(int device_id, Stream stream, const std::string &name); ~Impl() = default; + void init(const PlanJson& plan); + + int device_id() const { return device_id_; } + + Stream stream() const { return reinterpret_cast(stream_raw_); } + + std::string plan() const { return plan_json_.dump_pretty(); } + void compile(); void launch(int64_t max_spin_count); void run(int iter); @@ -151,9 +158,12 @@ class Executor::Impl { float stop(int64_t max_spin_count); void barrier(); - void tensor_read(const Tensor tensor, void *data, size_t bytes) const; - void tensor_write(const Tensor tensor, const void *data, - size_t bytes) const; + uintptr_t tensor_address(const Tensor tensor) const; + + void tensor_read(const Tensor tensor, void *data, size_t bytes, + Stream stream, bool is_d2d) const; + void tensor_write(const Tensor tensor, const void *data, size_t bytes, + Stream stream, bool is_d2d) const; private: void init_communicator(); @@ -162,14 +172,18 @@ class Executor::Impl { void init_channels(const std::set &remote_ranks); protected: - const int rank_; - const int world_size_; - int gpu_id_; + int device_id_; + std::string name_; + gpuStream stream_raw_; + + int rank_; + int world_size_; bool is_launched_ = false; bool is_recording_ = false; float elapsed_msec_ = -1; + PlanJson plan_json_; std::map buffer_id_to_offset_; size_t total_bytes_; std::shared_ptr codegen_; @@ -177,8 +191,7 @@ class Executor::Impl { std::shared_ptr timer_end_; std::shared_ptr buffer_; std::shared_ptr flag_; - std::shared_ptr main_stream_; - std::shared_ptr copy_stream_; + std::shared_ptr stream_; std::shared_ptr kernel_; // For communication @@ -190,30 +203,35 @@ class Executor::Impl { rank_to_sm_channels_; }; -Executor::Impl::Impl(int rank, int world_size, int gpu_id, - const std::string &name, const std::string &plan) - : rank_(rank), world_size_(world_size), gpu_id_(gpu_id) { - if (rank < 0 || rank >= world_size) { - ERR(InvalidUsageError, "Invalid rank ", rank, " with world size ", - world_size); +Executor::Impl::Impl(int device_id, Stream stream, const std::string &name) + : device_id_(device_id), name_(name) { + if (device_id < 0) { + ERR(InvalidUsageError, "Invalid device ID ", device_id); } - if (gpu_id < 0) { - ERR(InvalidUsageError, "Invalid GPU ID ", gpu_id); + if (stream) { + stream_raw_ = reinterpret_cast(stream); + } else { + stream_ = GpuManager::get_instance(device_id_)->create_stream(); + stream_raw_ = stream_->get(); + } +} + +void Executor::Impl::init(const PlanJson &plan_json) { + plan_json_ = plan_json; + rank_ = plan_json_["Rank"].get(); + world_size_ = plan_json_["WorldSize"].get(); + + if (rank_ < 0 || rank_ >= world_size_) { + ERR(InvalidUsageError, "Invalid rank ", rank_, " with world size ", + world_size_); } if (world_size_ > 1) { init_communicator(); } - Json plan_json; - auto &plan_path = get_env().enforce_plan_path; - if (!plan_path.empty()) { - LOG(INFO, "Enforce executor plan path: ", plan_path); - plan_json = Json::parse(read_file(plan_path)); - } else { - plan_json = Json::parse(plan); - } + auto gpu_manager = GpuManager::get_instance(device_id_); - buffer_id_to_offset_ = init_buffers(plan_json); + buffer_id_to_offset_ = init_buffers(plan_json_); std::string buffer_id_to_offset_str; for (const auto &kv : buffer_id_to_offset_) { @@ -221,17 +239,14 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id, std::to_string(kv.first) + ": " + std::to_string(kv.second) + ", "; } - codegen_ = - std::make_shared(plan_json, buffer_id_to_offset_, name); + codegen_ = std::make_shared(plan_json_, buffer_id_to_offset_, + name_); - auto gpu_manager = GpuManager::get_instance(gpu_id_); timer_begin_ = gpu_manager->create_event(); timer_end_ = gpu_manager->create_event(); buffer_ = gpu_manager->malloc(total_bytes_, 65536); flag_ = gpu_manager->malloc_host( sizeof(int), gpuHostAllocMapped | gpuHostAllocWriteCombined); - main_stream_ = gpu_manager->create_stream(); - copy_stream_ = gpu_manager->create_stream(); int threads_per_block = static_cast( codegen_->num_warps_per_proc() * gpu_manager->info().threads_per_warp); @@ -241,13 +256,13 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id, static_cast(gpu_manager->info().smem_block_total); if (world_size_ > 1) { - auto remote_ranks = init_remote_ranks(plan_json); + auto remote_ranks = init_remote_ranks(plan_json_); init_channels(remote_ranks); } kernel_ = std::shared_ptr(new GpuKernel( - gpu_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1}, - std::max(smem_block_total, size_t(4)), name, + device_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1}, + std::max(smem_block_total, size_t(4)), name_, {std::pair{buffer_->ref(), sizeof(buffer_->ref())}, std::pair{flag, sizeof(flag)}})); } @@ -509,7 +524,7 @@ void Executor::Impl::init_channels(const std::set &remote_ranks) { mscclpp::TransportFlags all_transports = mscclpp::Transport::CudaIpc | mscclpp::Transport::Ethernet; if (!get_env().disable_ib) { - all_transports |= IBs[gpu_id_]; + all_transports |= IBs[device_id_]; } mscclpp::RegisteredMemory regmem = comm_->registerMemory(buffer_->ref(), buffer_->bytes(), all_transports); @@ -530,12 +545,12 @@ void Executor::Impl::init_channels(const std::set &remote_ranks) { if (remote_node == this_node) { add_connection(remote_rank, mscclpp::Transport::CudaIpc); if (!get_env().disable_ib) { - add_connection(remote_rank, IBs[gpu_id_]); + add_connection(remote_rank, IBs[device_id_]); } } else { add_connection(remote_rank, get_env().disable_ib ? mscclpp::Transport::Ethernet - : IBs[gpu_id_]); + : IBs[device_id_]); } comm_->sendMemoryOnSetup(regmem, remote_rank, 0); rank_to_remote_regmem_future[remote_rank] = @@ -623,22 +638,22 @@ void Executor::Impl::launch(int64_t max_spin_count) { sm_handles[i] = it2->second[0]->deviceHandle(); } } - GLOG(gpuSetDevice(gpu_id_)); + GLOG(gpuSetDevice(device_id_)); GLOG(gpuMemcpyAsync( proxy_chan_addr, proxy_handles.data(), proxy_handles.size() * sizeof(mscclpp::SimpleProxyChannel::DeviceHandle), - gpuMemcpyHostToDevice, copy_stream_->get())); + gpuMemcpyHostToDevice, stream_raw_)); GLOG(gpuMemcpyAsync( proxy_secondary_chan_addr, proxy_secondary_handles.data(), proxy_secondary_handles.size() * sizeof(mscclpp::SimpleProxyChannel::DeviceHandle), - gpuMemcpyHostToDevice, copy_stream_->get())); + gpuMemcpyHostToDevice, stream_raw_)); GLOG(gpuMemcpyAsync( sm_chan_addr, sm_handles.data(), sm_handles.size() * sizeof(mscclpp::SmChannel::DeviceHandle), - gpuMemcpyHostToDevice, copy_stream_->get())); - copy_stream_->sync(); + gpuMemcpyHostToDevice, stream_raw_)); + GLOG(gpuStreamSynchronize(stream_raw_)); } elapsed_msec_ = -1; @@ -648,7 +663,7 @@ void Executor::Impl::launch(int64_t max_spin_count) { LOG(WARN, "Ignore launching twice."); return; } - timer_begin_->record(main_stream_); + timer_begin_->record(stream_raw_); if (world_size_ > 1) { proxy_service_->startProxy(); @@ -656,8 +671,8 @@ void Executor::Impl::launch(int64_t max_spin_count) { // Initialize loop flags. atomicStoreRelaxed(flag_->ref(), 0); - kernel_->launch(main_stream_); - timer_end_->record(main_stream_); + kernel_->launch(stream_raw_); + timer_end_->record(stream_raw_); is_recording_ = true; is_launched_ = true; } @@ -677,7 +692,7 @@ void Executor::Impl::wait(int64_t max_spin_count) { continue; } // Check if the kernel encountered an error. - gpuError res = main_stream_->query(); + gpuError res = gpuStreamQuery(stream_raw_); if (res == gpuSuccess) { if (atomicLoadRelaxed(flag_->ref()) > 0) { LOG(WARN, "Stream is finished but the loop flag is still set."); @@ -699,7 +714,7 @@ void Executor::Impl::wait(int64_t max_spin_count) { float Executor::Impl::stop(int64_t max_spin_count) { this->wait(max_spin_count); atomicStoreRelaxed(flag_->ref(), -1); - main_stream_->sync(); + GLOG(gpuStreamSynchronize(stream_raw_)); if (is_recording_) { elapsed_msec_ = timer_end_->elapsed_msec(*timer_begin_); is_recording_ = false; @@ -717,71 +732,140 @@ void Executor::Impl::barrier() { } } -void Executor::Impl::tensor_read(const Tensor tensor, void *data, - size_t bytes) const { - GLOG(gpuSetDevice(gpu_id_)); +uintptr_t Executor::Impl::tensor_address(const Tensor tensor) const { + size_t buffer_id = tensor.ref()->buffer()->id(); + if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) { + ERR(NotFoundError, "Invalid buffer ID: ", buffer_id); + } + size_t offset = buffer_id_to_offset_.at(buffer_id); + return reinterpret_cast(buffer_->ref(offset)); +} + +void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes, + Stream stream, bool is_d2d) const { + GLOG(gpuSetDevice(device_id_)); + std::shared_ptr copy_stream; + gpuStream copy_stream_raw; + if (stream) { + copy_stream_raw = reinterpret_cast(stream); + if ((stream == stream_raw_) && is_launched_) { + LOG(WARN, + "Reading from a tensor in the same stream of the kernel " + "may cause a deadlock."); + } + } else { + copy_stream = GpuManager::get_instance(device_id_)->create_stream(); + copy_stream_raw = copy_stream->get(); + } size_t tensor_data_bytes = tensor.shape().nelems() * tensor.data_type().bytes(); - if (bytes < tensor_data_bytes) { - ERR(InvalidUsageError, "Data buffer (", bytes, - ") is smaller than the tensor data (", tensor_data_bytes, ")."); + if (bytes != tensor_data_bytes) { + ERR(InvalidUsageError, "Destination bytes (", bytes, + ") mismatches the tensor data bytes (", tensor_data_bytes, ")."); } - size_t tensor_bytes = - tensor.strides().nelems() * tensor.data_type().bytes(); - void *src = - buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id())); + auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyDeviceToHost; + void *src = reinterpret_cast(tensor_address(tensor)); if (tensor.strides() == tensor.shape()) { - GLOG(gpuMemcpyAsync(data, src, bytes, gpuMemcpyDeviceToHost, - copy_stream_->get())); - copy_stream_->sync(); + GLOG(gpuMemcpyAsync(data, src, bytes, kind, copy_stream_raw)); } else { + size_t tensor_bytes = + tensor.strides().nelems() * tensor.data_type().bytes(); std::vector tensor_host(tensor_bytes); GLOG(gpuMemcpyAsync(tensor_host.data(), src, tensor_bytes, - gpuMemcpyDeviceToHost, copy_stream_->get())); - copy_stream_->sync(); - tensor_to_data(tensor_host.data(), static_cast(data), - tensor.shape(), tensor.strides(), tensor.offsets(), + gpuMemcpyDeviceToHost, copy_stream_raw)); + GLOG(gpuStreamSynchronize(copy_stream_raw)); + if (!is_d2d) { + tensor_to_data(tensor_host.data(), static_cast(data), + tensor.shape(), tensor.strides(), tensor.offsets(), + tensor.data_type().bytes()); + return; + } + // TODO: convert data layout on the device directly + std::vector data_host(bytes); + tensor_to_data(tensor_host.data(), data_host.data(), tensor.shape(), + tensor.strides(), tensor.offsets(), tensor.data_type().bytes()); + GLOG(gpuMemcpyAsync(data, data_host.data(), bytes, + gpuMemcpyHostToDevice, copy_stream_raw)); } + GLOG(gpuStreamSynchronize(copy_stream_raw)); } void Executor::Impl::tensor_write(const Tensor tensor, const void *data, - size_t bytes) const { - GLOG(gpuSetDevice(gpu_id_)); + size_t bytes, Stream stream, + bool is_d2d) const { + GLOG(gpuSetDevice(device_id_)); + std::shared_ptr copy_stream; + gpuStream copy_stream_raw; + if (stream) { + copy_stream_raw = reinterpret_cast(stream); + if ((stream == stream_raw_) && is_launched_) { + LOG(WARN, + "Writing to a tensor in the same stream of the kernel " + "may cause a deadlock."); + } + } else { + copy_stream = GpuManager::get_instance(device_id_)->create_stream(); + copy_stream_raw = copy_stream->get(); + } size_t tensor_data_bytes = tensor.shape().nelems() * tensor.data_type().bytes(); - if (bytes < tensor_data_bytes) { - ERR(InvalidUsageError, "Data buffer (", bytes, - ") is smaller than the tensor data (", tensor_data_bytes, ")."); + if (bytes != tensor_data_bytes) { + ERR(InvalidUsageError, "Source bytes (", bytes, + ") mismatches the tensor data bytes (", tensor_data_bytes, ")."); } size_t tensor_bytes = tensor.strides().nelems() * tensor.data_type().bytes(); - void *dst = - buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id())); + auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice; + void *dst = reinterpret_cast(tensor_address(tensor)); if (tensor.strides() == tensor.shape()) { - GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, gpuMemcpyHostToDevice, - copy_stream_->get())); + GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_raw)); } else { std::vector tensor_host(tensor_bytes); - GLOG(gpuMemcpyAsync(tensor_host.data(), dst, tensor_bytes, - gpuMemcpyDeviceToHost, copy_stream_->get())); - copy_stream_->sync(); - data_to_tensor(tensor_host.data(), static_cast(data), - tensor.shape(), tensor.strides(), tensor.offsets(), - tensor.data_type().bytes()); + if (!is_d2d) { + GLOG(gpuMemcpyAsync(tensor_host.data(), dst, tensor_bytes, + gpuMemcpyDeviceToHost, copy_stream_raw)); + GLOG(gpuStreamSynchronize(copy_stream_raw)); + data_to_tensor(tensor_host.data(), + static_cast(data), tensor.shape(), + tensor.strides(), tensor.offsets(), + tensor.data_type().bytes()); + } else { + // TODO: convert data layout on the device directly + std::vector tmp(bytes); + GLOG(gpuMemcpyAsync(tmp.data(), data, bytes, gpuMemcpyDeviceToHost, + copy_stream_raw)); + GLOG(gpuStreamSynchronize(copy_stream_raw)); + data_to_tensor(tensor_host.data(), tmp.data(), tensor.shape(), + tensor.strides(), tensor.offsets(), + tensor.data_type().bytes()); + } GLOG(gpuMemcpyAsync(dst, tensor_host.data(), tensor_bytes, - gpuMemcpyHostToDevice, copy_stream_->get())); + gpuMemcpyHostToDevice, copy_stream_raw)); } - copy_stream_->sync(); + GLOG(gpuStreamSynchronize(copy_stream_raw)); } -Executor::Executor(int rank, int world_size, int gpu_id, - const std::string &name, const std::string &plan) - : impl_(std::make_unique(rank, world_size, gpu_id, name, - plan)) {} +Executor::Executor(int device_id, Stream stream, const std::string &name, + const std::string &plan) + : impl_(std::make_unique(device_id, stream, name)) { + auto &plan_path = get_env().enforce_plan_path; + if (!plan_path.empty()) { + LOG(INFO, "Enforce executor plan path: ", plan_path); + impl_->init(Json::parse(read_file(plan_path))); + } else if (!plan.empty()) { + impl_->init(Json::parse(plan)); + } +} Executor::~Executor() = default; +int Executor::device_id() const { return impl_->device_id(); } + +Stream Executor::stream() const { return impl_->stream(); } + +std::string Executor::plan() const { return impl_->plan(); } + void Executor::compile() { impl_->compile(); } void Executor::launch(int64_t max_spin_count) { impl_->launch(max_spin_count); } @@ -800,25 +884,32 @@ void Executor::destroy() { impl_.reset(nullptr); } bool Executor::destroyed() const { return impl_.get() == nullptr; } -void Executor::tensor_read(const Tensor tensor, void *data, - size_t bytes) const { - impl_->tensor_read(tensor, data, bytes); +uintptr_t Executor::tensor_address(const Tensor tensor) const { + return impl_->tensor_address(tensor); } -void Executor::tensor_write(const Tensor tensor, const void *data, - size_t bytes) const { - impl_->tensor_write(tensor, data, bytes); +void Executor::tensor_read(const Tensor tensor, void *data, size_t bytes, + Stream stream, bool is_d2d) const { + impl_->tensor_read(tensor, data, bytes, stream, is_d2d); } -DefaultExecutor::DefaultExecutor(const Model &model, int gpu_id, - const std::string &name) - : Executor( - model.rank(), model.world_size(), - (gpu_id < 0) ? (model.rank() % get_env().num_ranks_per_host) : gpu_id, - name, - DefaultPlanner(model, (gpu_id < 0) ? (model.rank() % - get_env().num_ranks_per_host) - : gpu_id) - .plan()) {} +void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes, + Stream stream, bool is_d2d) const { + impl_->tensor_write(tensor, data, bytes, stream, is_d2d); +} + +DefaultExecutor::DefaultExecutor( + const Model &model, int device_id, Stream stream, + const std::vector &config_rules, + const std::string &name) + : Executor((device_id < 0) ? (model.rank() % get_env().num_ranks_per_host) + : device_id, + stream, name, "") { + DefaultPlanner planner(model, impl_->device_id()); + for (const auto &rule : config_rules) { + planner.install_config_rule(rule); + } + impl_->init(Json::parse(planner.plan())); +} } // namespace ark diff --git a/ark/gpu/gpu_event.cpp b/ark/gpu/gpu_event.cpp index 93ec3fd52..cbc45d9a6 100644 --- a/ark/gpu/gpu_event.cpp +++ b/ark/gpu/gpu_event.cpp @@ -3,7 +3,6 @@ #include "gpu/gpu_event.h" -#include "gpu/gpu.h" #include "gpu/gpu_logging.h" #include "gpu/gpu_manager.h" @@ -15,7 +14,7 @@ class GpuEvent::Impl { Impl(const Impl&) = delete; Impl& operator=(const Impl&) = delete; - void record(std::shared_ptr stream); + void record(gpuStream stream); float elapsed_msec(const GpuEvent& other) const; private: @@ -32,8 +31,8 @@ GpuEvent::Impl::Impl(bool disable_timing) { GpuEvent::Impl::~Impl() { GLOG(gpuEventDestroy(event_)); } -void GpuEvent::Impl::record(std::shared_ptr stream) { - GLOG(gpuEventRecord(event_, stream->get())); +void GpuEvent::Impl::record(gpuStream stream) { + GLOG(gpuEventRecord(event_, stream)); } float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const { @@ -45,9 +44,7 @@ float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const { GpuEvent::GpuEvent(bool disable_timing) : pimpl_(std::make_shared(disable_timing)) {} -void GpuEvent::record(std::shared_ptr stream) { - pimpl_->record(stream); -} +void GpuEvent::record(gpuStream stream) { pimpl_->record(stream); } float GpuEvent::elapsed_msec(const GpuEvent& other) const { return pimpl_->elapsed_msec(other); diff --git a/ark/gpu/gpu_event.h b/ark/gpu/gpu_event.h index 4599ecaa4..081f0203b 100644 --- a/ark/gpu/gpu_event.h +++ b/ark/gpu/gpu_event.h @@ -6,6 +6,8 @@ #include +#include "gpu/gpu.h" + namespace ark { class GpuStream; @@ -17,7 +19,7 @@ class GpuEvent { GpuEvent(const GpuEvent &) = delete; GpuEvent &operator=(const GpuEvent &) = delete; - void record(std::shared_ptr stream); + void record(gpuStream stream); float elapsed_msec(const GpuEvent &other) const; protected: diff --git a/ark/gpu/gpu_kernel.cpp b/ark/gpu/gpu_kernel.cpp index 44ff43a1d..46f467f51 100644 --- a/ark/gpu/gpu_kernel.cpp +++ b/ark/gpu/gpu_kernel.cpp @@ -68,7 +68,7 @@ void GpuKernel::compile() { dynamic_smem_size_bytes)); } -void GpuKernel::launch(std::shared_ptr stream) { +void GpuKernel::launch(gpuStream stream) { if (!this->is_compiled()) { ERR(InvalidUsageError, "Kernel is not compiled yet."); } diff --git a/ark/gpu/gpu_kernel.h b/ark/gpu/gpu_kernel.h index c3b60aec4..b3be79071 100644 --- a/ark/gpu/gpu_kernel.h +++ b/ark/gpu/gpu_kernel.h @@ -27,7 +27,7 @@ class GpuKernel { const std::string& kernel_name, std::initializer_list> args = {}); void compile(); - void launch(std::shared_ptr stream); + void launch(gpuStream stream); gpuDeviceptr get_global(const std::string& name, bool ignore_not_found = false) const; diff --git a/ark/gpu/gpu_manager.cpp b/ark/gpu/gpu_manager.cpp index 3a6d0a066..fc841fa32 100644 --- a/ark/gpu/gpu_manager.cpp +++ b/ark/gpu/gpu_manager.cpp @@ -20,11 +20,10 @@ class GpuManager::Impl { int gpu_id_; GpuManager::Info info_; - std::shared_ptr main_stream_; void launch(gpuFunction kernel, const std::array &grid_dim, const std::array &block_dim, int smem_bytes, - std::shared_ptr stream, void **params, void **extra); + gpuStream stream, void **params, void **extra); }; GpuManager::Impl::Impl(int gpu_id) : gpu_id_(gpu_id) { @@ -76,11 +75,11 @@ GpuManager::Impl::Impl(int gpu_id) : gpu_id_(gpu_id) { void GpuManager::Impl::launch(gpuFunction kernel, const std::array &grid_dim, const std::array &block_dim, - int smem_bytes, std::shared_ptr stream, - void **params, void **extra) { + int smem_bytes, gpuStream stream, void **params, + void **extra) { GLOG_DRV(gpuModuleLaunchKernel( kernel, grid_dim[0], grid_dim[1], grid_dim[2], block_dim[0], - block_dim[1], block_dim[2], smem_bytes, stream->get(), params, extra)); + block_dim[1], block_dim[2], smem_bytes, stream, params, extra)); } std::shared_ptr GpuManager::get_instance(int gpu_id) { @@ -102,9 +101,7 @@ std::shared_ptr GpuManager::get_instance(int gpu_id) { } } -GpuManager::GpuManager(int gpu_id) : pimpl_(std::make_shared(gpu_id)) { - this->pimpl_->main_stream_ = std::shared_ptr(new GpuStream()); -} +GpuManager::GpuManager(int gpu_id) : pimpl_(std::make_shared(gpu_id)) {} std::shared_ptr GpuManager::malloc(size_t bytes, size_t align, bool expose) { @@ -126,8 +123,6 @@ std::shared_ptr GpuManager::create_stream() const { return std::shared_ptr(new GpuStream()); } -int GpuManager::get_gpu_id() const { return pimpl_->gpu_id_; } - const GpuManager::Info &GpuManager::info() const { return pimpl_->info_; } void GpuManager::set_current() const { GLOG(gpuSetDevice(pimpl_->gpu_id_)); } @@ -135,8 +130,7 @@ void GpuManager::set_current() const { GLOG(gpuSetDevice(pimpl_->gpu_id_)); } void GpuManager::launch(gpuFunction function, const std::array &grid_dim, const std::array &block_dim, int smem_bytes, - std::shared_ptr stream, void **params, - void **extra) const { + gpuStream stream, void **params, void **extra) const { this->set_current(); pimpl_->launch(function, grid_dim, block_dim, smem_bytes, stream, params, extra); diff --git a/ark/gpu/gpu_manager.h b/ark/gpu/gpu_manager.h index 05014ac47..93a48cf7b 100644 --- a/ark/gpu/gpu_manager.h +++ b/ark/gpu/gpu_manager.h @@ -30,11 +30,9 @@ class GpuManager { std::shared_ptr create_event(bool disable_timing = false) const; std::shared_ptr create_stream() const; - int get_gpu_id() const; void launch(gpuFunction function, const std::array &grid_dim, const std::array &block_dim, int smem_bytes, - std::shared_ptr stream, void **params, - void **extra) const; + gpuStream stream, void **params, void **extra) const; struct Info; const Info &info() const; diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp index 4682af7d0..75dc81c17 100644 --- a/ark/include/ark/executor.hpp +++ b/ark/include/ark/executor.hpp @@ -5,6 +5,7 @@ #define ARK_EXECUTOR_HPP #include +#include #include #include #include @@ -12,15 +13,27 @@ namespace ark { +using Stream = void *; + /// Convenience class for executing a model. class Executor { public: /// Constructor. - Executor(int rank, int world_size, int gpu_id, const std::string &name, + Executor(int device_id, Stream stream, const std::string &name, const std::string &plan); + /// Destructor. ~Executor(); + /// Return the device ID. + int device_id() const; + + /// Return the stream of the executor. + Stream stream() const; + + /// Return the plan string. + std::string plan() const; + /// Compile the model. This must be called before `launch()`. void compile(); @@ -39,30 +52,39 @@ class Executor { /// again. float stop(int64_t max_spin_count = -1); + /// Barrier for all rank executors. void barrier(); + /// Destroy the executor. void destroy(); + /// Return whether the executor is destroyed. bool destroyed() const; + /// Return the raw virtual address of the tensor. + uintptr_t tensor_address(const Tensor tensor) const; + template - void tensor_read(const Tensor tensor, std::vector &data) const { + void tensor_read(const Tensor tensor, std::vector &data, + Stream stream = nullptr) const { tensor_read(tensor, reinterpret_cast(data.data()), - data.size() * sizeof(T)); + data.size() * sizeof(T), stream); } template - void tensor_write(const Tensor tensor, const std::vector &data) const { + void tensor_write(const Tensor tensor, const std::vector &data, + Stream stream = nullptr) const { tensor_write(tensor, reinterpret_cast(data.data()), - data.size() * sizeof(T)); + data.size() * sizeof(T), stream); } - void tensor_read(const Tensor tensor, void *data, size_t bytes) const; + void tensor_read(const Tensor tensor, void *data, size_t bytes, + Stream stream = nullptr, bool is_d2d = false) const; - void tensor_write(const Tensor tensor, const void *data, - size_t bytes) const; + void tensor_write(const Tensor tensor, const void *data, size_t bytes, + Stream stream = nullptr, bool is_d2d = false) const; - private: + protected: class Impl; std::unique_ptr impl_; }; @@ -71,8 +93,10 @@ class Model; class DefaultExecutor : public Executor { public: - DefaultExecutor(const Model &model, int gpu_id = -1, - const std::string &name = "DefaultExecutor"); + DefaultExecutor( + const Model &model, int device_id = -1, Stream stream = nullptr, + const std::vector &config_rules = {}, + const std::string &name = "DefaultExecutor"); }; } // namespace ark diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp index 0057ef0aa..97ce71967 100644 --- a/ark/model/model_json.cpp +++ b/ark/model/model_json.cpp @@ -272,7 +272,16 @@ static void verify_format_plan(const Json &json) { } } -PlanJson::PlanJson(const Json &json) : Json(json) { verify_format_plan(*this); } +PlanJson::PlanJson(const Json &json) + : Json((json != nullptr) ? json + : Json{{"Rank", 0}, + {"WorldSize", 1}, + {"NumProcessors", 1}, + {"NumWarpsPerProcessor", 1}, + {"TaskInfos", Json::array()}, + {"ProcessorGroups", Json::array()}}) { + verify_format_plan(*this); +} static std::stringstream &dump_pretty_plan(const Json &json, std::stringstream &ss, int indent, diff --git a/ark/model/model_json.hpp b/ark/model/model_json.hpp index cf5fbbce2..e42640a9a 100644 --- a/ark/model/model_json.hpp +++ b/ark/model/model_json.hpp @@ -18,7 +18,7 @@ class ModelJson : public Json { class PlanJson : public Json { public: - PlanJson(const Json &json); + PlanJson(const Json &json = nullptr); std::string dump_pretty(int indent = 0, int indent_step = 2) const; }; diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp index 6cdba5d02..b5a0645c8 100644 --- a/ark/model/model_op.cpp +++ b/ark/model/model_op.cpp @@ -202,8 +202,11 @@ std::shared_ptr ModelOp::deserialize(const Json &serialized) { } else if (!serialized.contains("Args")) { ERR(InvalidUsageError, "ModelOp deserialization failed: missing Args"); } + // Run `ModelOpT::from_name` before `construct()` to ensure all operators + // are registered. + auto op_type = ModelOpT::from_name(serialized["Type"]); auto ret = model_op_factory()->construct(serialized["Type"]); - ret->type_ = ModelOpT::from_name(serialized["Type"]); + ret->type_ = op_type; ret->name_ = serialized["Name"]; ret->is_virtual_ = serialized["IsVirtual"]; for (const auto &t : serialized["ReadTensors"]) { diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp index 9e2c6f675..030146680 100644 --- a/ark/ops/ops_all_reduce_test.cpp +++ b/ark/ops/ops_all_reduce_test.cpp @@ -91,10 +91,9 @@ void test_all_reduce_internal(ark::DimType nelem) { std::vector ones_vec(ones.shape().nelems(), ark::half_t(1.0f)); - auto result = - ark::op_test("all_reduce", m, {ones}, {output}, - baseline_all_reduce, - {ones_vec.data()}, false, gpu_id, NumGpus); + auto result = ark::op_test( + "all_reduce", m, {ones}, {output}, + baseline_all_reduce, {ones_vec.data()}); UNITTEST_LOG(result); UNITTEST_EQ(result.max_diff[0], 0.0f); return ark::unittest::SUCCESS; diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp index 2b63642e6..f01de9789 100644 --- a/ark/ops/ops_communication_test.cpp +++ b/ark/ops/ops_communication_test.cpp @@ -229,9 +229,7 @@ ark::unittest::State test_communication_send_recv_bidir_sm() { ark::Tensor tns2 = model.identity(tns2_data, {tns}); tns2 = model.recv(tns2_data, remote_gpu_id, tag); - ark::DefaultPlanner planner(model, gpu_id); - planner.install_config_rule(config_rule); - ark::Executor exe(gpu_id, 2, gpu_id, "Executor", planner.plan()); + ark::DefaultExecutor exe(model, gpu_id, nullptr, {config_rule}); exe.compile(); std::vector data(1024); @@ -275,9 +273,7 @@ ark::unittest::State test_communication_send_recv_bidir_sm() { ark::Tensor sum = model.add(tns2, tns_data); - ark::DefaultPlanner planner(model, gpu_id); - planner.install_config_rule(config_rule); - ark::Executor exe(gpu_id, 2, gpu_id, "Executor", planner.plan()); + ark::DefaultExecutor exe(model, gpu_id, nullptr, {config_rule}); exe.compile(); std::vector data(1024); diff --git a/ark/ops/ops_embedding_test.cpp b/ark/ops/ops_embedding_test.cpp index 822973106..8cc95abd2 100644 --- a/ark/ops/ops_embedding_test.cpp +++ b/ark/ops/ops_embedding_test.cpp @@ -78,9 +78,9 @@ ark::unittest::State test_embedding() { } else if (std::is_same::value) { type_str = "bf16"; } - auto result = ark::op_test("embedding_" + type_str, m, {ti, tw}, {to}, - baseline_embedding, - {ti_data.data(), tw_data.data()}, true); + auto result = + ark::op_test("embedding_" + type_str, m, {ti, tw}, {to}, + baseline_embedding, {ti_data.data(), tw_data.data()}); UNITTEST_LOG(result); UNITTEST_EQ(result.max_diff[0], 0.0f); return ark::unittest::SUCCESS; diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp index 50317fba7..60ffc9dc2 100644 --- a/ark/ops/ops_test_common.cpp +++ b/ark/ops/ops_test_common.cpp @@ -31,13 +31,13 @@ std::ostream &operator<<(std::ostream &os, const OpsTestResult &result) { return os; } -OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, - const std::vector &inputs, - const std::vector &outputs, - OpsTestBaseline baseline, - const std::vector &inputs_data, - bool print_on_error, int rank, int world_size) { - DefaultExecutor exe(model); +OpsTestResult op_test( + const std::string &test_name_prefix, const Model &model, + const std::vector &inputs, const std::vector &outputs, + OpsTestBaseline baseline, const std::vector &inputs_data, + const std::vector &config_rules, + bool print_on_error) { + DefaultExecutor exe(model, -1, nullptr, config_rules); exe.compile(); std::vector>> inputs_data_storages; @@ -133,7 +133,8 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, for (auto t : gt) { gt_ptrs.push_back(t->data()); } - baseline(gt_ptrs, output_shapes, inputs_data_refs, input_shapes, rank); + baseline(gt_ptrs, output_shapes, inputs_data_refs, input_shapes, + model.rank()); std::stringstream test_name; test_name << test_name_prefix; @@ -147,6 +148,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, OpsTestResult result; result.test_name = test_name.str(); + result.plan = exe.plan(); // Compare results with the ground truth. for (size_t i = 0; i < outputs.size(); i++) { @@ -187,7 +189,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, GLOG(gpuDeviceSynchronize()); // Throughput test. - if (world_size > 1) { + if (model.world_size() > 1) { // For multi-GPU, we need to make sure that all GPUs run the same // number of iterations. Rather than doing allgather, we just // use a magic number here. diff --git a/ark/ops/ops_test_common.hpp b/ark/ops/ops_test_common.hpp index 01e97dbb1..c5d640f3b 100644 --- a/ark/ops/ops_test_common.hpp +++ b/ark/ops/ops_test_common.hpp @@ -10,6 +10,7 @@ #include "ark/model.hpp" #include "ark/model_ref.hpp" +#include "ark/planner.hpp" #include "ark/random.hpp" #include "bfloat16.h" #include "half.h" @@ -133,6 +134,7 @@ TensorCompareResult tensor_compare(T *ground_truth, T *res, Dims shape, struct OpsTestResult { std::string test_name; + std::string plan; int iter; float msec_per_iter; std::vector mse; @@ -165,13 +167,12 @@ using OpsTestBaseline = std::function &inputs, - const std::vector &outputs, - OpsTestBaseline baseline, - const std::vector &inputs_data = {}, - bool print_on_error = false, int rank = 0, - int world_size = 1); +OpsTestResult op_test( + const std::string &test_name_prefix, const Model &model, + const std::vector &inputs, const std::vector &outputs, + OpsTestBaseline baseline, const std::vector &inputs_data = {}, + const std::vector &config_rules = {}, + bool print_on_error = false); OpsTestGpuMem to_gpu(void *host_ptr, size_t size); diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index 9bb83fb42..855cb824b 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -14,7 +14,7 @@ if(GIT_CLANG_FORMAT) COMMAND ${GIT_CLANG_FORMAT} --style=file --diff || true ) add_custom_target(cpplint-autofix - COMMAND ${GIT_CLANG_FORMAT} --style=file || true + COMMAND ${GIT_CLANG_FORMAT} --style=file --force --extensions cc,cpp,h,hpp,cu,in,hip || true ) else() message(STATUS "git-clang-format not found.") diff --git a/python/ark/runtime.py b/python/ark/runtime.py index 7480ce7da..33db1fb5c 100644 --- a/python/ark/runtime.py +++ b/python/ark/runtime.py @@ -106,6 +106,7 @@ def launch( gpu_id: int = 0, plan: str = "", plan_path: str = "", + stream: int = 0, ): """ Create an executor and schedule the ARK model. The scheduler will generate @@ -130,9 +131,8 @@ def launch( _RuntimeState.executor.destroy() _RuntimeState.executor = Executor( - rank, - world_size, gpu_id, + stream, "ArkRuntime", plan, ) diff --git a/python/ark/tensor.py b/python/ark/tensor.py index 316d18566..d69f2aabc 100644 --- a/python/ark/tensor.py +++ b/python/ark/tensor.py @@ -48,7 +48,9 @@ def dtype(self) -> DataType: """ return DataType.from_ctype(self._tensor.data_type()) - def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray: + def to_numpy( + self, ndarray: np.ndarray = None, stream: int = 0 + ) -> np.ndarray: """ Copy a tensor from device to host. If `ndarray` is None, a new numpy array will be created. If the tensor is not allocated, @@ -68,10 +70,10 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray: raise ValueError("ndarray dtype does not match the tensor") elif ndarray.nbytes != self.nelems() * self.dtype().element_size(): raise ValueError("ndarray size does not match the tensor") - rt.executor.tensor_read(self._tensor, ndarray) + rt.executor.tensor_read(self._tensor, ndarray, stream) return ndarray - def from_numpy(self, ndarray: np.ndarray) -> "Tensor": + def from_numpy(self, ndarray: np.ndarray, stream: int = 0) -> "Tensor": """ Copies the tensor from a host numpy array to the device. """ @@ -86,7 +88,7 @@ def from_numpy(self, ndarray: np.ndarray) -> "Tensor": ndarray = np.ascontiguousarray(ndarray) if ndarray.nbytes != self.nelems() * self.dtype().element_size(): raise ValueError("ndarray size does not match the tensor") - rt.executor.tensor_write(self._tensor, ndarray) + rt.executor.tensor_write(self._tensor, ndarray, stream) return self diff --git a/python/executor_py.cpp b/python/executor_py.cpp index dc2840329..979cb2952 100644 --- a/python/executor_py.cpp +++ b/python/executor_py.cpp @@ -11,25 +11,48 @@ namespace py = pybind11; static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor, - py::buffer host_buffer) { + py::buffer host_buffer, uintptr_t stream) { py::buffer_info info = host_buffer.request(); exe->tensor_write(tensor, reinterpret_cast(info.ptr), - info.size * info.itemsize); + info.size * info.itemsize, + reinterpret_cast(stream), false); +} + +static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor, + size_t address, size_t bytes, uintptr_t stream, + bool is_d2d) { + exe->tensor_write(tensor, reinterpret_cast(address), bytes, + reinterpret_cast(stream), is_d2d); } static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor, - py::buffer host_buffer) { + py::buffer host_buffer, uintptr_t stream) { py::buffer_info info = host_buffer.request(); exe->tensor_read(tensor, reinterpret_cast(info.ptr), - info.size * info.itemsize); + info.size * info.itemsize, + reinterpret_cast(stream), false); +} + +static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor, + size_t address, size_t bytes, uintptr_t stream, + bool is_d2d) { + exe->tensor_read(tensor, reinterpret_cast(address), bytes, + reinterpret_cast(stream), is_d2d); } void register_executor(py::module &m) { py::class_(m, "_Executor") - .def( - py::init(), - py::arg("rank"), py::arg("world_size"), py::arg("gpu_id"), - py::arg("name"), py::arg("plan")) + .def(py::init([](int device_id, uintptr_t stream, + const std::string &name, const std::string &plan) { + return new ark::Executor( + device_id, reinterpret_cast(stream), name, plan); + })) + .def("device_id", &ark::Executor::device_id) + .def("stream", + [](ark::Executor *self) { + return reinterpret_cast(self->stream()); + }) + .def("plan", &ark::Executor::plan) .def("compile", &ark::Executor::compile) .def("launch", &ark::Executor::launch, py::arg("max_spin_count") = -1) .def("run", &ark::Executor::run, py::arg("iter")) @@ -38,6 +61,22 @@ void register_executor(py::module &m) { .def("barrier", &ark::Executor::barrier) .def("destroy", &ark::Executor::destroy) .def("destroyed", &ark::Executor::destroyed) - .def("tensor_read", &tensor_read, py::arg("tensor"), py::arg("data")) - .def("tensor_write", &tensor_write, py::arg("tensor"), py::arg("data")); + .def("tensor_read", + py::overload_cast(&tensor_read), + py::arg("tensor"), py::arg("data"), py::arg("stream")) + .def("tensor_read", + py::overload_cast(&tensor_read), + py::arg("tensor"), py::arg("address"), py::arg("bytes"), + py::arg("stream"), py::arg("is_d2d")) + .def("tensor_write", + py::overload_cast(&tensor_write), + py::arg("tensor"), py::arg("data"), py::arg("stream")) + .def("tensor_write", + py::overload_cast(&tensor_write), + py::arg("tensor"), py::arg("address"), py::arg("bytes"), + py::arg("stream"), py::arg("is_d2d")); } From 215469044ae49a4a453f576b2a396a5c96992aec Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 10 Jul 2024 23:53:32 +0000 Subject: [PATCH 30/61] Update lint workflow --- .github/workflows/lint.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 758eaf564..a918dcede 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -13,11 +13,8 @@ jobs: - name: Check out Git repository uses: actions/checkout@v4 - - name: Install ClangFormat - run: sudo apt-get install -y clang-format - - - name: Run clang-format - run: clang-format -style=file -Werror --dry-run `find ark python examples -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu` + - name: Run git-clang-format + run: git-clang-format --style=file --diff - name: Set up Python uses: actions/setup-python@v4 From 705f9f86d8bf8b70005a03fd875e8cc080c99af1 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 11 Jul 2024 00:02:45 +0000 Subject: [PATCH 31/61] Optimize operators --- ark/include/kernels/common/broadcast.h | 4 +- ark/include/kernels/common/sync.h | 12 ++---- ark/include/kernels/reduce.h | 59 ++++++++++++++++++-------- ark/ops/ops_broadcast.cpp | 3 +- ark/ops/ops_matmul.cpp | 32 +++++++++----- 5 files changed, 69 insertions(+), 41 deletions(-) diff --git a/ark/include/kernels/common/broadcast.h b/ark/include/kernels/common/broadcast.h index 97b12e004..858938613 100644 --- a/ark/include/kernels/common/broadcast.h +++ b/ark/include/kernels/common/broadcast.h @@ -186,9 +186,9 @@ struct Broadcast2Intrinsic { (BroadcastInput0 && BroadcastInput1) ? OutNelemPerThread : BroadcastInput0 - ? math::gcd::value + ? math::gcd::value : BroadcastInput1 - ? math::gcd::value + ? math::gcd::value : math::gcd::value>::value; diff --git a/ark/include/kernels/common/sync.h b/ark/include/kernels/common/sync.h index 85f7639c9..cf22e357d 100644 --- a/ark/include/kernels/common/sync.h +++ b/ark/include/kernels/common/sync.h @@ -106,25 +106,21 @@ DEVICE void sync_warps() { static_assert(Arch::ThreadsPerWarp == 64, ""); if constexpr (NumWarps == 1) { __builtin_amdgcn_wave_barrier(); - } else if constexpr (NumWarps == 16) { + } else if constexpr (NumWarps == ARK_WARPS_PER_BLOCK) { __syncthreads(); } else { static_assert(ARK_SMEM_RESERVED_BYTES >= sizeof(sync::WarpGroupState), ""); - int lane_id = threadIdx.x & 63; - if (lane_id == 0) { + if ((threadIdx.x & 63) == 0) { constexpr int MaxOldCnt = NumWarps - 1; - int warp_id = threadIdx.x >> 6; - int group_id = warp_id / NumWarps; + int group_id = (threadIdx.x >> 6) / NumWarps; sync::WarpGroupState *state = reinterpret_cast(_ARK_SMEM); unsigned int tmp = state->is_inc_flag[group_id] ^ 1; if (atomicInc(&state->cnt[group_id], MaxOldCnt) == MaxOldCnt) { state->flag[group_id] = tmp; } else { - while (atomicAdd(&state->flag[group_id], 0) != tmp) - __builtin_amdgcn_s_sleep(1); - __asm__ __volatile__("s_wakeup"); + while (atomicAdd(&state->flag[group_id], 0) != tmp); } state->is_inc_flag[group_id] = tmp; } diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h index 30c8b7831..2dd79d2c3 100644 --- a/ark/include/kernels/reduce.h +++ b/ark/include/kernels/reduce.h @@ -53,7 +53,7 @@ DEVICE bf16 warpReduce(bf16 val) { template DEVICE DataType warpsReduce(DataType val, int tid, int smem_per_warp) { val = warpReduce(val); - if (LanesNum > Arch::ThreadsPerWarp) { + if constexpr (LanesNum > Arch::ThreadsPerWarp) { ReduceSharedStorage *shared = UnitOp::template shared_memory>( smem_per_warp); @@ -351,12 +351,19 @@ struct WwiseReduce { /// @param in Input tensor. /// @param uop_idx Index of the unit operator. template - static DEVICE void runW(DataType *out, DataType *in, int uop_idx, - int smem_per_warp) { + static DEVICE void run(DataType *out, DataType *in, int uop_idx, + int smem_per_warp) { using ShapeChecker = ReduceShapeChecker; + constexpr int InConsecBytes = sizeof(DataType) * InShape::W; constexpr int NelemPerThread = - DefaultNelemPerThread::value; + (InConsecBytes % 16 == 0) + ? 16 / sizeof(DataType) + : (InConsecBytes % 8 == 0) + ? 8 / sizeof(DataType) + : (InConsecBytes % 4 == 0) + ? 4 / sizeof(DataType) + : (InConsecBytes % 2 == 0) ? 2 / sizeof(DataType) : 1; constexpr int NonReduceDimLength = UnitOutDims::N * UnitOutDims::C * UnitOutDims::H; @@ -397,22 +404,38 @@ struct WwiseReduce { &in[idx_in]); } - DataType finalSum; - ReduceType::template identity<1>(&finalSum); + static_assert(math::is_pow2::value, + "NelemPerThread must be power of 2"); + if constexpr (NelemPerThread > 8) { #pragma unroll - for (int i = 0; i < NelemPerThread; ++i) { - ReduceType::template reduce<1>(&finalSum, &finalSum, &reduced[i]); + for (int i = 8; i < NelemPerThread; i += 8) { + ReduceType::template reduce<8>(&reduced[0], &reduced[0], &reduced[i]); + } + ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]); + ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + } else if constexpr (NelemPerThread == 8) { + ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]); + ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + } else if constexpr (NelemPerThread == 4) { + ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + } else if constexpr (NelemPerThread == 2) { + ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); } - UnitOp::sync_threads(); + if constexpr (InShape::W % ThreadsPerRow != 0) { + UnitOp::sync_threads(); + } // final reduction on shared memory using warp shuffle. - finalSum = warpsReduce( - finalSum, tid, smem_per_warp); + reduced[0] = warpsReduce( + reduced[0], tid, smem_per_warp); // write the result to output. if (tid % ThreadsPerRow == 0) { - ReduceType::template postReduce<1>(&out[idx_out], &finalSum, + ReduceType::template postReduce<1>(&out[idx_out], &reduced[0], InShape::W); } @@ -450,8 +473,8 @@ template ::runW(out, in, uop_idx, - smem_per_warp); + SmemBytes, ReduceTypeSum, Axis>::run(out, in, uop_idx, + smem_per_warp); } template ::runW(out, in, uop_idx, - smem_per_warp); + SmemBytes, ReduceTypeMean, Axis>::run(out, in, uop_idx, + smem_per_warp); } template ::runW(out, in, uop_idx, - smem_per_warp); + SmemBytes, ReduceTypeMax, Axis>::run(out, in, uop_idx, + smem_per_warp); } } // namespace ark diff --git a/ark/ops/ops_broadcast.cpp b/ark/ops/ops_broadcast.cpp index 3985a0500..e5559fc32 100644 --- a/ark/ops/ops_broadcast.cpp +++ b/ark/ops/ops_broadcast.cpp @@ -27,8 +27,7 @@ ModelOpBroadcast1::ModelOpBroadcast1(const std::string &type_name, std::string ModelOpBroadcast1::impl_name(const Json &config) const { check_fields_config(config, {"NumWarps", "Tile"}); int num_warps = config.at("NumWarps"); - auto &tile_shape = config.at("Tile"); - Dims unit_out_dims{tile_shape[0], tile_shape[1]}; + Dims unit_out_dims(config.at("Tile").get>()); return function_name_string( pascal_to_snake(type()->type_name()), diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp index b259f99c8..a24b95d72 100644 --- a/ark/ops/ops_matmul.cpp +++ b/ark/ops/ops_matmul.cpp @@ -189,45 +189,55 @@ std::vector ModelOpMatmul::impl_args([ } static const Json get_default_config(const ArchRef arch, - const ModelDataType &data_type) { + const ModelDataType &data_type, + const Dims &mnk) { + if (data_type != FP32.ref() && data_type != FP16.ref() && + data_type != BF16.ref()) { + ERR(InvalidUsageError, + "Unsupported data type: ", data_type->type_name()); + } + if (!arch->belongs_to(ARCH_CUDA) && !arch->belongs_to(ARCH_ROCM)) { + ERR(InvalidUsageError, "Unsupported architecture: ", arch->name()); + } + DimType tm = (mnk[0] > mnk[1]) ? 256 : 128; + DimType tn = (mnk[0] > mnk[1]) ? 128 : 256; if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP32.ref()) { return {{"NumWarps", 8}, {"SramBytes", 147456}, - {"TileShapeMNK", {128, 256, 32}}}; + {"TileShapeMNK", {tm, tn, 32}}}; } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP16.ref()) { return {{"NumWarps", 8}, {"SramBytes", 147456}, - {"TileShapeMNK", {128, 256, 64}}}; + {"TileShapeMNK", {tm, tn, 64}}}; } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == BF16.ref()) { return {{"NumWarps", 8}, {"SramBytes", 147456}, - {"TileShapeMNK", {128, 256, 64}}}; + {"TileShapeMNK", {tm, tn, 64}}}; } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP32.ref()) { return {{"NumWarps", 4}, {"SramBytes", 24672}, - {"TileShapeMNK", {128, 256, 16}}}; + {"TileShapeMNK", {tm, tn, 16}}}; } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP16.ref()) { return {{"NumWarps", 4}, {"SramBytes", 24672}, - {"TileShapeMNK", {128, 256, 32}}}; + {"TileShapeMNK", {tm, tn, 32}}}; } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == BF16.ref()) { return {{"NumWarps", 4}, - {"SramBytes", 24672}, - {"TileShapeMNK", {128, 256, 32}}}; + {"SramBytes", 24624}, + {"TileShapeMNK", {tm, tn, 32}}}; } - ERR(InvalidUsageError, "Unsupported arch and data type: ", arch->name(), - " and ", data_type->type_name()); + ERR(InternalError, "Unexpected error"); return {}; } Json ModelOpMatmul::default_config(const ArchRef arch) const { auto result = result_tensors_[0]; - Json config = get_default_config(arch, result->data_type()); check_fields_args(args_, {"TransposeInput", "TransposeOther"}); Dims mnk = calc_problem_size(read_tensors_[0]->padded_shape(), read_tensors_[1]->padded_shape(), args_.at("TransposeInput").value(), args_.at("TransposeOther").value()); + Json config = get_default_config(arch, result->data_type(), mnk); size_t tile_x = config.at("TileShapeMNK")[0]; size_t tile_y = config.at("TileShapeMNK")[1]; if (mnk[0] % tile_x != 0 || mnk[1] % tile_y != 0) { From a3114e45eea5d8c7929915e7ca1b1f9cc6ef1591 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 11 Jul 2024 00:04:40 +0000 Subject: [PATCH 32/61] fix --- ark/error.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ark/error.hpp b/ark/error.hpp index e08acd975..5ad21824b 100644 --- a/ark/error.hpp +++ b/ark/error.hpp @@ -20,6 +20,7 @@ class BaseError : public std::runtime_error { _name(const std::string &msg) : BaseError(msg) {} \ }; +REGISTER_ERROR_TYPE(InternalError) REGISTER_ERROR_TYPE(InvalidUsageError) REGISTER_ERROR_TYPE(NotFoundError) REGISTER_ERROR_TYPE(ModelError) From 6116424e2a692a3cec2eb749565f1ae03637e5e6 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 11 Jul 2024 00:28:47 +0000 Subject: [PATCH 33/61] delete an unused file --- plan_gpu0.json | 2423 ------------------------------------------------ 1 file changed, 2423 deletions(-) delete mode 100644 plan_gpu0.json diff --git a/plan_gpu0.json b/plan_gpu0.json deleted file mode 100644 index cad05f774..000000000 --- a/plan_gpu0.json +++ /dev/null @@ -1,2423 +0,0 @@ -{ - "Rank": 0, - "WorldSize": 1, - "Architecture": "ROCM_942", - "NumProcessors": 304, - "NumWarpsPerProcessor": 4, - "TaskInfos": [ - { - "Id": 0, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul", - "IsVirtual": false, - "ReadTensors": [ - {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":6,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":7,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":true} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [256,128,32], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 1, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Rope", - "Name": "rope", - "IsVirtual": false, - "ReadTensors": [ - {"Id":12,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":15,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [256,1,128], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 2, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Transpose", - "Name": "transpose", - "IsVirtual": false, - "ReadTensors": [ - {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":19,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Permutation": {"DIMS":[0,2,1,3]} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [256,128], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 3, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":8,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":true} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [256,128,32], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 4, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Rope", - "Name": "rope_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":13,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":17,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [256,1,128], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 5, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Transpose", - "Name": "transpose_2", - "IsVirtual": false, - "ReadTensors": [ - {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":23,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":24,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Permutation": {"DIMS":[0,2,1,3]} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [256,128], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 6, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_2", - "IsVirtual": false, - "ReadTensors": [ - {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":10,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":11,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":true} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [256,128,32], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 7, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Transpose", - "Name": "transpose_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":14,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":21,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Permutation": {"DIMS":[0,2,1,3]} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [256,128], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 8, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_3", - "IsVirtual": false, - "ReadTensors": [ - {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":24,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":true} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [256,128,32], - "NumTasks": 4096 - } - } - ] - }, - { - "Id": 9, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "ScalarMul", - "Name": "mul", - "IsVirtual": false, - "ReadTensors": [ - {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":27,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Factor": {"FLOAT":0.0883883461356163} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [256,128], - "NumTasks": 4096 - } - } - ] - }, - { - "Id": 10, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "ReduceMax", - "Name": "reduce_max", - "IsVirtual": false, - "ReadTensors": [ - {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":29,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Axis": {"INT":3}, - "KeepDim": {"BOOL":true} - }, - "Config": { - "NumWarps": 1, - "ImplType": "WarpWise", - "SramBytes": 0, - "NumTasks": 65536 - } - } - ] - }, - { - "Id": 11, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Sub", - "Name": "sub", - "IsVirtual": false, - "ReadTensors": [ - {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 4096 - } - } - ] - }, - { - "Id": 12, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Exp", - "Name": "exp", - "IsVirtual": false, - "ReadTensors": [ - {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 4096 - } - } - ] - }, - { - "Id": 13, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "ReduceSum", - "Name": "reduce_sum", - "IsVirtual": false, - "ReadTensors": [ - {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":33,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Axis": {"INT":3}, - "KeepDim": {"BOOL":true} - }, - "Config": { - "NumWarps": 1, - "ImplType": "WarpWise", - "SramBytes": 0, - "NumTasks": 65536 - } - } - ] - }, - { - "Id": 14, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Div", - "Name": "div", - "IsVirtual": false, - "ReadTensors": [ - {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 4096 - } - } - ] - }, - { - "Id": 15, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_4", - "IsVirtual": false, - "ReadTensors": [ - {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":36,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":false} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [256,128,32], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 16, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "Transpose", - "Name": "transpose_3", - "IsVirtual": false, - "ReadTensors": [ - {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":38,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":39,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Permutation": {"DIMS":[0,2,1,3]} - }, - "Config": { - "NumWarps": 1, - "SramBytes": 0, - "Tile": [8,128], - "NumTasks": 8192 - } - } - ] - }, - { - "Id": 17, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_5", - "IsVirtual": false, - "ReadTensors": [ - {"Id":40,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":41,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":42,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":true} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [128,256,32], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 18, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Cast", - "Name": "cast", - "IsVirtual": false, - "ReadTensors": [ - {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":54,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 19, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Mul", - "Name": "mul_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":56,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 20, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "ReduceMean", - "Name": "reduce_mean", - "IsVirtual": false, - "ReadTensors": [ - {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":58,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Axis": {"INT":2}, - "KeepDim": {"BOOL":true} - }, - "Config": { - "NumWarps": 1, - "ImplType": "WarpWise", - "SramBytes": 0, - "NumTasks": 2048 - } - } - ] - }, - { - "Id": 21, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "Rsqrt", - "Name": "rsqrt", - "IsVirtual": false, - "ReadTensors": [ - {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":60,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 1, - "SramBytes": 0, - "Tile": [64,1], - "NumTasks": 32 - } - } - ] - }, - { - "Id": 22, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Mul", - "Name": "mul_2", - "IsVirtual": false, - "ReadTensors": [ - {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":62,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 23, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Mul", - "Name": "mul_3", - "IsVirtual": false, - "ReadTensors": [ - {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":50,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 24, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Cast", - "Name": "cast_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":65,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 25, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_6", - "IsVirtual": false, - "ReadTensors": [ - {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":43,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":67,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":68,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":true} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [128,256,32], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 26, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "Rope", - "Name": "rope_2", - "IsVirtual": false, - "ReadTensors": [ - {"Id":73,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":76,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 1, - "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 - } - } - ] - }, - { - "Id": 27, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "Transpose", - "Name": "transpose_4", - "IsVirtual": false, - "ReadTensors": [ - {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":80,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Permutation": {"DIMS":[0,2,1,3]} - }, - "Config": { - "NumWarps": 1, - "SramBytes": 0, - "Tile": [8,128], - "NumTasks": 8192 - } - } - ] - }, - { - "Id": 28, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_7", - "IsVirtual": false, - "ReadTensors": [ - {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":44,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":69,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":70,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":true} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [128,256,32], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 29, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "Rope", - "Name": "rope_3", - "IsVirtual": false, - "ReadTensors": [ - {"Id":74,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":78,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 1, - "SramBytes": 0, - "Tile": [1,64], - "NumTasks": 131072 - } - } - ] - }, - { - "Id": 30, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "Transpose", - "Name": "transpose_6", - "IsVirtual": false, - "ReadTensors": [ - {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":84,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Permutation": {"DIMS":[0,2,3,1]} - }, - "Config": { - "NumWarps": 1, - "SramBytes": 0, - "Tile": [8,8], - "NumTasks": 131072 - } - } - ] - }, - { - "Id": 31, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_8", - "IsVirtual": false, - "ReadTensors": [ - {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":45,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":71,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":72,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":true} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [128,256,32], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 32, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "Transpose", - "Name": "transpose_5", - "IsVirtual": false, - "ReadTensors": [ - {"Id":75,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":82,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Permutation": {"DIMS":[0,2,1,3]} - }, - "Config": { - "NumWarps": 1, - "SramBytes": 0, - "Tile": [8,128], - "NumTasks": 8192 - } - } - ] - }, - { - "Id": 33, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_9", - "IsVirtual": false, - "ReadTensors": [ - {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":86,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":false} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [128,256,32], - "NumTasks": 4096 - } - } - ] - }, - { - "Id": 34, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "ScalarMul", - "Name": "mul_4", - "IsVirtual": false, - "ReadTensors": [ - {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":88,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Factor": {"FLOAT":0.0883883461356163} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 4096 - } - } - ] - }, - { - "Id": 35, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "ReduceMax", - "Name": "reduce_max_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":90,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Axis": {"INT":3}, - "KeepDim": {"BOOL":true} - }, - "Config": { - "NumWarps": 1, - "ImplType": "WarpWise", - "SramBytes": 0, - "NumTasks": 65536 - } - } - ] - }, - { - "Id": 36, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Sub", - "Name": "sub_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 4096 - } - } - ] - }, - { - "Id": 37, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Exp", - "Name": "exp_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 4096 - } - } - ] - }, - { - "Id": 38, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "ReduceSum", - "Name": "reduce_sum_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":94,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Axis": {"INT":3}, - "KeepDim": {"BOOL":true} - }, - "Config": { - "NumWarps": 1, - "ImplType": "WarpWise", - "SramBytes": 0, - "NumTasks": 65536 - } - } - ] - }, - { - "Id": 39, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Div", - "Name": "div_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 4096 - } - } - ] - }, - { - "Id": 40, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_10", - "IsVirtual": false, - "ReadTensors": [ - {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":97,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":false} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [256,128,32], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 41, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "Transpose", - "Name": "transpose_7", - "IsVirtual": false, - "ReadTensors": [ - {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":99,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":100,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Permutation": {"DIMS":[0,2,1,3]} - }, - "Config": { - "NumWarps": 1, - "SramBytes": 0, - "Tile": [8,128], - "NumTasks": 8192 - } - } - ] - }, - { - "Id": 42, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_11", - "IsVirtual": false, - "ReadTensors": [ - {"Id":101,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":46,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":102,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":true} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [128,256,32], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 43, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Add", - "Name": "add", - "IsVirtual": false, - "ReadTensors": [ - {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":104,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 44, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Cast", - "Name": "cast_2", - "IsVirtual": false, - "ReadTensors": [ - {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":106,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 45, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "Mul", - "Name": "mul_5", - "IsVirtual": false, - "ReadTensors": [ - {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":108,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 1, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 46, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "ReduceMean", - "Name": "reduce_mean_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":110,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "Axis": {"INT":2}, - "KeepDim": {"BOOL":true} - }, - "Config": { - "NumWarps": 1, - "ImplType": "WarpWise", - "SramBytes": 0, - "NumTasks": 2048 - } - } - ] - }, - { - "Id": 47, - "NumWarps": 1, - "SramBytes": 0, - "Ops": [ - { - "Type": "Rsqrt", - "Name": "rsqrt_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":112,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 1, - "SramBytes": 0, - "Tile": [64,1], - "NumTasks": 32 - } - } - ] - }, - { - "Id": 48, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Mul", - "Name": "mul_6", - "IsVirtual": false, - "ReadTensors": [ - {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":114,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 49, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Mul", - "Name": "mul_7", - "IsVirtual": false, - "ReadTensors": [ - {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":51,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 50, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Cast", - "Name": "cast_3", - "IsVirtual": false, - "ReadTensors": [ - {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":117,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 51, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_12", - "IsVirtual": false, - "ReadTensors": [ - {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":47,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":119,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":true} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [128,256,32], - "NumTasks": 688 - } - } - ] - }, - { - "Id": 52, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Sigmoid", - "Name": "sigmoid", - "IsVirtual": false, - "ReadTensors": [ - {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":121,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 688 - } - } - ] - }, - { - "Id": 53, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Mul", - "Name": "mul_8", - "IsVirtual": false, - "ReadTensors": [ - {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":123,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 688 - } - } - ] - }, - { - "Id": 54, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_13", - "IsVirtual": false, - "ReadTensors": [ - {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":49,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":125,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":true} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [128,256,32], - "NumTasks": 688 - } - } - ] - }, - { - "Id": 55, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Mul", - "Name": "mul_9", - "IsVirtual": false, - "ReadTensors": [ - {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":127,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 688 - } - } - ] - }, - { - "Id": 56, - "NumWarps": 4, - "SramBytes": 24672, - "Ops": [ - { - "Type": "Matmul", - "Name": "matmul_14", - "IsVirtual": false, - "ReadTensors": [ - {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":48,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":129,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": { - "TransposeInput": {"BOOL":false}, - "TransposeOther": {"BOOL":true} - }, - "Config": { - "NumWarps": 4, - "SramBytes": 24672, - "TileShapeMNK": [128,256,32], - "NumTasks": 256 - } - } - ] - }, - { - "Id": 57, - "NumWarps": 4, - "SramBytes": 0, - "Ops": [ - { - "Type": "Add", - "Name": "add_1", - "IsVirtual": false, - "ReadTensors": [ - {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}, - {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "WriteTensors": [ - {"Id":131,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "ResultTensors": [ - {"Id":132,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}} - ], - "Args": {}, - "Config": { - "NumWarps": 4, - "SramBytes": 0, - "Tile": [128,256], - "NumTasks": 256 - } - } - ] - } - ], - "ProcessorGroups": [ - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,86], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":0,"TaskRange":[0,256],"Granularity":1}, - {"TaskId":1,"TaskRange":[0,256],"Granularity":1}, - {"TaskId":2,"TaskRange":[0,256],"Granularity":1} - ] - }, - { - "ProcessorRange": [86,172], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":3,"TaskRange":[0,256],"Granularity":1}, - {"TaskId":4,"TaskRange":[0,256],"Granularity":1}, - {"TaskId":5,"TaskRange":[0,256],"Granularity":1} - ] - }, - { - "ProcessorRange": [172,258], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":6,"TaskRange":[0,256],"Granularity":1}, - {"TaskId":7,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":8,"TaskRange":[0,4096],"Granularity":1}, - {"TaskId":9,"TaskRange":[0,4096],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":10,"TaskRange":[0,65536],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":11,"TaskRange":[0,4096],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":12,"TaskRange":[0,4096],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":13,"TaskRange":[0,65536],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":14,"TaskRange":[0,4096],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,256], - "ResourceGroups": [ - { - "ProcessorRange": [0,256], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":15,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":16,"TaskRange":[0,8192],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,256], - "ResourceGroups": [ - { - "ProcessorRange": [0,256], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":17,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":18,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":19,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":20,"TaskRange":[0,2048],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,32], - "ResourceGroups": [ - { - "ProcessorRange": [0,32], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":21,"TaskRange":[0,32],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":22,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":23,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":24,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,256], - "ResourceGroups": [ - { - "ProcessorRange": [0,256], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":25,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":26,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":27,"TaskRange":[0,8192],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,256], - "ResourceGroups": [ - { - "ProcessorRange": [0,256], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":28,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":29,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":30,"TaskRange":[0,131072],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,256], - "ResourceGroups": [ - { - "ProcessorRange": [0,256], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":31,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":32,"TaskRange":[0,8192],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":33,"TaskRange":[0,4096],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":34,"TaskRange":[0,4096],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":35,"TaskRange":[0,65536],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":36,"TaskRange":[0,4096],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":37,"TaskRange":[0,4096],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":38,"TaskRange":[0,65536],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":39,"TaskRange":[0,4096],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,256], - "ResourceGroups": [ - { - "ProcessorRange": [0,256], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":40,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":41,"TaskRange":[0,8192],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,256], - "ResourceGroups": [ - { - "ProcessorRange": [0,256], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":42,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":43,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":44,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":45,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":46,"TaskRange":[0,2048],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,32], - "ResourceGroups": [ - { - "ProcessorRange": [0,32], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":47,"TaskRange":[0,32],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":48,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":49,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":50,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":51,"TaskRange":[0,688],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":52,"TaskRange":[0,688],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":53,"TaskRange":[0,688],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":54,"TaskRange":[0,688],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":55,"TaskRange":[0,688],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,256], - "ResourceGroups": [ - { - "ProcessorRange": [0,256], - "WarpRange": [0,4], - "SramRange": [0,24672], - "TaskGroups": [ - {"TaskId":56,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - }, - { - "ProcessorRange": [0,304], - "ResourceGroups": [ - { - "ProcessorRange": [0,304], - "WarpRange": [0,4], - "SramRange": [0,0], - "TaskGroups": [ - {"TaskId":57,"TaskRange":[0,256],"Granularity":1} - ] - } - ] - } - ] -} From 67e3b2601f00997d6debe8f9dd3e7c633ceee08b Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 11 Jul 2024 01:44:53 +0000 Subject: [PATCH 34/61] update test --- ark/ops/ops_scalar_test.cpp | 43 +++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/ark/ops/ops_scalar_test.cpp b/ark/ops/ops_scalar_test.cpp index 9e9e635b8..6ae0022f0 100644 --- a/ark/ops/ops_scalar_test.cpp +++ b/ark/ops/ops_scalar_test.cpp @@ -263,31 +263,28 @@ ark::unittest::State test_scalar_mul_fp16_offset() { { ark::Model m; ark::Tensor buf = m.tensor({1024}, ark::FP16); - ark::Tensor tns = m.refer(buf, {2}, {1024}, {3}); - ark::Tensor out = m.mul(tns, 2, tns); - - ark::DefaultExecutor exe(m); - exe.compile(); + ark::Tensor tns = m.refer(buf, {2}, {1024}, {6}); + ark::Tensor doubled = m.mul(tns, 2, tns); + ark::Tensor out = m.identity(buf, {doubled}); std::vector data(1024, ark::half_t(2)); - exe.tensor_write(buf, data); - - exe.launch(); - exe.run(1); - exe.stop(); - - data.clear(); - data.resize(1024); - - exe.tensor_read(buf, data); - - for (size_t i = 0; i < data.size(); ++i) { - if (i == 3 || i == 4) { - UNITTEST_EQ(data[i], 4); - } else { - UNITTEST_EQ(data[i], 2); - } - } + auto result = ark::op_test( + "scalar_mul_fp16_offset", m, {buf}, {out}, + [](std::vector &outputs, const std::vector &, + const std::vector &, const std::vector &, + int) { + ark::half_t *out = static_cast(outputs[0]); + for (size_t i = 0; i < 1024; ++i) { + if (i == 6 || i == 7) { + out[i] = 4; + } else { + out[i] = 2; + } + } + }, + {data.data()}); + UNITTEST_LOG(result); + UNITTEST_EQ(result.max_diff[0], 0.0f); } return ark::unittest::SUCCESS; } From e1f178bd3c7bbb0023e1ffc3eceee72564116d10 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 12 Jul 2024 04:37:51 +0000 Subject: [PATCH 35/61] fix merge & updates --- ark/api/executor.cpp | 3 +-- python/ark/runtime.py | 8 ++++---- python/ark/tensor.py | 17 ++++++++++------- python/executor_py.cpp | 2 +- python/unittest/unittest_common.py | 22 ++++++++++++++++++++++ 5 files changed, 38 insertions(+), 14 deletions(-) create mode 100644 python/unittest/unittest_common.py diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index 1af298e89..ad6cb8550 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -233,7 +233,6 @@ void Executor::Impl::init(const PlanJson &plan_json) { if (world_size_ > 1) { init_communicator(); } -} auto gpu_manager = GpuManager::get_instance(device_id_); @@ -384,7 +383,7 @@ std::map Executor::Impl::init_buffers(const Json &plan_json) { continue; } if (buf_info->buffer->is_external()) { - if (buf_info->buffer->device_id() != gpu_id_) { + if (buf_info->buffer->device_id() != device_id_) { ERR(InvalidUsageError, "PyTorch tensor and model execution are on different GPUs"); } diff --git a/python/ark/runtime.py b/python/ark/runtime.py index 93acb6bf8..1e56fe1ca 100644 --- a/python/ark/runtime.py +++ b/python/ark/runtime.py @@ -142,7 +142,7 @@ def launch( initialized. The executor will compile the cuda kernels and launch the ARK runtime. """ if self.launched(): - logging.warn( + logging.warning( f"Runtime {self.runtime_id} is already launched, skip launching" ) return @@ -153,7 +153,7 @@ def launch( if self.state == Runtime.State.Init: if self.executor is not None: if not self.executor.destroyed(): - logging.warn( + logging.warning( f"Runtime {self.runtime_id}, has already been launched. Destroying the old executor" ) self.executor.destroy() @@ -184,7 +184,7 @@ def wait(self): Wait for the kernel to finish. """ if self.state != Runtime.State.Running: - logging.warn( + logging.warning( f"ARK runtime {self.runtime_id} is not running, skip waiting" ) return @@ -197,7 +197,7 @@ def stop(self) -> float: Once this is called, we need to call `launch()` again to run the model again. """ if not self.launched(): - logging.warn( + logging.warning( f"ARK runtime {self.runtime_id} is never launched, skip stopping" ) return diff --git a/python/ark/tensor.py b/python/ark/tensor.py index e377cf852..335020769 100644 --- a/python/ark/tensor.py +++ b/python/ark/tensor.py @@ -103,7 +103,7 @@ def to_numpy( return ndarray def to_torch( - self, tensor: torch.Tensor = None, runtime_id: int = -1 + self, tensor: torch.Tensor = None, stream: int = 0 ) -> torch.Tensor: """ """ if _no_torch: @@ -116,21 +116,24 @@ def to_torch( ) torch_type = self.dtype().to_torch() if tensor is None: - dev_name = f"cuda:{rt.executor.gpu_id()}" + dev_name = f"cuda:{rt.executor.device_id()}" tensor = torch.zeros( self.shape(), dtype=torch_type, device=torch.device(dev_name) ) - elif tensor.shape != self.shape(): - raise ValueError("torch tensor shape does not match the tensor") + elif list(tensor.shape) != self.shape(): + raise ValueError(f"torch tensor shape {list(tensor.shape)} " + f"does not match the tensor {self.shape()}") elif tensor.dtype != torch_type: - raise ValueError("torch tensor dtype does not match the tensor") + raise ValueError(f"torch tensor dtype {tensor.dtype} " + f"does not match the tensor {torch_type}") elif not tensor.is_contiguous(): raise ValueError("torch tensor is not contiguous in memory") elif tensor.numel() != self.nelems(): - raise ValueError("torch tensor size does not match the tensor") + raise ValueError(f"torch tensor size {tensor.numel()} " + f"does not match the tensor {self.nelems()}") tensor_bytes = self.nelems() * self.dtype().element_size() rt.executor.tensor_read( - self._tensor, tensor.data_ptr(), tensor_bytes, True + self._tensor, tensor.data_ptr(), tensor_bytes, stream, True ) return tensor diff --git a/python/executor_py.cpp b/python/executor_py.cpp index fffbb2c30..8455fa585 100644 --- a/python/executor_py.cpp +++ b/python/executor_py.cpp @@ -93,7 +93,7 @@ static DLManagedTensor *to_dlpack(ark::Executor &exe, tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0]; dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes(); dl_tensor.device.device_type = get_device_type(); - dl_tensor.device.device_id = static_cast(exe.gpu_id()); + dl_tensor.device.device_id = static_cast(exe.device_id()); dl_tensor.ndim = static_cast(tensor.shape().ndims()); dl_tensor.dtype = get_dl_dtype(tensor.data_type()); diff --git a/python/unittest/unittest_common.py b/python/unittest/unittest_common.py new file mode 100644 index 000000000..9548410b5 --- /dev/null +++ b/python/unittest/unittest_common.py @@ -0,0 +1,22 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import pytest +import ark + + +def pytest_ark(need_torch: bool = False): + """ + Decorator for ARK unit tests. + """ + def decorator(test_func): + if need_torch: + try: + import torch + except ImportError: + return pytest.mark.skip(reason="torch is not installed")(test_func) + def wrapper(*args, **kwargs): + ark.init() + test_func(*args, **kwargs) + return wrapper + return decorator From ce1959ecb5fb064b4e653b3cad7cf3dcba63a9d7 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 12 Jul 2024 06:49:30 +0000 Subject: [PATCH 36/61] Add `loop_mode` argument --- ark/api/executor.cpp | 116 ++++++++++++++------- ark/api/planner.cpp | 2 +- ark/codegen.cpp | 2 +- ark/gpu/{gpu.h => gpu.hpp} | 7 +- ark/gpu/gpu_compile.cpp | 4 +- ark/gpu/{gpu_compile.h => gpu_compile.hpp} | 6 +- ark/gpu/gpu_event.cpp | 6 +- ark/gpu/{gpu_event.h => gpu_event.hpp} | 8 +- ark/gpu/gpu_kernel.cpp | 33 ++---- ark/gpu/{gpu_kernel.h => gpu_kernel.hpp} | 19 ++-- ark/gpu/gpu_kernel_test.cpp | 8 +- ark/gpu/{gpu_logging.h => gpu_logging.hpp} | 8 +- ark/gpu/gpu_manager.cpp | 4 +- ark/gpu/{gpu_manager.h => gpu_manager.hpp} | 14 +-- ark/gpu/gpu_memory.cpp | 8 +- ark/gpu/{gpu_memory.h => gpu_memory.hpp} | 10 +- ark/gpu/gpu_stream.cpp | 6 +- ark/gpu/{gpu_stream.h => gpu_stream.hpp} | 8 +- ark/include/ark/executor.hpp | 4 +- ark/include/kernels/kernel_template.in | 17 ++- ark/ops/ops_matmul_test.cpp | 2 +- ark/ops/ops_test_common.cpp | 2 +- python/ark/runtime.py | 4 +- python/executor_py.cpp | 8 +- 24 files changed, 173 insertions(+), 133 deletions(-) rename ark/gpu/{gpu.h => gpu.hpp} (98%) rename ark/gpu/{gpu_compile.h => gpu_compile.hpp} (78%) rename ark/gpu/{gpu_event.h => gpu_event.hpp} (84%) rename ark/gpu/{gpu_kernel.h => gpu_kernel.hpp} (68%) rename ark/gpu/{gpu_logging.h => gpu_logging.hpp} (92%) rename ark/gpu/{gpu_manager.h => gpu_manager.hpp} (88%) rename ark/gpu/{gpu_memory.h => gpu_memory.hpp} (87%) rename ark/gpu/{gpu_stream.h => gpu_stream.hpp} (79%) diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index 2f50a4280..91c8e39de 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -14,11 +14,11 @@ #include "codegen.hpp" #include "env.h" #include "file_io.h" -#include "gpu/gpu.h" -#include "gpu/gpu_event.h" -#include "gpu/gpu_kernel.h" -#include "gpu/gpu_logging.h" -#include "gpu/gpu_manager.h" +#include "gpu/gpu.hpp" +#include "gpu/gpu_event.hpp" +#include "gpu/gpu_kernel.hpp" +#include "gpu/gpu_logging.hpp" +#include "gpu/gpu_manager.hpp" #include "logging.h" #include "model/model_buffer.hpp" #include "model/model_data_type.hpp" @@ -140,7 +140,7 @@ static size_t tensor_stride_bytes(const Json &tensor) { class Executor::Impl { public: - Impl(int device_id, Stream stream, const std::string &name); + Impl(int device_id, Stream stream, const std::string &name, bool loop_mode); ~Impl() = default; void init(const PlanJson& plan); @@ -174,6 +174,8 @@ class Executor::Impl { protected: int device_id_; std::string name_; + bool loop_mode_; + gpuStream stream_raw_; int rank_; @@ -203,8 +205,9 @@ class Executor::Impl { rank_to_sm_channels_; }; -Executor::Impl::Impl(int device_id, Stream stream, const std::string &name) - : device_id_(device_id), name_(name) { +Executor::Impl::Impl(int device_id, Stream stream, const std::string &name, + bool loop_mode) + : device_id_(device_id), name_(name), loop_mode_(loop_mode) { if (device_id < 0) { ERR(InvalidUsageError, "Invalid device ID ", device_id); } @@ -251,7 +254,6 @@ void Executor::Impl::init(const PlanJson &plan_json) { int threads_per_block = static_cast( codegen_->num_warps_per_proc() * gpu_manager->info().threads_per_warp); int num_sm = static_cast(codegen_->num_procs()); - int *flag = flag_->ref(); size_t smem_block_total = static_cast(gpu_manager->info().smem_block_total); @@ -260,11 +262,19 @@ void Executor::Impl::init(const PlanJson &plan_json) { init_channels(remote_ranks); } + std::string kernel_name; + if (loop_mode_) { + kernel_name = "ark_loop_kernel"; + } else { + kernel_name = "ark_kernel"; + } + if (!name_.empty()) { + kernel_name += "_" + name_; + } + kernel_ = std::shared_ptr(new GpuKernel( device_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1}, - std::max(smem_block_total, size_t(4)), name_, - {std::pair{buffer_->ref(), sizeof(buffer_->ref())}, - std::pair{flag, sizeof(flag)}})); + std::max(smem_block_total, size_t(4)), kernel_name)); } void Executor::Impl::init_communicator() { @@ -669,51 +679,76 @@ void Executor::Impl::launch(int64_t max_spin_count) { proxy_service_->startProxy(); } - // Initialize loop flags. - atomicStoreRelaxed(flag_->ref(), 0); - kernel_->launch(stream_raw_); - timer_end_->record(stream_raw_); + if (loop_mode_) { + // Initialize loop flags. + atomicStoreRelaxed(flag_->ref(), 0); + void *buf_ptr = buffer_->ref(); + void *flag_ptr = flag_->ref(); + std::vector args = {&buf_ptr, &flag_ptr}; + kernel_->launch(stream_raw_, args); + } is_recording_ = true; is_launched_ = true; } void Executor::Impl::run(int iter) { - if (iter > 0) { + if (iter <= 0) return; + if (loop_mode_) { while (atomicLoadRelaxed(flag_->ref()) > 0) { } atomicStoreRelaxed(flag_->ref(), iter); + } else { + void *buf_ptr = buffer_->ref(); + int i = 0; + std::vector args = {&buf_ptr, reinterpret_cast(&i)}; + for (; i < iter; i++) { + kernel_->launch(stream_raw_, args); + } } } void Executor::Impl::wait(int64_t max_spin_count) { int64_t cnt = max_spin_count; - while (atomicLoadRelaxed(flag_->ref()) > 0) { - if (cnt-- > 0) { - continue; - } - // Check if the kernel encountered an error. - gpuError res = gpuStreamQuery(stream_raw_); - if (res == gpuSuccess) { - if (atomicLoadRelaxed(flag_->ref()) > 0) { - LOG(WARN, "Stream is finished but the loop flag is still set."); - break; + if (loop_mode_) { + while (atomicLoadRelaxed(flag_->ref()) > 0) { + if (cnt-- > 0) { + continue; + } + // Check if the kernel encountered an error. + gpuError res = gpuStreamQuery(stream_raw_); + if (res == gpuSuccess) { + if (atomicLoadRelaxed(flag_->ref()) > 0) { + LOG(WARN, + "Stream is finished but the loop flag is still set."); + break; + } else { + LOG(WARN, + "wait() is delayed by a stream query. Regarding " + "timing measurements may be inaccurate."); + break; + } + } else if (res == gpuErrorNotReady) { + cnt = max_spin_count; } else { - LOG(WARN, - "wait() is delayed by a stream query. Regarding " - "timing measurements may be inaccurate."); - break; + GLOG(res); } - } else if (res == gpuErrorNotReady) { - cnt = max_spin_count; - } else { - GLOG(res); } + } else { + if (max_spin_count >= 0) { + LOG(WARN, "max_spin_count is ignored in non-loop mode."); + } + GLOG(gpuStreamSynchronize(stream_raw_)); } } float Executor::Impl::stop(int64_t max_spin_count) { this->wait(max_spin_count); - atomicStoreRelaxed(flag_->ref(), -1); + if (is_recording_) { + timer_end_->record(stream_raw_); + } + if (loop_mode_) { + atomicStoreRelaxed(flag_->ref(), -1); + } GLOG(gpuStreamSynchronize(stream_raw_)); if (is_recording_) { elapsed_msec_ = timer_end_->elapsed_msec(*timer_begin_); @@ -847,8 +882,9 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data, } Executor::Executor(int device_id, Stream stream, const std::string &name, - const std::string &plan) - : impl_(std::make_unique(device_id, stream, name)) { + const std::string &plan, bool loop_mode) + : impl_(std::make_unique(device_id, stream, name, + loop_mode)) { auto &plan_path = get_env().enforce_plan_path; if (!plan_path.empty()) { LOG(INFO, "Enforce executor plan path: ", plan_path); @@ -901,10 +937,10 @@ void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes, DefaultExecutor::DefaultExecutor( const Model &model, int device_id, Stream stream, const std::vector &config_rules, - const std::string &name) + const std::string &name, bool loop_mode) : Executor((device_id < 0) ? (model.rank() % get_env().num_ranks_per_host) : device_id, - stream, name, "") { + stream, name, "", loop_mode) { DefaultPlanner planner(model, impl_->device_id()); for (const auto &rule : config_rules) { planner.install_config_rule(rule); diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp index 5c9d09f2e..d7fdbf807 100644 --- a/ark/api/planner.cpp +++ b/ark/api/planner.cpp @@ -6,7 +6,7 @@ #include "ark/model.hpp" #include "env.h" #include "file_io.h" -#include "gpu/gpu_manager.h" +#include "gpu/gpu_manager.hpp" #include "model/model_json.hpp" #include "model/model_node.hpp" #include "model/model_op.hpp" diff --git a/ark/codegen.cpp b/ark/codegen.cpp index cd6206284..02a5d9ad9 100644 --- a/ark/codegen.cpp +++ b/ark/codegen.cpp @@ -174,7 +174,7 @@ CodeGenerator::Impl::Impl(const PlanJson &plan, {"@NUM_WARPS_PER_BLOCK@", std::to_string(num_warps_per_proc_)}, {"@DEFINITIONS@", definitions_ss.str()}, {"@BODY@", body_ss.str()}, - {"@NAME@", name_}, + {"@NAME@", (name_.empty() ? "" : "_" + name_)}, }; code_ = replace(template_code, replacements); } diff --git a/ark/gpu/gpu.h b/ark/gpu/gpu.hpp similarity index 98% rename from ark/gpu/gpu.h rename to ark/gpu/gpu.hpp index 2f1eba3ba..531d6c7ee 100644 --- a/ark/gpu/gpu.h +++ b/ark/gpu/gpu.hpp @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_GPU_H_ -#define ARK_GPU_H_ +#ifndef ARK_GPU_HPP_ +#define ARK_GPU_HPP_ #include @@ -125,6 +125,7 @@ ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuPointerAttributeSyncMemops, // runtime API ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetErrorString, cudaGetErrorString, hipGetErrorString); +ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetLastError, cudaGetLastError, hipGetLastError); ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceGetAttribute, cudaDeviceGetAttribute, hipDeviceGetAttribute); ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceSynchronize, cudaDeviceSynchronize, @@ -183,4 +184,4 @@ ARK_GPU_DEFINE_FUNC_ALIAS(gpuPointerSetAttribute, cuPointerSetAttribute, } // namespace ark -#endif // ARK_GPU_H_ +#endif // ARK_GPU_HPP_ diff --git a/ark/gpu/gpu_compile.cpp b/ark/gpu/gpu_compile.cpp index b1c078af4..11e172f07 100644 --- a/ark/gpu/gpu_compile.cpp +++ b/ark/gpu/gpu_compile.cpp @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "gpu/gpu_compile.h" +#include "gpu/gpu_compile.hpp" #include #include @@ -22,7 +22,7 @@ #include "cpu_timer.h" #include "env.h" #include "file_io.h" -#include "gpu/gpu_logging.h" +#include "gpu/gpu_logging.hpp" #include "utils/utils_string.hpp" #define ARK_DEBUG_KERNEL 0 diff --git a/ark/gpu/gpu_compile.h b/ark/gpu/gpu_compile.hpp similarity index 78% rename from ark/gpu/gpu_compile.h rename to ark/gpu/gpu_compile.hpp index 58048e78c..8b9e1a9fd 100644 --- a/ark/gpu/gpu_compile.h +++ b/ark/gpu/gpu_compile.hpp @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_GPU_COMPILE_H_ -#define ARK_GPU_COMPILE_H_ +#ifndef ARK_GPU_COMPILE_HPP_ +#define ARK_GPU_COMPILE_HPP_ #include #include @@ -16,4 +16,4 @@ const std::string gpu_compile(const std::vector &codes, } // namespace ark -#endif // ARK_GPU_COMPILE_H_ +#endif // ARK_GPU_COMPILE_HPP_ diff --git a/ark/gpu/gpu_event.cpp b/ark/gpu/gpu_event.cpp index cbc45d9a6..06779b91a 100644 --- a/ark/gpu/gpu_event.cpp +++ b/ark/gpu/gpu_event.cpp @@ -1,10 +1,10 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "gpu/gpu_event.h" +#include "gpu/gpu_event.hpp" -#include "gpu/gpu_logging.h" -#include "gpu/gpu_manager.h" +#include "gpu/gpu_logging.hpp" +#include "gpu/gpu_manager.hpp" namespace ark { class GpuEvent::Impl { diff --git a/ark/gpu/gpu_event.h b/ark/gpu/gpu_event.hpp similarity index 84% rename from ark/gpu/gpu_event.h rename to ark/gpu/gpu_event.hpp index 081f0203b..bd2a7c952 100644 --- a/ark/gpu/gpu_event.h +++ b/ark/gpu/gpu_event.hpp @@ -1,12 +1,12 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_GPU_EVENT_H_ -#define ARK_GPU_EVENT_H_ +#ifndef ARK_GPU_EVENT_HPP_ +#define ARK_GPU_EVENT_HPP_ #include -#include "gpu/gpu.h" +#include "gpu/gpu.hpp" namespace ark { @@ -33,4 +33,4 @@ class GpuEvent { }; } // namespace ark -#endif // ARK_GPU_EVENT_H_ +#endif // ARK_GPU_EVENT_HPP_ diff --git a/ark/gpu/gpu_kernel.cpp b/ark/gpu/gpu_kernel.cpp index 46f467f51..d4412f80e 100644 --- a/ark/gpu/gpu_kernel.cpp +++ b/ark/gpu/gpu_kernel.cpp @@ -1,50 +1,38 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "gpu_kernel.h" +#include "gpu_kernel.hpp" #include #include -#include "gpu.h" -#include "gpu_compile.h" -#include "gpu_logging.h" -#include "gpu_manager.h" +#include "gpu.hpp" +#include "gpu_compile.hpp" +#include "gpu_logging.hpp" +#include "gpu_manager.hpp" namespace ark { GpuKernel::GpuKernel(int gpu_id, const std::string& code, const std::array& block_dim, const std::array& grid_dim, size_t smem_bytes, - const std::string& kernel_name, - std::initializer_list> args) { - this->init(gpu_id, code, block_dim, grid_dim, smem_bytes, kernel_name, - args); + const std::string& kernel_name) { + this->init(gpu_id, code, block_dim, grid_dim, smem_bytes, kernel_name); } void GpuKernel::init(int gpu_id, const std::string& code, const std::array& block_dim, const std::array& grid_dim, size_t smem_bytes, - const std::string& kernel_name, - std::initializer_list> args) { + const std::string& kernel_name) { gpu_manager_ = GpuManager::get_instance(gpu_id); code_ = code; block_dim_ = block_dim; grid_dim_ = grid_dim; smem_bytes_ = smem_bytes; kernel_name_ = kernel_name; - params_ptr_.resize(args.size()); - args_.resize(args.size()); if (kernel_name_.size() == 0) { ERR(InvalidUsageError, "Invalid kernel name: ", kernel_name_); } - size_t idx = 0; - for (auto& pair : args) { - args_[idx].reset(new uint8_t[pair.second]); - std::memcpy(args_[idx].get(), &(pair.first), pair.second); - params_ptr_[idx] = static_cast(args_[idx].get()); - idx++; - } } void GpuKernel::compile() { @@ -68,12 +56,13 @@ void GpuKernel::compile() { dynamic_smem_size_bytes)); } -void GpuKernel::launch(gpuStream stream) { +void GpuKernel::launch(gpuStream stream, std::vector& args) { if (!this->is_compiled()) { ERR(InvalidUsageError, "Kernel is not compiled yet."); } gpu_manager_->launch(function_, grid_dim_, block_dim_, smem_bytes_, stream, - params_ptr_.data(), nullptr); + args.data(), nullptr); + GLOG(gpuGetLastError()); } gpuDeviceptr GpuKernel::get_global(const std::string& name, diff --git a/ark/gpu/gpu_kernel.h b/ark/gpu/gpu_kernel.hpp similarity index 68% rename from ark/gpu/gpu_kernel.h rename to ark/gpu/gpu_kernel.hpp index b3be79071..5308cfead 100644 --- a/ark/gpu/gpu_kernel.h +++ b/ark/gpu/gpu_kernel.hpp @@ -1,13 +1,14 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_GPU_KERNEL_H_ -#define ARK_GPU_KERNEL_H_ +#ifndef ARK_GPU_KERNEL_HPP_ +#define ARK_GPU_KERNEL_HPP_ #include #include +#include -#include "gpu_stream.h" +#include "gpu_stream.hpp" namespace ark { @@ -18,16 +19,14 @@ class GpuKernel { GpuKernel(int gpu_id, const std::string& codes, const std::array& block_dim, const std::array& grid_dim, size_t smem_bytes, - const std::string& kernel_name, - std::initializer_list> args = {}); + const std::string& kernel_name); void init(int gpu_id, const std::string& codes, const std::array& block_dim, const std::array& grid_dim, size_t smem_bytes, - const std::string& kernel_name, - std::initializer_list> args = {}); + const std::string& kernel_name); void compile(); - void launch(gpuStream stream); + void launch(gpuStream stream, std::vector& args); gpuDeviceptr get_global(const std::string& name, bool ignore_not_found = false) const; @@ -43,10 +42,8 @@ class GpuKernel { std::string bin_; gpuModule module_; gpuFunction function_ = nullptr; - std::vector params_ptr_; - std::vector> args_; }; } // namespace ark -#endif // ARK_GPU_KERNEL_H_ +#endif // ARK_GPU_KERNEL_HPP_ diff --git a/ark/gpu/gpu_kernel_test.cpp b/ark/gpu/gpu_kernel_test.cpp index 870ad7ab9..342ef9656 100644 --- a/ark/gpu/gpu_kernel_test.cpp +++ b/ark/gpu/gpu_kernel_test.cpp @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "gpu/gpu_kernel.h" +#include "gpu/gpu_kernel.hpp" #include "unittest/unittest_utils.h" @@ -9,7 +9,13 @@ const std::string void_kernel = "extern \"C\" __global__ void kernel() {}"; ark::unittest::State test_gpu_kernel() { ark::GpuKernel kernel(0, void_kernel, {1, 1, 1}, {1, 1, 1}, 0, "kernel"); + UNITTEST_TRUE(!kernel.is_compiled()); kernel.compile(); + UNITTEST_TRUE(kernel.is_compiled()); + std::vector args; + for (int i = 0; i < 10; i++) { + kernel.launch(nullptr, args); + } return ark::unittest::SUCCESS; } diff --git a/ark/gpu/gpu_logging.h b/ark/gpu/gpu_logging.hpp similarity index 92% rename from ark/gpu/gpu_logging.h rename to ark/gpu/gpu_logging.hpp index b14435b8b..5e35cc003 100644 --- a/ark/gpu/gpu_logging.h +++ b/ark/gpu/gpu_logging.hpp @@ -1,10 +1,10 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_GPU_LOGGING_H_ -#define ARK_GPU_LOGGING_H_ +#ifndef ARK_GPU_LOGGING_HPP_ +#define ARK_GPU_LOGGING_HPP_ -#include "gpu/gpu.h" +#include "gpu/gpu.hpp" #include "logging.h" #define GLOG(cmd) \ @@ -29,4 +29,4 @@ } \ } while (0) -#endif // ARK_GPU_LOGGING_H_ +#endif // ARK_GPU_LOGGING_HPP_ diff --git a/ark/gpu/gpu_manager.cpp b/ark/gpu/gpu_manager.cpp index fc841fa32..572932e35 100644 --- a/ark/gpu/gpu_manager.cpp +++ b/ark/gpu/gpu_manager.cpp @@ -1,11 +1,11 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "gpu/gpu_manager.h" +#include "gpu/gpu_manager.hpp" #include -#include "gpu/gpu_logging.h" +#include "gpu/gpu_logging.hpp" #include "utils/utils_string.hpp" namespace ark { diff --git a/ark/gpu/gpu_manager.h b/ark/gpu/gpu_manager.hpp similarity index 88% rename from ark/gpu/gpu_manager.h rename to ark/gpu/gpu_manager.hpp index 93a48cf7b..eeeda4d94 100644 --- a/ark/gpu/gpu_manager.h +++ b/ark/gpu/gpu_manager.hpp @@ -1,16 +1,16 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_GPU_MANAGER_H_ -#define ARK_GPU_MANAGER_H_ +#ifndef ARK_GPU_MANAGER_HPP_ +#define ARK_GPU_MANAGER_HPP_ #include #include "arch.hpp" -#include "gpu/gpu.h" -#include "gpu/gpu_event.h" -#include "gpu/gpu_memory.h" -#include "gpu/gpu_stream.h" +#include "gpu/gpu.hpp" +#include "gpu/gpu_event.hpp" +#include "gpu/gpu_memory.hpp" +#include "gpu/gpu_stream.hpp" namespace ark { @@ -62,4 +62,4 @@ class GpuManager { } // namespace ark -#endif // ARK_GPU_MANAGER_H_ +#endif // ARK_GPU_MANAGER_HPP_ diff --git a/ark/gpu/gpu_memory.cpp b/ark/gpu/gpu_memory.cpp index 184db457c..9a854f521 100644 --- a/ark/gpu/gpu_memory.cpp +++ b/ark/gpu/gpu_memory.cpp @@ -1,11 +1,11 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "gpu/gpu_memory.h" +#include "gpu/gpu_memory.hpp" -#include "gpu/gpu.h" -#include "gpu/gpu_logging.h" -#include "gpu/gpu_manager.h" +#include "gpu/gpu.hpp" +#include "gpu/gpu_logging.hpp" +#include "gpu/gpu_manager.hpp" namespace ark { diff --git a/ark/gpu/gpu_memory.h b/ark/gpu/gpu_memory.hpp similarity index 87% rename from ark/gpu/gpu_memory.h rename to ark/gpu/gpu_memory.hpp index cd7a6f04f..6b277d40b 100644 --- a/ark/gpu/gpu_memory.h +++ b/ark/gpu/gpu_memory.hpp @@ -1,13 +1,13 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_GPU_MEMORY_H_ -#define ARK_GPU_MEMORY_H_ +#ifndef ARK_GPU_MEMORY_HPP_ +#define ARK_GPU_MEMORY_HPP_ #include #include -#include "gpu/gpu.h" +#include "gpu/gpu.hpp" namespace ark { @@ -40,7 +40,7 @@ class GpuHostMemory { GpuHostMemory(const GpuHostMemory&) = delete; GpuHostMemory& operator=(const GpuHostMemory&) = delete; - template + template T* ref() const { return reinterpret_cast(ptr_); } @@ -54,4 +54,4 @@ class GpuHostMemory { } // namespace ark -#endif // ARK_GPU_MEMORY_H_ +#endif // ARK_GPU_MEMORY_HPP_ diff --git a/ark/gpu/gpu_stream.cpp b/ark/gpu/gpu_stream.cpp index 52502365a..17d4e21f5 100644 --- a/ark/gpu/gpu_stream.cpp +++ b/ark/gpu/gpu_stream.cpp @@ -1,10 +1,10 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "gpu/gpu_stream.h" +#include "gpu/gpu_stream.hpp" -#include "gpu/gpu_logging.h" -#include "gpu/gpu_manager.h" +#include "gpu/gpu_logging.hpp" +#include "gpu/gpu_manager.hpp" namespace ark { class GpuStream::Impl { diff --git a/ark/gpu/gpu_stream.h b/ark/gpu/gpu_stream.hpp similarity index 79% rename from ark/gpu/gpu_stream.h rename to ark/gpu/gpu_stream.hpp index e76f01827..9d8775f95 100644 --- a/ark/gpu/gpu_stream.h +++ b/ark/gpu/gpu_stream.hpp @@ -1,12 +1,12 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_GPU_STREAM_H_ -#define ARK_GPU_STREAM_H_ +#ifndef ARK_GPU_STREAM_HPP_ +#define ARK_GPU_STREAM_HPP_ #include -#include "gpu/gpu.h" +#include "gpu/gpu.hpp" namespace ark { @@ -30,4 +30,4 @@ class GpuStream { }; } // namespace ark -#endif // ARK_GPU_STREAM_H_ +#endif // ARK_GPU_STREAM_HPP_ diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp index 75dc81c17..f0a108a1f 100644 --- a/ark/include/ark/executor.hpp +++ b/ark/include/ark/executor.hpp @@ -20,7 +20,7 @@ class Executor { public: /// Constructor. Executor(int device_id, Stream stream, const std::string &name, - const std::string &plan); + const std::string &plan, bool loop_mode = true); /// Destructor. ~Executor(); @@ -96,7 +96,7 @@ class DefaultExecutor : public Executor { DefaultExecutor( const Model &model, int device_id = -1, Stream stream = nullptr, const std::vector &config_rules = {}, - const std::string &name = "DefaultExecutor"); + const std::string &name = "DefaultExecutor", bool loop_mode = true); }; } // namespace ark diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in index ea1862920..a8a56f141 100644 --- a/ark/include/kernels/kernel_template.in +++ b/ark/include/kernels/kernel_template.in @@ -33,12 +33,12 @@ __device__ sync::State ARK_LOOP_SYNC_STATE; @DEFINITIONS@ -__device__ void ark_loop_body(char *_buf, int _iter) { +__device__ void ark_body(char *_buf, int _iter) { @BODY@ } extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1) -void @NAME@(char *_buf, int *_iter) { +void ark_loop_kernel@NAME@(char *_buf, int *_iter) { int *shared_mem = (int *)_ARK_SMEM; for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) { shared_mem[i] = 0; @@ -52,10 +52,10 @@ void @NAME@(char *_buf, int *_iter) { sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE); if (ARK_ITER < 0) return; - ark_loop_body(_buf, 0); + ark_body(_buf, 0); for (int _i = 1; _i < ARK_ITER; ++_i) { sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE); - ark_loop_body(_buf, _i); + ark_body(_buf, _i); } if (threadIdx.x == 0) { __threadfence_system(); @@ -67,3 +67,12 @@ void @NAME@(char *_buf, int *_iter) { sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE); } } + +extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1) +void ark_kernel@NAME@(char *_buf, int _iter) { + int *shared_mem = (int *)_ARK_SMEM; + for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) { + shared_mem[i] = 0; + } + ark_body(_buf, _iter); +} diff --git a/ark/ops/ops_matmul_test.cpp b/ark/ops/ops_matmul_test.cpp index 4304a19e2..6d09b54d6 100644 --- a/ark/ops/ops_matmul_test.cpp +++ b/ark/ops/ops_matmul_test.cpp @@ -3,7 +3,7 @@ #include -#include "gpu/gpu.h" +#include "gpu/gpu.hpp" #include "logging.h" #include "model/model_node.hpp" #include "model/model_op.hpp" diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp index 60ffc9dc2..bec69c456 100644 --- a/ark/ops/ops_test_common.cpp +++ b/ark/ops/ops_test_common.cpp @@ -10,7 +10,7 @@ #include "ark/planner.hpp" #include "ark/random.hpp" #include "env.h" -#include "gpu/gpu_logging.h" +#include "gpu/gpu_logging.hpp" #include "logging.h" #include "model/model_data_type.hpp" #include "model/model_tensor.hpp" diff --git a/python/ark/runtime.py b/python/ark/runtime.py index 33db1fb5c..d54f85c36 100644 --- a/python/ark/runtime.py +++ b/python/ark/runtime.py @@ -101,12 +101,11 @@ def running(self) -> bool: def launch( self, - rank: int = 0, - world_size: int = 1, gpu_id: int = 0, plan: str = "", plan_path: str = "", stream: int = 0, + loop_mode: bool = True, ): """ Create an executor and schedule the ARK model. The scheduler will generate @@ -135,6 +134,7 @@ def launch( stream, "ArkRuntime", plan, + loop_mode, ) self.executor = _RuntimeState.executor self.executor.compile() diff --git a/python/executor_py.cpp b/python/executor_py.cpp index 979cb2952..e782a99fe 100644 --- a/python/executor_py.cpp +++ b/python/executor_py.cpp @@ -43,9 +43,11 @@ static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor, void register_executor(py::module &m) { py::class_(m, "_Executor") .def(py::init([](int device_id, uintptr_t stream, - const std::string &name, const std::string &plan) { - return new ark::Executor( - device_id, reinterpret_cast(stream), name, plan); + const std::string &name, const std::string &plan, + bool loop_mode) { + return new ark::Executor(device_id, + reinterpret_cast(stream), + name, plan, loop_mode); })) .def("device_id", &ark::Executor::device_id) .def("stream", From 55755bbe2e2fbc36195f7786280689bde3170ec2 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sun, 14 Jul 2024 14:19:35 -0700 Subject: [PATCH 37/61] do not force noinline --- ark/codegen.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ark/codegen.cpp b/ark/codegen.cpp index cd6206284..0d4b14a09 100644 --- a/ark/codegen.cpp +++ b/ark/codegen.cpp @@ -213,7 +213,7 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) { for (auto &op_json : task_json["Ops"]) { ss << this->def_op(op_json, task_json["Id"], op_idx++); } - ss << "__noinline__ __device__ void t" << task_json["Id"] + ss << "__device__ void t" << task_json["Id"] << "(char* _buf, int _idx, int _spw) {\n"; op_idx = 0; for (auto &op_json : task_json["Ops"]) { From b29eaaefb5b969a8e0ec8b8e3813e5e3245e7825 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sun, 14 Jul 2024 21:25:20 +0000 Subject: [PATCH 38/61] wip --- arkprof.py | 4 +++- python/ark/profiler.py | 10 +++++----- python/ark/runtime.py | 11 +++++++++-- python/ark/tensor.py | 18 ++++++++++++------ python/unittest/unittest_common.py | 8 +++++++- 5 files changed, 36 insertions(+), 15 deletions(-) diff --git a/arkprof.py b/arkprof.py index 782bba560..9e67c2dfc 100644 --- a/arkprof.py +++ b/arkprof.py @@ -1,4 +1,6 @@ import ark import sys -ark.Profiler(ark.Plan.from_file(sys.argv[1])).run(iter=1000, profile_processor_groups=False) +ark.Profiler(ark.Plan.from_file(sys.argv[1])).run( + iter=1000, profile_processor_groups=False +) diff --git a/python/ark/profiler.py b/python/ark/profiler.py index 56233247c..c161b24e6 100644 --- a/python/ark/profiler.py +++ b/python/ark/profiler.py @@ -8,9 +8,9 @@ from .planner import Plan -def timeit(plan: Plan, iter: int): +def timeit(plan: Plan, iter: int, loop_mode: bool): with Runtime() as rt: - rt.launch(plan=plan) + rt.launch(plan=plan, loop_mode=loop_mode) start_time = time.time() rt.run(iter=iter) end_time = time.time() @@ -21,8 +21,8 @@ class Profiler: def __init__(self, plan: Plan): self.plan = plan - def run(self, iter: int = 1000, profile_processor_groups: bool = False): - sys.stderr.write(f"End-to-end: {timeit(self.plan, iter):.6f} seconds/iter\n") + def run(self, iter: int = 1000, loop_mode: bool = True, profile_processor_groups: bool = False): + sys.stderr.write(f"End-to-end: {timeit(self.plan, iter, loop_mode):.6f} seconds/iter\n") if not profile_processor_groups: return @@ -38,7 +38,7 @@ def run(self, iter: int = 1000, profile_processor_groups: bool = False): } for i in range(num_processor_groups): new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i] - lat_per_iter = timeit(Plan(new_plan), iter) + lat_per_iter = timeit(Plan(new_plan), iter, loop_mode) sys.stderr.write( f"Processor group {i}: {lat_per_iter:.6f} seconds/iter\n" ) diff --git a/python/ark/runtime.py b/python/ark/runtime.py index b3dbe7887..51a5b7905 100644 --- a/python/ark/runtime.py +++ b/python/ark/runtime.py @@ -48,8 +48,15 @@ def print_runtime_states(): class Executor(_Executor): - def __init__(self, device_id: int, stream: int, name: str, plan: Plan): - super().__init__(device_id, stream, name, str(plan)) + def __init__( + self, + device_id: int, + stream: int, + name: str, + plan: Plan, + loop_mode: bool = True, + ): + super().__init__(device_id, stream, name, str(plan), loop_mode) class Runtime: diff --git a/python/ark/tensor.py b/python/ark/tensor.py index 335020769..657da1065 100644 --- a/python/ark/tensor.py +++ b/python/ark/tensor.py @@ -121,16 +121,22 @@ def to_torch( self.shape(), dtype=torch_type, device=torch.device(dev_name) ) elif list(tensor.shape) != self.shape(): - raise ValueError(f"torch tensor shape {list(tensor.shape)} " - f"does not match the tensor {self.shape()}") + raise ValueError( + f"torch tensor shape {list(tensor.shape)} " + f"does not match the tensor {self.shape()}" + ) elif tensor.dtype != torch_type: - raise ValueError(f"torch tensor dtype {tensor.dtype} " - f"does not match the tensor {torch_type}") + raise ValueError( + f"torch tensor dtype {tensor.dtype} " + f"does not match the tensor {torch_type}" + ) elif not tensor.is_contiguous(): raise ValueError("torch tensor is not contiguous in memory") elif tensor.numel() != self.nelems(): - raise ValueError(f"torch tensor size {tensor.numel()} " - f"does not match the tensor {self.nelems()}") + raise ValueError( + f"torch tensor size {tensor.numel()} " + f"does not match the tensor {self.nelems()}" + ) tensor_bytes = self.nelems() * self.dtype().element_size() rt.executor.tensor_read( self._tensor, tensor.data_ptr(), tensor_bytes, stream, True diff --git a/python/unittest/unittest_common.py b/python/unittest/unittest_common.py index 9548410b5..0c385e89a 100644 --- a/python/unittest/unittest_common.py +++ b/python/unittest/unittest_common.py @@ -9,14 +9,20 @@ def pytest_ark(need_torch: bool = False): """ Decorator for ARK unit tests. """ + def decorator(test_func): if need_torch: try: import torch except ImportError: - return pytest.mark.skip(reason="torch is not installed")(test_func) + return pytest.mark.skip(reason="torch is not installed")( + test_func + ) + def wrapper(*args, **kwargs): ark.init() test_func(*args, **kwargs) + return wrapper + return decorator From a7a5d46c001b143781022e2d28aaa3eee0c502b3 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sun, 14 Jul 2024 23:56:21 +0000 Subject: [PATCH 39/61] Fix CK tile indexing --- third_party/patches/composable_kernel.patch | 89 +++++++++++++++++++-- 1 file changed, 83 insertions(+), 6 deletions(-) diff --git a/third_party/patches/composable_kernel.patch b/third_party/patches/composable_kernel.patch index 43b1afcaa..e12f19332 100644 --- a/third_party/patches/composable_kernel.patch +++ b/third_party/patches/composable_kernel.patch @@ -561,7 +561,7 @@ index 2d5dc90bf..160eef036 100644 }); diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp -index 7bb47e9d3..2b2e8c604 100644 +index 7bb47e9d3..d495c7297 100644 --- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp +++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp @@ -60,7 +60,7 @@ struct BlockToCTileMap_M00_N0_M01 @@ -582,7 +582,84 @@ index 7bb47e9d3..2b2e8c604 100644 { return true; } -@@ -315,7 +315,7 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt +@@ -177,58 +177,7 @@ struct BlockToCTileMap_M00_N0_M01Adapt + index_t idx_N0 = block_1d_id % N0; + index_t idx_M0 = block_1d_id / N0; + +- const auto M01_adapt = (idx_M0 < M0 - M0 % M01_) ? M01_ : M0 % M01_; +- +- index_t idx_M00 = idx_M0 / M01_; +- index_t idx_M01 = idx_M0 % M01_; +- index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0; +- +- /** +- * idxN0 +- * +- * |< mtx N >| +- * +- * NPerBlock NPerBlock NPerBlock NPerBlock +- * N_0 N_1 N_2 N_3 +- * - |-----------|-----------|-----------|-----|-----|- +- * ^ | - - 0 |/----> 2 | | | | +- * | | | / | | | | | M_0 MPerBlock +- * | M | /| | | | | | +- * |-0---|---/-|-----|-----|-----------|-----|-----|- +- * | 1 | / | | | blockid | | | +- * idxM0 | | | / | V | 5 | | | M_1 MPerBlock +- * | - V 1 | - 3 | | | | +- * |-----------|-----------|-----------|-----|-----|- +- * mtx M | | | | | | +- * | | | | | | M_2 MPerBlock +- * | | | | | | +- * |-----------|-----------|-----------|-----|-----|- +- * | | | | | | +- * | | | | | | M_3 MPerBlock +- * | | | | | | +- * |-----------|-----------|-----------|-----|-----|- +- * V | | | | | | +- * - |-----------|-----------|-----------|-----|-----|- M_4 MPerBlock +- * | | | | | | +- * |-----------|-----------|-----------|-----|-----|- +- * Example: +- * assume: +- * M0 = 5 +- * N0 = 4 +- * block_1d_id = 5 +- * M01 = 2 +- * +- * idx_N0 = 1 +- * idx_M0 = 1 +- * M01_adapt = 2 +- * idx_M00 = 0 +- * idx_M01 = 1 +- * idx_N0_M01_local = 5 +- * output {1, 2} +- */ +- +- return make_tuple(idx_N0_M01_local % M01_adapt + idx_M00 * M01_, +- idx_N0_M01_local / M01_adapt); ++ return make_tuple(idx_M0, idx_N0); + } + + template +@@ -297,15 +246,7 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt + index_t idx_N0 = block_1d_id % N0; + index_t idx_M0 = block_1d_id / N0; + +- const auto M01_adapt = (idx_M0 < M0 - M0 % M01_) ? M01_ : M0 % M01_; +- +- index_t idx_M00 = idx_M0 / M01_; +- index_t idx_M01 = idx_M0 % M01_; +- index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0; +- +- return make_tuple(idx_ksplit, +- idx_N0_M01_local % M01_adapt + idx_M00 * M01_, +- idx_N0_M01_local / M01_adapt); ++ return make_tuple(idx_ksplit, idx_M0, idx_N0); + } + + template +@@ -315,7 +256,7 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt return true; // always valid provided that user gets grid size from CalculateGridSize() } @@ -591,7 +668,7 @@ index 7bb47e9d3..2b2e8c604 100644 private: index_t M01_; -@@ -373,7 +373,7 @@ struct BlockToCTileMap_M00_N00_M01_N01 +@@ -373,7 +314,7 @@ struct BlockToCTileMap_M00_N00_M01_N01 return true; } @@ -600,7 +677,7 @@ index 7bb47e9d3..2b2e8c604 100644 { if constexpr(DeviceCTileIndexCheck) return true; // validity check moved to kernel -@@ -485,7 +485,7 @@ struct BlockToCTileMap_KSplit_M00_N00_M01_N01 +@@ -485,7 +426,7 @@ struct BlockToCTileMap_KSplit_M00_N00_M01_N01 return true; } @@ -609,7 +686,7 @@ index 7bb47e9d3..2b2e8c604 100644 { if constexpr(DeviceCTileIndexCheck) return true; // validity check moved to kernel -@@ -609,7 +609,7 @@ struct OffsettedBlockToCTileMap +@@ -609,7 +550,7 @@ struct OffsettedBlockToCTileMap } template @@ -618,7 +695,7 @@ index 7bb47e9d3..2b2e8c604 100644 { return block_to_ctile_map_.CheckValidity(c_grid_desc_m_n); } -@@ -666,7 +666,7 @@ struct BlockToCTileMap_3DGrid_KSplit +@@ -666,7 +607,7 @@ struct BlockToCTileMap_3DGrid_KSplit } template From 9c19a5ec8543863d159c96f05c007b63943c2566 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 29 Jul 2024 02:56:23 +0000 Subject: [PATCH 40/61] wip --- .vscode/settings.json | 2 - ark/api/context_manager.cpp | 42 +++++++++ ark/api/context_manager_test.cpp | 54 +++++++++++ ark/api/model.cpp | 4 +- ark/api/model_graph.cpp | 4 +- ark/api/model_test.cpp | 24 ++--- ark/api/planner.cpp | 4 +- ark/include/ark.hpp | 1 + ark/include/ark/context_manager.hpp | 24 +++++ ark/include/ark/model.hpp | 64 +++++++------ ark/include/ark/model_graph.hpp | 3 +- ark/model/model_graph_impl.cpp | 40 ++++++++- ark/model/model_graph_impl.hpp | 36 +++++++- ark/model/model_node.hpp | 3 + ark/model/model_op.cpp | 11 +++ ark/model/model_op.hpp | 9 +- ark/ops/ops_arithmetic.cpp | 20 +++-- ark/ops/ops_cast.cpp | 10 +-- ark/ops/ops_communication.cpp | 14 +-- ark/ops/ops_copy.cpp | 5 +- ark/ops/ops_embedding.cpp | 4 +- ark/ops/ops_identity.cpp | 2 +- ark/ops/ops_math.cpp | 31 ++++--- ark/ops/ops_matmul.cpp | 6 +- ark/ops/ops_noop.cpp | 2 +- ark/ops/ops_reduce.cpp | 12 +-- ark/ops/ops_refer.cpp | 2 +- ark/ops/ops_reshape.cpp | 4 +- ark/ops/ops_rope.cpp | 5 +- ark/ops/ops_scalar.cpp | 31 ++++--- ark/ops/ops_tensor.cpp | 2 +- ark/ops/ops_transpose.cpp | 5 +- arkprof.py | 1 + examples/tutorial/context_tutorial.py | 117 ++++++++++++++++++++++++ python/ark/__init__.py | 2 +- python/ark/context_manager.py | 24 +++++ python/ark/ops.py | 125 ++++++++++++++++++++------ python/ark/profiler.py | 11 ++- python/ark_py.cpp | 2 + python/context_manager_py.cpp | 15 ++++ python/model_py.cpp | 86 ++++++++++-------- 41 files changed, 676 insertions(+), 187 deletions(-) create mode 100644 ark/api/context_manager.cpp create mode 100644 ark/api/context_manager_test.cpp create mode 100644 ark/include/ark/context_manager.hpp create mode 100644 examples/tutorial/context_tutorial.py create mode 100644 python/ark/context_manager.py create mode 100644 python/context_manager_py.cpp diff --git a/.vscode/settings.json b/.vscode/settings.json index 640196a66..00260f078 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -3,8 +3,6 @@ "cmake.environment": { "ARK_ROOT": "${workspaceFolder}/build", "ARK_IGNORE_BINARY_CACHE": "1", - "ARK_DISABLE_GRAPH_OPT": "0", - "ARK_IPC_LISTEN_PORT_BASE": "42000", // "ARK_LOG_LEVEL": "DEBUG" }, "cmake.ctestArgs": [ diff --git a/ark/api/context_manager.cpp b/ark/api/context_manager.cpp new file mode 100644 index 000000000..6d16d9e79 --- /dev/null +++ b/ark/api/context_manager.cpp @@ -0,0 +1,42 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/context_manager.hpp" + +#include "model/model_graph_impl.hpp" + +namespace ark { + +class ContextManager::Impl { + public: + Impl(std::shared_ptr context_stack, + const std::map& context_map); + + ~Impl(); + + private: + std::shared_ptr context_stack_; + std::vector keys_; +}; + +ContextManager::Impl::Impl( + std::shared_ptr context_stack, + const std::map& context_map) + : context_stack_(context_stack) { + for (const auto& [key, value] : context_map) { + context_stack_->push(key, value); + keys_.push_back(key); + } +} + +ContextManager::Impl::~Impl() { + for (auto it = keys_.rbegin(); it != keys_.rend(); ++it) { + context_stack_->pop(*it); + } +} + +ContextManager::ContextManager( + Model& model, const std::map& context_map) + : impl_(std::make_shared(model.impl_->context_stack_, context_map)) {} + +} // namespace ark diff --git a/ark/api/context_manager_test.cpp b/ark/api/context_manager_test.cpp new file mode 100644 index 000000000..ff60b43bf --- /dev/null +++ b/ark/api/context_manager_test.cpp @@ -0,0 +1,54 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/model.hpp" +#include "ark/context_manager.hpp" + +#include "model/model_node.hpp" +#include "unittest/unittest_utils.h" + +ark::unittest::State test_context_manager() { + ark::Model model; + ark::Tensor t0 = model.tensor({1}, ark::FP32); + ark::Tensor t1 = model.tensor({1}, ark::FP32); + ark::Tensor t2 = model.add(t0, t1); + + ark::Tensor t3; + ark::Tensor t4; + ark::Tensor t5; + { + ark::ContextManager cm0_1(model, {{"key0", "val1"}}); + t3 = model.relu(t2); + + ark::ContextManager cm1_1(model, {{"key1", "val2"}}); + t4 = model.sqrt(t3); + } + { + ark::ContextManager cm0_2(model, {{"key0", "val3"}}); + t5 = model.exp(t2); + } + + UNITTEST_TRUE(model.verify()); + + auto compressed = model.compress(false); + UNITTEST_TRUE(compressed.verify()); + + auto nodes = compressed.nodes(); + UNITTEST_EQ(nodes.size(), 4); + + UNITTEST_EQ(nodes[0]->context.size(), 0); + UNITTEST_EQ(nodes[1]->context.size(), 1); + UNITTEST_EQ(nodes[1]->context.at("key0"), "val1"); + UNITTEST_EQ(nodes[2]->context.size(), 2); + UNITTEST_EQ(nodes[2]->context.at("key0"), "val1"); + UNITTEST_EQ(nodes[2]->context.at("key1"), "val2"); + UNITTEST_EQ(nodes[3]->context.size(), 1); + UNITTEST_EQ(nodes[3]->context.at("key0"), "val3"); + + return ark::unittest::SUCCESS; +} + +int main() { + UNITTEST(test_context_manager); + return 0; +} diff --git a/ark/api/model.cpp b/ark/api/model.cpp index ab536a33c..a5a258f71 100644 --- a/ark/api/model.cpp +++ b/ark/api/model.cpp @@ -9,9 +9,9 @@ namespace ark { -Model Model::compress() const { +Model Model::compress(bool merge_nodes) const { Model model(*this); - model.compress_nodes(); + model.compress_nodes(merge_nodes); return model; } diff --git a/ark/api/model_graph.cpp b/ark/api/model_graph.cpp index b6061a34e..d11808467 100644 --- a/ark/api/model_graph.cpp +++ b/ark/api/model_graph.cpp @@ -33,7 +33,9 @@ int ModelGraph::rank() const { return impl_->rank(); } int ModelGraph::world_size() const { return impl_->world_size(); } -void ModelGraph::compress_nodes() { impl_->compress_nodes(); } +void ModelGraph::compress_nodes(bool merge_nodes) { + impl_->compress_nodes(merge_nodes); +} bool ModelGraph::compressed() const { return impl_->compressed(); } diff --git a/ark/api/model_test.cpp b/ark/api/model_test.cpp index a9d332a97..785bfcd7b 100644 --- a/ark/api/model_test.cpp +++ b/ark/api/model_test.cpp @@ -36,7 +36,7 @@ ark::unittest::State test_model_basics() { // (AddOp,) // - compressed = model.compress(); + compressed = model.compress(true); UNITTEST_TRUE(compressed.verify()); UNITTEST_TRUE(compressed.compressed()); UNITTEST_EQ(compressed.nodes().size(), 1); @@ -70,7 +70,7 @@ ark::unittest::State test_model_basics() { // (AddOp,AddOp,) // - compressed = model.compress(); + compressed = model.compress(true); UNITTEST_TRUE(compressed.verify()); UNITTEST_EQ(compressed.nodes().size(), 1); @@ -104,7 +104,7 @@ ark::unittest::State test_model_basics() { // (AddOp,AddOp,ReluOp,) // - compressed = model.compress(); + compressed = model.compress(true); UNITTEST_TRUE(compressed.verify()); UNITTEST_EQ(compressed.nodes().size(), 1); @@ -143,7 +143,7 @@ ark::unittest::State test_model_basics() { // (AddOp,AddOp,ReluOp,AddOp,) // - compressed = model.compress(); + compressed = model.compress(true); UNITTEST_TRUE(compressed.verify()); auto nodes = compressed.nodes(); @@ -190,7 +190,7 @@ ark::unittest::State test_model_basics() { // (AddOp,) --+--> (AddOp,) // - compressed = model.compress(); + compressed = model.compress(true); UNITTEST_TRUE(compressed.verify()); nodes = compressed.nodes(); @@ -250,7 +250,7 @@ ark::unittest::State test_model_basics() { // (AddOp,) // - compressed = model.compress(); + compressed = model.compress(true); UNITTEST_TRUE(compressed.verify()); nodes = compressed.nodes(); @@ -312,7 +312,7 @@ ark::unittest::State test_model_basics() { // (AddOp,) // - compressed = model.compress(); + compressed = model.compress(true); UNITTEST_TRUE(compressed.verify()); nodes = compressed.nodes(); @@ -353,7 +353,7 @@ ark::unittest::State test_model_dependent_inputs() { ark::Tensor x4 = m.mul(x2, x3); ark::Tensor y = m.add(x0, x4); - auto compressed = m.compress(); + auto compressed = m.compress(true); auto nodes = compressed.nodes(); UNITTEST_EQ(nodes.size(), 4); auto nodes_iter = nodes.begin(); @@ -399,7 +399,7 @@ ark::unittest::State test_model_noop() { UNITTEST_TRUE(model.verify()); - auto compressed = model.compress(); + auto compressed = model.compress(true); UNITTEST_TRUE(compressed.verify()); UNITTEST_EQ(compressed.nodes().size(), 0); return ark::unittest::SUCCESS; @@ -425,7 +425,7 @@ ark::unittest::State test_model_identity() { ark::Tensor t4 = model.relu(t3); UNITTEST_TRUE(model.verify()); - auto compressed = model.compress(); + auto compressed = model.compress(true); UNITTEST_TRUE(compressed.verify()); auto nodes = compressed.nodes(); UNITTEST_EQ(nodes.size(), 3); @@ -478,7 +478,7 @@ ark::unittest::State test_model_sharding() { ark::Tensor t5 = model.relu(t4); UNITTEST_TRUE(model.verify()); - auto compressed = model.compress(); + auto compressed = model.compress(true); UNITTEST_TRUE(compressed.verify()); auto nodes = compressed.nodes(); UNITTEST_EQ(nodes.size(), 4); @@ -526,7 +526,7 @@ ark::unittest::State test_model_cumulate() { UNITTEST_TRUE(model.verify()); - auto compressed = model.compress(); + auto compressed = model.compress(true); auto nodes = compressed.nodes(); UNITTEST_EQ(nodes.size(), 5); diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp index f4e7fa8ee..dba149a1e 100644 --- a/ark/api/planner.cpp +++ b/ark/api/planner.cpp @@ -69,7 +69,9 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const { task_info["Id"] = next_node_id++; Json config; - if (!config_rules_.empty()) { + if (!op->config().empty()) { + config = op->config(); + } else if (!config_rules_.empty()) { const std::string op_str = op->serialize().dump(); for (auto &rule : config_rules_) { auto config_str = rule(op_str, gpu_info.arch->name()); diff --git a/ark/include/ark.hpp b/ark/include/ark.hpp index 2ca796172..e76687bce 100644 --- a/ark/include/ark.hpp +++ b/ark/include/ark.hpp @@ -8,6 +8,7 @@ #include // clang-format on +#include #include #include #include diff --git a/ark/include/ark/context_manager.hpp b/ark/include/ark/context_manager.hpp new file mode 100644 index 000000000..58271ea8c --- /dev/null +++ b/ark/include/ark/context_manager.hpp @@ -0,0 +1,24 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_CONTEXT_MANAGER_HPP +#define ARK_CONTEXT_MANAGER_HPP + +#include +#include + +namespace ark { + +class ContextManager { + public: + ContextManager(Model& model, + const std::map& context_map); + + private: + class Impl; + std::shared_ptr impl_; +}; + +} // namespace ark + +#endif // ARK_CONTEXT_MANAGER_HPP diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp index 66551a037..35efe53d5 100644 --- a/ark/include/ark/model.hpp +++ b/ark/include/ark/model.hpp @@ -26,7 +26,7 @@ class Model : public ModelGraph { Model &operator=(const Model &other) = default; - Model compress() const; + Model compress(bool merge_nodes = false) const; int unique_tag(); @@ -87,23 +87,29 @@ class Model : public ModelGraph { // result in `output`. // Currently, only reduction along the last dimension is supported. Tensor reduce_sum(Tensor input, int axis, bool keepdims = true, - Tensor output = NullTensor, const std::string &name = ""); + Tensor output = NullTensor, + const std::string &config = "", + const std::string &name = ""); Tensor reduce_mean(Tensor input, int axis, bool keepdims = true, Tensor output = NullTensor, + const std::string &config = "", const std::string &name = ""); Tensor reduce_max(Tensor input, int axis, bool keepdims = true, - Tensor output = NullTensor, const std::string &name = ""); + Tensor output = NullTensor, + const std::string &config = "", + const std::string &name = ""); // Transposes the `input` tensor according to the given `permutation`. // For example, transpose(input, {0, 1 ,3, 2}) will swap the last two // dimensions of the input tensor. Currently, only 4D tensors are supported. Tensor transpose(Tensor input, const std::vector &permutation, - Tensor output = NullTensor, const std::string &name = ""); + Tensor output = NullTensor, const std::string &config = "", + const std::string &name = ""); // Performs matrix multiplication between the `input` tensor and another // `other` tensor, storing the result in `output`. Tensor matmul(Tensor input, Tensor other, Tensor output = NullTensor, bool trans_input = false, bool trans_other = false, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); // Implements the 'im2col' method for 2D convolution layers, which takes an // `input` tensor and reshapes it to a 2D matrix by extracting image patches // from the input tensor based on the provided parameters. @@ -120,72 +126,76 @@ class Model : public ModelGraph { Tensor output = NullTensor, const std::string &name = ""); // Calculates the exponential of the `input` tensor, element-wise. Tensor exp(Tensor input, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); // Calculates the square root of the `input` tensor, element-wise. Tensor sqrt(Tensor input, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); // Calculates the reverse square root of the `input` tensor, element-wise. Tensor rsqrt(Tensor input, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); // ReLU activation Tensor relu(Tensor input, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); // Copy the `input` tensor to `output` tensor Tensor copy(Tensor input, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); Tensor copy(float val, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); // Applies the Gaussian Error Linear Unit (GELU) activation function to the // `input` tensor, element-wise. GELU is a smooth approximation of the // rectifier function and is widely used in deep learning models. Tensor gelu(Tensor input, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); // Sigmoid activation Tensor sigmoid(Tensor input, Tensor output = NullTensor, + const std::string &config = "", const std::string &name = ""); // Performs rotary position embedding (RoPE) on the `input` tensor Tensor rope(Tensor input, Tensor other, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); // Performs an element-wise addition operator between the `input` tensor // and the `other` tensor Tensor add(Tensor input, Tensor other, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); Tensor add(Tensor input, float value, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); // Performs an element-wise subtraction operator between the `input` tensor // and the `other` tensor Tensor sub(Tensor input, Tensor other, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); Tensor sub(Tensor input, float value, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); // Performs an element-wise multiplication operator between the `input` // tensor and the `other` tensor, Tensor mul(Tensor input, Tensor other, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); Tensor mul(Tensor input, float value, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); // Performs an element-wise division operator between the `input` // tensor and the `other` tensor, Tensor div(Tensor input, Tensor other, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); Tensor div(Tensor input, float value, Tensor output = NullTensor, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); Tensor send(Tensor input, int remote_rank, int tag, - Tensor output = NullTensor, const std::string &name = ""); + Tensor output = NullTensor, const std::string &config = "", + const std::string &name = ""); // Blocks the execution until the corresponding 'send' operator with the // specified `id` is completed. - Tensor send_done(Tensor input, const std::string &name = ""); + Tensor send_done(Tensor input, const std::string &config = "", + const std::string &name = ""); // Receives a tensor from a source rank (@p src_rank), identified by the // `id` parameter. Blocks the execution until the corresponding 'recv' // operator is completed. Tensor recv(Tensor output, int remote_rank, int tag, - const std::string &name = ""); + const std::string &config = "", const std::string &name = ""); // Tensor put_packet(Tensor input, Tensor local_tmp_buf, Tensor recv_buf, int id, int rank, int dst_rank, size_t dst_offset, - int flag, const std::string &name = ""); + int flag, const std::string &config = "", + const std::string &name = ""); // Performs an all-reduce operator across all ranks, aggregating the input // tensors. Takes the `input` tensor, the current GPU's rank, and the // total number of ranks `rank_num`. @@ -200,10 +210,12 @@ class Model : public ModelGraph { const std::string &name = ""); /// Embedding layer. Tensor embedding(Tensor input, Tensor weight, Tensor output = NullTensor, + const std::string &config = "", const std::string &name = ""); /// Tensor type casting. Tensor cast(Tensor input, const DataType &data_type, - Tensor output = NullTensor, const std::string &name = ""); + Tensor output = NullTensor, const std::string &config = "", + const std::string &name = ""); // sync across multi devices Tensor device_sync(Tensor input, int npeers, const std::string &name = ""); diff --git a/ark/include/ark/model_graph.hpp b/ark/include/ark/model_graph.hpp index bd7c59033..f6390a2a9 100644 --- a/ark/include/ark/model_graph.hpp +++ b/ark/include/ark/model_graph.hpp @@ -25,7 +25,7 @@ class ModelGraph { int world_size() const; - void compress_nodes(); + void compress_nodes(bool merge_nodes = false); bool compressed() const; @@ -38,6 +38,7 @@ class ModelGraph { protected: friend class Model; + friend class ContextManager; class Impl; std::unique_ptr impl_; diff --git a/ark/model/model_graph_impl.cpp b/ark/model/model_graph_impl.cpp index 17410d23f..53a7fa851 100644 --- a/ark/model/model_graph_impl.cpp +++ b/ark/model/model_graph_impl.cpp @@ -17,6 +17,36 @@ namespace ark { +ModelGraphContextStack::ModelGraphContextStack(const ModelGraphContextStack &other) { + for (const auto &pair : other.storage_) { + for (const auto &value : pair.second) { + this->storage_[pair.first].push_back(value); + } + } +} + +void ModelGraphContextStack::push(const std::string &key, const std::string &value) { + this->storage_[key].push_back(std::make_shared(value)); +} + +void ModelGraphContextStack::pop(const std::string &key) { + auto it = this->storage_.find(key); + if (it == this->storage_.end() || it->second.empty()) { + ERR(ModelError, "context stack is empty"); + } + it->second.pop_back(); +} + +std::map ModelGraphContextStack::current_context() const { + std::map cur; + for (const auto &pair : this->storage_) { + if (!pair.second.empty()) { + cur[pair.first] = *pair.second.back(); + } + } + return cur; +} + ModelGraph::Impl::Impl(const ModelGraph::Impl &other) { *this = other; } ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) { @@ -25,6 +55,7 @@ ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) { for (const auto &node : other.nodes_) { ModelNodeRef new_node = std::make_shared(); new_node->ops = node->ops; + new_node->context = node->context; node_map.emplace(node, new_node); nodes_.push_back(new_node); } @@ -61,13 +92,16 @@ ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) { rank_ = other.rank_; world_size_ = other.world_size_; compressed_ = other.compressed_; + context_stack_ = std::make_shared(*(other.context_stack_)); return *this; } -void ModelGraph::Impl::compress_nodes() { +void ModelGraph::Impl::compress_nodes(bool merge_nodes) { if (!compressed_) { this->recursive_remove_virtual_nodes(); - this->recursive_merge_nodes(); + if (merge_nodes) { + this->recursive_merge_nodes(); + } compressed_ = true; } } @@ -171,6 +205,8 @@ ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) { producer->consumers.push_back(node); } + node->context = context_stack_->current_context(); + nodes_.push_back(node); return node; } diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp index 6c109b51e..fbfc54c7e 100644 --- a/ark/model/model_graph_impl.hpp +++ b/ark/model/model_graph_impl.hpp @@ -4,6 +4,7 @@ #ifndef ARK_MODEL_GRAPH_IMPL_HPP_ #define ARK_MODEL_GRAPH_IMPL_HPP_ +#include #include #include #include @@ -18,17 +19,39 @@ namespace ark { +class ModelGraphContextStack { + private: + std::map>> storage_; + + public: + ModelGraphContextStack() = default; + + ModelGraphContextStack(const ModelGraphContextStack &other); + + ~ModelGraphContextStack() = default; + + void push(const std::string &key, const std::string &value); + + void pop(const std::string &key); + + std::map current_context() const; +}; + class ModelGraph::Impl { public: Impl(int rank, int world_size) - : rank_(rank), world_size_(world_size), compressed_(false){}; + : rank_(rank), + world_size_(world_size), + compressed_(false), + context_stack_(std::make_shared()) {}; Impl(const Impl &other); Impl &operator=(const Impl &other); template - ModelOpRef create_op(const std::string &name, Args &&... args) { + ModelOpRef create_op(const std::string &config, const std::string &name, + Args &&...args) { ModelOpRef op = std::make_shared(std::forward(args)...); std::string name_copy; if (name.empty()) { @@ -41,6 +64,7 @@ class ModelGraph::Impl { if (count > 0) { name_copy += "_" + std::to_string(count); } + op->set_config(config); op->set_name(name_copy); add_op(op); return op; @@ -50,7 +74,7 @@ class ModelGraph::Impl { int world_size() const { return world_size_; } - void compress_nodes(); + void compress_nodes(bool merge_nodes = false); bool compressed() const { return compressed_; } @@ -100,6 +124,12 @@ class ModelGraph::Impl { /// True if `compress_nodes` has been called. bool compressed_; + + protected: + friend class ContextManager; + + /// Graph context stack. + std::shared_ptr context_stack_; }; } // namespace ark diff --git a/ark/model/model_node.hpp b/ark/model/model_node.hpp index 7838ca120..c86b4d29a 100644 --- a/ark/model/model_node.hpp +++ b/ark/model/model_node.hpp @@ -26,6 +26,9 @@ class ModelNode { /// The list of @ref ModelNode that this @ref ModelNode depends on. UniqueList producers; + + /// Graph context of this node. + std::map context; }; } // namespace ark diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp index b5a0645c8..e9689cdcb 100644 --- a/ark/model/model_op.cpp +++ b/ark/model/model_op.cpp @@ -87,6 +87,14 @@ const ModelOpType ModelOpT::from_name(const std::string &type_name) { return it->second; } +void ModelOp::set_config(const std::string &config) { + if (!config.empty()) { + config_ = Json::parse(config); + } else { + config_.clear(); + } +} + std::vector ModelOp::input_tensors() const { // input_tensors = read_tensors || write_tensors std::set input_tensors; @@ -179,6 +187,9 @@ Json ModelOp::serialize() const { for (auto &arg : args_) { j["Args"][arg.first] = arg.second.serialize(); } + if (!config_.empty()) { + j["Config"] = config_; + } return j; } diff --git a/ark/model/model_op.hpp b/ark/model/model_op.hpp index e8c220258..091a9f163 100644 --- a/ark/model/model_op.hpp +++ b/ark/model/model_op.hpp @@ -50,8 +50,8 @@ class ModelOp { return ""; } - virtual std::vector impl_args([ - [maybe_unused]] const Json &config) const { + virtual std::vector impl_args( + [[maybe_unused]] const Json &config) const { return {}; } @@ -60,10 +60,14 @@ class ModelOp { return {{"NumTasks", 0}, {"NumWarps", 0}, {"SramBytes", 0}}; } + void set_config(const std::string &config); + void set_name(const std::string &name) { name_ = name; } ModelOpType type() const { return type_; } + const Json &config() const { return config_; } + const std::string &name() const { return name_; } bool is_virtual() const { return is_virtual_; } @@ -100,6 +104,7 @@ class ModelOp { const std::vector &template_args = {}); ModelOpType type_; + Json config_; std::string name_; bool is_virtual_; std::vector read_tensors_; diff --git a/ark/ops/ops_arithmetic.cpp b/ark/ops/ops_arithmetic.cpp index aeece0d77..ef85b5d22 100644 --- a/ark/ops/ops_arithmetic.cpp +++ b/ark/ops/ops_arithmetic.cpp @@ -12,9 +12,10 @@ ModelOpAdd::ModelOpAdd(ModelTensorRef input, ModelTensorRef other, : ModelOpBroadcast2("Add", input, other, output) {} Tensor Model::add(Tensor input, Tensor other, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref_, other.ref_, output.ref_) + ->create_op(config, name, input.ref_, other.ref_, + output.ref_) ->result_tensors()[0]; } @@ -23,9 +24,10 @@ ModelOpMul::ModelOpMul(ModelTensorRef input, ModelTensorRef other, : ModelOpBroadcast2("Mul", input, other, output) {} Tensor Model::mul(Tensor input, Tensor other, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref_, other.ref_, output.ref_) + ->create_op(config, name, input.ref_, other.ref_, + output.ref_) ->result_tensors()[0]; } @@ -34,9 +36,10 @@ ModelOpSub::ModelOpSub(ModelTensorRef input, ModelTensorRef other, : ModelOpBroadcast2("Sub", input, other, output) {} Tensor Model::sub(Tensor input, Tensor other, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref_, other.ref_, output.ref_) + ->create_op(config, name, input.ref_, other.ref_, + output.ref_) ->result_tensors()[0]; } @@ -45,9 +48,10 @@ ModelOpDiv::ModelOpDiv(ModelTensorRef input, ModelTensorRef other, : ModelOpBroadcast2("Div", input, other, output) {} Tensor Model::div(Tensor input, Tensor other, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref_, other.ref_, output.ref_) + ->create_op(config, name, input.ref_, other.ref_, + output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_cast.cpp b/ark/ops/ops_cast.cpp index 9873c8367..e9527ad8c 100644 --- a/ark/ops/ops_cast.cpp +++ b/ark/ops/ops_cast.cpp @@ -105,7 +105,7 @@ ModelOpByteCast::ModelOpByteCast(ModelTensorRef input, ModelDataType data_type, } Tensor Model::cast(Tensor input, const DataType &data_type, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { check_null(input.ref()); if (output.is_null()) { if (input.data_type() == data_type) { @@ -119,14 +119,14 @@ Tensor Model::cast(Tensor input, const DataType &data_type, Tensor output, byte_cast_helper(input.ref(), data_type.ref(), new_shape, new_strides, new_offsets, new_padded_shape); return impl_ - ->create_op(name, input.ref(), data_type.ref(), - new_shape, new_strides, - new_offsets, new_padded_shape) + ->create_op( + config, name, input.ref(), data_type.ref(), new_shape, + new_strides, new_offsets, new_padded_shape) ->result_tensors()[0]; } } return impl_ - ->create_op(name, input.ref(), data_type.ref(), + ->create_op(config, name, input.ref(), data_type.ref(), output.ref()) ->result_tensors()[0]; } diff --git a/ark/ops/ops_communication.cpp b/ark/ops/ops_communication.cpp index e335f869e..4e76d2ede 100644 --- a/ark/ops/ops_communication.cpp +++ b/ark/ops/ops_communication.cpp @@ -157,23 +157,25 @@ Json ModelOpRecv::default_config([[maybe_unused]] const ArchRef arch) const { } Tensor Model::send(Tensor input, int remote_rank, int tag, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { tags_.insert(tag); return impl_ - ->create_op(name, input.ref(), remote_rank, tag, + ->create_op(config, name, input.ref(), remote_rank, tag, output.ref()) ->result_tensors()[0]; } -Tensor Model::send_done(Tensor input, const std::string &name) { - return impl_->create_op(name, input.ref()) +Tensor Model::send_done(Tensor input, const std::string &config, + const std::string &name) { + return impl_->create_op(config, name, input.ref()) ->result_tensors()[0]; } Tensor Model::recv(Tensor output, int remote_rank, int tag, - const std::string &name) { + const std::string &config, const std::string &name) { tags_.insert(tag); - return impl_->create_op(name, output.ref(), remote_rank, tag) + return impl_ + ->create_op(config, name, output.ref(), remote_rank, tag) ->result_tensors()[0]; } diff --git a/ark/ops/ops_copy.cpp b/ark/ops/ops_copy.cpp index 4f32966b8..4914c34a4 100644 --- a/ark/ops/ops_copy.cpp +++ b/ark/ops/ops_copy.cpp @@ -20,8 +20,9 @@ ModelOpCopy::ModelOpCopy(ModelTensorRef input, ModelTensorRef output) verify(); } -Tensor Model::copy(Tensor input, Tensor output, const std::string &name) { - return impl_->create_op(name, input.ref_, output.ref_) +Tensor Model::copy(Tensor input, Tensor output, const std::string &config, + const std::string &name) { + return impl_->create_op(config, name, input.ref_, output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_embedding.cpp b/ark/ops/ops_embedding.cpp index 542c0fcac..466b9a4e5 100644 --- a/ark/ops/ops_embedding.cpp +++ b/ark/ops/ops_embedding.cpp @@ -70,9 +70,9 @@ Json ModelOpEmbedding::default_config([ } Tensor Model::embedding(Tensor input, Tensor weight, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref_, weight.ref_, + ->create_op(config, name, input.ref_, weight.ref_, output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_identity.cpp b/ark/ops/ops_identity.cpp index 065cd9a52..dd398d8a5 100644 --- a/ark/ops/ops_identity.cpp +++ b/ark/ops/ops_identity.cpp @@ -31,7 +31,7 @@ Tensor Model::identity(Tensor input, const std::vector &deps, for (auto &dep : deps) { deps_ref.emplace_back(dep.ref_); } - return impl_->create_op(name, input.ref_, deps_ref) + return impl_->create_op("", name, input.ref_, deps_ref) ->result_tensors()[0]; } diff --git a/ark/ops/ops_math.cpp b/ark/ops/ops_math.cpp index 1067c561a..b2833dcca 100644 --- a/ark/ops/ops_math.cpp +++ b/ark/ops/ops_math.cpp @@ -24,48 +24,55 @@ ModelOpMath::ModelOpMath(const std::string &type_name, ModelTensorRef input, ModelOpExp::ModelOpExp(ModelTensorRef input, ModelTensorRef output) : ModelOpMath("Exp", input, output) {} -Tensor Model::exp(Tensor input, Tensor output, const std::string &name) { - return impl_->create_op(name, input.ref_, output.ref_) +Tensor Model::exp(Tensor input, Tensor output, const std::string &config, + const std::string &name) { + return impl_->create_op(config, name, input.ref_, output.ref_) ->result_tensors()[0]; } ModelOpGelu::ModelOpGelu(ModelTensorRef input, ModelTensorRef output) : ModelOpMath("Gelu", input, output) {} -Tensor Model::gelu(Tensor input, Tensor output, const std::string &name) { - return impl_->create_op(name, input.ref_, output.ref_) +Tensor Model::gelu(Tensor input, Tensor output, const std::string &config, + const std::string &name) { + return impl_->create_op(config, name, input.ref_, output.ref_) ->result_tensors()[0]; } ModelOpRelu::ModelOpRelu(ModelTensorRef input, ModelTensorRef output) : ModelOpMath("Relu", input, output) {} -Tensor Model::relu(Tensor input, Tensor output, const std::string &name) { - return impl_->create_op(name, input.ref_, output.ref_) +Tensor Model::relu(Tensor input, Tensor output, const std::string &config, + const std::string &name) { + return impl_->create_op(config, name, input.ref_, output.ref_) ->result_tensors()[0]; } ModelOpRsqrt::ModelOpRsqrt(ModelTensorRef input, ModelTensorRef output) : ModelOpMath("Rsqrt", input, output) {} -Tensor Model::rsqrt(Tensor input, Tensor output, const std::string &name) { - return impl_->create_op(name, input.ref_, output.ref_) +Tensor Model::rsqrt(Tensor input, Tensor output, const std::string &config, + const std::string &name) { + return impl_->create_op(config, name, input.ref_, output.ref_) ->result_tensors()[0]; } ModelOpSigmoid::ModelOpSigmoid(ModelTensorRef input, ModelTensorRef output) : ModelOpMath("Sigmoid", input, output) {} -Tensor Model::sigmoid(Tensor input, Tensor output, const std::string &name) { - return impl_->create_op(name, input.ref_, output.ref_) +Tensor Model::sigmoid(Tensor input, Tensor output, const std::string &config, + const std::string &name) { + return impl_ + ->create_op(config, name, input.ref_, output.ref_) ->result_tensors()[0]; } ModelOpSqrt::ModelOpSqrt(ModelTensorRef input, ModelTensorRef output) : ModelOpMath("Sqrt", input, output) {} -Tensor Model::sqrt(Tensor input, Tensor output, const std::string &name) { - return impl_->create_op(name, input.ref_, output.ref_) +Tensor Model::sqrt(Tensor input, Tensor output, const std::string &config, + const std::string &name) { + return impl_->create_op(config, name, input.ref_, output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp index a24b95d72..1976699a1 100644 --- a/ark/ops/ops_matmul.cpp +++ b/ark/ops/ops_matmul.cpp @@ -255,10 +255,10 @@ Json ModelOpMatmul::default_config(const ArchRef arch) const { Tensor Model::matmul(Tensor input, Tensor other, Tensor output, bool trans_input, bool trans_other, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref(), other.ref(), output.ref(), - trans_input, trans_other) + ->create_op(config, name, input.ref(), other.ref(), + output.ref(), trans_input, trans_other) ->result_tensors()[0]; } diff --git a/ark/ops/ops_noop.cpp b/ark/ops/ops_noop.cpp index 894ab29be..42fe5fdf5 100644 --- a/ark/ops/ops_noop.cpp +++ b/ark/ops/ops_noop.cpp @@ -30,7 +30,7 @@ Json ModelOpNoop::default_config([[maybe_unused]] const ArchRef arch) const { } void Model::noop(Tensor input, const std::string &name) { - impl_->create_op(name, input.ref_); + impl_->create_op("", name, input.ref_); } } // namespace ark diff --git a/ark/ops/ops_reduce.cpp b/ark/ops/ops_reduce.cpp index 1c91a2f0b..dadd049d2 100644 --- a/ark/ops/ops_reduce.cpp +++ b/ark/ops/ops_reduce.cpp @@ -128,25 +128,25 @@ Json ModelOpReduce::default_config([[maybe_unused]] const ArchRef arch) const { } Tensor Model::reduce_max(Tensor input, int axis, bool keepdims, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref_, axis, keepdims, + ->create_op(config, name, input.ref_, axis, keepdims, output.ref_) ->result_tensors()[0]; } Tensor Model::reduce_mean(Tensor input, int axis, bool keepdims, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref_, axis, keepdims, + ->create_op(config, name, input.ref_, axis, keepdims, output.ref_) ->result_tensors()[0]; } Tensor Model::reduce_sum(Tensor input, int axis, bool keepdims, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref_, axis, keepdims, + ->create_op(config, name, input.ref_, axis, keepdims, output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_refer.cpp b/ark/ops/ops_refer.cpp index 782d6708c..68c61b30f 100644 --- a/ark/ops/ops_refer.cpp +++ b/ark/ops/ops_refer.cpp @@ -20,7 +20,7 @@ Tensor Model::refer(Tensor input, const Dims &shape, const Dims &strides, const Dims &offsets, const Dims &padded_shape, const std::string &name) { return impl_ - ->create_op(name, input.ref_, shape, strides, offsets, + ->create_op("", name, input.ref_, shape, strides, offsets, padded_shape) ->result_tensors()[0]; } diff --git a/ark/ops/ops_reshape.cpp b/ark/ops/ops_reshape.cpp index c4e192908..6ecbba466 100644 --- a/ark/ops/ops_reshape.cpp +++ b/ark/ops/ops_reshape.cpp @@ -199,8 +199,8 @@ Tensor Model::reshape(Tensor input, const Dims &shape, bool allowzero, reshape_helper(input.ref_, Dims{inferred_shape}, allowzero, new_shape, new_strides, new_offs); return impl_ - ->create_op(name, input.ref_, new_shape, new_strides, - new_offs) + ->create_op("", name, input.ref_, new_shape, + new_strides, new_offs) ->result_tensors()[0]; } diff --git a/ark/ops/ops_rope.cpp b/ark/ops/ops_rope.cpp index 06c1c915e..36015aae5 100644 --- a/ark/ops/ops_rope.cpp +++ b/ark/ops/ops_rope.cpp @@ -12,9 +12,10 @@ ModelOpRope::ModelOpRope(ModelTensorRef input, ModelTensorRef other, : ModelOpBroadcast2("Rope", input, other, output) {} Tensor Model::rope(Tensor input, Tensor other, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref_, other.ref_, output.ref_) + ->create_op(config, name, input.ref_, other.ref_, + output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_scalar.cpp b/ark/ops/ops_scalar.cpp index 944a7247c..b5c10f1c3 100644 --- a/ark/ops/ops_scalar.cpp +++ b/ark/ops/ops_scalar.cpp @@ -115,20 +115,21 @@ std::vector ModelOpScalarMul::impl_args([ Tensor Model::constant(float val, const Dims &shape, DataType data_type, const std::string &name) { return impl_ - ->create_op(name, val, shape, data_type.ref(), + ->create_op("", name, val, shape, data_type.ref(), nullptr) ->result_tensors()[0]; } -Tensor Model::copy(float val, Tensor output, const std::string &name) { +Tensor Model::copy(float val, Tensor output, const std::string &config, + const std::string &name) { if (output == NullTensor) { return impl_ - ->create_op(name, val, Dims{1}, FP32.ref(), - output.ref()) + ->create_op(config, name, val, Dims{1}, + FP32.ref(), output.ref()) ->result_tensors()[0]; } else { return impl_ - ->create_op(name, val, output.shape(), + ->create_op(config, name, val, output.shape(), output.data_type().ref(), output.ref()) ->result_tensors()[0]; @@ -136,30 +137,34 @@ Tensor Model::copy(float val, Tensor output, const std::string &name) { } Tensor Model::add(Tensor input, float value, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref_, value, output.ref_) + ->create_op(config, name, input.ref_, value, + output.ref_) ->result_tensors()[0]; } Tensor Model::sub(Tensor input, float value, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref_, -value, output.ref_) + ->create_op(config, name, input.ref_, -value, + output.ref_) ->result_tensors()[0]; } Tensor Model::mul(Tensor input, float value, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref_, value, output.ref_) + ->create_op(config, name, input.ref_, value, + output.ref_) ->result_tensors()[0]; } Tensor Model::div(Tensor input, float value, Tensor output, - const std::string &name) { + const std::string &config, const std::string &name) { return impl_ - ->create_op(name, input.ref_, 1 / value, output.ref_) + ->create_op(config, name, input.ref_, 1 / value, + output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_tensor.cpp b/ark/ops/ops_tensor.cpp index 0279ab311..77091fa57 100644 --- a/ark/ops/ops_tensor.cpp +++ b/ark/ops/ops_tensor.cpp @@ -27,7 +27,7 @@ Tensor Model::tensor(const Dims &shape, const DataType &data_type, const Dims &strides, const Dims &offsets, const Dims &padded_shape, const std::string &name) { return impl_ - ->create_op(name, nullptr, shape, data_type.ref(), + ->create_op("", name, nullptr, shape, data_type.ref(), strides, offsets, padded_shape) ->result_tensors()[0]; } diff --git a/ark/ops/ops_transpose.cpp b/ark/ops/ops_transpose.cpp index 3f0ed0131..f099c7fb7 100644 --- a/ark/ops/ops_transpose.cpp +++ b/ark/ops/ops_transpose.cpp @@ -124,9 +124,10 @@ Json ModelOpTranspose::default_config([ } Tensor Model::transpose(Tensor input, const std::vector &permutation, - Tensor output, const std::string &name) { + Tensor output, const std::string &config, + const std::string &name) { return impl_ - ->create_op(name, input.ref_, permutation, + ->create_op(config, name, input.ref_, permutation, output.ref_) ->result_tensors()[0]; } diff --git a/arkprof.py b/arkprof.py index 9e67c2dfc..5fb62e118 100644 --- a/arkprof.py +++ b/arkprof.py @@ -1,6 +1,7 @@ import ark import sys +ark.init() ark.Profiler(ark.Plan.from_file(sys.argv[1])).run( iter=1000, profile_processor_groups=False ) diff --git a/examples/tutorial/context_tutorial.py b/examples/tutorial/context_tutorial.py new file mode 100644 index 000000000..fb01f0a0c --- /dev/null +++ b/examples/tutorial/context_tutorial.py @@ -0,0 +1,117 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import ark +import time +import torch +import torch.nn.functional as F + + +class VanillaSoftmax(ark.Module): + def __init__(self): + super(Softmax, self).__init__() + + def forward(self, input): + max = ark.reduce_max(input, axis=-1) + output = ark.sub(input, max) + output = ark.exp(output) + sum = ark.reduce_sum(output, axis=-1) + output = ark.div(output, sum) + return output + + +class Softmax(ark.Module): + def __init__(self): + super(Softmax, self).__init__() + + def forward(self, input): + with ark.ContextManager( + processor_range=[0, 304], + warp_range=[0, 8], + sram_range=[0, 0], + task_id=0, + ): + max = ark.reduce_max( + input, + axis=-1, + config={ + "NumWarps": 1, + "ImplType": "WarpWise", + "SramBytes": 0, + "NumTasks": 65536, + }, + ) + output = ark.sub( + input, + max, + config={ + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1, 2048], + "NumTasks": 65536, + }, + ) + output = ark.exp( + output, + config={ + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1, 2048], + "NumTasks": 65536, + }, + ) + sum = ark.reduce_sum( + output, + axis=-1, + config={ + "NumWarps": 1, + "ImplType": "WarpWise", + "SramBytes": 0, + "NumTasks": 65536, + }, + ) + output = ark.div( + output, + sum, + config={ + "NumWarps": 1, + "SramBytes": 0, + "Tile": [1, 2048], + "NumTasks": 65536, + }, + ) + return output + + +def eval(tensor: ark.Tensor): + with ark.Runtime() as rt: + rt.launch() + rt.run() + return tensor.to_torch() + + +def perf(): + with ark.Runtime() as rt: + rt.launch() + + start = time.time() + rt.run(iter=1000) + end = time.time() + return (end - start) / 1000 + + +if __name__ == "__main__": + ark.init() + + shape = (32, 2048, 2048) + + input = torch.randn(*shape).to("cuda:0") + + output = Softmax()(ark.Tensor.from_torch(input)) + + if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5): + print("Correct result") + else: + print("Incorrect result") + + print(f"Performance: {(perf() * 1e3):.3f} ms/iter") diff --git a/python/ark/__init__.py b/python/ark/__init__.py index e96972906..00370e683 100644 --- a/python/ark/__init__.py +++ b/python/ark/__init__.py @@ -1,7 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import sys import os if os.environ.get("ARK_ROOT", None) is None: @@ -102,3 +101,4 @@ def set_world_size(world_size): ) from .planner import DefaultPlanner, Plan from .profiler import Profiler +from .context_manager import ContextManager diff --git a/python/ark/context_manager.py b/python/ark/context_manager.py new file mode 100644 index 000000000..443e1ca5d --- /dev/null +++ b/python/ark/context_manager.py @@ -0,0 +1,24 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import json +from .model import Model +from ._ark_core import _ContextManager + + +class ContextManager(_ContextManager): + def __init__(self, **kwargs): + context_map = {key: json.dumps(value) for key, value in kwargs.items()} + super().__init__(Model.get_model(), context_map) + + def __enter__(self) -> "ContextManager": + """ + Enter the context manager. + """ + return self + + def __exit__(self, exc_type, exc_value, exc_tb): + """ + Exit the context manager. + """ + del self diff --git a/python/ark/ops.py b/python/ark/ops.py index 86b021aef..509e3c891 100644 --- a/python/ark/ops.py +++ b/python/ark/ops.py @@ -1,7 +1,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -from typing import List, Iterable, Union +import json +from typing import Any, Dict, List, Iterable, Union from .tensor import Dims, Tensor, Parameter, NullTensor from .data_type import DataType, fp32 @@ -12,6 +13,12 @@ def _is_list_or_tuple(obj): return isinstance(obj, list) or isinstance(obj, tuple) +def _config_to_str(config: Union[str, Dict[str, Any]]) -> str: + if isinstance(config, str): + return config + return json.dumps(config) + + def _tensor( shape: Iterable[int], dtype: DataType = fp32, @@ -50,6 +57,7 @@ def add( input: Union[Tensor, float], other: Union[Tensor, float], output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", name: str = "add", ) -> Union[Tensor, float]: """ @@ -73,12 +81,15 @@ def add( return input + other else: return Tensor( - Model.get_model().copy(input + other, output._tensor, name) + Model.get_model().copy( + input + other, output._tensor, _config_to_str(config), name + ) ) if output is not NullTensor: output = output._tensor return Tensor( - Model.get_model().add(a, b, output, name), runtime_id=input.runtime_id + Model.get_model().add(a, b, output, _config_to_str(config), name), + runtime_id=input.runtime_id, ) @@ -86,13 +97,16 @@ def cast( input: Tensor, dtype: DataType, output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", name: str = "cast", ) -> Tensor: """Type casting.""" if output is not NullTensor: output = output._tensor return Tensor( - Model.get_model().cast(input._tensor, dtype.ctype(), output, name), + Model.get_model().cast( + input._tensor, dtype.ctype(), output, _config_to_str(config), name + ), runtime_id=input.runtime_id, ) @@ -112,7 +126,10 @@ def constant( def copy( - input: Union[Tensor, float], output: Tensor = NullTensor, name: str = "copy" + input: Union[Tensor, float], + output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", + name: str = "copy", ) -> Tensor: """Data caopy.""" if output is not NullTensor: @@ -120,7 +137,7 @@ def copy( if isinstance(input, Tensor): intput = intput._tensor return Tensor( - Model.get_model().copy(intput, output, name), + Model.get_model().copy(intput, output, _config_to_str(config), name), runtime_id=input.runtime_id, ) @@ -129,6 +146,7 @@ def div( input: Tensor, other: Union[Tensor, float], output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", name: str = "div", ) -> Tensor: """ @@ -144,7 +162,9 @@ def div( raise ValueError("Tensors must be on the same runtime") other = other._tensor return Tensor( - Model.get_model().div(input._tensor, other, output, name), + Model.get_model().div( + input._tensor, other, output, _config_to_str(config), name + ), runtime_id=input.runtime_id, ) @@ -153,6 +173,7 @@ def embedding( input: Tensor, weight: Tensor, output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", name: str = "embedding", ) -> Tensor: """Embedding layer.""" @@ -162,14 +183,17 @@ def embedding( output = output._tensor return Tensor( Model.get_model().embedding( - input._tensor, weight._tensor, output, name + input._tensor, weight._tensor, output, _config_to_str(config), name ), runtime_id=input.runtime_id, ) def exp( - input: Tensor, output: Tensor = NullTensor, name: str = "exp" + input: Tensor, + output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", + name: str = "exp", ) -> Tensor: """ Calculates the exponential of the `input` tensor, element-wise. @@ -179,13 +203,18 @@ def exp( if output is not NullTensor: output = output._tensor return Tensor( - Model.get_model().exp(input._tensor, output, name), + Model.get_model().exp( + input._tensor, output, _config_to_str(config), name + ), runtime_id=input.runtime_id, ) def gelu( - input: Tensor, output: Tensor = NullTensor, name: str = "gelu" + input: Tensor, + output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", + name: str = "gelu", ) -> Tensor: """ Applies the Gaussian Error Linear Unit (GELU) activation @@ -198,7 +227,9 @@ def gelu( if output is not NullTensor: output = output._tensor return Tensor( - Model.get_model().gelu(input._tensor, output, name), + Model.get_model().gelu( + input._tensor, output, _config_to_str(config), name + ), runtime_id=input.runtime_id, ) @@ -230,6 +261,7 @@ def matmul( output: Tensor = NullTensor, transpose_input: bool = False, transpose_other: bool = False, + config: Union[str, Dict[str, Any]] = "", name: str = "matmul", ) -> Tensor: """ @@ -252,6 +284,7 @@ def matmul( output, transpose_input, transpose_other, + _config_to_str(config), name, ), runtime_id=input.runtime_id, @@ -262,6 +295,7 @@ def mul( input: Tensor, other: Union[Tensor, float], output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", name: str = "mul", ) -> Tensor: """ @@ -277,7 +311,9 @@ def mul( raise ValueError("Tensors must be on the same runtime") other = other._tensor return Tensor( - Model.get_model().mul(input._tensor, other, output, name), + Model.get_model().mul( + input._tensor, other, output, _config_to_str(config), name + ), runtime_id=input.runtime_id, ) @@ -294,6 +330,7 @@ def reduce_max( axis: int, keepdims: bool = True, output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", name: str = "reduce_max", ) -> Tensor: """ @@ -306,7 +343,7 @@ def reduce_max( output = output._tensor return Tensor( Model.get_model().reduce_max( - input._tensor, axis, keepdims, output, name + input._tensor, axis, keepdims, output, _config_to_str(config), name ), runtime_id=input.runtime_id, ) @@ -317,6 +354,7 @@ def reduce_mean( axis: int, keepdims: bool = True, output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", name: str = "reduce_mean", ) -> Tensor: """ @@ -329,7 +367,7 @@ def reduce_mean( output = output._tensor return Tensor( Model.get_model().reduce_mean( - input._tensor, axis, keepdims, output, name + input._tensor, axis, keepdims, output, _config_to_str(config), name ), runtime_id=input.runtime_id, ) @@ -340,6 +378,7 @@ def reduce_sum( axis: int, keepdims: bool = True, output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", name: str = "reduce_sum", ) -> Tensor: """ @@ -354,14 +393,17 @@ def reduce_sum( output = output._tensor return Tensor( Model.get_model().reduce_sum( - input._tensor, axis, keepdims, output, name + input._tensor, axis, keepdims, output, _config_to_str(config), name ), runtime_id=input.runtime_id, ) def relu( - input: Tensor, output: Tensor = NullTensor, name: str = "relu" + input: Tensor, + output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", + name: str = "relu", ) -> Tensor: """ Applies the ReLU activation function to the `input` tensor, @@ -372,7 +414,9 @@ def relu( if output is not NullTensor: output = output._tensor return Tensor( - Model.get_model().relu(input._tensor, output, name), + Model.get_model().relu( + input._tensor, output, _config_to_str(config), name + ), runtime_id=input.runtime_id, ) @@ -411,6 +455,7 @@ def rope( input: Tensor, other: Tensor, output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", name: str = "rope", ) -> Tensor: """ @@ -423,13 +468,18 @@ def rope( if input.runtime_id != other.runtime_id: raise ValueError("Tensors must be on the same runtime") return Tensor( - Model.get_model().rope(input._tensor, other._tensor, output, name), + Model.get_model().rope( + input._tensor, other._tensor, output, _config_to_str(config), name + ), runtime_id=input.runtime_id, ) def rsqrt( - input: Tensor, output: Tensor = NullTensor, name: str = "rsqrt" + input: Tensor, + output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", + name: str = "rsqrt", ) -> Tensor: """ Calculates the square root of the `input` tensor, element-wise. @@ -439,7 +489,9 @@ def rsqrt( if output is not NullTensor: output = output._tensor return Tensor( - Model.get_model().rsqrt(input._tensor, output, name), + Model.get_model().rsqrt( + input._tensor, output, _config_to_str(config), name + ), runtime_id=input.runtime_id, ) @@ -465,7 +517,10 @@ def sharding( def sigmoid( - input: Tensor, output: Tensor = NullTensor, name: str = "sigmoid" + input: Tensor, + output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", + name: str = "sigmoid", ) -> Tensor: """ Applies the Sigmoid activation function to the `input` tensor, @@ -476,13 +531,18 @@ def sigmoid( if output is not NullTensor: output = output._tensor return Tensor( - Model.get_model().sigmoid(input._tensor, output, name), + Model.get_model().sigmoid( + input._tensor, output, _config_to_str(config), name + ), runtime_id=input.runtime_id, ) def sqrt( - input: Tensor, output: Tensor = NullTensor, name: str = "sqrt" + input: Tensor, + output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", + name: str = "sqrt", ) -> Tensor: """ Calculates the square root of the `input` tensor, element-wise. @@ -492,7 +552,9 @@ def sqrt( if output is not NullTensor: output = output._tensor return Tensor( - Model.get_model().sqrt(input._tensor, output, name), + Model.get_model().sqrt( + input._tensor, output, _config_to_str(config), name + ), runtime_id=input.runtime_id, ) @@ -501,6 +563,7 @@ def sub( input: Tensor, other: Union[Tensor, float], output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", name: str = "sub", ) -> Tensor: """ @@ -516,7 +579,9 @@ def sub( raise ValueError("Tensors must be on the same runtime") other = other._tensor return Tensor( - Model.get_model().sub(input._tensor, other, output, name), + Model.get_model().sub( + input._tensor, other, output, _config_to_str(config), name + ), runtime_id=input.runtime_id, ) @@ -546,6 +611,7 @@ def transpose( input: Tensor, perm: Iterable[int], output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", name: str = "transpose", ) -> Tensor: """ @@ -565,7 +631,9 @@ def transpose( if len(perm) > 4: raise ValueError("Only support perm up to 4 dimensions") return Tensor( - Model.get_model().transpose(input._tensor, perm, output, name), + Model.get_model().transpose( + input._tensor, perm, output, _config_to_str(config), name + ), runtime_id=input.runtime_id, ) @@ -578,10 +646,11 @@ def mean( axis: int, keepdims: bool = True, output: Tensor = NullTensor, + config: Union[str, Dict[str, Any]] = "", name: str = "mean", ) -> Tensor: """Alias of reduce_mean.""" - return reduce_mean(input, axis, keepdims, output, name) + return reduce_mean(input, axis, keepdims, output, config, name) def ones( diff --git a/python/ark/profiler.py b/python/ark/profiler.py index c161b24e6..e47f5b7aa 100644 --- a/python/ark/profiler.py +++ b/python/ark/profiler.py @@ -21,8 +21,15 @@ class Profiler: def __init__(self, plan: Plan): self.plan = plan - def run(self, iter: int = 1000, loop_mode: bool = True, profile_processor_groups: bool = False): - sys.stderr.write(f"End-to-end: {timeit(self.plan, iter, loop_mode):.6f} seconds/iter\n") + def run( + self, + iter: int = 1000, + loop_mode: bool = True, + profile_processor_groups: bool = False, + ): + sys.stderr.write( + f"End-to-end: {timeit(self.plan, iter, loop_mode):.6f} seconds/iter\n" + ) if not profile_processor_groups: return diff --git a/python/ark_py.cpp b/python/ark_py.cpp index 1bc4255d6..7acd4ad1a 100644 --- a/python/ark_py.cpp +++ b/python/ark_py.cpp @@ -7,6 +7,7 @@ namespace py = pybind11; +extern void register_context_manager(py::module &m); extern void register_data_type(py::module &m); extern void register_dims(py::module &m); extern void register_error(py::module &m); @@ -22,6 +23,7 @@ extern void register_version(py::module &m); PYBIND11_MODULE(_ark_core, m) { m.doc() = "Bind ARK C++ APIs to Python"; + register_context_manager(m); register_data_type(m); register_dims(m); register_error(m); diff --git a/python/context_manager_py.cpp b/python/context_manager_py.cpp new file mode 100644 index 000000000..3d703a4bc --- /dev/null +++ b/python/context_manager_py.cpp @@ -0,0 +1,15 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include +#include +#include + +#include + +namespace py = pybind11; + +void register_context_manager(py::module &m) { + py::class_(m, "_ContextManager") + .def(py::init&>()); +} diff --git a/python/model_py.cpp b/python/model_py.cpp index 2d1e5f634..ba17251d8 100644 --- a/python/model_py.cpp +++ b/python/model_py.cpp @@ -15,97 +15,109 @@ void register_model(py::module &m) { .def(py::init(), py::arg("rank"), py::arg("world_size")) .def("rank", &ark::Model::rank) .def("world_size", &ark::Model::world_size) - .def("compress", &ark::Model::compress) + .def("compress", &ark::Model::compress, py::arg("merge_nodes") = false) .def("add", py::overload_cast(&ark::Model::add), + const std::string &, const std::string &>( + &ark::Model::add), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("add", py::overload_cast(&ark::Model::add), + const std::string &, const std::string &>( + &ark::Model::add), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("cast", &ark::Model::cast, py::arg("input"), py::arg("data_type"), - py::arg("output"), py::arg("name")) + py::arg("output"), py::arg("config"), py::arg("name")) .def("constant", &ark::Model::constant, py::arg("value"), py::arg("shape"), py::arg("data_type"), py::arg("name")) .def("copy", - py::overload_cast( - &ark::Model::copy), - py::arg("input"), py::arg("output"), py::arg("name")) + py::overload_cast(&ark::Model::copy), + py::arg("input"), py::arg("output"), py::arg("config"), + py::arg("name")) .def("copy", - py::overload_cast( - &ark::Model::copy), - py::arg("input"), py::arg("output"), py::arg("name")) + py::overload_cast(&ark::Model::copy), + py::arg("input"), py::arg("output"), py::arg("config"), + py::arg("name")) .def("div", py::overload_cast(&ark::Model::div), + const std::string &, const std::string &>( + &ark::Model::div), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("div", py::overload_cast(&ark::Model::div), + const std::string &, const std::string &>( + &ark::Model::div), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("embedding", &ark::Model::embedding, py::arg("input"), - py::arg("weight"), py::arg("output"), py::arg("name")) - .def("exp", &ark::Model::exp, py::arg("input"), py::arg("output"), + py::arg("weight"), py::arg("output"), py::arg("config"), py::arg("name")) + .def("exp", &ark::Model::exp, py::arg("input"), py::arg("output"), + py::arg("config"), py::arg("name")) .def("gelu", &ark::Model::gelu, py::arg("input"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("identity", &ark::Model::identity, py::arg("input"), py::arg("deps"), py::arg("name")) .def("matmul", &ark::Model::matmul, py::arg("input"), py::arg("other"), py::arg("output"), py::arg("trans_input"), py::arg("trans_other"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("mul", py::overload_cast(&ark::Model::mul), + const std::string &, const std::string &>( + &ark::Model::mul), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("mul", py::overload_cast(&ark::Model::mul), + const std::string &, const std::string &>( + &ark::Model::mul), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("noop", &ark::Model::noop, py::arg("input"), py::arg("name")) .def("reduce_max", &ark::Model::reduce_max, py::arg("input"), py::arg("axis"), py::arg("keepdims"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("reduce_mean", &ark::Model::reduce_mean, py::arg("input"), py::arg("axis"), py::arg("keepdims"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("reduce_sum", &ark::Model::reduce_sum, py::arg("input"), py::arg("axis"), py::arg("keepdims"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("relu", &ark::Model::relu, py::arg("input"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("reshape", &ark::Model::reshape, py::arg("input"), py::arg("shape"), py::arg("allowzero"), py::arg("name")) .def("rope", &ark::Model::rope, py::arg("input"), py::arg("other"), - py::arg("output"), py::arg("name")) + py::arg("output"), py::arg("config"), py::arg("name")) .def("rsqrt", &ark::Model::rsqrt, py::arg("input"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("sharding", &ark::Model::sharding, py::arg("input"), py::arg("axis"), py::arg("dim_per_shard"), py::arg("name")) .def("sigmoid", &ark::Model::sigmoid, py::arg("input"), - py::arg("output"), py::arg("name")) + py::arg("output"), py::arg("config"), py::arg("name")) .def("sqrt", &ark::Model::sqrt, py::arg("input"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("sub", py::overload_cast(&ark::Model::sub), + const std::string &, const std::string &>( + &ark::Model::sub), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("sub", py::overload_cast(&ark::Model::sub), + const std::string &, const std::string &>( + &ark::Model::sub), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("name")) + py::arg("config"), py::arg("name")) .def("tensor", &ark::Model::tensor, py::arg("shape"), py::arg("data_type"), py::arg("strides"), py::arg("offsets"), py::arg("padded_shape"), py::arg("name")) .def("transpose", &ark::Model::transpose, py::arg("input"), - py::arg("permutation"), py::arg("output"), py::arg("name")); + py::arg("permutation"), py::arg("output"), py::arg("config"), + py::arg("name")); } From ef3bb84e8ebb3bb86e256767802401e39d617a85 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 29 Jul 2024 20:31:14 +0000 Subject: [PATCH 41/61] plan manager --- ark/api/context_manager_test.cpp | 1 - ark/api/model.cpp | 9 ++ ark/api/plan_manager.cpp | 97 ++++++++++++++++ ark/api/plan_manager_test.cpp | 58 ++++++++++ ark/api/planner.cpp | 125 +++++++++++++++------ ark/include/ark/model.hpp | 9 +- ark/include/ark/model_graph.hpp | 1 + ark/include/ark/plan_manager.hpp | 25 +++++ ark/model/model_graph_impl.cpp | 16 ++- ark/model/model_graph_impl.hpp | 6 +- examples/tutorial/context_tutorial.py | 117 ------------------- examples/tutorial/plan_manager_tutorial.py | 82 ++++++++++++++ python/ark/__init__.py | 2 +- python/ark/context_manager.py | 24 ---- python/ark/plan_manager.py | 34 ++++++ python/ark_py.cpp | 4 +- python/context_manager_py.cpp | 15 --- python/plan_manager_py.cpp | 15 +++ 18 files changed, 440 insertions(+), 200 deletions(-) create mode 100644 ark/api/plan_manager.cpp create mode 100644 ark/api/plan_manager_test.cpp create mode 100644 ark/include/ark/plan_manager.hpp delete mode 100644 examples/tutorial/context_tutorial.py create mode 100644 examples/tutorial/plan_manager_tutorial.py delete mode 100644 python/ark/context_manager.py create mode 100644 python/ark/plan_manager.py delete mode 100644 python/context_manager_py.cpp create mode 100644 python/plan_manager_py.cpp diff --git a/ark/api/context_manager_test.cpp b/ark/api/context_manager_test.cpp index ff60b43bf..5fff94f34 100644 --- a/ark/api/context_manager_test.cpp +++ b/ark/api/context_manager_test.cpp @@ -1,7 +1,6 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "ark/model.hpp" #include "ark/context_manager.hpp" #include "model/model_node.hpp" diff --git a/ark/api/model.cpp b/ark/api/model.cpp index a5a258f71..e9604c341 100644 --- a/ark/api/model.cpp +++ b/ark/api/model.cpp @@ -9,6 +9,15 @@ namespace ark { +Model::Model(int rank, int world_size) : ModelGraph(rank, world_size) { + static size_t next_id = 0; + id_ = next_id++; +} + +Model::Model(const Model &other) : ModelGraph(other), id_(other.id()) {} + +size_t Model::id() const { return id_; } + Model Model::compress(bool merge_nodes) const { Model model(*this); model.compress_nodes(merge_nodes); diff --git a/ark/api/plan_manager.cpp b/ark/api/plan_manager.cpp new file mode 100644 index 000000000..aee8d4f7b --- /dev/null +++ b/ark/api/plan_manager.cpp @@ -0,0 +1,97 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/plan_manager.hpp" + +#include "logging.h" +#include "model/model_json.hpp" +#include "model/model_graph_impl.hpp" + +namespace ark { + +class PlanManagerState { + public: + PlanManagerState() : sync(true) {} + bool sync; +}; + +static std::map gPlanManagerStates; + +PlanManager::PlanManager(Model& model, const std::string& plan_context) : model_id_(model.id()), stop_sync_(false) { + auto ctx = Json::parse(plan_context); + if (!ctx.is_object()) { + ERR(ModelError, "plan context must be a JSON object"); + } + if (gPlanManagerStates.find(model_id_) == gPlanManagerStates.end()) { + gPlanManagerStates.emplace(model_id_, PlanManagerState()); + } + auto& state = gPlanManagerStates[model_id_]; + bool async = !state.sync; + std::map context_map; + for (const auto& [key, value] : ctx.items()) { + if (key == "sync") { + if (!value.is_boolean()) { + ERR(ModelError, "sync must be a boolean"); + } + if (state.sync && !value.get()) { + stop_sync_ = true; + state.sync = false; + context_map["AppendTask"] = "true"; + } else if (!state.sync) { + context_map["AppendTask"] = "true"; + } + } else if (key == "processor_range") { + if (!value.is_array()) { + ERR(ModelError, "processor_range must be an array"); + } + if (async) { + LOG(WARN, "Ignoring processor_range under sync=false context"); + continue; + } + context_map["ProcessorRange"] = value.dump(); + } else if (key == "warp_range") { + if (!value.is_array()) { + ERR(ModelError, "warp_range must be an array"); + } + if (async) { + LOG(WARN, "Ignoring warp_range under sync=false context"); + continue; + } + context_map["WarpRange"] = value.dump(); + } else if (key == "sram_range") { + if (!value.is_array()) { + ERR(ModelError, "sram_range must be an array"); + } + if (async) { + LOG(WARN, "Ignoring sram_range under sync=false context"); + continue; + } + context_map["SramRange"] = value.dump(); + } else if (key == "config") { + if (!value.is_object()) { + ERR(ModelError, "config must be an object"); + } + auto cfg = model.impl_->get_context("Config"); + if (cfg.empty()) { + context_map["Config"] = value.dump(); + } else { + auto cfg_obj = Json::parse(cfg); + for (const auto& [k, v] : value.items()) { + cfg_obj[k] = v; + } + context_map["Config"] = cfg_obj.dump(); + } + } else { + LOG(WARN, "Ignoring unknown plan context key: ", key); + } + } + context_manager_ = std::make_shared(model, context_map); +} + +PlanManager::~PlanManager() { + if (stop_sync_) { + gPlanManagerStates[model_id_].sync = true; + } +} + +} // namespace ark diff --git a/ark/api/plan_manager_test.cpp b/ark/api/plan_manager_test.cpp new file mode 100644 index 000000000..78f5d4cb8 --- /dev/null +++ b/ark/api/plan_manager_test.cpp @@ -0,0 +1,58 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/plan_manager.hpp" +#include "ark/planner.hpp" + +#include "model/model_json.hpp" +#include "unittest/unittest_utils.h" + +ark::unittest::State test_plan_manager() { + ark::Model model; + ark::Tensor t0 = model.tensor({1}, ark::FP32); + ark::Tensor t1 = model.tensor({1}, ark::FP32); + ark::Tensor t2 = model.add(t0, t1); + + ark::Tensor t3; + ark::Tensor t4; + ark::Tensor t5; + ark::Tensor t6; + { + ark::PlanManager pm_0(model, ark::Json({ + {"processor_range", {0, 2}}, + {"warp_range", {0, 4}}, + {"sram_range", {0, 0}}, + {"sync", false} + }).dump()); + t3 = model.relu(t2); + t4 = model.sqrt(t3); + } + { + ark::PlanManager pm_0(model, ark::Json({ + {"processor_range", {2, 4}}, + {"warp_range", {0, 4}}, + {"sram_range", {0, 0}} + }).dump()); + t5 = model.exp(t2); + + ark::PlanManager pm_1(model, ark::Json({ + {"processor_range", {2, 3}} + }).dump()); + t6 = model.rsqrt(t5); + } + + UNITTEST_TRUE(model.verify()); + + ark::DefaultPlanner planner(model, 0); + auto plan_str = planner.plan(); + ark::Json plan = ark::Json::parse(plan_str); + + UNITTEST_LOG(plan_str); + + return ark::unittest::SUCCESS; +} + +int main() { + UNITTEST(test_plan_manager); + return 0; +} diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp index dba149a1e..1c40e5301 100644 --- a/ark/api/planner.cpp +++ b/ark/api/planner.cpp @@ -58,19 +58,35 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const { size_t num_sm = gpu_info.num_sm; Json task_infos = Json::array(); Json processor_groups = Json::array(); - size_t max_num_warps = 1; - size_t max_num_processors = 1; - size_t next_node_id = 0; + size_t max_processor_id = 1; + size_t max_warp_id = 1; + size_t next_task_id = 0; + bool prev_append_task = false; + bool first_op = true; + + auto get_context = [&](const ModelNodeRef &node, + const std::string &key) -> Json { + if (node->context.find(key) != node->context.end()) { + return Json::parse(node->context.at(key)); + } + return Json(); + }; + for (const auto &node : model_.nodes()) { + std::string context = ""; + for (const auto &[key, value] : node->context) { + context += key + "=" + value + ","; + } + context += "prev_append_task=" + std::to_string(prev_append_task); + LOG(INFO, context); + for (const auto &op : node->ops) { if (op->is_virtual()) continue; - Json task_info; - task_info["Id"] = next_node_id++; - + auto ctx_config = get_context(node, "Config"); Json config; - if (!op->config().empty()) { - config = op->config(); + if (!ctx_config.empty()) { + config = ctx_config; } else if (!config_rules_.empty()) { const std::string op_str = op->serialize().dump(); for (auto &rule : config_rules_) { @@ -90,31 +106,70 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const { size_t num_warps = config["NumWarps"]; size_t num_tasks = config["NumTasks"]; size_t sram_bytes = config["SramBytes"]; - task_info["NumWarps"] = num_warps; - task_info["SramBytes"] = sram_bytes; - - max_num_warps = std::max(max_num_warps, num_warps); - - task_info["Ops"] = Json::array(); - task_info["Ops"].push_back(op->serialize()); - task_info["Ops"][0]["Config"] = config; - task_infos.push_back(task_info); - - Json resource_group; - size_t num_processors = std::min(num_sm, num_tasks); - max_num_processors = std::max(max_num_processors, num_processors); - resource_group["ProcessorRange"] = {0, num_processors}; - resource_group["WarpRange"] = {0, num_warps}; - resource_group["SramRange"] = {0, sram_bytes}; - resource_group["TaskGroups"] = {{{"TaskId", task_info["Id"]}, - {"TaskRange", {0, num_tasks}}, - {"Granularity", 1}}}; - - Json processor_group; - processor_group["ProcessorRange"] = {0, num_processors}; - processor_group["ResourceGroups"] = Json::array(); - processor_group["ResourceGroups"].push_back(resource_group); - processor_groups.push_back(processor_group); + + auto ctx_append_task = get_context(node, "AppendTask"); + if (!ctx_append_task.empty() && ctx_append_task.get() && + prev_append_task) { + auto &task_info = task_infos.back(); + task_info["NumWarps"] = + std::max(task_info["NumWarps"].get(), num_warps); + task_info["SramBytes"] = + std::max(task_info["SramBytes"].get(), sram_bytes); + task_info["Ops"].push_back(op->serialize()); + task_info["Ops"].back()["Config"] = config; + } else { + Json task_info; + task_info["Id"] = first_op ? next_task_id : ++next_task_id; + task_info["NumWarps"] = num_warps; + task_info["SramBytes"] = sram_bytes; + task_info["Ops"] = Json::array(); + task_info["Ops"].push_back(op->serialize()); + task_info["Ops"][0]["Config"] = config; + task_infos.push_back(task_info); + + auto ctx_processor_range = get_context(node, "ProcessorRange"); + auto ctx_warp_range = get_context(node, "WarpRange"); + auto ctx_sram_range = get_context(node, "SramRange"); + + Json processor_group; + if (!ctx_processor_range.empty()) { + processor_group["ProcessorRange"] = ctx_processor_range; + max_processor_id = std::max( + max_processor_id, ctx_processor_range[1].get()); + } else { + size_t num_processors = std::min(num_sm, num_tasks); + processor_group["ProcessorRange"] = {0, num_processors}; + max_processor_id = + std::max(max_processor_id, num_processors); + } + + Json resource_group; + resource_group["ProcessorRange"] = + processor_group["ProcessorRange"]; + if (!ctx_warp_range.empty()) { + resource_group["WarpRange"] = ctx_warp_range; + max_warp_id = + std::max(max_warp_id, ctx_warp_range[1].get()); + } else { + resource_group["WarpRange"] = {0, num_warps}; + max_warp_id = std::max(max_warp_id, num_warps); + } + if (!ctx_sram_range.empty()) { + resource_group["SramRange"] = ctx_sram_range; + } else { + resource_group["SramRange"] = {0, sram_bytes}; + } + resource_group["TaskGroups"] = {{{"TaskId", task_info["Id"]}, + {"TaskRange", {0, num_tasks}}, + {"Granularity", 1}}}; + + processor_group["ResourceGroups"] = Json::array(); + processor_group["ResourceGroups"].push_back(resource_group); + processor_groups.push_back(processor_group); + } + prev_append_task = + !ctx_append_task.empty() && ctx_append_task.get(); + first_op = false; } } @@ -122,8 +177,8 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const { plan["Rank"] = model_.rank(); plan["WorldSize"] = model_.world_size(); plan["Architecture"] = gpu_info.arch->name(); - plan["NumProcessors"] = max_num_processors; - plan["NumWarpsPerProcessor"] = max_num_warps; + plan["NumProcessors"] = max_processor_id; + plan["NumWarpsPerProcessor"] = max_warp_id; plan["TaskInfos"] = task_infos; plan["ProcessorGroups"] = processor_groups; diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp index 35efe53d5..e0b17be52 100644 --- a/ark/include/ark/model.hpp +++ b/ark/include/ark/model.hpp @@ -17,15 +17,20 @@ namespace ark { class Model : public ModelGraph { private: + size_t id_; std::set tags_; public: - Model(int rank = 0, int world_size = 1) : ModelGraph(rank, world_size) {} - Model(const Model &other) : ModelGraph(other) {} + Model(int rank = 0, int world_size = 1); + + Model(const Model &other); + ~Model() {} Model &operator=(const Model &other) = default; + size_t id() const; + Model compress(bool merge_nodes = false) const; int unique_tag(); diff --git a/ark/include/ark/model_graph.hpp b/ark/include/ark/model_graph.hpp index f6390a2a9..c53c98c3a 100644 --- a/ark/include/ark/model_graph.hpp +++ b/ark/include/ark/model_graph.hpp @@ -38,6 +38,7 @@ class ModelGraph { protected: friend class Model; + friend class PlanManager; friend class ContextManager; class Impl; diff --git a/ark/include/ark/plan_manager.hpp b/ark/include/ark/plan_manager.hpp new file mode 100644 index 000000000..3952a1c06 --- /dev/null +++ b/ark/include/ark/plan_manager.hpp @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_PLAN_MANAGER_HPP +#define ARK_PLAN_MANAGER_HPP + +#include + +namespace ark { + +class PlanManager { + public: + PlanManager(Model& model, const std::string& plan_context); + + ~PlanManager(); + + private: + size_t model_id_; + bool stop_sync_; + std::shared_ptr context_manager_; +}; + +} // namespace ark + +#endif // ARK_PLAN_MANAGER_HPP diff --git a/ark/model/model_graph_impl.cpp b/ark/model/model_graph_impl.cpp index 53a7fa851..385424e57 100644 --- a/ark/model/model_graph_impl.cpp +++ b/ark/model/model_graph_impl.cpp @@ -37,7 +37,15 @@ void ModelGraphContextStack::pop(const std::string &key) { it->second.pop_back(); } -std::map ModelGraphContextStack::current_context() const { +std::string ModelGraphContextStack::get_context(const std::string &key) const { + if (this->storage_.find(key) == this->storage_.end() || + this->storage_.at(key).empty()) { + return ""; + } + return *this->storage_.at(key).back(); +} + +std::map ModelGraphContextStack::get_context_all() const { std::map cur; for (const auto &pair : this->storage_) { if (!pair.second.empty()) { @@ -167,6 +175,10 @@ bool ModelGraph::Impl::verify() const { return true; } +std::string ModelGraph::Impl::get_context(const std::string &key) const { + return context_stack_->get_context(key); +} + ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) { for (auto &tns : op->input_tensors()) { if (tensor_to_producer_op_.find(tns) == tensor_to_producer_op_.end()) { @@ -205,7 +217,7 @@ ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) { producer->consumers.push_back(node); } - node->context = context_stack_->current_context(); + node->context = context_stack_->get_context_all(); nodes_.push_back(node); return node; diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp index fbfc54c7e..ec255423e 100644 --- a/ark/model/model_graph_impl.hpp +++ b/ark/model/model_graph_impl.hpp @@ -34,7 +34,9 @@ class ModelGraphContextStack { void pop(const std::string &key); - std::map current_context() const; + std::string get_context(const std::string &key) const; + + std::map get_context_all() const; }; class ModelGraph::Impl { @@ -80,6 +82,8 @@ class ModelGraph::Impl { bool verify() const; + std::string get_context(const std::string &key) const; + std::string serialize(bool pretty = true) const; std::vector nodes() const; diff --git a/examples/tutorial/context_tutorial.py b/examples/tutorial/context_tutorial.py deleted file mode 100644 index fb01f0a0c..000000000 --- a/examples/tutorial/context_tutorial.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import ark -import time -import torch -import torch.nn.functional as F - - -class VanillaSoftmax(ark.Module): - def __init__(self): - super(Softmax, self).__init__() - - def forward(self, input): - max = ark.reduce_max(input, axis=-1) - output = ark.sub(input, max) - output = ark.exp(output) - sum = ark.reduce_sum(output, axis=-1) - output = ark.div(output, sum) - return output - - -class Softmax(ark.Module): - def __init__(self): - super(Softmax, self).__init__() - - def forward(self, input): - with ark.ContextManager( - processor_range=[0, 304], - warp_range=[0, 8], - sram_range=[0, 0], - task_id=0, - ): - max = ark.reduce_max( - input, - axis=-1, - config={ - "NumWarps": 1, - "ImplType": "WarpWise", - "SramBytes": 0, - "NumTasks": 65536, - }, - ) - output = ark.sub( - input, - max, - config={ - "NumWarps": 1, - "SramBytes": 0, - "Tile": [1, 2048], - "NumTasks": 65536, - }, - ) - output = ark.exp( - output, - config={ - "NumWarps": 1, - "SramBytes": 0, - "Tile": [1, 2048], - "NumTasks": 65536, - }, - ) - sum = ark.reduce_sum( - output, - axis=-1, - config={ - "NumWarps": 1, - "ImplType": "WarpWise", - "SramBytes": 0, - "NumTasks": 65536, - }, - ) - output = ark.div( - output, - sum, - config={ - "NumWarps": 1, - "SramBytes": 0, - "Tile": [1, 2048], - "NumTasks": 65536, - }, - ) - return output - - -def eval(tensor: ark.Tensor): - with ark.Runtime() as rt: - rt.launch() - rt.run() - return tensor.to_torch() - - -def perf(): - with ark.Runtime() as rt: - rt.launch() - - start = time.time() - rt.run(iter=1000) - end = time.time() - return (end - start) / 1000 - - -if __name__ == "__main__": - ark.init() - - shape = (32, 2048, 2048) - - input = torch.randn(*shape).to("cuda:0") - - output = Softmax()(ark.Tensor.from_torch(input)) - - if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5): - print("Correct result") - else: - print("Incorrect result") - - print(f"Performance: {(perf() * 1e3):.3f} ms/iter") diff --git a/examples/tutorial/plan_manager_tutorial.py b/examples/tutorial/plan_manager_tutorial.py new file mode 100644 index 000000000..25aca7af6 --- /dev/null +++ b/examples/tutorial/plan_manager_tutorial.py @@ -0,0 +1,82 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import ark +import time +import torch +import torch.nn.functional as F + + +class VanillaSoftmax(ark.Module): + def __init__(self): + super(Softmax, self).__init__() + + def forward(self, input): + max = ark.reduce_max(input, axis=-1) + output = ark.sub(input, max) + output = ark.exp(output) + sum = ark.reduce_sum(output, axis=-1) + output = ark.div(output, sum) + return output + + +class Softmax(ark.Module): + def __init__(self): + super(Softmax, self).__init__() + + def forward(self, input): + with ark.PlanManager( + processor_range=[0, 304], + warp_range=[0, 8], + sram_range=[0, 0], + sync=False, + config={ + "NumWarps": 1, + "SramBytes": 0, + "NumTasks": 65536, + } + ): + with ark.PlanManager(config={"ImplType": "WarpWise"}): + max = ark.reduce_max(input, axis=-1) + with ark.PlanManager(config={"Tile": [1, 2048]}): + output = ark.sub(input, max) + output = ark.exp(output) + with ark.PlanManager(config={"ImplType": "WarpWise"}): + sum = ark.reduce_sum(output, axis=-1) + with ark.PlanManager(config={"Tile": [1, 2048]}): + output = ark.div(output, sum) + return output + + +def eval(tensor: ark.Tensor): + with ark.Runtime() as rt: + rt.launch() + rt.run() + return tensor.to_torch() + + +def perf(): + with ark.Runtime() as rt: + rt.launch() + + start = time.time() + rt.run(iter=1000) + end = time.time() + return (end - start) / 1000 + + +if __name__ == "__main__": + ark.init() + + shape = (32, 2048, 2048) + + input = torch.randn(*shape).to("cuda:0") + + output = Softmax()(ark.Tensor.from_torch(input)) + + if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5): + print("Correct result") + else: + print("Incorrect result") + + print(f"Performance: {(perf() * 1e3):.3f} ms/iter") diff --git a/python/ark/__init__.py b/python/ark/__init__.py index 00370e683..db19b59d4 100644 --- a/python/ark/__init__.py +++ b/python/ark/__init__.py @@ -101,4 +101,4 @@ def set_world_size(world_size): ) from .planner import DefaultPlanner, Plan from .profiler import Profiler -from .context_manager import ContextManager +from .plan_manager import PlanManager diff --git a/python/ark/context_manager.py b/python/ark/context_manager.py deleted file mode 100644 index 443e1ca5d..000000000 --- a/python/ark/context_manager.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import json -from .model import Model -from ._ark_core import _ContextManager - - -class ContextManager(_ContextManager): - def __init__(self, **kwargs): - context_map = {key: json.dumps(value) for key, value in kwargs.items()} - super().__init__(Model.get_model(), context_map) - - def __enter__(self) -> "ContextManager": - """ - Enter the context manager. - """ - return self - - def __exit__(self, exc_type, exc_value, exc_tb): - """ - Exit the context manager. - """ - del self diff --git a/python/ark/plan_manager.py b/python/ark/plan_manager.py new file mode 100644 index 000000000..80e615ab8 --- /dev/null +++ b/python/ark/plan_manager.py @@ -0,0 +1,34 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import json +from typing import List, Dict, Any +from .model import Model +from ._ark_core import _PlanManager + + +class PlanManager(_PlanManager): + def __init__(self, **kwargs): + """ + Plan manager for specifying the parallelization and tiling configuration of the operators in the context. + + Args: + processor_range (List[int], optional): The range of processors to be used. Defaults to None. + warp_range (List[int], optional): The range of warps to be used. Defaults to None. + sram_range (List[int], optional): The range of SRAMs to be used. Defaults to None. + sync (bool, optional): Whether to synchronize the execution. Defaults to True. + config (Dict[str, Any], optional): The configuration for the operators. Defaults to None. + """ + super().__init__(Model.get_model(), json.dumps(kwargs)) + + def __enter__(self) -> "PlanManager": + """ + Enter the plan manager. + """ + return self + + def __exit__(self, exc_type, exc_value, exc_tb): + """ + Exit the plan manager. + """ + del self diff --git a/python/ark_py.cpp b/python/ark_py.cpp index 7acd4ad1a..75788ba55 100644 --- a/python/ark_py.cpp +++ b/python/ark_py.cpp @@ -7,7 +7,7 @@ namespace py = pybind11; -extern void register_context_manager(py::module &m); +extern void register_plan_manager(py::module &m); extern void register_data_type(py::module &m); extern void register_dims(py::module &m); extern void register_error(py::module &m); @@ -23,7 +23,7 @@ extern void register_version(py::module &m); PYBIND11_MODULE(_ark_core, m) { m.doc() = "Bind ARK C++ APIs to Python"; - register_context_manager(m); + register_plan_manager(m); register_data_type(m); register_dims(m); register_error(m); diff --git a/python/context_manager_py.cpp b/python/context_manager_py.cpp deleted file mode 100644 index 3d703a4bc..000000000 --- a/python/context_manager_py.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include -#include -#include - -#include - -namespace py = pybind11; - -void register_context_manager(py::module &m) { - py::class_(m, "_ContextManager") - .def(py::init&>()); -} diff --git a/python/plan_manager_py.cpp b/python/plan_manager_py.cpp new file mode 100644 index 000000000..34aa0b77c --- /dev/null +++ b/python/plan_manager_py.cpp @@ -0,0 +1,15 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include +#include +#include + +#include + +namespace py = pybind11; + +void register_plan_manager(py::module &m) { + py::class_(m, "_PlanManager") + .def(py::init()); +} From 7a7f70e43d3e6e327abf5fe835fad1902c803ca0 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 30 Jul 2024 04:45:27 +0000 Subject: [PATCH 42/61] fix --- ark/api/plan_manager.cpp | 8 ++++---- ark/api/planner.cpp | 22 ++++++++-------------- examples/tutorial/plan_manager_tutorial.py | 3 +-- python/ark/tensor.py | 7 +++++-- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/ark/api/plan_manager.cpp b/ark/api/plan_manager.cpp index aee8d4f7b..8cb1940b1 100644 --- a/ark/api/plan_manager.cpp +++ b/ark/api/plan_manager.cpp @@ -17,7 +17,9 @@ class PlanManagerState { static std::map gPlanManagerStates; -PlanManager::PlanManager(Model& model, const std::string& plan_context) : model_id_(model.id()), stop_sync_(false) { +PlanManager::PlanManager(Model& model, const std::string& plan_context) + : model_id_(model.id()), stop_sync_(false) { + static int task_group_id = 0; auto ctx = Json::parse(plan_context); if (!ctx.is_object()) { ERR(ModelError, "plan context must be a JSON object"); @@ -36,9 +38,7 @@ PlanManager::PlanManager(Model& model, const std::string& plan_context) : model_ if (state.sync && !value.get()) { stop_sync_ = true; state.sync = false; - context_map["AppendTask"] = "true"; - } else if (!state.sync) { - context_map["AppendTask"] = "true"; + context_map["TaskGroupId"] = std::to_string(task_group_id++); } } else if (key == "processor_range") { if (!value.is_array()) { diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp index 1c40e5301..032be0d6f 100644 --- a/ark/api/planner.cpp +++ b/ark/api/planner.cpp @@ -61,7 +61,7 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const { size_t max_processor_id = 1; size_t max_warp_id = 1; size_t next_task_id = 0; - bool prev_append_task = false; + int prev_task_group_id = -1; bool first_op = true; auto get_context = [&](const ModelNodeRef &node, @@ -73,13 +73,6 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const { }; for (const auto &node : model_.nodes()) { - std::string context = ""; - for (const auto &[key, value] : node->context) { - context += key + "=" + value + ","; - } - context += "prev_append_task=" + std::to_string(prev_append_task); - LOG(INFO, context); - for (const auto &op : node->ops) { if (op->is_virtual()) continue; @@ -106,10 +99,12 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const { size_t num_warps = config["NumWarps"]; size_t num_tasks = config["NumTasks"]; size_t sram_bytes = config["SramBytes"]; + size_t granularity = config.value("Granularity", 1); - auto ctx_append_task = get_context(node, "AppendTask"); - if (!ctx_append_task.empty() && ctx_append_task.get() && - prev_append_task) { + auto ctx_task_group_id = get_context(node, "TaskGroupId"); + int task_group_id = + ctx_task_group_id.empty() ? -1 : ctx_task_group_id.get(); + if (task_group_id != -1 && task_group_id == prev_task_group_id) { auto &task_info = task_infos.back(); task_info["NumWarps"] = std::max(task_info["NumWarps"].get(), num_warps); @@ -161,14 +156,13 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const { } resource_group["TaskGroups"] = {{{"TaskId", task_info["Id"]}, {"TaskRange", {0, num_tasks}}, - {"Granularity", 1}}}; + {"Granularity", granularity}}}; processor_group["ResourceGroups"] = Json::array(); processor_group["ResourceGroups"].push_back(resource_group); processor_groups.push_back(processor_group); } - prev_append_task = - !ctx_append_task.empty() && ctx_append_task.get(); + prev_task_group_id = task_group_id; first_op = false; } } diff --git a/examples/tutorial/plan_manager_tutorial.py b/examples/tutorial/plan_manager_tutorial.py index 25aca7af6..c840ce0c0 100644 --- a/examples/tutorial/plan_manager_tutorial.py +++ b/examples/tutorial/plan_manager_tutorial.py @@ -26,7 +26,6 @@ def __init__(self): def forward(self, input): with ark.PlanManager( - processor_range=[0, 304], warp_range=[0, 8], sram_range=[0, 0], sync=False, @@ -34,7 +33,7 @@ def forward(self, input): "NumWarps": 1, "SramBytes": 0, "NumTasks": 65536, - } + }, ): with ark.PlanManager(config={"ImplType": "WarpWise"}): max = ark.reduce_max(input, axis=-1) diff --git a/python/ark/tensor.py b/python/ark/tensor.py index 657da1065..eed7a4259 100644 --- a/python/ark/tensor.py +++ b/python/ark/tensor.py @@ -193,7 +193,9 @@ def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor": ark_tensor = _Tensor(dl_capsule, ark_dtype.ctype()) return Tensor(ark_tensor, runtime_id=runtime_id) - def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor": + def copy( + self, data: Union[np.ndarray, torch.Tensor], stream: int = 0 + ) -> "Tensor": """ Copies data into this tensor. The data type may differ, but the size must match. @@ -214,6 +216,7 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor": self._tensor, data.data_ptr(), tensor_bytes, + stream, data.device.type == "cuda", ) elif isinstance(data, np.ndarray): @@ -221,7 +224,7 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor": data = np.ascontiguousarray(data) if data.nbytes != tensor_bytes: raise ValueError("data size does not match the tensor") - rt.executor.tensor_write(self._tensor, data) + rt.executor.tensor_write(self._tensor, data, stream) else: raise ValueError("data must be a numpy array or a torch tensor") return self From a77a2ea6b864562f4e916dbaaf30f82e080aad93 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 30 Jul 2024 05:48:00 +0000 Subject: [PATCH 43/61] llama example --- examples/llama/model_7b_b1_s2048.py | 704 ++++++++++++++++++++++++++++ examples/llama/model_test.py | 6 +- 2 files changed, 708 insertions(+), 2 deletions(-) create mode 100644 examples/llama/model_7b_b1_s2048.py diff --git a/examples/llama/model_7b_b1_s2048.py b/examples/llama/model_7b_b1_s2048.py new file mode 100644 index 000000000..f41304e85 --- /dev/null +++ b/examples/llama/model_7b_b1_s2048.py @@ -0,0 +1,704 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""LLaMA 2 Transformer model. + Correspond to https://github.com/facebookresearch/llama/blob/main/llama/model.py +""" + +import ark +import math +from dataclasses import dataclass +from typing import Optional +import os + + +@dataclass +class ModelArgs: + dim: int = 4096 + n_layers: int = 32 + n_heads: int = 32 + n_kv_heads: Optional[int] = None + vocab_size: int = -1 # defined later by tokenizer + multiple_of: int = ( + 256 # make SwiGLU hidden layer size multiple of large power of 2 + ) + ffn_dim_multiplier: Optional[float] = None + norm_eps: float = 1e-5 + max_batch_size: int = 32 + max_seq_len: int = 2048 + + +@dataclass +class ModelArgs7B(ModelArgs): + dim: int = 4096 + n_layers: int = 32 + n_heads: int = 32 + n_kv_heads: Optional[int] = None + vocab_size: int = -1 # defined later by tokenizer + multiple_of: int = ( + 256 # make SwiGLU hidden layer size multiple of large power of 2 + ) + ffn_dim_multiplier: Optional[float] = None + norm_eps: float = 1e-5 + max_batch_size: int = 32 + max_seq_len: int = 2048 + + +@dataclass +class ModelArgs13B(ModelArgs): + dim: int = 5120 + n_layers: int = 40 + n_heads: int = 40 + n_kv_heads: Optional[int] = None + vocab_size: int = -1 # defined later by tokenizer + multiple_of: int = ( + 256 # make SwiGLU hidden layer size multiple of large power of 2 + ) + ffn_dim_multiplier: Optional[float] = None + norm_eps: float = 1e-5 + max_batch_size: int = 32 + max_seq_len: int = 2048 + + +@dataclass +class ModelArgs70B(ModelArgs): + dim: int = 8192 + n_layers: int = 80 + n_heads: int = 64 + n_kv_heads: Optional[int] = 8 + vocab_size: int = -1 + multiple_of: int = ( + 4096 # make SwiGLU hidden layer size multiple of large power of 2 + ) + ffn_dim_multiplier: Optional[float] = 1.3 + norm_eps: float = 1e-5 + max_batch_size: int = 32 + max_seq_len: int = 4096 + + +class RMSNorm(ark.Module): + """ + Root mean square layer normalization (RMSNorm). + """ + + def __init__( + self, dim: int, eps: float = 1e-6, dtype: ark.DataType = ark.fp16 + ): + super().__init__() + self.eps = eps + self.dtype = dtype + self.weight = ark.parameter([1, 1, dim], ark.fp32) + + def forward(self, x): + with ark.PlanManager( + warp_range=[0, 8], + sync=False, + config={ + "NumWarps": 1, + "SramBytes": 0, + "NumTasks": 2048, + "Granularity": 7, + }, + ): + with ark.PlanManager(config={"Tile": [1, 4096]}): + x = ark.cast(x, ark.fp32) + x2 = ark.mul(x, x) + with ark.PlanManager(config={"ImplType": "WarpWise"}): + mean = ark.reduce_mean(x2, axis=-1) + with ark.PlanManager( + config={ + "NumWarps": 1, + "SramBytes": 0, + "Tile": [64, 1], + "NumTasks": 32, + } + ): + rrms = ark.rsqrt(mean) + with ark.PlanManager( + warp_range=[0, 8], + sync=False, + config={ + "NumWarps": 1, + "SramBytes": 0, + "NumTasks": 2048, + "Tile": [1, 4096], + "Granularity": 7, + }, + ): + x = ark.mul(x, rrms) + x = ark.mul(x, self.weight, x) + return ark.cast(x, self.dtype) + + +class ColumnParallelLinear(ark.Module): + """Linear layer with column parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its second dimension as A = [A_1, ..., A_p]. + Here the weight = A^T, so we need to partition the weight matrix along + its first dimension. + + """ + + def __init__( + self, + in_dim: int, + out_dim: int, + dtype: ark.DataType = ark.fp16, + gather_output: bool = True, + local_rank: int = 0, + world_size: int = 1, + ): + super().__init__() + self.in_dim = in_dim + self.out_dim = out_dim + self.dtype = dtype + self.local_rank = local_rank + self.world_size = world_size + self.gather_output = gather_output + + self.weight = ark.parameter([out_dim // world_size, in_dim], dtype) + + def forward(self, x): + if self.world_size == 1 or self.gather_output == False: + return ark.matmul(x, self.weight, transpose_other=True) + # We need to concat the output_tensor_shards along the last dimension + output_tensor = ark.tensor( + [x.shape()[0], x.shape()[1], self.out_dim], self.dtype + ) + output_tensor_shards = ark.sharding( + output_tensor, + axis=2, + dim_per_shard=self.out_dim // self.world_size, + ) + local_result = ark.identity( + output_tensor_shards[self.local_rank], deps=output_tensor_shards + ) + # (batch_size, seq_len, out_dim // world_size) + local_result = ark.matmul( + x, self.weight, local_result, transpose_other=True + ) + gather_input = ark.identity(output_tensor, deps=[local_result]) + # return gather_input + gather_reshape = ark.reshape( + gather_input, [x.shape()[0] * x.shape()[1], self.out_dim] + ) + gather_out = ark.local_all_gather( + gather_reshape, self.local_rank, self.world_size, 1 + ) + return ark.reshape( + gather_out, [x.shape()[0], x.shape()[1], self.out_dim] + ) + + +class RowParallelLinear(ark.Module): + """Linear layer with row parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its first dimension and X along its second dimension as: + - - + | A_1 | + | . | + A = | . | X = [X_1, ..., X_p] + | . | + | A_p | + - - + + Here the weight = A^T, so we need to partition the weight matrix along + its second dimension. + """ + + def __init__( + self, + in_dim: int, + out_dim: int, + dtype: ark.DataType = ark.fp16, + input_is_parallel: bool = False, + local_rank: int = 0, + world_size: int = 1, + ): + super().__init__() + self.in_dim = in_dim + self.out_dim = out_dim + self.dtype = dtype + self.local_rank = local_rank + self.world_size = world_size + self.input_is_parallel = input_is_parallel + + self.weight = ark.parameter([out_dim, in_dim // world_size], dtype) + + def forward(self, x): + if self.world_size == 1: + return ark.matmul(x, self.weight, transpose_other=True) + x_ndims = len(x.shape()) + if self.input_is_parallel: + input_parallel = x + else: + x_shards = ark.sharding( + x, x_ndims - 1, self.in_dim // self.world_size + ) + input_parallel = x_shards[self.local_rank] + local_result = ark.matmul( + input_parallel, self.weight, transpose_other=True + ) + reduced_result = ark.local_all_reduce( + local_result, self.local_rank, self.world_size + ) + return reduced_result + + +class ParallelEmbedding(ark.Module): + """Embedding layer.""" + + def __init__( + self, + vocab_size: int, + dim: int, + dtype: ark.DataType, + local_rank: int = 0, + world_size: int = 1, + ): + super().__init__() + self.vocab_size = vocab_size + self.dim = dim + self.weight = ark.parameter([vocab_size, dim // world_size], dtype) + self.out_dim = dim + self.dtype = dtype + self.world_size = world_size + self.local_rank = local_rank + + def forward(self, x): + if self.world_size == 1: + return ark.embedding(x, self.weight) + + output_tensor = ark.tensor( + [x.shape()[0], x.shape()[1], self.out_dim], self.dtype + ) + output_tensor_shards = ark.sharding( + output_tensor, axis=2, dim_per_shard=self.out_dim // self.world_size + ) + local_result = ark.identity( + output_tensor_shards[self.local_rank], deps=output_tensor_shards + ) + local_result = ark.embedding(x, self.weight, local_result) + gather_input = ark.identity(output_tensor, deps=[local_result]) + gather_reshape = ark.reshape( + gather_input, [x.shape()[0] * x.shape()[1], self.out_dim] + ) + gather_out = ark.local_all_gather( + gather_reshape, self.local_rank, self.world_size, 1 + ) + return ark.reshape( + gather_out, [x.shape()[0], x.shape()[1], self.out_dim] + ) + + +class Linear(ark.Module): + """ + Linear layer module with weights and no bias. + """ + + def __init__( + self, in_dim: int, out_dim: int, dtype: ark.DataType = ark.fp16 + ): + super().__init__() + self.dtype = dtype + self.weight = ark.parameter([out_dim, in_dim], dtype) + + def forward(self, x): + return ark.matmul(x, self.weight, transpose_other=True) + + +class Silu(ark.Module): + """ + Silu activation function, silu(x) = x * sigmoid(x) + """ + + def __init__(self): + super().__init__() + + def forward(self, x: ark.Tensor): + # We need to specify output tensor so that the sigmoid op will not be an in-place operator + output = ark.tensor(x.shape(), x.dtype()) + x1 = ark.sigmoid(x, output) + return ark.mul(x, x1) + + +class FeedForward(ark.Module): + def __init__( + self, + dim: int, + hidden_dim: int, + multiple_of: int, + ffn_dim_multiplier: Optional[float], + dtype: ark.DataType = ark.fp16, + local_rank: int = 0, + world_size: int = 1, + ): + super().__init__() + hidden_dim = int(2 * hidden_dim / 3) + # custom dim factor multiplier + if ffn_dim_multiplier is not None: + hidden_dim = int(ffn_dim_multiplier * hidden_dim) + hidden_dim = multiple_of * ( + (hidden_dim + multiple_of - 1) // multiple_of + ) + + self.w1 = ColumnParallelLinear( + dim, hidden_dim, dtype, False, local_rank, world_size + ) + self.w2 = RowParallelLinear( + hidden_dim, dim, dtype, True, local_rank, world_size + ) + self.w3 = ColumnParallelLinear( + dim, hidden_dim, dtype, False, local_rank, world_size + ) + + def forward(self, x): + # self.w2(F.silu(self.w1(x)) * self.w3(x)) + with ark.PlanManager( + warp_range=[0, 8], + sram_range=[0, 49344], + sync=False, + config={ + "NumWarps": 4, + "NumTasks": 688, + }, + ): + with ark.PlanManager( + config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} + ): + x1 = self.w1(x) + with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}): + x1 = Silu()(x1) + with ark.PlanManager( + warp_range=[0, 8], + sram_range=[0, 49344], + sync=False, + config={ + "NumWarps": 4, + "NumTasks": 688, + }, + ): + with ark.PlanManager( + config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} + ): + x2 = self.w3(x) + with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}): + x3 = ark.mul(x1, x2) + x4 = self.w2(x3) + return x4 + + +def apply_rotary_emb(xq, xk, freqs_cis): + """ + Apply rotary embeddings to xq and xk. + """ + xq_out = ark.rope(xq, freqs_cis) + xk_out = ark.rope(xk, freqs_cis) + return xq_out, xk_out + + +class Softmax(ark.Module): + def __init__(self): + super(Softmax, self).__init__() + + def forward(self, input): + with ark.PlanManager( + warp_range=[0, 8], + sram_range=[0, 0], + sync=False, + config={ + "NumWarps": 1, + "SramBytes": 0, + "NumTasks": 65536, + }, + ): + with ark.PlanManager(config={"ImplType": "WarpWise"}): + max = ark.reduce_max(input, axis=-1) + with ark.PlanManager(config={"Tile": [1, 2048]}): + output = ark.sub(input, max) + output = ark.exp(output) + with ark.PlanManager(config={"ImplType": "WarpWise"}): + sum = ark.reduce_sum(output, axis=-1) + with ark.PlanManager(config={"Tile": [1, 2048]}): + output = ark.div(output, sum) + return output + + +class Attention(ark.Module): + def __init__( + self, + args: ModelArgs, + dtype: ark.DataType = ark.fp16, + local_rank: int = 0, + world_size: int = 1, + ): + super().__init__() + self.n_kv_heads = ( + args.n_heads if args.n_kv_heads is None else args.n_kv_heads + ) + model_parallel_size = world_size + self.dtype = dtype + self.n_local_heads = args.n_heads // model_parallel_size + self.n_local_kv_heads = self.n_kv_heads // model_parallel_size + self.n_rep = self.n_local_heads // self.n_local_kv_heads + self.head_dim = args.dim // args.n_heads + self.wq = ColumnParallelLinear( + args.dim, + args.n_heads * self.head_dim, + dtype, + False, + local_rank, + world_size, + ) + self.wk = ColumnParallelLinear( + args.dim, + self.n_kv_heads * self.head_dim, + dtype, + False, + local_rank, + world_size, + ) + self.wv = ColumnParallelLinear( + args.dim, + self.n_kv_heads * self.head_dim, + dtype, + False, + local_rank, + world_size, + ) + self.wo = RowParallelLinear( + args.n_heads * self.head_dim, + args.dim, + dtype, + True, + local_rank, + world_size, + ) + + def forward( + self, + x: ark.Tensor, + start_pos: int, + freqs_cis: ark.Tensor, + mask: Optional[ark.Tensor], + ): + bsz, seqlen, _ = x.shape() + + with ark.PlanManager( + warp_range=[0, 4], + sram_range=[0, 24672], + sync=False, + config={"NumWarps": 4, "NumTasks": 256}, + ): + with ark.PlanManager( + config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} + ): + xq = self.wq(x) + xq = ark.reshape( + xq, [bsz, seqlen, self.n_local_heads, self.head_dim] + ) + with ark.PlanManager( + config={"SramBytes": 0, "Tile": [256, 1, 128]} + ): + if freqs_cis is not None: + xq = ark.rope(xq, freqs_cis) + with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}): + xq = ark.transpose(xq, [0, 2, 1, 3]) + + with ark.PlanManager( + warp_range=[0, 4], + sram_range=[0, 24672], + sync=False, + config={"NumWarps": 4, "NumTasks": 256}, + ): + with ark.PlanManager( + config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} + ): + xk = self.wk(x) + xk = ark.reshape( + xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim] + ) + with ark.PlanManager( + config={"SramBytes": 0, "Tile": [256, 1, 128]} + ): + if freqs_cis is not None: + xk = ark.rope(xk, freqs_cis) + keys = xk + with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}): + keys = ark.transpose(keys, [0, 2, 1, 3]) + + with ark.PlanManager( + warp_range=[0, 4], + sram_range=[0, 24672], + sync=False, + config={ + "NumWarps": 4, + "NumTasks": 256, + "SramBytes": 24672, + "TileShapeMNK": [256, 128, 32], + }, + ): + with ark.PlanManager( + config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} + ): + xv = self.wv(x) + xv = ark.reshape( + xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim] + ) + values = xv + with ark.PlanManager( + config={"SramBytes": 0, "Tile": [256, 1, 128]} + ): + values = ark.transpose(values, [0, 2, 1, 3]) + + with ark.PlanManager( + warp_range=[0, 8], + sram_range=[0, 49344], + sync=False, + config={ + "NumWarps": 4, + "NumTasks": 4096, + "Granularity": 2, + }, + ): + with ark.PlanManager( + config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} + ): + scores = ark.matmul(xq, keys, transpose_other=True) + with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}): + scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim)) + + if mask is not None: + scores = ark.add(scores, mask) + + scores = Softmax()(scores) + + with ark.PlanManager( + warp_range=[0, 4], + sram_range=[0, 24672], + sync=False, + config={ + "NumWarps": 4, + "NumTasks": 256, + }, + ): + with ark.PlanManager( + config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} + ): + output = ark.matmul(scores, values) + with ark.PlanManager( + config={"SramBytes": 0, "Tile": [256, 1, 128]} + ): + output = ark.transpose(output, [0, 2, 1, 3]) + output = ark.reshape( + output, [bsz, seqlen, self.head_dim * self.n_local_heads] + ) + return self.wo(output) + + +class TransformerBlock(ark.Module): + def __init__( + self, + layer_id: int, + args: ModelArgs, + dtype: ark.DataType = ark.fp16, + local_rank: int = 0, + world_size: int = 1, + ): + super().__init__() + self.n_heads = args.n_heads + self.dim = args.dim + self.head_dim = args.dim // args.n_heads + self.attention = Attention(args, dtype, local_rank, world_size) + self.feed_forward = FeedForward( + dim=args.dim, + hidden_dim=4 * args.dim, + multiple_of=args.multiple_of, + ffn_dim_multiplier=args.ffn_dim_multiplier, + dtype=dtype, + local_rank=local_rank, + world_size=world_size, + ) + self.layer_id = layer_id + self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps, dtype=dtype) + self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps, dtype=dtype) + + def forward( + self, + x: ark.Tensor, + start_pos: int, + freqs_cis: ark.Tensor, + mask: Optional[ark.Tensor], + ): + attention_norm_x = self.attention_norm(x) + h = self.attention.forward(attention_norm_x, start_pos, freqs_cis, mask) + with ark.PlanManager( + warp_range=[0, 4], + config={ + "NumWarps": 4, + "Tile": [256, 128], + "NumTasks": 256, + "SramBytes": 0, + }, + ): + h = ark.add(x, h) + ff = self.feed_forward(self.ffn_norm(h)) + with ark.PlanManager( + warp_range=[0, 4], + config={ + "NumWarps": 4, + "Tile": [256, 128], + "NumTasks": 256, + "SramBytes": 0, + }, + ): + out = ark.add(h, ff) + return out + + +class Transformer(ark.Module): + def __init__( + self, + params: ModelArgs, + dtype: ark.DataType = ark.fp16, + local_rank: int = 0, + world_size: int = 1, + ): + super().__init__() + self.params = params + self.vocab_size = params.vocab_size + self.n_layers = params.n_layers + + self.tok_embeddings = ParallelEmbedding( + params.vocab_size, params.dim, dtype, local_rank, world_size + ) + + self.layers = [] + for layer_id in range(self.n_layers): + self.layers.append( + TransformerBlock( + layer_id, params, dtype, local_rank, world_size + ) + ) + self.register_module(f"layers.{layer_id}", self.layers[layer_id]) + self.norm = RMSNorm(params.dim, eps=params.norm_eps, dtype=dtype) + self.output = ColumnParallelLinear( + params.dim, params.vocab_size, dtype, True, local_rank, world_size + ) + + def forward( + self, + tokens: ark.Tensor, + start_pos: int, + freqs_cis: ark.Tensor, + mask: Optional[ark.Tensor], + ): + h = self.tok_embeddings(tokens) + + for layer in self.layers: + h = layer(h, start_pos, freqs_cis, mask) + h = self.norm(h) + output = self.output(h) + return output diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py index 19c680854..f559a826b 100644 --- a/examples/llama/model_test.py +++ b/examples/llama/model_test.py @@ -59,8 +59,10 @@ def run_ark( output = module(*module_inputs) with ark.Runtime() as rt: - plan = ark.Plan.from_file("plan_llama2_7b_b1_s2048.json") - rt.launch(plan) + plan = ark.DefaultPlanner().plan() + with open("plan.json", "w") as f: + f.write(str(plan)) + rt.launch(plan=plan) # Load model parameters if state_dict: From 78ac0dacb70e26ef5dc8704c0bb69c7c47240cbd Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 6 Aug 2024 08:06:32 +0000 Subject: [PATCH 44/61] fix merge --- ark/include/ark/executor.hpp | 2 +- ark/ops/ops_test_common.cpp | 2 +- ark/ops/ops_test_common.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp index f0a108a1f..3744c33db 100644 --- a/ark/include/ark/executor.hpp +++ b/ark/include/ark/executor.hpp @@ -95,7 +95,7 @@ class DefaultExecutor : public Executor { public: DefaultExecutor( const Model &model, int device_id = -1, Stream stream = nullptr, - const std::vector &config_rules = {}, + const std::vector &config_rules = {}, const std::string &name = "DefaultExecutor", bool loop_mode = true); }; diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp index 2bd9ce2e7..4e94d06a7 100644 --- a/ark/ops/ops_test_common.cpp +++ b/ark/ops/ops_test_common.cpp @@ -35,7 +35,7 @@ OpsTestResult op_test( const std::string &test_name_prefix, const Model &model, const std::vector &inputs, const std::vector &outputs, OpsTestBaseline baseline, const std::vector &inputs_data, - const std::vector &config_rules, + const std::vector &config_rules, bool print_on_error) { DefaultExecutor exe(model, -1, nullptr, config_rules); exe.compile(); diff --git a/ark/ops/ops_test_common.hpp b/ark/ops/ops_test_common.hpp index c5d640f3b..3848773e6 100644 --- a/ark/ops/ops_test_common.hpp +++ b/ark/ops/ops_test_common.hpp @@ -171,7 +171,7 @@ OpsTestResult op_test( const std::string &test_name_prefix, const Model &model, const std::vector &inputs, const std::vector &outputs, OpsTestBaseline baseline, const std::vector &inputs_data = {}, - const std::vector &config_rules = {}, + const std::vector &config_rules = {}, bool print_on_error = false); OpsTestGpuMem to_gpu(void *host_ptr, size_t size); From afb518a7622363b000e9fc1d21c4cf8178c3461d Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 6 Aug 2024 08:09:48 +0000 Subject: [PATCH 45/61] fix merge --- ark/api/executor.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index 58d058d25..42ed45128 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -233,8 +233,6 @@ void Executor::Impl::init(const PlanJson &plan_json) { } auto gpu_manager = GpuManager::get_instance(device_id_); - - auto gpu_manager = GpuManager::get_instance(gpu_id_); if (!gpu_manager->info().arch->belongs_to( Arch::from_name(plan_json.at("Architecture")))) { LOG(WARN, "Architecture name of the plan `", @@ -779,7 +777,7 @@ void Executor::Impl::barrier() { uintptr_t Executor::Impl::tensor_address(const Tensor tensor) const { size_t buffer_id = tensor.ref()->buffer()->id(); if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) { - ERR(NotFoundError, "Invalid buffer ID: ", buffer_id); + ERR(InternalError, "Invalid buffer ID: ", buffer_id); } size_t offset = buffer_id_to_offset_.at(buffer_id); return reinterpret_cast(buffer_->ref(offset)); From 762bf4aa439510dbc04e4f9ee83da84c7a32a03a Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 6 Aug 2024 16:30:57 +0000 Subject: [PATCH 46/61] fix merge --- ark/ops/ops_all_reduce_test.cpp | 15 +++++++-------- ark/ops/ops_communication_test.cpp | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp index 90814d036..8cf68b085 100644 --- a/ark/ops/ops_all_reduce_test.cpp +++ b/ark/ops/ops_all_reduce_test.cpp @@ -125,10 +125,9 @@ void test_all_reduce_packet_internal(ark::DimType nelem) { std::vector ones_vec(ones.shape().nelems(), ark::half_t(1.0f)); - auto result = - ark::op_test("all_reduce_packet", m, {ones}, {output}, - baseline_all_reduce, - {ones_vec.data()}, false, gpu_id, NumGpus); + auto result = ark::op_test( + "all_reduce_packet", m, {ones}, {output}, + baseline_all_reduce, {ones_vec.data()}); UNITTEST_LOG(result); UNITTEST_EQ(result.max_diff[0], 0.0f); return ark::unittest::SUCCESS; @@ -232,10 +231,10 @@ void test_all_reduce_sm_internal(ark::DimType nelem) { std::vector ones_vec(ones.shape().nelems(), ark::half_t(1.0f)); - auto result = ark::op_test( - "all_reduce_sm", m, {ones}, {output}, - baseline_all_reduce, {ones_vec.data()}, - false, gpu_id, NumGpus, config_rule); + auto result = + ark::op_test("all_reduce_sm", m, {ones}, {output}, + baseline_all_reduce, + {ones_vec.data()}, {config_rule}); UNITTEST_LOG(result); UNITTEST_EQ(result.max_diff[0], 0.0f); return ark::unittest::SUCCESS; diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp index db384c1f4..8cdad41b2 100644 --- a/ark/ops/ops_communication_test.cpp +++ b/ark/ops/ops_communication_test.cpp @@ -433,7 +433,7 @@ ark::unittest::State test_communication_send_recv_reduce() { ark::Planner planner(model, gpu_id); planner.install_config_rule(config_rule); - ark::Executor exe(gpu_id, 2, gpu_id, "Executor", planner.plan()); + ark::Executor exe(gpu_id, nullptr, "Executor", planner.plan()); exe.compile(); std::vector data(1024); From f654f0b08d48931acd5645c16300c1a6f3ebe88e Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 6 Aug 2024 16:34:21 +0000 Subject: [PATCH 47/61] add a python method --- python/executor_py.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/python/executor_py.cpp b/python/executor_py.cpp index e782a99fe..a3f2a078b 100644 --- a/python/executor_py.cpp +++ b/python/executor_py.cpp @@ -63,6 +63,7 @@ void register_executor(py::module &m) { .def("barrier", &ark::Executor::barrier) .def("destroy", &ark::Executor::destroy) .def("destroyed", &ark::Executor::destroyed) + .def("tensor_address", &ark::Executor::tensor_address) .def("tensor_read", py::overload_cast(&tensor_read), From 498926c6242a35a38ffd6a8c406b4f3cf1ff84c6 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 6 Aug 2024 16:35:28 +0000 Subject: [PATCH 48/61] submodule update --- third_party/mscclpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/mscclpp b/third_party/mscclpp index cddffbc8b..40cb19655 160000 --- a/third_party/mscclpp +++ b/third_party/mscclpp @@ -1 +1 @@ -Subproject commit cddffbc8b6dfa6facf7c64c1b7d73acf30e600b3 +Subproject commit 40cb1965538ab98fea3cc9fe004f730e23e84829 From 3e331a2e2f5487502daccc32890ef49c5d86eb12 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 6 Aug 2024 17:12:15 +0000 Subject: [PATCH 49/61] fix --- ark/model/model_json.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp index b82f9e484..c2099e2c9 100644 --- a/ark/model/model_json.cpp +++ b/ark/model/model_json.cpp @@ -287,6 +287,7 @@ PlanJson::PlanJson(const Json &json) : Json((json != nullptr) ? json : Json{{"Rank", 0}, {"WorldSize", 1}, + {"Architecture", "ANY"}, {"NumProcessors", 1}, {"NumWarpsPerProcessor", 1}, {"TaskInfos", Json::array()}, From 10bfa75dbd40a96ffca69fb22e89127e1839b940 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 6 Aug 2024 17:14:47 +0000 Subject: [PATCH 50/61] Rename CMake environments --- .github/workflows/codeql.yml | 4 ++-- .github/workflows/ut-cuda.yml | 2 +- CMakeLists.txt | 32 ++++++++++++++++---------------- ark/CMakeLists.txt | 10 +++++----- pyproject.toml | 2 +- third_party/CMakeLists.txt | 9 +++++++-- third_party/mscclpp | 2 +- 7 files changed, 33 insertions(+), 28 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 0d7094c36..272cb8ebe 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -49,7 +49,7 @@ jobs: - name: Build run: | mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_PYTHON=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON -DBUILD_TESTS=OFF .. + cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF .. make -j build ark_py - name: Perform CodeQL Analysis @@ -95,7 +95,7 @@ jobs: - name: Build run: | mkdir build && cd build - CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_PYTHON=ON -DBYPASS_GPU_CHECK=ON -DUSE_ROCM=ON -DBUILD_TESTS=OFF .. + CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_ROCM=ON -DARK_BUILD_TESTS=OFF .. make -j build ark_py - name: Perform CodeQL Analysis diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml index 4e573adfb..c2e8e7c50 100644 --- a/.github/workflows/ut-cuda.yml +++ b/.github/workflows/ut-cuda.yml @@ -44,7 +44,7 @@ jobs: - name: Build run: | mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_PYTHON=ON .. + cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON .. make -j ut ark_py - name: Run C++ UT diff --git a/CMakeLists.txt b/CMakeLists.txt index ee1e3566e..2e80ea1e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,16 +13,16 @@ enable_language(CXX) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") -option(USE_CUDA "Use NVIDIA/CUDA." OFF) -option(USE_ROCM "Use AMD/ROCm." OFF) -option(BYPASS_GPU_CHECK "Bypass GPU check." OFF) -option(BUILD_TESTS "Build unit tests." ON) +option(ARK_USE_CUDA "Use NVIDIA/CUDA." OFF) +option(ARK_USE_ROCM "Use AMD/ROCm." OFF) +option(ARK_BYPASS_GPU_CHECK "Bypass GPU check." OFF) +option(ARK_BUILD_TESTS "Build unit tests." ON) -if(BYPASS_GPU_CHECK) - if(USE_CUDA) +if(ARK_BYPASS_GPU_CHECK) + if(ARK_USE_CUDA) message("Bypassing GPU check: using NVIDIA/CUDA.") find_package(CUDAToolkit REQUIRED) - elseif(USE_ROCM) + elseif(ARK_USE_ROCM) message("Bypassing GPU check: using AMD/ROCm.") set(CMAKE_PREFIX_PATH "/opt/rocm;${CMAKE_PREFIX_PATH}") find_package(hip REQUIRED) @@ -35,16 +35,16 @@ else() include(CheckAmdGpu) if(NVIDIA_FOUND AND AMD_FOUND) message("Detected NVIDIA/CUDA and AMD/ROCm: prioritizing NVIDIA/CUDA.") - set(USE_CUDA ON) - set(USE_ROCM OFF) + set(ARK_USE_CUDA ON) + set(ARK_USE_ROCM OFF) elseif(NVIDIA_FOUND) message("Detected NVIDIA/CUDA.") - set(USE_CUDA ON) - set(USE_ROCM OFF) + set(ARK_USE_CUDA ON) + set(ARK_USE_ROCM OFF) elseif(AMD_FOUND) message("Detected AMD/ROCm.") - set(USE_CUDA OFF) - set(USE_ROCM ON) + set(ARK_USE_CUDA OFF) + set(ARK_USE_ROCM ON) else() message(FATAL_ERROR "Neither NVIDIA/CUDA nor AMD/ROCm is found.") endif() @@ -53,7 +53,7 @@ endif() # Declare project set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-deprecated-declarations") -if(USE_CUDA) +if(ARK_USE_CUDA) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra") project(ark LANGUAGES CXX CUDA) @@ -72,7 +72,7 @@ if(USE_CUDA) if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12) set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 90) endif() -else() # USE_ROCM +else() # ARK_USE_ROCM set(CMAKE_HIP_STANDARD 17) set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra") project(ark LANGUAGES CXX HIP) @@ -145,7 +145,7 @@ add_custom_target(ut) # Details add_subdirectory(ark) -if(BUILD_PYTHON) +if(ARK_BUILD_PYTHON) # Install Python module add_subdirectory(python) add_dependencies(ark_py build) diff --git a/ark/CMakeLists.txt b/ark/CMakeLists.txt index 4457d3c0b..208d9f9cb 100644 --- a/ark/CMakeLists.txt +++ b/ark/CMakeLists.txt @@ -6,7 +6,7 @@ file(GLOB_RECURSE UT_SOURCES CONFIGURE_DEPENDS *_test.cpp) file(GLOB_RECURSE UT_COMMON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unittest/*.cpp) list(REMOVE_ITEM SOURCES ${UT_SOURCES} ${UT_COMMON_SOURCES}) -if(USE_ROCM) +if(ARK_USE_ROCM) file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu) set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX) endif() @@ -23,7 +23,7 @@ target_include_directories(ark_obj SYSTEM PRIVATE ${NUMA_INCLUDE_DIRS} ) -if(USE_CUDA) +if(ARK_USE_CUDA) list(APPEND COMMON_LIBS CUDA::cuda_driver) target_include_directories(ark_obj SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cutlass/include @@ -32,7 +32,7 @@ if(USE_CUDA) target_compile_definitions(ark_obj PUBLIC ARK_CUDA) endif() -if(USE_ROCM) +if(ARK_USE_ROCM) list(APPEND COMMON_LIBS hip::host) target_include_directories(ark_obj SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cutlass/include @@ -45,7 +45,7 @@ target_sources(ark_obj PRIVATE ${SOURCES}) target_link_libraries(ark_obj PUBLIC mscclpp_static PRIVATE ${COMMON_LIBS}) # ARK unit tests -if(BUILD_TESTS) +if(ARK_BUILD_TESTS) foreach(ut_source IN ITEMS ${UT_SOURCES}) get_filename_component(exe_name ${ut_source} NAME_WE) add_executable(${exe_name} ${ut_source} ${UT_COMMON_SOURCES}) @@ -58,7 +58,7 @@ if(BUILD_TESTS) ${NUMA_INCLUDE_DIRS} ) - if(USE_CUDA) + if(ARK_USE_CUDA) target_link_libraries(${exe_name} PRIVATE ark_obj ${COMMON_LIBS} CUDA::cudart CUDA::cublas) target_include_directories(${exe_name} SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS} diff --git a/pyproject.toml b/pyproject.toml index 1f9386c73..d9fb4502e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ install.strip = true build-dir = "build/{wheel_tag}" [tool.scikit-build.cmake.define] -BUILD_PYTHON = "ON" +ARK_BUILD_PYTHON = "ON" [tool.black] line-length = 80 diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index 12ae74298..96e442289 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -14,7 +14,12 @@ FetchContent_Declare( GIT_TAG v0.5.2 SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp ) +set(BUILD_TESTS OFF CACHE BOOL "" FORCE) set(BUILD_PYTHON_BINDINGS OFF CACHE BOOL "" FORCE) +set(BUILD_APPS_NCCL OFF CACHE BOOL "" FORCE) +set(USE_CUDA ${ARK_USE_CUDA} CACHE BOOL "" FORCE) +set(USE_ROCM ${ARK_USE_ROCM} CACHE BOOL "" FORCE) +set(BYPASS_GPU_CHECK ON CACHE BOOL "" FORCE) set(INSTALL_PREFIX "ark") FetchContent_GetProperties(mscclpp) if (NOT mscclpp_POPULATED) @@ -35,7 +40,7 @@ if (NOT json_POPULATED) endif() set(JSON_INCLUDE_DIRS ${json_SOURCE_DIR}/include PARENT_SCOPE) -if(USE_CUDA) +if(ARK_USE_CUDA) # Configure CUTLASS FetchContent_Declare( cutlass @@ -58,7 +63,7 @@ if(USE_CUDA) endif() -if(USE_ROCM) +if(ARK_USE_ROCM) # Configure CK FetchContent_Declare( ck diff --git a/third_party/mscclpp b/third_party/mscclpp index cddffbc8b..40cb19655 160000 --- a/third_party/mscclpp +++ b/third_party/mscclpp @@ -1 +1 @@ -Subproject commit cddffbc8b6dfa6facf7c64c1b7d73acf30e600b3 +Subproject commit 40cb1965538ab98fea3cc9fe004f730e23e84829 From 3dda44a8dc310560333de0cf9090d7da0013e21f Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 6 Aug 2024 18:15:09 +0000 Subject: [PATCH 51/61] A few fixes & improved coverage --- ark/api/executor.cpp | 21 +++-- ark/api/executor_test.cpp | 150 +++++++++++++++++++++++++++++++++++ ark/include/ark/executor.hpp | 2 +- python/executor_py.cpp | 2 +- 4 files changed, 161 insertions(+), 14 deletions(-) create mode 100644 ark/api/executor_test.cpp diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index 42ed45128..16d369bc8 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -141,7 +141,7 @@ static size_t tensor_stride_bytes(const Json &tensor) { class Executor::Impl { public: Impl(int device_id, Stream stream, const std::string &name, bool loop_mode); - ~Impl() = default; + ~Impl(); void init(const PlanJson& plan); @@ -152,7 +152,7 @@ class Executor::Impl { std::string plan() const { return plan_json_.dump_pretty(); } void compile(); - void launch(int64_t max_spin_count); + void launch(); void run(int iter); void wait(int64_t max_spin_count); float stop(int64_t max_spin_count); @@ -219,6 +219,10 @@ Executor::Impl::Impl(int device_id, Stream stream, const std::string &name, } } +Executor::Impl::~Impl() { + if (is_launched_) stop(-1); +} + void Executor::Impl::init(const PlanJson &plan_json) { plan_json_ = plan_json; rank_ = plan_json_["Rank"].get(); @@ -620,13 +624,12 @@ void Executor::Impl::init_channels(const std::set &remote_ranks) { void Executor::Impl::compile() { kernel_->compile(); } -void Executor::Impl::launch(int64_t max_spin_count) { +void Executor::Impl::launch() { if (!kernel_->is_compiled()) { ERR(InvalidUsageError, "Need to compile first before initialization."); } if (is_launched_) { - // Wait until previous works finish. - this->wait(max_spin_count); + LOG(WARN, "Ignore launching twice."); return; } auto get_global_rt = [&](const std::string &symbol) { @@ -674,12 +677,6 @@ void Executor::Impl::launch(int64_t max_spin_count) { } elapsed_msec_ = -1; - if (!kernel_->is_compiled()) { - ERR(InvalidUsageError, "Need to compile first before initialization."); - } else if (is_launched_) { - LOG(WARN, "Ignore launching twice."); - return; - } timer_begin_->record(stream_raw_); if (world_size_ > 1) { @@ -911,7 +908,7 @@ std::string Executor::plan() const { return impl_->plan(); } void Executor::compile() { impl_->compile(); } -void Executor::launch(int64_t max_spin_count) { impl_->launch(max_spin_count); } +void Executor::launch() { impl_->launch(); } void Executor::run(int iter) { impl_->run(iter); } diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp new file mode 100644 index 000000000..b0b398ac9 --- /dev/null +++ b/ark/api/executor_test.cpp @@ -0,0 +1,150 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/executor.hpp" + +#include "gpu/gpu.hpp" +#include "model/model_json.hpp" +#include "unittest/unittest_utils.h" + +template +ark::unittest::State test_executor() { + ark::gpuStream stream; + UNITTEST_EQ( + ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking), + ark::gpuSuccess); + + ark::Model empty; + { + ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode); + UNITTEST_EQ(executor.device_id(), 0); + UNITTEST_EQ(executor.stream(), stream); + + executor.compile(); + executor.launch(); + executor.run(1); + executor.wait(); + executor.stop(); + executor.destroy(); + } + { + ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode); + executor.compile(); + executor.launch(); + executor.run(1); + executor.wait(); + executor.stop(); + + executor.launch(); + executor.run(1); + executor.wait(); + executor.stop(); + + executor.destroy(); + } + { + ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode); + UNITTEST_THROW(executor.launch(), ark::InvalidUsageError); + + executor.compile(); + executor.launch(); + executor.launch(); // Will be ignored with a warning. + executor.run(1); + executor.wait(); + executor.wait(); // nothing to do + + // Stop & destroy automatically. + } + + UNITTEST_EQ(ark::gpuStreamDestroy(stream), ark::gpuSuccess); + return ark::unittest::SUCCESS; +} + +ark::unittest::State test_executor_loop() { return test_executor(); } + +ark::unittest::State test_executor_no_loop() { return test_executor(); } + +ark::unittest::State test_executor_tensor_read_write() { + // Alloc CPU array + std::vector host_data(1024); + void *host_ptr = host_data.data(); + for (size_t i = 0; i < host_data.size(); ++i) { + host_data[i] = static_cast(i); + } + + // Alloc GPU array + void *dev_ptr; + UNITTEST_EQ(ark::gpuMalloc(&dev_ptr, 1024 * sizeof(float)), + ark::gpuSuccess); + + // Create an ARK tensor + ark::Model m; + auto tensor = m.tensor({1024}, ark::FP32); + m.noop(tensor); + + ark::DefaultExecutor executor(m, 0); + executor.compile(); + executor.launch(); + + // Copy data from CPU array to ARK tensor + executor.tensor_write(tensor, host_ptr, 1024 * sizeof(float)); + + // Copy data from ARK tensor to GPU array + executor.tensor_read(tensor, dev_ptr, 1024 * sizeof(float), nullptr, true); + + // Check the data + std::vector dev_data(1024); + executor.tensor_read(tensor, dev_data.data(), 1024 * sizeof(float)); + for (size_t i = 0; i < dev_data.size(); ++i) { + UNITTEST_EQ(dev_data[i], static_cast(i)); + dev_data[i] = -1; + } + + UNITTEST_EQ(ark::gpuMemcpy(dev_data.data(), dev_ptr, 1024 * sizeof(float), + ark::gpuMemcpyDeviceToHost), + ark::gpuSuccess); + for (size_t i = 0; i < dev_data.size(); ++i) { + UNITTEST_EQ(dev_data[i], static_cast(i)); + dev_data[i] = -1; + } + + // Copy -1s back to GPU array + UNITTEST_EQ(ark::gpuMemcpy(dev_ptr, dev_data.data(), 1024 * sizeof(float), + ark::gpuMemcpyHostToDevice), + ark::gpuSuccess); + + // Copy data from GPU array to ARK tensor + executor.tensor_write(tensor, dev_ptr, 1024 * sizeof(float), nullptr, true); + + // Copy data from ARK tensor to CPU array + executor.tensor_read(tensor, host_ptr, 1024 * sizeof(float)); + + // Check the data + for (size_t i = 0; i < host_data.size(); ++i) { + UNITTEST_EQ(host_data[i], -1); + } + + return ark::unittest::SUCCESS; +} + +ark::unittest::State test_executor_invalid() { + // Invalid device ID. + UNITTEST_THROW(ark::Executor(-1, nullptr, "test", ""), + ark::InvalidUsageError); + + // Invalid rank. + ark::PlanJson plan; + plan["Rank"] = 1; + UNITTEST_THROW(ark::Executor(0, nullptr, "test", plan.dump(), true), + ark::InvalidUsageError); + + return ark::unittest::SUCCESS; +} + +int main() { + UNITTEST(test_executor_loop); + UNITTEST(test_executor_no_loop); + UNITTEST(test_executor_tensor_read_write); + UNITTEST(test_executor_invalid); + return 0; +} diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp index 3744c33db..7f30f39ed 100644 --- a/ark/include/ark/executor.hpp +++ b/ark/include/ark/executor.hpp @@ -39,7 +39,7 @@ class Executor { /// Launch the model (not running yet). This must be called after /// `compile()`. - void launch(int64_t max_spin_count = -1); + void launch(); /// Run the model for `iter` iterations. void run(int iter); diff --git a/python/executor_py.cpp b/python/executor_py.cpp index a3f2a078b..36e1c435e 100644 --- a/python/executor_py.cpp +++ b/python/executor_py.cpp @@ -56,7 +56,7 @@ void register_executor(py::module &m) { }) .def("plan", &ark::Executor::plan) .def("compile", &ark::Executor::compile) - .def("launch", &ark::Executor::launch, py::arg("max_spin_count") = -1) + .def("launch", &ark::Executor::launch) .def("run", &ark::Executor::run, py::arg("iter")) .def("wait", &ark::Executor::wait, py::arg("max_spin_count") = -1) .def("stop", &ark::Executor::stop, py::arg("max_spin_count") = -1) From 4971601b09880e29adc85ab305a739edf55ccbb0 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 6 Aug 2024 19:03:08 +0000 Subject: [PATCH 52/61] fix merge --- ark/api/context_manager.cpp | 42 ---------- ark/api/context_manager_test.cpp | 53 ------------ ark/api/executor.cpp | 8 -- ark/api/model.cpp | 2 +- ark/api/model_graph.cpp | 4 +- ark/api/plan_manager.cpp | 97 ---------------------- ark/api/plan_manager_test.cpp | 58 ------------- ark/codegen.cpp | 1 - ark/include/ark/context_manager.hpp | 24 ------ ark/include/ark/error.hpp | 15 +++- ark/include/ark/model.hpp | 57 +++++-------- ark/include/ark/model_graph.hpp | 2 +- ark/include/ark/plan_manager.hpp | 25 ------ ark/model/model_graph_impl.cpp | 6 +- ark/model/model_graph_impl.hpp | 8 +- ark/model/model_op.cpp | 11 --- ark/model/model_op.hpp | 9 +- ark/ops/ops_arithmetic.cpp | 20 ++--- ark/ops/ops_arithmetic_test.cpp | 48 ++++------- ark/ops/ops_cast.cpp | 10 +-- ark/ops/ops_communication.cpp | 14 ++-- ark/ops/ops_copy.cpp | 5 +- ark/ops/ops_embedding.cpp | 4 +- ark/ops/ops_identity.cpp | 2 +- ark/ops/ops_math.cpp | 31 +++---- ark/ops/ops_matmul.cpp | 6 +- ark/ops/ops_noop.cpp | 2 +- ark/ops/ops_reduce.cpp | 12 +-- ark/ops/ops_refer.cpp | 2 +- ark/ops/ops_reshape.cpp | 4 +- ark/ops/ops_rope.cpp | 5 +- ark/ops/ops_scalar.cpp | 31 +++---- ark/ops/ops_transpose.cpp | 5 +- examples/llama/model_7b_b1_s2048.py | 70 ++++++++-------- examples/tutorial/plan_manager_tutorial.py | 81 ------------------ python/ark/plan_manager.py | 34 -------- python/ark/runtime.py | 1 + python/model_py.cpp | 79 ++++++++---------- python/plan_manager_py.cpp | 15 ---- 39 files changed, 195 insertions(+), 708 deletions(-) delete mode 100644 ark/api/context_manager.cpp delete mode 100644 ark/api/context_manager_test.cpp delete mode 100644 ark/api/plan_manager.cpp delete mode 100644 ark/api/plan_manager_test.cpp delete mode 100644 ark/include/ark/context_manager.hpp delete mode 100644 ark/include/ark/plan_manager.hpp delete mode 100644 examples/tutorial/plan_manager_tutorial.py delete mode 100644 python/ark/plan_manager.py delete mode 100644 python/plan_manager_py.cpp diff --git a/ark/api/context_manager.cpp b/ark/api/context_manager.cpp deleted file mode 100644 index 6d16d9e79..000000000 --- a/ark/api/context_manager.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "ark/context_manager.hpp" - -#include "model/model_graph_impl.hpp" - -namespace ark { - -class ContextManager::Impl { - public: - Impl(std::shared_ptr context_stack, - const std::map& context_map); - - ~Impl(); - - private: - std::shared_ptr context_stack_; - std::vector keys_; -}; - -ContextManager::Impl::Impl( - std::shared_ptr context_stack, - const std::map& context_map) - : context_stack_(context_stack) { - for (const auto& [key, value] : context_map) { - context_stack_->push(key, value); - keys_.push_back(key); - } -} - -ContextManager::Impl::~Impl() { - for (auto it = keys_.rbegin(); it != keys_.rend(); ++it) { - context_stack_->pop(*it); - } -} - -ContextManager::ContextManager( - Model& model, const std::map& context_map) - : impl_(std::make_shared(model.impl_->context_stack_, context_map)) {} - -} // namespace ark diff --git a/ark/api/context_manager_test.cpp b/ark/api/context_manager_test.cpp deleted file mode 100644 index 5fff94f34..000000000 --- a/ark/api/context_manager_test.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "ark/context_manager.hpp" - -#include "model/model_node.hpp" -#include "unittest/unittest_utils.h" - -ark::unittest::State test_context_manager() { - ark::Model model; - ark::Tensor t0 = model.tensor({1}, ark::FP32); - ark::Tensor t1 = model.tensor({1}, ark::FP32); - ark::Tensor t2 = model.add(t0, t1); - - ark::Tensor t3; - ark::Tensor t4; - ark::Tensor t5; - { - ark::ContextManager cm0_1(model, {{"key0", "val1"}}); - t3 = model.relu(t2); - - ark::ContextManager cm1_1(model, {{"key1", "val2"}}); - t4 = model.sqrt(t3); - } - { - ark::ContextManager cm0_2(model, {{"key0", "val3"}}); - t5 = model.exp(t2); - } - - UNITTEST_TRUE(model.verify()); - - auto compressed = model.compress(false); - UNITTEST_TRUE(compressed.verify()); - - auto nodes = compressed.nodes(); - UNITTEST_EQ(nodes.size(), 4); - - UNITTEST_EQ(nodes[0]->context.size(), 0); - UNITTEST_EQ(nodes[1]->context.size(), 1); - UNITTEST_EQ(nodes[1]->context.at("key0"), "val1"); - UNITTEST_EQ(nodes[2]->context.size(), 2); - UNITTEST_EQ(nodes[2]->context.at("key0"), "val1"); - UNITTEST_EQ(nodes[2]->context.at("key1"), "val2"); - UNITTEST_EQ(nodes[3]->context.size(), 1); - UNITTEST_EQ(nodes[3]->context.at("key0"), "val3"); - - return ark::unittest::SUCCESS; -} - -int main() { - UNITTEST(test_context_manager); - return 0; -} diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index 6fb2b5f2e..17d579763 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -250,14 +250,6 @@ void Executor::Impl::init(const PlanJson &plan_json) { gpu_manager->info().arch->name(), "`."); } - if (!gpu_manager->info().arch->belongs_to( - Arch::from_name(plan_json_.at("Architecture")))) { - LOG(WARN, "Architecture name of the plan `", - plan_json_.at("Architecture").get(), - "` is not compatible with the GPU architecture `", - gpu_manager->info().arch->name(), "`."); - } - buffer_id_to_offset_ = init_buffers(plan_json_); std::string buffer_id_to_offset_str; diff --git a/ark/api/model.cpp b/ark/api/model.cpp index 8227ea848..dcbd4940e 100644 --- a/ark/api/model.cpp +++ b/ark/api/model.cpp @@ -20,7 +20,7 @@ size_t Model::id() const { return id_; } Model Model::compress() const { Model model(*this); - model.compress_nodes(merge_nodes); + model.compress_nodes(); return model; } diff --git a/ark/api/model_graph.cpp b/ark/api/model_graph.cpp index a4477b8e6..e07565141 100644 --- a/ark/api/model_graph.cpp +++ b/ark/api/model_graph.cpp @@ -33,9 +33,7 @@ int ModelGraph::rank() const { return impl_->rank(); } int ModelGraph::world_size() const { return impl_->world_size(); } -void ModelGraph::compress_nodes(bool merge_nodes) { - impl_->compress_nodes(merge_nodes); -} +void ModelGraph::compress_nodes() { impl_->compress_nodes(); } bool ModelGraph::compressed() const { return impl_->compressed(); } diff --git a/ark/api/plan_manager.cpp b/ark/api/plan_manager.cpp deleted file mode 100644 index 8cb1940b1..000000000 --- a/ark/api/plan_manager.cpp +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "ark/plan_manager.hpp" - -#include "logging.h" -#include "model/model_json.hpp" -#include "model/model_graph_impl.hpp" - -namespace ark { - -class PlanManagerState { - public: - PlanManagerState() : sync(true) {} - bool sync; -}; - -static std::map gPlanManagerStates; - -PlanManager::PlanManager(Model& model, const std::string& plan_context) - : model_id_(model.id()), stop_sync_(false) { - static int task_group_id = 0; - auto ctx = Json::parse(plan_context); - if (!ctx.is_object()) { - ERR(ModelError, "plan context must be a JSON object"); - } - if (gPlanManagerStates.find(model_id_) == gPlanManagerStates.end()) { - gPlanManagerStates.emplace(model_id_, PlanManagerState()); - } - auto& state = gPlanManagerStates[model_id_]; - bool async = !state.sync; - std::map context_map; - for (const auto& [key, value] : ctx.items()) { - if (key == "sync") { - if (!value.is_boolean()) { - ERR(ModelError, "sync must be a boolean"); - } - if (state.sync && !value.get()) { - stop_sync_ = true; - state.sync = false; - context_map["TaskGroupId"] = std::to_string(task_group_id++); - } - } else if (key == "processor_range") { - if (!value.is_array()) { - ERR(ModelError, "processor_range must be an array"); - } - if (async) { - LOG(WARN, "Ignoring processor_range under sync=false context"); - continue; - } - context_map["ProcessorRange"] = value.dump(); - } else if (key == "warp_range") { - if (!value.is_array()) { - ERR(ModelError, "warp_range must be an array"); - } - if (async) { - LOG(WARN, "Ignoring warp_range under sync=false context"); - continue; - } - context_map["WarpRange"] = value.dump(); - } else if (key == "sram_range") { - if (!value.is_array()) { - ERR(ModelError, "sram_range must be an array"); - } - if (async) { - LOG(WARN, "Ignoring sram_range under sync=false context"); - continue; - } - context_map["SramRange"] = value.dump(); - } else if (key == "config") { - if (!value.is_object()) { - ERR(ModelError, "config must be an object"); - } - auto cfg = model.impl_->get_context("Config"); - if (cfg.empty()) { - context_map["Config"] = value.dump(); - } else { - auto cfg_obj = Json::parse(cfg); - for (const auto& [k, v] : value.items()) { - cfg_obj[k] = v; - } - context_map["Config"] = cfg_obj.dump(); - } - } else { - LOG(WARN, "Ignoring unknown plan context key: ", key); - } - } - context_manager_ = std::make_shared(model, context_map); -} - -PlanManager::~PlanManager() { - if (stop_sync_) { - gPlanManagerStates[model_id_].sync = true; - } -} - -} // namespace ark diff --git a/ark/api/plan_manager_test.cpp b/ark/api/plan_manager_test.cpp deleted file mode 100644 index 78f5d4cb8..000000000 --- a/ark/api/plan_manager_test.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "ark/plan_manager.hpp" -#include "ark/planner.hpp" - -#include "model/model_json.hpp" -#include "unittest/unittest_utils.h" - -ark::unittest::State test_plan_manager() { - ark::Model model; - ark::Tensor t0 = model.tensor({1}, ark::FP32); - ark::Tensor t1 = model.tensor({1}, ark::FP32); - ark::Tensor t2 = model.add(t0, t1); - - ark::Tensor t3; - ark::Tensor t4; - ark::Tensor t5; - ark::Tensor t6; - { - ark::PlanManager pm_0(model, ark::Json({ - {"processor_range", {0, 2}}, - {"warp_range", {0, 4}}, - {"sram_range", {0, 0}}, - {"sync", false} - }).dump()); - t3 = model.relu(t2); - t4 = model.sqrt(t3); - } - { - ark::PlanManager pm_0(model, ark::Json({ - {"processor_range", {2, 4}}, - {"warp_range", {0, 4}}, - {"sram_range", {0, 0}} - }).dump()); - t5 = model.exp(t2); - - ark::PlanManager pm_1(model, ark::Json({ - {"processor_range", {2, 3}} - }).dump()); - t6 = model.rsqrt(t5); - } - - UNITTEST_TRUE(model.verify()); - - ark::DefaultPlanner planner(model, 0); - auto plan_str = planner.plan(); - ark::Json plan = ark::Json::parse(plan_str); - - UNITTEST_LOG(plan_str); - - return ark::unittest::SUCCESS; -} - -int main() { - UNITTEST(test_plan_manager); - return 0; -} diff --git a/ark/codegen.cpp b/ark/codegen.cpp index bc43584cb..1619b863f 100644 --- a/ark/codegen.cpp +++ b/ark/codegen.cpp @@ -87,7 +87,6 @@ CodeGenerator::Impl::Impl(const PlanJson &plan, num_warps_per_proc_ = plan.at("NumWarpsPerProcessor"); std::stringstream definitions_ss; - for (auto &task_json : plan.at("TaskInfos")) { definitions_ss << this->def_task(task_json); } diff --git a/ark/include/ark/context_manager.hpp b/ark/include/ark/context_manager.hpp deleted file mode 100644 index 58271ea8c..000000000 --- a/ark/include/ark/context_manager.hpp +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#ifndef ARK_CONTEXT_MANAGER_HPP -#define ARK_CONTEXT_MANAGER_HPP - -#include -#include - -namespace ark { - -class ContextManager { - public: - ContextManager(Model& model, - const std::map& context_map); - - private: - class Impl; - std::shared_ptr impl_; -}; - -} // namespace ark - -#endif // ARK_CONTEXT_MANAGER_HPP diff --git a/ark/include/ark/error.hpp b/ark/include/ark/error.hpp index 78d02cab3..965b1c0bc 100644 --- a/ark/include/ark/error.hpp +++ b/ark/include/ark/error.hpp @@ -9,6 +9,7 @@ namespace ark { +/// Base class for all ARK errors. class BaseError : public std::exception { private: std::string msg_; @@ -24,15 +25,21 @@ class BaseError : public std::exception { _name(const std::string &msg) : BaseError(msg) {} \ }; +/// Internal error in ARK, likely a bug. REGISTER_ERROR_TYPE(InternalError) +/// Invalid usage of ARK API. REGISTER_ERROR_TYPE(InvalidUsageError) -REGISTER_ERROR_TYPE(NotFoundError) +/// Invalid ARK model definition or usage. REGISTER_ERROR_TYPE(ModelError) -REGISTER_ERROR_TYPE(SchedulerError) -REGISTER_ERROR_TYPE(ExecutorError) +/// Invalid ARK plan definition or usage. +REGISTER_ERROR_TYPE(PlanError) +/// Unsupported feature triggered. +REGISTER_ERROR_TYPE(UnsupportedError) +/// Error from invalid system state such as a system call failure. REGISTER_ERROR_TYPE(SystemError) +/// Error from a CUDA/HIP API call. REGISTER_ERROR_TYPE(GpuError) -REGISTER_ERROR_TYPE(RuntimeError) +/// Error from a unit test. REGISTER_ERROR_TYPE(UnitTestError) } // namespace ark diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp index cbbff7f95..3c4f22e22 100644 --- a/ark/include/ark/model.hpp +++ b/ark/include/ark/model.hpp @@ -103,29 +103,23 @@ class Model : public ModelGraph { // result in `output`. // Currently, only reduction along the last dimension is supported. Tensor reduce_sum(Tensor input, int axis, bool keepdims = true, - Tensor output = NullTensor, - const std::string &config = "", - const std::string &name = ""); + Tensor output = NullTensor, const std::string &name = ""); Tensor reduce_mean(Tensor input, int axis, bool keepdims = true, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); Tensor reduce_max(Tensor input, int axis, bool keepdims = true, - Tensor output = NullTensor, - const std::string &config = "", - const std::string &name = ""); + Tensor output = NullTensor, const std::string &name = ""); // Transposes the `input` tensor according to the given `permutation`. // For example, transpose(input, {0, 1 ,3, 2}) will swap the last two // dimensions of the input tensor. Currently, only 4D tensors are supported. Tensor transpose(Tensor input, const std::vector &permutation, - Tensor output = NullTensor, const std::string &config = "", - const std::string &name = ""); + Tensor output = NullTensor, const std::string &name = ""); // Performs matrix multiplication between the `input` tensor and another // `other` tensor, storing the result in `output`. Tensor matmul(Tensor input, Tensor other, Tensor output = NullTensor, bool trans_input = false, bool trans_other = false, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); // Implements the 'im2col' method for 2D convolution layers, which takes an // `input` tensor and reshapes it to a 2D matrix by extracting image patches // from the input tensor based on the provided parameters. @@ -142,66 +136,63 @@ class Model : public ModelGraph { Tensor output = NullTensor, const std::string &name = ""); // Calculates the exponential of the `input` tensor, element-wise. Tensor exp(Tensor input, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); // Calculates the square root of the `input` tensor, element-wise. Tensor sqrt(Tensor input, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); // Calculates the reverse square root of the `input` tensor, element-wise. Tensor rsqrt(Tensor input, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); // ReLU activation Tensor relu(Tensor input, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); // Copy the `input` tensor to `output` tensor Tensor copy(Tensor input, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); Tensor copy(float val, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); // Applies the Gaussian Error Linear Unit (GELU) activation function to the // `input` tensor, element-wise. GELU is a smooth approximation of the // rectifier function and is widely used in deep learning models. Tensor gelu(Tensor input, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); // Sigmoid activation Tensor sigmoid(Tensor input, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); // Performs rotary position embedding (RoPE) on the `input` tensor Tensor rope(Tensor input, Tensor other, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); // Performs an element-wise addition operator between the `input` tensor // and the `other` tensor Tensor add(Tensor input, Tensor other, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); Tensor add(Tensor input, float value, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); // Performs an element-wise subtraction operator between the `input` tensor // and the `other` tensor Tensor sub(Tensor input, Tensor other, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); Tensor sub(Tensor input, float value, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); // Performs an element-wise multiplication operator between the `input` // tensor and the `other` tensor, Tensor mul(Tensor input, Tensor other, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); Tensor mul(Tensor input, float value, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); // Performs an element-wise division operator between the `input` // tensor and the `other` tensor, Tensor div(Tensor input, Tensor other, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); Tensor div(Tensor input, float value, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); + const std::string &name = ""); Tensor send(Tensor input, int remote_rank, int tag, - Tensor output = NullTensor, const std::string &config = "", - const std::string &name = ""); + Tensor output = NullTensor, const std::string &name = ""); // Blocks the execution until the corresponding 'send' operator with the // specified `id` is completed. - Tensor send_done(Tensor input, const std::string &config = "", - const std::string &name = ""); + Tensor send_done(Tensor input, const std::string &name = ""); // Receives a tensor from a source rank (@p src_rank), identified by the // `id` parameter. Blocks the execution until the corresponding 'recv' // operator is completed. @@ -238,12 +229,10 @@ class Model : public ModelGraph { const std::string &name = ""); /// Embedding layer. Tensor embedding(Tensor input, Tensor weight, Tensor output = NullTensor, - const std::string &config = "", const std::string &name = ""); /// Tensor type casting. Tensor cast(Tensor input, const DataType &data_type, - Tensor output = NullTensor, const std::string &config = "", - const std::string &name = ""); + Tensor output = NullTensor, const std::string &name = ""); // sync across multi devices Tensor device_sync(Tensor input, int rank, int rank_num, diff --git a/ark/include/ark/model_graph.hpp b/ark/include/ark/model_graph.hpp index 598bf343a..29074630c 100644 --- a/ark/include/ark/model_graph.hpp +++ b/ark/include/ark/model_graph.hpp @@ -25,7 +25,7 @@ class ModelGraph { int world_size() const; - void compress_nodes(bool merge_nodes = false); + void compress_nodes(); bool compressed() const; diff --git a/ark/include/ark/plan_manager.hpp b/ark/include/ark/plan_manager.hpp deleted file mode 100644 index 3952a1c06..000000000 --- a/ark/include/ark/plan_manager.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#ifndef ARK_PLAN_MANAGER_HPP -#define ARK_PLAN_MANAGER_HPP - -#include - -namespace ark { - -class PlanManager { - public: - PlanManager(Model& model, const std::string& plan_context); - - ~PlanManager(); - - private: - size_t model_id_; - bool stop_sync_; - std::shared_ptr context_manager_; -}; - -} // namespace ark - -#endif // ARK_PLAN_MANAGER_HPP diff --git a/ark/model/model_graph_impl.cpp b/ark/model/model_graph_impl.cpp index 81359439a..7c1ea3fb5 100644 --- a/ark/model/model_graph_impl.cpp +++ b/ark/model/model_graph_impl.cpp @@ -112,7 +112,7 @@ ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) { return *this; } -void ModelGraph::Impl::compress_nodes(bool merge_nodes) { +void ModelGraph::Impl::compress_nodes() { if (!compressed_) { this->recursive_remove_virtual_nodes(); compressed_ = true; @@ -178,10 +178,6 @@ bool ModelGraph::Impl::verify() const { return true; } -std::string ModelGraph::Impl::get_context(const std::string &key) const { - return context_stack_->get_context(key); -} - ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) { for (auto &tns : op->input_tensors()) { if (tensor_to_producer_op_.find(tns) == tensor_to_producer_op_.end()) { diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp index c7080ab73..62944f999 100644 --- a/ark/model/model_graph_impl.hpp +++ b/ark/model/model_graph_impl.hpp @@ -54,8 +54,7 @@ class ModelGraph::Impl { Impl &operator=(const Impl &other); template - ModelOpRef create_op(const std::string &config, const std::string &name, - Args &&...args) { + ModelOpRef create_op(const std::string &name, Args &&... args) { ModelOpRef op = std::make_shared(std::forward(args)...); std::string name_copy; if (name.empty()) { @@ -68,7 +67,6 @@ class ModelGraph::Impl { if (count > 0) { name_copy += "_" + std::to_string(count); } - op->set_config(config); op->set_name(name_copy); add_op(op); return op; @@ -78,14 +76,12 @@ class ModelGraph::Impl { int world_size() const { return world_size_; } - void compress_nodes(bool merge_nodes = false); + void compress_nodes(); bool compressed() const { return compressed_; } bool verify() const; - std::string get_context(const std::string &key) const; - std::string serialize(bool pretty = true) const; std::vector nodes() const; diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp index dc4906235..5db8576e8 100644 --- a/ark/model/model_op.cpp +++ b/ark/model/model_op.cpp @@ -92,14 +92,6 @@ const ModelOpType ModelOpT::from_name(const std::string &type_name) { return it->second; } -void ModelOp::set_config(const std::string &config) { - if (!config.empty()) { - config_ = Json::parse(config); - } else { - config_.clear(); - } -} - std::vector ModelOp::input_tensors() const { // input_tensors = read_tensors || write_tensors std::set input_tensors; @@ -192,9 +184,6 @@ Json ModelOp::serialize() const { for (auto &arg : args_) { j["Args"][arg.first] = arg.second.serialize(); } - if (!config_.empty()) { - j["Config"] = config_; - } return j; } diff --git a/ark/model/model_op.hpp b/ark/model/model_op.hpp index d048375c2..f7323d6c0 100644 --- a/ark/model/model_op.hpp +++ b/ark/model/model_op.hpp @@ -50,8 +50,8 @@ class ModelOp { return ""; } - virtual std::vector impl_args( - [[maybe_unused]] const Json &config) const { + virtual std::vector impl_args([ + [maybe_unused]] const Json &config) const { return {}; } @@ -60,14 +60,10 @@ class ModelOp { return {{"NumTasks", 0}, {"NumWarps", 0}, {"SramBytes", 0}}; } - void set_config(const std::string &config); - void set_name(const std::string &name) { name_ = name; } ModelOpType type() const { return type_; } - const Json &config() const { return config_; } - const std::string &name() const { return name_; } bool is_virtual() const { return is_virtual_; } @@ -104,7 +100,6 @@ class ModelOp { const std::vector &template_args = {}); ModelOpType type_; - Json config_; std::string name_; bool is_virtual_; std::vector read_tensors_; diff --git a/ark/ops/ops_arithmetic.cpp b/ark/ops/ops_arithmetic.cpp index ef85b5d22..aeece0d77 100644 --- a/ark/ops/ops_arithmetic.cpp +++ b/ark/ops/ops_arithmetic.cpp @@ -12,10 +12,9 @@ ModelOpAdd::ModelOpAdd(ModelTensorRef input, ModelTensorRef other, : ModelOpBroadcast2("Add", input, other, output) {} Tensor Model::add(Tensor input, Tensor other, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, other.ref_, - output.ref_) + ->create_op(name, input.ref_, other.ref_, output.ref_) ->result_tensors()[0]; } @@ -24,10 +23,9 @@ ModelOpMul::ModelOpMul(ModelTensorRef input, ModelTensorRef other, : ModelOpBroadcast2("Mul", input, other, output) {} Tensor Model::mul(Tensor input, Tensor other, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, other.ref_, - output.ref_) + ->create_op(name, input.ref_, other.ref_, output.ref_) ->result_tensors()[0]; } @@ -36,10 +34,9 @@ ModelOpSub::ModelOpSub(ModelTensorRef input, ModelTensorRef other, : ModelOpBroadcast2("Sub", input, other, output) {} Tensor Model::sub(Tensor input, Tensor other, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, other.ref_, - output.ref_) + ->create_op(name, input.ref_, other.ref_, output.ref_) ->result_tensors()[0]; } @@ -48,10 +45,9 @@ ModelOpDiv::ModelOpDiv(ModelTensorRef input, ModelTensorRef other, : ModelOpBroadcast2("Div", input, other, output) {} Tensor Model::div(Tensor input, Tensor other, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, other.ref_, - output.ref_) + ->create_op(name, input.ref_, other.ref_, output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_arithmetic_test.cpp b/ark/ops/ops_arithmetic_test.cpp index fd6a05b1a..772da3276 100644 --- a/ark/ops/ops_arithmetic_test.cpp +++ b/ark/ops/ops_arithmetic_test.cpp @@ -2,7 +2,6 @@ // Licensed under the MIT license. #include "ops_test_common.hpp" -#include "model/model_json.hpp" template void baseline_add(std::vector &outputs, @@ -143,25 +142,12 @@ ark::unittest::State test_add_fp32() { ark::unittest::State test_add_fp16() { ark::Model m; - ark::Tensor t0 = m.tensor({32, 2048, 2048}, ark::FP16); - ark::Tensor t1 = m.tensor({32, 2048, 2048}, ark::FP16); + ark::Tensor t0 = m.tensor({8192}, ark::FP16); + ark::Tensor t1 = m.tensor({8192}, ark::FP16); ark::Tensor out = m.add(t0, t1); auto result = - ark::op_test("add_fp16", m, {t0, t1}, {out}, baseline_add, {}, - { - ark::DefaultPlanner::ConfigRule([](const std::string op_str, const std::string) { - auto op = ark::Json::parse(op_str); - ark::Json config; - if (op.at("Type") == "Add") { - config["NumWarps"] = 4; - config["SramBytes"] = 0; - config["Tile"] = {128, 256}; - config["NumTasks"] = 4096; - } - return config.dump(); - }) - }); + ark::op_test("add_fp16", m, {t0, t1}, {out}, baseline_add); UNITTEST_LOG(result); UNITTEST_EQ(result.max_diff[0], 0.0f); return ark::unittest::SUCCESS; @@ -430,20 +416,20 @@ ark::unittest::State test_div_invalid() { int main() { ark::init(); - // UNITTEST(test_add_fp32); + UNITTEST(test_add_fp32); UNITTEST(test_add_fp16); - // UNITTEST(test_add_bf16); - // UNITTEST(test_add_overwrite); - // UNITTEST(test_add_broadcast); - // UNITTEST(test_add_invalid); - // UNITTEST(test_sub_fp32); - // UNITTEST(test_sub_invalid); - // UNITTEST(test_mul_fp32); - // UNITTEST(test_mul_fp16); - // UNITTEST(test_mul_overwrite); - // UNITTEST(test_mul_broadcast); - // UNITTEST(test_mul_invalid); - // UNITTEST(test_div_fp32); - // UNITTEST(test_div_invalid); + UNITTEST(test_add_bf16); + UNITTEST(test_add_overwrite); + UNITTEST(test_add_broadcast); + UNITTEST(test_add_invalid); + UNITTEST(test_sub_fp32); + UNITTEST(test_sub_invalid); + UNITTEST(test_mul_fp32); + UNITTEST(test_mul_fp16); + UNITTEST(test_mul_overwrite); + UNITTEST(test_mul_broadcast); + UNITTEST(test_mul_invalid); + UNITTEST(test_div_fp32); + UNITTEST(test_div_invalid); return ark::unittest::SUCCESS; } diff --git a/ark/ops/ops_cast.cpp b/ark/ops/ops_cast.cpp index 96146217e..e94fec989 100644 --- a/ark/ops/ops_cast.cpp +++ b/ark/ops/ops_cast.cpp @@ -105,7 +105,7 @@ ModelOpByteCast::ModelOpByteCast(ModelTensorRef input, ModelDataType data_type, } Tensor Model::cast(Tensor input, const DataType &data_type, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { check_null(input.ref()); if (output.is_null()) { if (input.data_type() == data_type) { @@ -119,14 +119,14 @@ Tensor Model::cast(Tensor input, const DataType &data_type, Tensor output, byte_cast_helper(input.ref(), data_type.ref(), new_shape, new_strides, new_offsets, new_padded_shape); return impl_ - ->create_op( - config, name, input.ref(), data_type.ref(), new_shape, - new_strides, new_offsets, new_padded_shape) + ->create_op(name, input.ref(), data_type.ref(), + new_shape, new_strides, + new_offsets, new_padded_shape) ->result_tensors()[0]; } } return impl_ - ->create_op(config, name, input.ref(), data_type.ref(), + ->create_op(name, input.ref(), data_type.ref(), output.ref()) ->result_tensors()[0]; } diff --git a/ark/ops/ops_communication.cpp b/ark/ops/ops_communication.cpp index e42c96d9c..baf7aafa2 100644 --- a/ark/ops/ops_communication.cpp +++ b/ark/ops/ops_communication.cpp @@ -589,25 +589,23 @@ Json ModelOpDeviceSync::default_config([[maybe_unused]] const ArchRef arch) cons } Tensor Model::send(Tensor input, int remote_rank, int tag, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { tags_.insert(tag); return impl_ - ->create_op(config, name, input.ref(), remote_rank, tag, + ->create_op(name, input.ref(), remote_rank, tag, output.ref()) ->result_tensors()[0]; } -Tensor Model::send_done(Tensor input, const std::string &config, - const std::string &name) { - return impl_->create_op(config, name, input.ref()) +Tensor Model::send_done(Tensor input, const std::string &name) { + return impl_->create_op(name, input.ref()) ->result_tensors()[0]; } Tensor Model::recv(Tensor output, int remote_rank, int tag, - const std::string &config, const std::string &name) { + const std::string &name) { tags_.insert(tag); - return impl_ - ->create_op(config, name, output.ref(), remote_rank, tag) + return impl_->create_op(name, output.ref(), remote_rank, tag) ->result_tensors()[0]; } diff --git a/ark/ops/ops_copy.cpp b/ark/ops/ops_copy.cpp index 4914c34a4..4f32966b8 100644 --- a/ark/ops/ops_copy.cpp +++ b/ark/ops/ops_copy.cpp @@ -20,9 +20,8 @@ ModelOpCopy::ModelOpCopy(ModelTensorRef input, ModelTensorRef output) verify(); } -Tensor Model::copy(Tensor input, Tensor output, const std::string &config, - const std::string &name) { - return impl_->create_op(config, name, input.ref_, output.ref_) +Tensor Model::copy(Tensor input, Tensor output, const std::string &name) { + return impl_->create_op(name, input.ref_, output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_embedding.cpp b/ark/ops/ops_embedding.cpp index 1169c47c3..2e2626d4c 100644 --- a/ark/ops/ops_embedding.cpp +++ b/ark/ops/ops_embedding.cpp @@ -70,9 +70,9 @@ Json ModelOpEmbedding::default_config([ } Tensor Model::embedding(Tensor input, Tensor weight, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, weight.ref_, + ->create_op(name, input.ref_, weight.ref_, output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_identity.cpp b/ark/ops/ops_identity.cpp index dd398d8a5..065cd9a52 100644 --- a/ark/ops/ops_identity.cpp +++ b/ark/ops/ops_identity.cpp @@ -31,7 +31,7 @@ Tensor Model::identity(Tensor input, const std::vector &deps, for (auto &dep : deps) { deps_ref.emplace_back(dep.ref_); } - return impl_->create_op("", name, input.ref_, deps_ref) + return impl_->create_op(name, input.ref_, deps_ref) ->result_tensors()[0]; } diff --git a/ark/ops/ops_math.cpp b/ark/ops/ops_math.cpp index b2833dcca..1067c561a 100644 --- a/ark/ops/ops_math.cpp +++ b/ark/ops/ops_math.cpp @@ -24,55 +24,48 @@ ModelOpMath::ModelOpMath(const std::string &type_name, ModelTensorRef input, ModelOpExp::ModelOpExp(ModelTensorRef input, ModelTensorRef output) : ModelOpMath("Exp", input, output) {} -Tensor Model::exp(Tensor input, Tensor output, const std::string &config, - const std::string &name) { - return impl_->create_op(config, name, input.ref_, output.ref_) +Tensor Model::exp(Tensor input, Tensor output, const std::string &name) { + return impl_->create_op(name, input.ref_, output.ref_) ->result_tensors()[0]; } ModelOpGelu::ModelOpGelu(ModelTensorRef input, ModelTensorRef output) : ModelOpMath("Gelu", input, output) {} -Tensor Model::gelu(Tensor input, Tensor output, const std::string &config, - const std::string &name) { - return impl_->create_op(config, name, input.ref_, output.ref_) +Tensor Model::gelu(Tensor input, Tensor output, const std::string &name) { + return impl_->create_op(name, input.ref_, output.ref_) ->result_tensors()[0]; } ModelOpRelu::ModelOpRelu(ModelTensorRef input, ModelTensorRef output) : ModelOpMath("Relu", input, output) {} -Tensor Model::relu(Tensor input, Tensor output, const std::string &config, - const std::string &name) { - return impl_->create_op(config, name, input.ref_, output.ref_) +Tensor Model::relu(Tensor input, Tensor output, const std::string &name) { + return impl_->create_op(name, input.ref_, output.ref_) ->result_tensors()[0]; } ModelOpRsqrt::ModelOpRsqrt(ModelTensorRef input, ModelTensorRef output) : ModelOpMath("Rsqrt", input, output) {} -Tensor Model::rsqrt(Tensor input, Tensor output, const std::string &config, - const std::string &name) { - return impl_->create_op(config, name, input.ref_, output.ref_) +Tensor Model::rsqrt(Tensor input, Tensor output, const std::string &name) { + return impl_->create_op(name, input.ref_, output.ref_) ->result_tensors()[0]; } ModelOpSigmoid::ModelOpSigmoid(ModelTensorRef input, ModelTensorRef output) : ModelOpMath("Sigmoid", input, output) {} -Tensor Model::sigmoid(Tensor input, Tensor output, const std::string &config, - const std::string &name) { - return impl_ - ->create_op(config, name, input.ref_, output.ref_) +Tensor Model::sigmoid(Tensor input, Tensor output, const std::string &name) { + return impl_->create_op(name, input.ref_, output.ref_) ->result_tensors()[0]; } ModelOpSqrt::ModelOpSqrt(ModelTensorRef input, ModelTensorRef output) : ModelOpMath("Sqrt", input, output) {} -Tensor Model::sqrt(Tensor input, Tensor output, const std::string &config, - const std::string &name) { - return impl_->create_op(config, name, input.ref_, output.ref_) +Tensor Model::sqrt(Tensor input, Tensor output, const std::string &name) { + return impl_->create_op(name, input.ref_, output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp index bc94922fc..dca349f44 100644 --- a/ark/ops/ops_matmul.cpp +++ b/ark/ops/ops_matmul.cpp @@ -244,10 +244,10 @@ Json ModelOpMatmul::default_config(const ArchRef arch) const { Tensor Model::matmul(Tensor input, Tensor other, Tensor output, bool trans_input, bool trans_other, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref(), other.ref(), - output.ref(), trans_input, trans_other) + ->create_op(name, input.ref(), other.ref(), output.ref(), + trans_input, trans_other) ->result_tensors()[0]; } diff --git a/ark/ops/ops_noop.cpp b/ark/ops/ops_noop.cpp index 42fe5fdf5..894ab29be 100644 --- a/ark/ops/ops_noop.cpp +++ b/ark/ops/ops_noop.cpp @@ -30,7 +30,7 @@ Json ModelOpNoop::default_config([[maybe_unused]] const ArchRef arch) const { } void Model::noop(Tensor input, const std::string &name) { - impl_->create_op("", name, input.ref_); + impl_->create_op(name, input.ref_); } } // namespace ark diff --git a/ark/ops/ops_reduce.cpp b/ark/ops/ops_reduce.cpp index 19f70385b..78dd9d7e6 100644 --- a/ark/ops/ops_reduce.cpp +++ b/ark/ops/ops_reduce.cpp @@ -127,25 +127,25 @@ Json ModelOpReduce::default_config([[maybe_unused]] const ArchRef arch) const { } Tensor Model::reduce_max(Tensor input, int axis, bool keepdims, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, axis, keepdims, + ->create_op(name, input.ref_, axis, keepdims, output.ref_) ->result_tensors()[0]; } Tensor Model::reduce_mean(Tensor input, int axis, bool keepdims, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, axis, keepdims, + ->create_op(name, input.ref_, axis, keepdims, output.ref_) ->result_tensors()[0]; } Tensor Model::reduce_sum(Tensor input, int axis, bool keepdims, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, axis, keepdims, + ->create_op(name, input.ref_, axis, keepdims, output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_refer.cpp b/ark/ops/ops_refer.cpp index 68c61b30f..782d6708c 100644 --- a/ark/ops/ops_refer.cpp +++ b/ark/ops/ops_refer.cpp @@ -20,7 +20,7 @@ Tensor Model::refer(Tensor input, const Dims &shape, const Dims &strides, const Dims &offsets, const Dims &padded_shape, const std::string &name) { return impl_ - ->create_op("", name, input.ref_, shape, strides, offsets, + ->create_op(name, input.ref_, shape, strides, offsets, padded_shape) ->result_tensors()[0]; } diff --git a/ark/ops/ops_reshape.cpp b/ark/ops/ops_reshape.cpp index 8ed3ac247..aac22b71a 100644 --- a/ark/ops/ops_reshape.cpp +++ b/ark/ops/ops_reshape.cpp @@ -199,8 +199,8 @@ Tensor Model::reshape(Tensor input, const Dims &shape, bool allowzero, reshape_helper(input.ref_, Dims{inferred_shape}, allowzero, new_shape, new_strides, new_offs); return impl_ - ->create_op("", name, input.ref_, new_shape, - new_strides, new_offs) + ->create_op(name, input.ref_, new_shape, new_strides, + new_offs) ->result_tensors()[0]; } diff --git a/ark/ops/ops_rope.cpp b/ark/ops/ops_rope.cpp index 36015aae5..06c1c915e 100644 --- a/ark/ops/ops_rope.cpp +++ b/ark/ops/ops_rope.cpp @@ -12,10 +12,9 @@ ModelOpRope::ModelOpRope(ModelTensorRef input, ModelTensorRef other, : ModelOpBroadcast2("Rope", input, other, output) {} Tensor Model::rope(Tensor input, Tensor other, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, other.ref_, - output.ref_) + ->create_op(name, input.ref_, other.ref_, output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_scalar.cpp b/ark/ops/ops_scalar.cpp index b5c10f1c3..944a7247c 100644 --- a/ark/ops/ops_scalar.cpp +++ b/ark/ops/ops_scalar.cpp @@ -115,21 +115,20 @@ std::vector ModelOpScalarMul::impl_args([ Tensor Model::constant(float val, const Dims &shape, DataType data_type, const std::string &name) { return impl_ - ->create_op("", name, val, shape, data_type.ref(), + ->create_op(name, val, shape, data_type.ref(), nullptr) ->result_tensors()[0]; } -Tensor Model::copy(float val, Tensor output, const std::string &config, - const std::string &name) { +Tensor Model::copy(float val, Tensor output, const std::string &name) { if (output == NullTensor) { return impl_ - ->create_op(config, name, val, Dims{1}, - FP32.ref(), output.ref()) + ->create_op(name, val, Dims{1}, FP32.ref(), + output.ref()) ->result_tensors()[0]; } else { return impl_ - ->create_op(config, name, val, output.shape(), + ->create_op(name, val, output.shape(), output.data_type().ref(), output.ref()) ->result_tensors()[0]; @@ -137,34 +136,30 @@ Tensor Model::copy(float val, Tensor output, const std::string &config, } Tensor Model::add(Tensor input, float value, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, value, - output.ref_) + ->create_op(name, input.ref_, value, output.ref_) ->result_tensors()[0]; } Tensor Model::sub(Tensor input, float value, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, -value, - output.ref_) + ->create_op(name, input.ref_, -value, output.ref_) ->result_tensors()[0]; } Tensor Model::mul(Tensor input, float value, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, value, - output.ref_) + ->create_op(name, input.ref_, value, output.ref_) ->result_tensors()[0]; } Tensor Model::div(Tensor input, float value, Tensor output, - const std::string &config, const std::string &name) { + const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, 1 / value, - output.ref_) + ->create_op(name, input.ref_, 1 / value, output.ref_) ->result_tensors()[0]; } diff --git a/ark/ops/ops_transpose.cpp b/ark/ops/ops_transpose.cpp index c659761d9..d0f7581cc 100644 --- a/ark/ops/ops_transpose.cpp +++ b/ark/ops/ops_transpose.cpp @@ -124,10 +124,9 @@ Json ModelOpTranspose::default_config([ } Tensor Model::transpose(Tensor input, const std::vector &permutation, - Tensor output, const std::string &config, - const std::string &name) { + Tensor output, const std::string &name) { return impl_ - ->create_op(config, name, input.ref_, permutation, + ->create_op(name, input.ref_, permutation, output.ref_) ->result_tensors()[0]; } diff --git a/examples/llama/model_7b_b1_s2048.py b/examples/llama/model_7b_b1_s2048.py index f41304e85..d4a080c84 100644 --- a/examples/llama/model_7b_b1_s2048.py +++ b/examples/llama/model_7b_b1_s2048.py @@ -90,7 +90,7 @@ def __init__( self.weight = ark.parameter([1, 1, dim], ark.fp32) def forward(self, x): - with ark.PlanManager( + with ark.PlannerContext( warp_range=[0, 8], sync=False, config={ @@ -100,12 +100,12 @@ def forward(self, x): "Granularity": 7, }, ): - with ark.PlanManager(config={"Tile": [1, 4096]}): + with ark.PlannerContext(config={"Tile": [1, 4096]}): x = ark.cast(x, ark.fp32) x2 = ark.mul(x, x) - with ark.PlanManager(config={"ImplType": "WarpWise"}): + with ark.PlannerContext(config={"ImplType": "WarpWise"}): mean = ark.reduce_mean(x2, axis=-1) - with ark.PlanManager( + with ark.PlannerContext( config={ "NumWarps": 1, "SramBytes": 0, @@ -114,7 +114,7 @@ def forward(self, x): } ): rrms = ark.rsqrt(mean) - with ark.PlanManager( + with ark.PlannerContext( warp_range=[0, 8], sync=False, config={ @@ -356,7 +356,7 @@ def __init__( def forward(self, x): # self.w2(F.silu(self.w1(x)) * self.w3(x)) - with ark.PlanManager( + with ark.PlannerContext( warp_range=[0, 8], sram_range=[0, 49344], sync=False, @@ -365,13 +365,13 @@ def forward(self, x): "NumTasks": 688, }, ): - with ark.PlanManager( + with ark.PlannerContext( config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} ): x1 = self.w1(x) - with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}): + with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}): x1 = Silu()(x1) - with ark.PlanManager( + with ark.PlannerContext( warp_range=[0, 8], sram_range=[0, 49344], sync=False, @@ -380,11 +380,11 @@ def forward(self, x): "NumTasks": 688, }, ): - with ark.PlanManager( + with ark.PlannerContext( config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} ): x2 = self.w3(x) - with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}): + with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}): x3 = ark.mul(x1, x2) x4 = self.w2(x3) return x4 @@ -404,7 +404,7 @@ def __init__(self): super(Softmax, self).__init__() def forward(self, input): - with ark.PlanManager( + with ark.PlannerContext( warp_range=[0, 8], sram_range=[0, 0], sync=False, @@ -414,14 +414,14 @@ def forward(self, input): "NumTasks": 65536, }, ): - with ark.PlanManager(config={"ImplType": "WarpWise"}): + with ark.PlannerContext(config={"ImplType": "WarpWise"}): max = ark.reduce_max(input, axis=-1) - with ark.PlanManager(config={"Tile": [1, 2048]}): + with ark.PlannerContext(config={"Tile": [1, 2048]}): output = ark.sub(input, max) output = ark.exp(output) - with ark.PlanManager(config={"ImplType": "WarpWise"}): + with ark.PlannerContext(config={"ImplType": "WarpWise"}): sum = ark.reduce_sum(output, axis=-1) - with ark.PlanManager(config={"Tile": [1, 2048]}): + with ark.PlannerContext(config={"Tile": [1, 2048]}): output = ark.div(output, sum) return output @@ -486,50 +486,50 @@ def forward( ): bsz, seqlen, _ = x.shape() - with ark.PlanManager( + with ark.PlannerContext( warp_range=[0, 4], sram_range=[0, 24672], sync=False, config={"NumWarps": 4, "NumTasks": 256}, ): - with ark.PlanManager( + with ark.PlannerContext( config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} ): xq = self.wq(x) xq = ark.reshape( xq, [bsz, seqlen, self.n_local_heads, self.head_dim] ) - with ark.PlanManager( + with ark.PlannerContext( config={"SramBytes": 0, "Tile": [256, 1, 128]} ): if freqs_cis is not None: xq = ark.rope(xq, freqs_cis) - with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}): + with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}): xq = ark.transpose(xq, [0, 2, 1, 3]) - with ark.PlanManager( + with ark.PlannerContext( warp_range=[0, 4], sram_range=[0, 24672], sync=False, config={"NumWarps": 4, "NumTasks": 256}, ): - with ark.PlanManager( + with ark.PlannerContext( config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} ): xk = self.wk(x) xk = ark.reshape( xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim] ) - with ark.PlanManager( + with ark.PlannerContext( config={"SramBytes": 0, "Tile": [256, 1, 128]} ): if freqs_cis is not None: xk = ark.rope(xk, freqs_cis) keys = xk - with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}): + with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}): keys = ark.transpose(keys, [0, 2, 1, 3]) - with ark.PlanManager( + with ark.PlannerContext( warp_range=[0, 4], sram_range=[0, 24672], sync=False, @@ -540,7 +540,7 @@ def forward( "TileShapeMNK": [256, 128, 32], }, ): - with ark.PlanManager( + with ark.PlannerContext( config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} ): xv = self.wv(x) @@ -548,12 +548,12 @@ def forward( xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim] ) values = xv - with ark.PlanManager( + with ark.PlannerContext( config={"SramBytes": 0, "Tile": [256, 1, 128]} ): values = ark.transpose(values, [0, 2, 1, 3]) - with ark.PlanManager( + with ark.PlannerContext( warp_range=[0, 8], sram_range=[0, 49344], sync=False, @@ -563,11 +563,11 @@ def forward( "Granularity": 2, }, ): - with ark.PlanManager( + with ark.PlannerContext( config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} ): scores = ark.matmul(xq, keys, transpose_other=True) - with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}): + with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}): scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim)) if mask is not None: @@ -575,7 +575,7 @@ def forward( scores = Softmax()(scores) - with ark.PlanManager( + with ark.PlannerContext( warp_range=[0, 4], sram_range=[0, 24672], sync=False, @@ -584,11 +584,11 @@ def forward( "NumTasks": 256, }, ): - with ark.PlanManager( + with ark.PlannerContext( config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]} ): output = ark.matmul(scores, values) - with ark.PlanManager( + with ark.PlannerContext( config={"SramBytes": 0, "Tile": [256, 1, 128]} ): output = ark.transpose(output, [0, 2, 1, 3]) @@ -634,7 +634,7 @@ def forward( ): attention_norm_x = self.attention_norm(x) h = self.attention.forward(attention_norm_x, start_pos, freqs_cis, mask) - with ark.PlanManager( + with ark.PlannerContext( warp_range=[0, 4], config={ "NumWarps": 4, @@ -645,7 +645,7 @@ def forward( ): h = ark.add(x, h) ff = self.feed_forward(self.ffn_norm(h)) - with ark.PlanManager( + with ark.PlannerContext( warp_range=[0, 4], config={ "NumWarps": 4, diff --git a/examples/tutorial/plan_manager_tutorial.py b/examples/tutorial/plan_manager_tutorial.py deleted file mode 100644 index c840ce0c0..000000000 --- a/examples/tutorial/plan_manager_tutorial.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import ark -import time -import torch -import torch.nn.functional as F - - -class VanillaSoftmax(ark.Module): - def __init__(self): - super(Softmax, self).__init__() - - def forward(self, input): - max = ark.reduce_max(input, axis=-1) - output = ark.sub(input, max) - output = ark.exp(output) - sum = ark.reduce_sum(output, axis=-1) - output = ark.div(output, sum) - return output - - -class Softmax(ark.Module): - def __init__(self): - super(Softmax, self).__init__() - - def forward(self, input): - with ark.PlanManager( - warp_range=[0, 8], - sram_range=[0, 0], - sync=False, - config={ - "NumWarps": 1, - "SramBytes": 0, - "NumTasks": 65536, - }, - ): - with ark.PlanManager(config={"ImplType": "WarpWise"}): - max = ark.reduce_max(input, axis=-1) - with ark.PlanManager(config={"Tile": [1, 2048]}): - output = ark.sub(input, max) - output = ark.exp(output) - with ark.PlanManager(config={"ImplType": "WarpWise"}): - sum = ark.reduce_sum(output, axis=-1) - with ark.PlanManager(config={"Tile": [1, 2048]}): - output = ark.div(output, sum) - return output - - -def eval(tensor: ark.Tensor): - with ark.Runtime() as rt: - rt.launch() - rt.run() - return tensor.to_torch() - - -def perf(): - with ark.Runtime() as rt: - rt.launch() - - start = time.time() - rt.run(iter=1000) - end = time.time() - return (end - start) / 1000 - - -if __name__ == "__main__": - ark.init() - - shape = (32, 2048, 2048) - - input = torch.randn(*shape).to("cuda:0") - - output = Softmax()(ark.Tensor.from_torch(input)) - - if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5): - print("Correct result") - else: - print("Incorrect result") - - print(f"Performance: {(perf() * 1e3):.3f} ms/iter") diff --git a/python/ark/plan_manager.py b/python/ark/plan_manager.py deleted file mode 100644 index 80e615ab8..000000000 --- a/python/ark/plan_manager.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import json -from typing import List, Dict, Any -from .model import Model -from ._ark_core import _PlanManager - - -class PlanManager(_PlanManager): - def __init__(self, **kwargs): - """ - Plan manager for specifying the parallelization and tiling configuration of the operators in the context. - - Args: - processor_range (List[int], optional): The range of processors to be used. Defaults to None. - warp_range (List[int], optional): The range of warps to be used. Defaults to None. - sram_range (List[int], optional): The range of SRAMs to be used. Defaults to None. - sync (bool, optional): Whether to synchronize the execution. Defaults to True. - config (Dict[str, Any], optional): The configuration for the operators. Defaults to None. - """ - super().__init__(Model.get_model(), json.dumps(kwargs)) - - def __enter__(self) -> "PlanManager": - """ - Enter the plan manager. - """ - return self - - def __exit__(self, exc_type, exc_value, exc_tb): - """ - Exit the plan manager. - """ - del self diff --git a/python/ark/runtime.py b/python/ark/runtime.py index f064a5988..960223c64 100644 --- a/python/ark/runtime.py +++ b/python/ark/runtime.py @@ -3,6 +3,7 @@ import logging from enum import Enum +from typing import Dict, List from _ark_core import _Executor from .planner import Planner, Plan diff --git a/python/model_py.cpp b/python/model_py.cpp index 5a22d6a18..c224a3d5b 100644 --- a/python/model_py.cpp +++ b/python/model_py.cpp @@ -19,100 +19,89 @@ void register_model(py::module &m) { .def("compress", &ark::Model::compress) .def("add", py::overload_cast( - &ark::Model::add), + const std::string &>(&ark::Model::add), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("add", py::overload_cast( - &ark::Model::add), + const std::string &>(&ark::Model::add), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("cast", &ark::Model::cast, py::arg("input"), py::arg("data_type"), - py::arg("output"), py::arg("config"), py::arg("name")) + py::arg("output"), py::arg("name")) .def("constant", &ark::Model::constant, py::arg("value"), py::arg("shape"), py::arg("data_type"), py::arg("name")) .def("copy", - py::overload_cast(&ark::Model::copy), - py::arg("input"), py::arg("output"), py::arg("config"), - py::arg("name")) + py::overload_cast( + &ark::Model::copy), + py::arg("input"), py::arg("output"), py::arg("name")) .def("copy", - py::overload_cast(&ark::Model::copy), - py::arg("input"), py::arg("output"), py::arg("config"), - py::arg("name")) + py::overload_cast( + &ark::Model::copy), + py::arg("input"), py::arg("output"), py::arg("name")) .def("div", py::overload_cast( - &ark::Model::div), + const std::string &>(&ark::Model::div), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("div", py::overload_cast( - &ark::Model::div), + const std::string &>(&ark::Model::div), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("config"), py::arg("name")) - .def("embedding", &ark::Model::embedding, py::arg("input"), - py::arg("weight"), py::arg("output"), py::arg("config"), py::arg("name")) + .def("embedding", &ark::Model::embedding, py::arg("input"), + py::arg("weight"), py::arg("output"), py::arg("name")) .def("exp", &ark::Model::exp, py::arg("input"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("gelu", &ark::Model::gelu, py::arg("input"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("identity", &ark::Model::identity, py::arg("input"), py::arg("deps"), py::arg("name")) .def("matmul", &ark::Model::matmul, py::arg("input"), py::arg("other"), py::arg("output"), py::arg("trans_input"), py::arg("trans_other"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("mul", py::overload_cast( - &ark::Model::mul), + const std::string &>(&ark::Model::mul), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("mul", py::overload_cast( - &ark::Model::mul), + const std::string &>(&ark::Model::mul), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("noop", &ark::Model::noop, py::arg("input"), py::arg("name")) .def("reduce_max", &ark::Model::reduce_max, py::arg("input"), py::arg("axis"), py::arg("keepdims"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("reduce_mean", &ark::Model::reduce_mean, py::arg("input"), py::arg("axis"), py::arg("keepdims"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("reduce_sum", &ark::Model::reduce_sum, py::arg("input"), py::arg("axis"), py::arg("keepdims"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("relu", &ark::Model::relu, py::arg("input"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("reshape", &ark::Model::reshape, py::arg("input"), py::arg("shape"), py::arg("allowzero"), py::arg("name")) .def("rope", &ark::Model::rope, py::arg("input"), py::arg("other"), - py::arg("output"), py::arg("config"), py::arg("name")) + py::arg("output"), py::arg("name")) .def("rsqrt", &ark::Model::rsqrt, py::arg("input"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("sharding", &ark::Model::sharding, py::arg("input"), py::arg("axis"), py::arg("dim_per_shard"), py::arg("name")) .def("sigmoid", &ark::Model::sigmoid, py::arg("input"), - py::arg("output"), py::arg("config"), py::arg("name")) + py::arg("output"), py::arg("name")) .def("sqrt", &ark::Model::sqrt, py::arg("input"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("sub", py::overload_cast( - &ark::Model::sub), + const std::string &>(&ark::Model::sub), py::arg("input"), py::arg("other"), py::arg("output"), - py::arg("config"), py::arg("name")) + py::arg("name")) .def("sub", py::overload_cast( - &ark::Model::sub), + const std::string &>(&ark::Model::sub), py::arg("input"), py::arg("other"), py::arg("output"), py::arg("name")) .def("tensor", diff --git a/python/plan_manager_py.cpp b/python/plan_manager_py.cpp deleted file mode 100644 index 34aa0b77c..000000000 --- a/python/plan_manager_py.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include -#include -#include - -#include - -namespace py = pybind11; - -void register_plan_manager(py::module &m) { - py::class_(m, "_PlanManager") - .def(py::init()); -} From 28b83953ae26b8554fc8b822df8e96dd8bf04091 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 6 Aug 2024 14:33:23 -0700 Subject: [PATCH 53/61] Update runtime.py --- python/ark/runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ark/runtime.py b/python/ark/runtime.py index 96c6f470a..e40750260 100644 --- a/python/ark/runtime.py +++ b/python/ark/runtime.py @@ -98,7 +98,7 @@ def launch( _RuntimeState.executor.destroy() _RuntimeState.executor = Executor( - gpu_id, + device_id, stream, "ArkRuntime", plan, From 11901c4a3f49469ede51e992b8b1d2fc1f2c1e3b Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 7 Aug 2024 09:36:45 +0000 Subject: [PATCH 54/61] fix --- python/ark/runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ark/runtime.py b/python/ark/runtime.py index e40750260..495fc1c24 100644 --- a/python/ark/runtime.py +++ b/python/ark/runtime.py @@ -101,7 +101,7 @@ def launch( device_id, stream, "ArkRuntime", - plan, + str(plan), loop_mode, ) self.executor = _RuntimeState.executor From 76e23c90c7afea129dd8f122cc7fdfc19673efd4 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 4 Apr 2026 04:32:19 +0000 Subject: [PATCH 55/61] Add Tensor.eval() & reduce bug fix --- ark/include/kernels/reduce.h | 24 +- ark/ops/ops_arithmetic_test.cpp | 451 ---------------- ark/ops/ops_cast_test.cpp | 322 ----------- ark/ops/ops_embedding_test.cpp | 122 ----- ark/ops/ops_math_test.cpp | 366 ------------- ark/ops/ops_matmul_test.cpp | 589 --------------------- ark/ops/ops_reduce.cpp | 14 +- ark/ops/ops_reduce_test.cpp | 472 ----------------- ark/ops/ops_rope_test.cpp | 103 ---- ark/ops/ops_scalar_test.cpp | 345 ------------ ark/ops/ops_transpose_test.cpp | 281 ---------- examples/multi_head_attention/mha.py | 169 ++++++ examples/multi_head_attention/test_mha.py | 371 +++++++++++++ examples/tutorial/module_tutorial.py | 157 ++---- examples/tutorial/planner_tutorial.py | 13 +- examples/tutorial/planner_tutorial_2.py | 35 +- examples/tutorial/quickstart_tutorial.py | 35 +- examples/tutorial/torch_tutorial.py | 36 +- python/ark/__init__.py | 2 +- python/ark/data_type.py | 1 + python/ark/model.py | 56 +- python/ark/ops.py | 97 ++-- python/ark/planner.py | 6 +- python/ark/tensor.py | 33 ++ python/unittest/ops/conftest.py | 32 ++ python/unittest/ops/test_arithmetic.py | 115 ++++ python/unittest/ops/test_cast.py | 27 + python/unittest/ops/test_composite.py | 33 ++ python/unittest/ops/test_embedding_rope.py | 45 ++ python/unittest/ops/test_math.py | 46 ++ python/unittest/ops/test_matmul.py | 65 +++ python/unittest/ops/test_reduce.py | 46 ++ python/unittest/ops/test_transpose.py | 26 + python/unittest/test_eval.py | 128 +++++ 34 files changed, 1365 insertions(+), 3298 deletions(-) delete mode 100644 ark/ops/ops_arithmetic_test.cpp delete mode 100644 ark/ops/ops_cast_test.cpp delete mode 100644 ark/ops/ops_embedding_test.cpp delete mode 100644 ark/ops/ops_math_test.cpp delete mode 100644 ark/ops/ops_matmul_test.cpp delete mode 100644 ark/ops/ops_reduce_test.cpp delete mode 100644 ark/ops/ops_rope_test.cpp delete mode 100644 ark/ops/ops_scalar_test.cpp delete mode 100644 ark/ops/ops_transpose_test.cpp create mode 100644 examples/multi_head_attention/mha.py create mode 100644 examples/multi_head_attention/test_mha.py create mode 100644 python/unittest/ops/conftest.py create mode 100644 python/unittest/ops/test_arithmetic.py create mode 100644 python/unittest/ops/test_cast.py create mode 100644 python/unittest/ops/test_composite.py create mode 100644 python/unittest/ops/test_embedding_rope.py create mode 100644 python/unittest/ops/test_math.py create mode 100644 python/unittest/ops/test_matmul.py create mode 100644 python/unittest/ops/test_reduce.py create mode 100644 python/unittest/ops/test_transpose.py create mode 100644 python/unittest/test_eval.py diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h index 62af5840b..a25e1ccf4 100644 --- a/ark/include/kernels/reduce.h +++ b/ark/include/kernels/reduce.h @@ -431,12 +431,26 @@ struct WwiseReduce { UnitOp::sync_threads(); } - // final reduction on shared memory using warp shuffle. - reduced[0] = warpsReduce( - reduced[0], tid, smem_per_warp); + // final reduction using warp shuffle. + // PhysicalThreadsPerRow = actual number of HW threads per row. + constexpr int PhysicalThreadsPerRow = + UnitOp::NumThreads / NonReduceDimLength; + static_assert(PhysicalThreadsPerRow > 0, + "Not enough threads for the tile dimensions. " + "Increase NumWarps or decrease Tile H dimension."); + if constexpr (PhysicalThreadsPerRow <= Arch::ThreadsPerWarp) { + // All threads for one row are within a single warp. + reduced[0] = + warpReduce(reduced[0]); + } else { + // Threads for one row span multiple warps — need shared memory. + reduced[0] = warpsReduce( + reduced[0], tid % PhysicalThreadsPerRow, smem_per_warp); + } - // write the result to output. - if (tid % ThreadsPerRow == 0) { + // write the result to output — first thread of each row group. + if (tid % PhysicalThreadsPerRow == 0) { ReduceType::template postReduce<1>(&out[idx_out], &reduced[0], InShape::W); } diff --git a/ark/ops/ops_arithmetic_test.cpp b/ark/ops/ops_arithmetic_test.cpp deleted file mode 100644 index 6a878c667..000000000 --- a/ark/ops/ops_arithmetic_test.cpp +++ /dev/null @@ -1,451 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "ops_test_common.hpp" - -template -void baseline_add(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, int) { - T *out = static_cast(outputs[0]); - T *t0 = static_cast(inputs[0]); - T *t1 = static_cast(inputs[1]); - - // NumPy-style broadcasted addition - ark::Dims osh = output_shapes[0].dims4(); - ark::Dims ish0 = input_shapes[0].dims4(); - ark::Dims ish1 = input_shapes[1].dims4(); - for (ark::DimType n = 0; n < osh[0]; ++n) { - for (ark::DimType c = 0; c < osh[1]; ++c) { - for (ark::DimType h = 0; h < osh[2]; ++h) { - for (ark::DimType w = 0; w < osh[3]; ++w) { - out[w + h * osh[3] + c * osh[2] * osh[3] + - n * osh[1] * osh[2] * osh[3]] = - t0[(w % ish0[3]) + (h % ish0[2]) * ish0[3] + - (c % ish0[1]) * ish0[2] * ish0[3] + - (n % ish0[0]) * ish0[1] * ish0[2] * ish0[3]] + - t1[(w % ish1[3]) + (h % ish1[2]) * ish1[3] + - (c % ish1[1]) * ish1[2] * ish1[3] + - (n % ish1[0]) * ish1[1] * ish1[2] * ish1[3]]; - } - } - } - } -}; - -template -void baseline_sub(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, int) { - T *out = static_cast(outputs[0]); - T *t0 = static_cast(inputs[0]); - T *t1 = static_cast(inputs[1]); - - // NumPy-style broadcasted addition - ark::Dims osh = output_shapes[0].dims4(); - ark::Dims ish0 = input_shapes[0].dims4(); - ark::Dims ish1 = input_shapes[1].dims4(); - for (ark::DimType n = 0; n < osh[0]; ++n) { - for (ark::DimType c = 0; c < osh[1]; ++c) { - for (ark::DimType h = 0; h < osh[2]; ++h) { - for (ark::DimType w = 0; w < osh[3]; ++w) { - out[w + h * osh[3] + c * osh[2] * osh[3] + - n * osh[1] * osh[2] * osh[3]] = - t0[(w % ish0[3]) + (h % ish0[2]) * ish0[3] + - (c % ish0[1]) * ish0[2] * ish0[3] + - (n % ish0[0]) * ish0[1] * ish0[2] * ish0[3]] - - t1[(w % ish1[3]) + (h % ish1[2]) * ish1[3] + - (c % ish1[1]) * ish1[2] * ish1[3] + - (n % ish1[0]) * ish1[1] * ish1[2] * ish1[3]]; - } - } - } - } -}; - -template -void baseline_mul(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, int) { - T *out = static_cast(outputs[0]); - T *t0 = static_cast(inputs[0]); - T *t1 = static_cast(inputs[1]); - - // NumPy-style broadcasted multiplication - ark::Dims osh = output_shapes[0].dims4(); - ark::Dims ish0 = input_shapes[0].dims4(); - ark::Dims ish1 = input_shapes[1].dims4(); - for (ark::DimType n = 0; n < osh[0]; ++n) { - for (ark::DimType c = 0; c < osh[1]; ++c) { - for (ark::DimType h = 0; h < osh[2]; ++h) { - for (ark::DimType w = 0; w < osh[3]; ++w) { - out[w + h * osh[3] + c * osh[2] * osh[3] + - n * osh[1] * osh[2] * osh[3]] = - t0[(w % ish0[3]) + (h % ish0[2]) * ish0[3] + - (c % ish0[1]) * ish0[2] * ish0[3] + - (n % ish0[0]) * ish0[1] * ish0[2] * ish0[3]] * - t1[(w % ish1[3]) + (h % ish1[2]) * ish1[3] + - (c % ish1[1]) * ish1[2] * ish1[3] + - (n % ish1[0]) * ish1[1] * ish1[2] * ish1[3]]; - } - } - } - } -}; - -template -void baseline_div(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, int) { - T *out = static_cast(outputs[0]); - T *t0 = static_cast(inputs[0]); - T *t1 = static_cast(inputs[1]); - - // NumPy-style broadcasted division - ark::Dims osh = output_shapes[0].dims4(); - ark::Dims ish0 = input_shapes[0].dims4(); - ark::Dims ish1 = input_shapes[1].dims4(); - for (ark::DimType n = 0; n < osh[0]; ++n) { - for (ark::DimType c = 0; c < osh[1]; ++c) { - for (ark::DimType h = 0; h < osh[2]; ++h) { - for (ark::DimType w = 0; w < osh[3]; ++w) { - out[w + h * osh[3] + c * osh[2] * osh[3] + - n * osh[1] * osh[2] * osh[3]] = - t0[(w % ish0[3]) + (h % ish0[2]) * ish0[3] + - (c % ish0[1]) * ish0[2] * ish0[3] + - (n % ish0[0]) * ish0[1] * ish0[2] * ish0[3]] / - t1[(w % ish1[3]) + (h % ish1[2]) * ish1[3] + - (c % ish1[1]) * ish1[2] * ish1[3] + - (n % ish1[0]) * ish1[1] * ish1[2] * ish1[3]]; - } - } - } - } -}; - -ark::unittest::State test_add_fp32() { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP32); - ark::Tensor t1 = m.tensor({8192}, ark::FP32); - ark::Tensor out = m.add(t0, t1); - - auto result = - ark::op_test("add_fp32", m, {t0, t1}, {out}, baseline_add); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_add_fp16() { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP16); - ark::Tensor t1 = m.tensor({8192}, ark::FP16); - ark::Tensor out = m.add(t0, t1); - - auto result = - ark::op_test("add_fp16", m, {t0, t1}, {out}, baseline_add); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_add_bf16() { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::BF16); - ark::Tensor t1 = m.tensor({8192}, ark::BF16); - ark::Tensor out = m.add(t0, t1); - - auto result = ark::op_test("add_bf16", m, {t0, t1}, {out}, - baseline_add); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_add_overwrite() { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP16); - ark::Tensor t1 = m.tensor({8192}, ark::FP16); - ark::Tensor out = m.add(t0, t1, t1); - - auto result = ark::op_test("add_overwrite", m, {t0, t1}, {out}, - baseline_add); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_add_broadcast() { - { - ark::Model m; - ark::Tensor t0 = m.tensor({4, 1024}, ark::FP16); - ark::Tensor t1 = m.tensor({1, 1024}, ark::FP16); - ark::Tensor out = m.add(t0, t1); - - auto result = ark::op_test("add_broadcast", m, {t0, t1}, {out}, - baseline_add); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({4, 64}, ark::FP16); - ark::Tensor t1 = m.tensor({4, 1}, ark::FP16, {4, 2}); - ark::Tensor out = m.add(t0, t1); - - auto result = ark::op_test("add_broadcast", m, {t0, t1}, {out}, - baseline_add); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({3, 1, 1024}, ark::FP16); - ark::Tensor t1 = m.tensor({1, 4, 1}, ark::FP16, {1, 4, 2}); - ark::Tensor out = m.add(t0, t1); - - auto result = ark::op_test("add_broadcast", m, {t0, t1}, {out}, - baseline_add); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_add_offset() { - { - ark::Model m; - ark::Tensor t0 = m.tensor({2, 64}, ark::FP16, {4, 128}, {2, 64}); - ark::Tensor t1 = m.tensor({2, 64}, ark::FP16); - ark::Tensor out = m.add(t0, t1); - - auto result = ark::op_test("add_offset", m, {t0, t1}, {out}, - baseline_add); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_add_invalid() { - { - ark::Model m; - ark::Tensor t0 = m.tensor({1024}, ark::FP16); - ark::Tensor t1 = m.tensor({1024}, ark::FP32); - UNITTEST_THROW(m.add(t0, t1), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP16); - ark::Tensor t1 = m.tensor({8192}, ark::FP16); - ark::Tensor out = m.tensor({8192}, ark::FP32); - UNITTEST_THROW(m.add(t0, t1, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP16); - ark::Tensor t1 = m.tensor({8192}, ark::FP16); - ark::Tensor out = m.tensor({1024}, ark::FP16); - UNITTEST_THROW(m.add(t0, t1, out), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_sub_fp32() { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP32); - ark::Tensor t1 = m.tensor({8192}, ark::FP32); - ark::Tensor out = m.sub(t0, t1); - - auto result = - ark::op_test("sub_fp32", m, {t0, t1}, {out}, baseline_sub); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_sub_invalid() { - { - ark::Model m; - ark::Tensor t0 = m.tensor({1024}, ark::FP16); - ark::Tensor t1 = m.tensor({1024}, ark::FP32); - UNITTEST_THROW(m.sub(t0, t1), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP16); - ark::Tensor t1 = m.tensor({8192}, ark::FP16); - ark::Tensor out = m.tensor({8192}, ark::FP32); - UNITTEST_THROW(m.sub(t0, t1, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP16); - ark::Tensor t1 = m.tensor({8192}, ark::FP16); - ark::Tensor out = m.tensor({1024}, ark::FP16); - UNITTEST_THROW(m.sub(t0, t1, out), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_mul_fp32() { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP32); - ark::Tensor t1 = m.tensor({8192}, ark::FP32); - ark::Tensor out = m.mul(t0, t1); - - auto result = - ark::op_test("mul_fp32", m, {t0, t1}, {out}, baseline_mul); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_mul_fp16() { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP16); - ark::Tensor t1 = m.tensor({8192}, ark::FP16); - ark::Tensor out = m.mul(t0, t1); - - auto result = - ark::op_test("mul_fp16", m, {t0, t1}, {out}, baseline_mul); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_mul_overwrite() { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP16); - ark::Tensor t1 = m.tensor({8192}, ark::FP16); - ark::Tensor out = m.mul(t0, t1, t1); - - auto result = ark::op_test("mul_overwrite", m, {t0, t1}, {out}, - baseline_mul); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_mul_broadcast() { - { - ark::Model m; - ark::Tensor t0 = m.tensor({4, 1024}, ark::FP16); - ark::Tensor t1 = m.tensor({1, 1024}, ark::FP16); - ark::Tensor out = m.mul(t0, t1); - - auto result = ark::op_test("mul_broadcast", m, {t0, t1}, {out}, - baseline_mul); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({4, 1024}, ark::FP16); - ark::Tensor t1 = m.tensor({4, 1}, ark::FP16, {4, 2}); - ark::Tensor out = m.mul(t0, t1); - - auto result = ark::op_test("mul_broadcast", m, {t0, t1}, {out}, - baseline_mul); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({3, 1, 1024}, ark::FP16); - ark::Tensor t1 = m.tensor({1, 4, 1}, ark::FP16, {1, 4, 2}); - ark::Tensor out = m.mul(t0, t1); - - auto result = ark::op_test("mul_broadcast", m, {t0, t1}, {out}, - baseline_mul); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_mul_invalid() { - { - ark::Model m; - ark::Tensor t0 = m.tensor({1024}, ark::FP16); - ark::Tensor t1 = m.tensor({1024}, ark::FP32); - UNITTEST_THROW(m.mul(t0, t1), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP16); - ark::Tensor t1 = m.tensor({8192}, ark::FP16); - ark::Tensor out = m.tensor({8192}, ark::FP32); - UNITTEST_THROW(m.mul(t0, t1, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP16); - ark::Tensor t1 = m.tensor({8192}, ark::FP16); - ark::Tensor out = m.tensor({1024}, ark::FP16); - UNITTEST_THROW(m.mul(t0, t1, out), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_div_fp32() { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP32); - ark::Tensor t1 = m.tensor({8192}, ark::FP32); - ark::Tensor out = m.div(t0, t1); - - auto result = - ark::op_test("div_fp32", m, {t0, t1}, {out}, baseline_div); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_div_invalid() { - { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP32); - ark::Tensor t1 = m.tensor({8192}, ark::FP16); - UNITTEST_THROW(m.div(t0, t1), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP32); - ark::Tensor t1 = m.tensor({8192}, ark::FP32); - ark::Tensor out = m.tensor({8192}, ark::FP16); - UNITTEST_THROW(m.div(t0, t1, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({8192}, ark::FP32); - ark::Tensor t1 = m.tensor({8192}, ark::FP32); - ark::Tensor out = m.tensor({1024}, ark::FP16); - UNITTEST_THROW(m.div(t0, t1, out), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -int main() { - ark::init(); - UNITTEST(test_add_fp32); - UNITTEST(test_add_fp16); - UNITTEST(test_add_bf16); - UNITTEST(test_add_overwrite); - UNITTEST(test_add_broadcast); - UNITTEST(test_add_offset); - UNITTEST(test_add_invalid); - UNITTEST(test_sub_fp32); - UNITTEST(test_sub_invalid); - UNITTEST(test_mul_fp32); - UNITTEST(test_mul_fp16); - UNITTEST(test_mul_overwrite); - UNITTEST(test_mul_broadcast); - UNITTEST(test_mul_invalid); - UNITTEST(test_div_fp32); - UNITTEST(test_div_invalid); - return ark::unittest::SUCCESS; -} diff --git a/ark/ops/ops_cast_test.cpp b/ark/ops/ops_cast_test.cpp deleted file mode 100644 index 8404e07f5..000000000 --- a/ark/ops/ops_cast_test.cpp +++ /dev/null @@ -1,322 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "ops_test_common.hpp" - -template -void baseline_cast(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &, int) { - ToType *out = static_cast(outputs[0]); - FromType *input = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0]; - for (ark::DimType i = 0; i < osh.nelems(); ++i) { - out[i] = ToType(input[i]); - } -}; - -template -void baseline_cast_from_byte(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &, int) { - ToType *out = static_cast(outputs[0]); - // input is a byte array, but force read it as ToType. - ToType *input = reinterpret_cast(inputs[0]); - ark::Dims osh = output_shapes[0]; - for (ark::DimType i = 0; i < osh.nelems(); ++i) { - out[i] = input[i]; - } -}; - -template -void baseline_cast_to_byte(std::vector &outputs, - const std::vector &, - const std::vector &inputs, - const std::vector &input_shapes, int) { - // output is a byte array, but force write it as FromType. - FromType *out = reinterpret_cast(outputs[0]); - FromType *input = static_cast(inputs[0]); - ark::Dims ish = input_shapes[0]; - for (ark::DimType i = 0; i < ish.nelems(); ++i) { - out[i] = input[i]; - } -}; - -ark::unittest::State test_cast_fp16_to_fp32() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::FP16); - ark::Tensor out = m.cast(t, ark::FP32); - - auto result = ark::op_test("cast_fp16_to_fp32", m, {t}, {out}, - baseline_cast); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_fp16_to_int32() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::FP16); - ark::Tensor out = m.cast(t, ark::INT32); - - std::vector input_data(t.shape().nelems()); - for (size_t i = 0; i < input_data.size(); ++i) { - input_data[i] = ark::half_t(int((i + 1) % 1000)); - } - - auto result = - ark::op_test("cast_fp16_to_int32", m, {t}, {out}, - baseline_cast, {input_data.data()}); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_fp32_to_fp16() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::FP32); - ark::Tensor out = m.cast(t, ark::FP16); - - auto result = ark::op_test("cast_fp32_to_fp16", m, {t}, {out}, - baseline_cast); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_fp32_to_int32() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::FP32); - ark::Tensor out = m.cast(t, ark::INT32); - - std::vector input_data(t.shape().nelems()); - for (size_t i = 0; i < input_data.size(); ++i) { - input_data[i] = float((i + 1) % 1000); - } - - auto result = ark::op_test("cast_fp32_to_int32", m, {t}, {out}, - baseline_cast, {input_data.data()}); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_int32_to_fp32() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::INT32); - ark::Tensor out = m.cast(t, ark::FP32); - - std::vector input_data(t.shape().nelems()); - for (size_t i = 0; i < input_data.size(); ++i) { - input_data[i] = (i + 1) % 1000; - } - - auto result = ark::op_test("cast_int32_to_fp32", m, {t}, {out}, - baseline_cast, {input_data.data()}); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_int32_to_fp16() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::INT32); - ark::Tensor out = m.cast(t, ark::FP16); - - std::vector input_data(t.shape().nelems()); - for (size_t i = 0; i < input_data.size(); ++i) { - input_data[i] = (i + 1) % 1000; - } - - auto result = - ark::op_test("cast_int32_to_fp16", m, {t}, {out}, - baseline_cast, {input_data.data()}); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_byte_to_fp32() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::BYTE); - ark::Tensor out = m.cast(t, ark::FP32); - - // For preventing optimize-out - m.noop(t); - m.noop(out); - - auto result = ark::op_test("cast_byte_to_fp32", m, {t}, {out}, - baseline_cast_from_byte); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_byte_to_fp16() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::BYTE); - ark::Tensor out = m.cast(t, ark::FP16); - - // For preventing optimize-out - m.noop(t); - m.noop(out); - - auto result = ark::op_test("cast_byte_to_fp16", m, {t}, {out}, - baseline_cast_from_byte); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_byte_to_int32() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::BYTE); - ark::Tensor out = m.cast(t, ark::INT32); - - // For preventing optimize-out - m.noop(t); - m.noop(out); - - auto result = ark::op_test("cast_byte_to_int32", m, {t}, {out}, - baseline_cast_from_byte); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_fp32_to_byte() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::FP32); - ark::Tensor out = m.cast(t, ark::BYTE); - - // For preventing optimize-out - m.noop(t); - m.noop(out); - - auto result = ark::op_test("cast_fp32_to_byte", m, {t}, {out}, - baseline_cast_to_byte); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_fp16_to_byte() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::FP16); - ark::Tensor out = m.cast(t, ark::BYTE); - - // For preventing optimize-out - m.noop(t); - m.noop(out); - - auto result = ark::op_test("cast_fp16_to_byte", m, {t}, {out}, - baseline_cast_to_byte); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_int32_to_byte() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::INT32); - ark::Tensor out = m.cast(t, ark::BYTE); - - // For preventing optimize-out - m.noop(t); - m.noop(out); - - auto result = ark::op_test("cast_int32_to_byte", m, {t}, {out}, - baseline_cast_to_byte); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_bf16_to_float() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::BF16); - ark::Tensor out = m.cast(t, ark::FP32); - - auto result = ark::op_test("cast_bf16_to_float", m, {t}, {out}, - baseline_cast); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_float_to_bf16() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::FP32); - ark::Tensor out = m.cast(t, ark::BF16); - - auto result = ark::op_test("cast_float_to_bf16", m, {t}, {out}, - baseline_cast); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_cast_invalid() { - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(1), ark::BYTE); - UNITTEST_THROW(m.cast(t, ark::FP32), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor(ark::Dims(4, 1), ark::BYTE); - m.cast(t0, ark::FP32); // ok - ark::Tensor t1 = m.tensor(ark::Dims(4, 1, 1), ark::BYTE); - m.cast(t1, ark::FP32); // ok - ark::Tensor t2 = m.tensor(ark::Dims(4, 1, 1, 1), ark::BYTE); - m.cast(t2, ark::FP32); // ok - ark::Tensor t3 = m.tensor(ark::Dims(7, 1), ark::BYTE); - UNITTEST_THROW(m.cast(t3, ark::FP32), ark::ModelError); - ark::Tensor t4 = m.tensor(ark::Dims(7, 1, 1), ark::BYTE); - UNITTEST_THROW(m.cast(t4, ark::FP32), ark::ModelError); - ark::Tensor t5 = m.tensor(ark::Dims(7, 1, 1, 1), ark::BYTE); - UNITTEST_THROW(m.cast(t5, ark::FP32), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({8, 1}, ark::BYTE); - m.cast(t0, ark::FP32); // ok - ark::Tensor t1 = m.tensor({8, 1}, ark::BYTE, {13, 1}, {0, 0}, {9, 1}); - UNITTEST_THROW(m.cast(t1, ark::FP32), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({8, 1}, ark::FP16); - ark::Tensor out = m.tensor({8, 1}, ark::INT32); - UNITTEST_THROW(m.cast(t0, ark::FP32, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t0 = m.tensor({8, 1}, ark::FP16); - ark::Tensor out = m.tensor({4, 1}, ark::FP32); - UNITTEST_THROW(m.cast(t0, ark::FP32, out), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -int main() { - ark::init(); - UNITTEST(test_cast_fp16_to_fp32); - UNITTEST(test_cast_fp16_to_int32); - UNITTEST(test_cast_fp32_to_fp16); - UNITTEST(test_cast_fp32_to_int32); - UNITTEST(test_cast_int32_to_fp32); - UNITTEST(test_cast_int32_to_fp16); - UNITTEST(test_cast_byte_to_fp32); - UNITTEST(test_cast_byte_to_fp16); - UNITTEST(test_cast_byte_to_int32); - UNITTEST(test_cast_fp32_to_byte); - UNITTEST(test_cast_fp16_to_byte); - UNITTEST(test_cast_int32_to_byte); - UNITTEST(test_cast_bf16_to_float); - UNITTEST(test_cast_float_to_bf16); - UNITTEST(test_cast_invalid); - return ark::unittest::SUCCESS; -} diff --git a/ark/ops/ops_embedding_test.cpp b/ark/ops/ops_embedding_test.cpp deleted file mode 100644 index 222605296..000000000 --- a/ark/ops/ops_embedding_test.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include -#include - -#include "ark/random.hpp" -#include "ops_test_common.hpp" - -template -void baseline_embedding(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, int) { - T *out = static_cast(outputs[0]); - int *in = static_cast(inputs[0]); - T *weight = static_cast(inputs[1]); - - ark::Dims osh = output_shapes[0].dims4(); - ark::Dims wsh = input_shapes[1].dims4(); - - assert(osh[3] == wsh[3]); - - int in_idx = 0; - for (ark::DimType n = 0; n < osh[0]; ++n) { - for (ark::DimType c = 0; c < osh[1]; ++c) { - for (ark::DimType h = 0; h < osh[2]; ++h) { - int weight_idx = in[in_idx++]; - if (weight_idx < 0) { - weight_idx += wsh[2]; - } - T *ptr = &weight[weight_idx * wsh[3]]; - for (ark::DimType w = 0; w < osh[3]; ++w) { - out[n * osh[1] * osh[2] * osh[3] + c * osh[2] * osh[3] + - h * osh[3] + w] = ptr[w]; - } - } - } - } -}; - -template -ark::unittest::State test_embedding() { - const int num_emb = 100; - const int emb_dim = 4096; - - ark::DataType weight_type; - if (std::is_same::value) { - weight_type = ark::FP32; - } else { - weight_type = ark::FP16; - } - - ark::Model m; - ark::Tensor ti = m.tensor(ark::Dims(8, 3, 64), ark::INT32); - ark::Tensor tw = m.tensor(ark::Dims(num_emb, emb_dim), weight_type); - ark::Tensor to = m.embedding(ti, tw); - - std::vector ti_data; - for (auto i = 0; i < ti.shape().nelems(); ++i) { - // Random indices in [0, num_emb) - int rand_idx = ark::rand() % num_emb; - if (i % 9 == 0) { - // test negative tokens (padding) - rand_idx = -rand_idx; - } - ti_data.push_back(rand_idx); - } - std::vector tw_data(tw.shape().nelems()); - for (auto i = 0; i < tw.shape().nelems(); ++i) { - tw_data[i] = ark::random(-1.0, 1.0); - } - std::string type_str = ""; - if (std::is_same::value) { - type_str = "fp32"; - } else if (std::is_same::value) { - type_str = "fp16"; - } else if (std::is_same::value) { - type_str = "bf16"; - } - auto result = - ark::op_test("embedding_" + type_str, m, {ti, tw}, {to}, - baseline_embedding, {ti_data.data(), tw_data.data()}); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_embedding_fp32() { return test_embedding(); } - -ark::unittest::State test_embedding_fp16() { - return test_embedding(); -} - -ark::unittest::State test_embedding_bf16() { - return test_embedding(); -} - -ark::unittest::State test_embedding_invalid() { - { - ark::Model m; - ark::Tensor ti = m.tensor(ark::Dims(4, 8, 3, 64), ark::INT32); - ark::Tensor tw = m.tensor(ark::Dims(100, 1024), ark::FP32); - UNITTEST_THROW(m.embedding(ti, tw), ark::ModelError); - } - { - ark::Model m; - ark::Tensor ti = m.tensor(ark::Dims(8, 3, 64), ark::INT32); - ark::Tensor tw = m.tensor(ark::Dims(2, 100, 1024), ark::FP32); - UNITTEST_THROW(m.embedding(ti, tw), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -int main() { - ark::init(); - UNITTEST(test_embedding_fp32); - UNITTEST(test_embedding_fp16); - UNITTEST(test_embedding_bf16); - UNITTEST(test_embedding_invalid); - return ark::unittest::SUCCESS; -} diff --git a/ark/ops/ops_math_test.cpp b/ark/ops/ops_math_test.cpp deleted file mode 100644 index f5774ab8e..000000000 --- a/ark/ops/ops_math_test.cpp +++ /dev/null @@ -1,366 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include - -#include "ark/model.hpp" -#include "ops_test_common.hpp" -#include "unittest/unittest_utils.h" - -float gelu(float x) { - return 0.5 * x * (1 + tanh(sqrt(2 / M_PI) * (x + 0.044715 * pow(x, 3)))); -} - -template -void baseline_gelu(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &, int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0]; - for (ark::DimType i = 0; i < osh.nelems(); ++i) { - out[i] = gelu(input[i]); - } -}; - -template -void baseline_exp(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &, int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0]; - for (ark::DimType i = 0; i < osh.nelems(); ++i) { - out[i] = std::exp(input[i]); - } -}; - -float relu(float x) { return x > 0 ? x : 0; } - -template -void baseline_relu(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &, int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0]; - for (ark::DimType i = 0; i < osh.nelems(); ++i) { - out[i] = relu(input[i]); - } -}; - -template -void baseline_rsqrt(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &, int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0]; - for (ark::DimType i = 0; i < osh.nelems(); ++i) { - out[i] = 1.0f / std::sqrt(input[i]); - } -}; - -float sigmoid(float x) { return 1 / (1 + std::exp(-x)); } - -template -void baseline_sigmoid(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &, int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0]; - for (ark::DimType i = 0; i < osh.nelems(); ++i) { - out[i] = sigmoid(input[i]); - } -}; - -template -void baseline_sqrt(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &, int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0]; - for (ark::DimType i = 0; i < osh.nelems(); ++i) { - out[i] = std::sqrt(input[i]); - } -}; - -ark::unittest::State test_gelu_fp32() { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::FP32); - ark::Tensor out = m.gelu(t); - - auto result = - ark::op_test("gelu_fp32", m, {t}, {out}, baseline_gelu); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-6f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_gelu_bf16() { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::BF16); - ark::Tensor out = m.gelu(t); - - auto result = ark::op_test("gelu_bf16", m, {t}, {out}, - baseline_gelu); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-6f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_gelu_invalid() { - { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::BF16); - ark::Tensor out = m.tensor({4, 2, 1024}, ark::FP32); - UNITTEST_THROW(m.gelu(t, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::BF16); - ark::Tensor out = m.tensor({4, 4, 1024}, ark::BF16); - UNITTEST_THROW(m.gelu(t, out), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_exp_fp32() { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::FP32); - ark::Tensor out = m.exp(t); - - auto result = ark::op_test("exp_fp32", m, {t}, {out}, baseline_exp); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-5f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_exp_fp16() { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::FP16); - ark::Tensor out = m.exp(t); - - auto result = - ark::op_test("exp_fp16", m, {t}, {out}, baseline_exp); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-2f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_exp_bf16() { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::BF16); - ark::Tensor out = m.exp(t); - - auto result = - ark::op_test("exp_bf16", m, {t}, {out}, baseline_exp); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-2f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_exp_invalid() { - { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::BF16); - ark::Tensor out = m.tensor({4, 2, 1024}, ark::FP32); - UNITTEST_THROW(m.exp(t, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::BF16); - ark::Tensor out = m.tensor({4, 4, 1024}, ark::BF16); - UNITTEST_THROW(m.exp(t, out), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_relu_fp32() { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::FP32); - ark::Tensor out = m.relu(t); - - auto result = - ark::op_test("relu_fp32", m, {t}, {out}, baseline_relu); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_relu_fp16() { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::FP16); - ark::Tensor out = m.relu(t); - - auto result = - ark::op_test("relu_fp16", m, {t}, {out}, baseline_relu); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_relu_bf16() { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::BF16); - ark::Tensor out = m.relu(t); - - auto result = ark::op_test("relu_bf16", m, {t}, {out}, - baseline_relu); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_relu_invalid() { - { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::BF16); - ark::Tensor out = m.tensor({4, 2, 1024}, ark::FP32); - UNITTEST_THROW(m.relu(t, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::BF16); - ark::Tensor out = m.tensor({4, 4, 1024}, ark::BF16); - UNITTEST_THROW(m.relu(t, out), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_math_rsqrt_fp32() { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::FP32); - ark::Tensor out = m.rsqrt(t); - - auto result = - ark::op_test("math_rsqrt_fp32", m, {t}, {out}, baseline_rsqrt); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-4f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_math_rsqrt_fp16() { - ark::Model m; - ark::Tensor t = m.tensor({1, 64, 1}, ark::FP16); - ark::Tensor out = m.rsqrt(t); - - std::vector data(64, 4); - - auto result = ark::op_test("math_rsqrt_fp16", m, {t}, {out}, - baseline_rsqrt, {data.data()}); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-4f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_sigmoid_fp32() { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::FP32); - ark::Tensor out = m.sigmoid(t); - - auto result = - ark::op_test("sigmoid_fp32", m, {t}, {out}, baseline_sigmoid); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-5f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_sigmoid_bf16() { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::BF16); - ark::Tensor out = m.sigmoid(t); - - auto result = ark::op_test("sigmoid_bf16", m, {t}, {out}, - baseline_sigmoid); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-2f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_sigmoid_invalid() { - { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::BF16); - ark::Tensor out = m.tensor({4, 2, 1024}, ark::FP32); - UNITTEST_THROW(m.sigmoid(t, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::BF16); - ark::Tensor out = m.tensor({4, 4, 1024}, ark::BF16); - UNITTEST_THROW(m.sigmoid(t, out), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_math_sqrt_fp32() { - ark::Model m; - ark::Tensor t = m.tensor({4, 2, 1024}, ark::FP32); - ark::Tensor out = m.sqrt(t); - - auto result = - ark::op_test("math_sqrt_fp32", m, {t}, {out}, baseline_sqrt); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-6f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_math_sqrt_fp16_small_last_dim() { - ark::Model m; - ark::Tensor t = m.tensor({4, 1024, 1}, ark::FP16, {4, 1024, 2}); - ark::Tensor out = m.sqrt(t); - - auto result = ark::op_test("math_sqrt_fp16_small_last_dim", m, {t}, {out}, - baseline_sqrt); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-4f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_math_sqrt_invalid() { - { - ark::Model model; - ark::Tensor input = model.tensor({1, 3, 16, 8192}, ark::FP32); - ark::Tensor output = model.tensor({1, 3, 16, 8192}, ark::FP16); - UNITTEST_THROW(model.sqrt(input, output), ark::ModelError); - } - { - ark::Model model; - ark::Tensor input = model.tensor({1, 3, 16, 8192}, ark::FP32); - ark::Tensor output = model.tensor({1, 3, 16, 1024}, ark::FP32); - UNITTEST_THROW(model.sqrt(input, output), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -int main() { - ark::init(); - UNITTEST(test_gelu_fp32); - UNITTEST(test_gelu_bf16); - UNITTEST(test_gelu_invalid); - UNITTEST(test_exp_fp32); - UNITTEST(test_exp_fp16); - UNITTEST(test_exp_invalid); - UNITTEST(test_relu_fp32); - UNITTEST(test_relu_fp16); - UNITTEST(test_relu_bf16); - UNITTEST(test_relu_invalid); - UNITTEST(test_math_rsqrt_fp32); - UNITTEST(test_math_rsqrt_fp16); - UNITTEST(test_sigmoid_fp32); - UNITTEST(test_sigmoid_bf16); - UNITTEST(test_sigmoid_invalid); - UNITTEST(test_math_sqrt_fp32); - UNITTEST(test_math_sqrt_fp16_small_last_dim); - UNITTEST(test_math_sqrt_invalid); - return ark::unittest::SUCCESS; -} diff --git a/ark/ops/ops_matmul_test.cpp b/ark/ops/ops_matmul_test.cpp deleted file mode 100644 index 11682ca49..000000000 --- a/ark/ops/ops_matmul_test.cpp +++ /dev/null @@ -1,589 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include - -#include "gpu/gpu.hpp" -#include "logging.hpp" -#include "model/model_node.hpp" -#include "model/model_op.hpp" -#include "ops_test_common.hpp" - -#if defined(ARK_CUDA) - -#include - -typedef cublasHandle_t blasHandle; -typedef cublasStatus_t blasStatus; -typedef cublasOperation_t blasOperation; -typedef cudaDataType blasDataType; -typedef cublasComputeType_t blasComputeType; -constexpr auto blasSuccess = CUBLAS_STATUS_SUCCESS; -constexpr auto BLAS_OP_N = CUBLAS_OP_N; -constexpr auto BLAS_OP_T = CUBLAS_OP_T; -constexpr auto BLAS_R_32F = CUDA_R_32F; -constexpr auto BLAS_R_16F = CUDA_R_16F; -constexpr auto BLAS_R_16BF = CUDA_R_16BF; -constexpr auto BLAS_COMPUTE_32F = CUBLAS_COMPUTE_32F; -constexpr auto BLAS_COMPUTE_32F_FAST_TF32 = CUBLAS_COMPUTE_32F_FAST_TF32; -constexpr auto BLAS_COMPUTE_16F = CUBLAS_COMPUTE_16F; - -inline auto blasGemmEx(blasHandle handle, blasOperation transA, - blasOperation transB, int m, int n, int k, - const void *alpha, const void *A, blasDataType Atype, - int lda, const void *B, blasDataType Btype, int ldb, - const void *beta, void *C, blasDataType Ctype, int ldc, - blasComputeType computeType) { - return cublasGemmEx(handle, transA, transB, m, n, k, alpha, A, Atype, lda, - B, Btype, ldb, beta, C, Ctype, ldc, computeType, - CUBLAS_GEMM_DEFAULT); -} - -inline auto blasGemmStridedBatchedEx( - blasHandle handle, blasOperation transA, blasOperation transB, int m, int n, - int k, const void *alpha, const void *A, blasDataType Atype, int lda, - int strideA, const void *B, blasDataType Btype, int ldb, int strideB, - const void *beta, void *C, blasDataType Ctype, int ldc, int strideC, - int batchCount, blasComputeType computeType) { - return cublasGemmStridedBatchedEx( - handle, transA, transB, m, n, k, alpha, A, Atype, lda, strideA, B, - Btype, ldb, strideB, beta, C, Ctype, ldc, strideC, batchCount, - computeType, CUBLAS_GEMM_DEFAULT); -} - -#elif defined(ARK_ROCM) - -#include - -typedef rocblas_handle blasHandle; -typedef rocblas_status blasStatus; -typedef rocblas_operation blasOperation; -typedef rocblas_datatype blasDataType; -typedef rocblas_datatype blasComputeType; -constexpr auto blasSuccess = rocblas_status_success; -constexpr auto BLAS_OP_N = rocblas_operation_none; -constexpr auto BLAS_OP_T = rocblas_operation_transpose; -constexpr auto BLAS_R_32F = rocblas_datatype_f32_r; -constexpr auto BLAS_R_16F = rocblas_datatype_f16_r; -constexpr auto BLAS_R_16BF = rocblas_datatype_bf16_r; -constexpr auto BLAS_COMPUTE_32F = rocblas_datatype_f32_r; -[[maybe_unused]] constexpr auto BLAS_COMPUTE_32F_FAST_TF32 = - rocblas_datatype_f32_r; -[[maybe_unused]] constexpr auto BLAS_COMPUTE_16F = rocblas_datatype_f16_r; - -inline auto blasGemmEx(blasHandle handle, blasOperation transA, - blasOperation transB, int m, int n, int k, - const void *alpha, const void *A, blasDataType Atype, - int lda, const void *B, blasDataType Btype, int ldb, - const void *beta, void *C, blasDataType Ctype, int ldc, - blasComputeType computeType) { - return rocblas_gemm_ex(handle, transA, transB, m, n, k, alpha, A, Atype, - lda, B, Btype, ldb, beta, C, Ctype, ldc, C, Ctype, - ldc, computeType, rocblas_gemm_algo_standard, 0, 0); -} - -inline auto blasGemmStridedBatchedEx( - blasHandle handle, blasOperation transA, blasOperation transB, int m, int n, - int k, const void *alpha, const void *A, blasDataType Atype, int lda, - int strideA, const void *B, blasDataType Btype, int ldb, int strideB, - const void *beta, void *C, blasDataType Ctype, int ldc, int strideC, - int batchCount, blasComputeType computeType) { - return rocblas_gemm_strided_batched_ex( - handle, transA, transB, m, n, k, alpha, A, Atype, lda, strideA, B, - Btype, ldb, strideB, beta, C, Ctype, ldc, strideC, C, Ctype, ldc, - strideC, batchCount, computeType, rocblas_gemm_algo_standard, 0, 0); -} - -#endif - -ARK_GPU_DEFINE_FUNC_ALIAS(blasCreate, cublasCreate, rocblas_create_handle); -ARK_GPU_DEFINE_FUNC_ALIAS(blasDestroy, cublasDestroy, rocblas_destroy_handle); - -class BlasHandle { - public: - BlasHandle() { - if (blasCreate(&handle_) != blasSuccess) { - throw std::runtime_error("Failed to create blas handle"); - } - } - - ~BlasHandle() { - if (blasDestroy(handle_) != blasSuccess) { - // do nothing. - } - } - - blasHandle get() const { return handle_; } - - private: - blasHandle handle_; -}; - -static BlasHandle globalBlasHandle; - -template -void blas_matmul(int m, int n, int k, const DataType *a, int lda, - const DataType *b, int ldb, DataType *c, int ldc, - int batch_size = 1) { - static_assert(std::is_same_v || - std::is_same_v || - std::is_same_v, - "Unsupported data type"); - - auto blasH = globalBlasHandle.get(); - blasStatus status; - blasOperation optypeA = (blasOperation)BlasOpTypeA; - blasOperation optypeB = (blasOperation)BlasOpTypeB; - -#if defined(ARK_CUDA) - using CompType = - typename std::conditional_t, - ark::half_t, float>; - blasComputeType ctype = - std::is_same_v - ? BLAS_COMPUTE_32F_FAST_TF32 - : (std::is_same_v ? BLAS_COMPUTE_16F - : BLAS_COMPUTE_32F); -#elif defined(ARK_ROCM) - // CK uses only fp32 compute type for fp16/bf16 - using CompType = float; - blasComputeType ctype = BLAS_COMPUTE_32F; -#endif - CompType alpha = 1; - CompType beta = 0; - - blasDataType dtype = - std::is_same_v - ? BLAS_R_32F - : (std::is_same_v ? BLAS_R_16F - : BLAS_R_16BF); - if (batch_size == 1) { - status = blasGemmEx(blasH, optypeB, optypeA, n, m, k, &alpha, b, dtype, - ldb, a, dtype, lda, &beta, c, dtype, ldc, ctype); - if (status != blasSuccess) { - throw std::runtime_error("Failed to call blasGemmEx"); - } - } else { - status = blasGemmStridedBatchedEx( - blasH, optypeB, optypeA, n, m, k, &alpha, b, dtype, ldb, n * k, a, - dtype, lda, k * m, &beta, c, dtype, ldc, n * m, batch_size, ctype); - if (status != blasSuccess) { - throw std::runtime_error("Failed to call blasGemmStridedBatchedEx"); - } - } -} - -template -void baseline_matmul_nn(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, int) { - auto out_shape_dims4 = output_shapes[0].dims4(); - - // baseline inputs & outputs have no padding - int m = out_shape_dims4[2]; - int n = out_shape_dims4[3]; - int k = input_shapes[0].dims4()[3]; - int lda = k; - int ldb = n; - int ldc = n; - - int batch_size = out_shape_dims4[0] * out_shape_dims4[1]; - - auto memA = ark::to_gpu(inputs[0], input_shapes[0].nelems() * sizeof(T)); - auto memB = ark::to_gpu(inputs[1], input_shapes[1].nelems() * sizeof(T)); - auto memC = ark::to_gpu(outputs[0], output_shapes[0].nelems() * sizeof(T)); - - T *devA = static_cast(memA.get()); - T *devB = static_cast(memB.get()); - T *devC = static_cast(memC.get()); - - blas_matmul(m, n, k, devA, lda, devB, ldb, devC, ldc, - batch_size); - ark::sync_gpu(); - - // copy back to host - ark::from_gpu(memC, outputs[0]); -} - -template -void baseline_matmul_nt(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, int) { - auto out_shape_dims4 = output_shapes[0].dims4(); - - // baseline inputs & outputs have no padding - int m = out_shape_dims4[2]; - int n = out_shape_dims4[3]; - int k = input_shapes[0].dims4()[3]; - int lda = k; - int ldb = k; - int ldc = n; - - int batch_size = out_shape_dims4[0] * out_shape_dims4[1]; - - auto memA = ark::to_gpu(inputs[0], input_shapes[0].nelems() * sizeof(T)); - auto memB = ark::to_gpu(inputs[1], input_shapes[1].nelems() * sizeof(T)); - auto memC = ark::to_gpu(outputs[0], output_shapes[0].nelems() * sizeof(T)); - - T *devA = static_cast(memA.get()); - T *devB = static_cast(memB.get()); - T *devC = static_cast(memC.get()); - - blas_matmul(m, n, k, devA, lda, devB, ldb, devC, ldc, - batch_size); - ark::sync_gpu(); - - // copy back to host - ark::from_gpu(memC, outputs[0]); -} - -template -void baseline_matmul_tn(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, int) { - auto out_shape_dims4 = output_shapes[0].dims4(); - - // baseline inputs & outputs have no padding - int m = out_shape_dims4[2]; - int n = out_shape_dims4[3]; - int k = input_shapes[0].dims4()[2]; - int lda = m; - int ldb = n; - int ldc = n; - - int batch_size = out_shape_dims4[0] * out_shape_dims4[1]; - - auto memA = ark::to_gpu(inputs[0], input_shapes[0].nelems() * sizeof(T)); - auto memB = ark::to_gpu(inputs[1], input_shapes[1].nelems() * sizeof(T)); - auto memC = ark::to_gpu(outputs[0], output_shapes[0].nelems() * sizeof(T)); - - T *devA = static_cast(memA.get()); - T *devB = static_cast(memB.get()); - T *devC = static_cast(memC.get()); - - blas_matmul(m, n, k, devA, lda, devB, ldb, devC, ldc, - batch_size); - ark::sync_gpu(); - - // copy back to host - ark::from_gpu(memC, outputs[0]); -} - -template -void baseline_matmul_tt(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, int) { - auto out_shape_dims4 = output_shapes[0].dims4(); - - // baseline inputs & outputs have no padding - int m = out_shape_dims4[2]; - int n = out_shape_dims4[3]; - int k = input_shapes[0].dims4()[2]; - int lda = m; - int ldb = k; - int ldc = n; - - int batch_size = out_shape_dims4[0] * out_shape_dims4[1]; - - auto memA = ark::to_gpu(inputs[0], input_shapes[0].nelems() * sizeof(T)); - auto memB = ark::to_gpu(inputs[1], input_shapes[1].nelems() * sizeof(T)); - auto memC = ark::to_gpu(outputs[0], output_shapes[0].nelems() * sizeof(T)); - - T *devA = static_cast(memA.get()); - T *devB = static_cast(memB.get()); - T *devC = static_cast(memC.get()); - - blas_matmul(m, n, k, devA, lda, devB, ldb, devC, ldc, - batch_size); - ark::sync_gpu(); - - // copy back to host - ark::from_gpu(memC, outputs[0]); -} - -ark::unittest::State test_matmul_model() { - // Hidden dimension of the dense layer. - unsigned int units = 1024; - // Input dimension of the dense layer. - unsigned int in_dim = 1024; - // Extra dimension of the input. CHANNEL=1 for 2D inputs. - unsigned int channel = 128; - // Batch size of the input. - unsigned int batch_size = 1; - - ark::Model m; - ark::Tensor input = m.tensor({batch_size, channel, in_dim}, ark::FP16); - ark::Tensor weight = m.tensor({in_dim, units}, ark::FP16); - m.matmul(input, weight); - - UNITTEST_TRUE(m.verify()); - auto compressed = m.compress(); - UNITTEST_TRUE(compressed.verify()); - - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_matmul_fp16() { - { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(128, 64), ark::FP16); - ark::Tensor b = m.tensor(ark::Dims(64, 256), ark::FP16); - ark::Tensor c = m.matmul(a, b); - - auto result = ark::op_test("matmul_fp16", m, {a, b}, {c}, - baseline_matmul_nn); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 64)); - } - { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(4096, 2048), ark::FP16); - ark::Tensor b = m.tensor(ark::Dims(2048, 16384), ark::FP16); - ark::Tensor c = m.matmul(a, b); - - std::vector p_ones_a(a.shape().nelems(), - ark::half_t(0.1f)); - std::vector p_ones_b(b.shape().nelems(), - ark::half_t(0.1f)); - - auto result = ark::op_test("matmul_fp16", m, {a, b}, {c}, - baseline_matmul_nn, - {p_ones_a.data(), p_ones_b.data()}); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 2048)); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_matmul_fp32() { - { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(128, 64), ark::FP32); - ark::Tensor b = m.tensor(ark::Dims(64, 256), ark::FP32); - ark::Tensor c = m.matmul(a, b); - - auto result = ark::op_test("matmul_fp32", m, {a, b}, {c}, - baseline_matmul_nn); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 64)); - } - { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(4096, 8192), ark::FP32); - ark::Tensor b = m.tensor(ark::Dims(8192, 16384), ark::FP32); - ark::Tensor c = m.matmul(a, b); - - std::vector p_ones_a(a.shape().nelems(), float(0.1f)); - std::vector p_ones_b(b.shape().nelems(), float(0.1f)); - - auto result = ark::op_test("matmul_fp32", m, {a, b}, {c}, - baseline_matmul_nn, - {p_ones_a.data(), p_ones_b.data()}); - UNITTEST_LOG(result); - // TODO: #199 -#if defined(ARK_CUDA) - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 8192)); -#endif // defined(ARK_CUDA) - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_matmul_bf16() { - { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(128, 64), ark::BF16); - ark::Tensor b = m.tensor(ark::Dims(64, 256), ark::BF16); - ark::Tensor c = m.matmul(a, b); - - auto result = ark::op_test("matmul_bf16", m, {a, b}, {c}, - baseline_matmul_nn); - UNITTEST_LOG(result); - UNITTEST_TRUE( - result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 64)); - } - { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(4096, 256), ark::BF16); - ark::Tensor b = m.tensor(ark::Dims(256, 16384), ark::BF16); - ark::Tensor c = m.matmul(a, b); - - auto result = ark::op_test("matmul_bf16", m, {a, b}, {c}, - baseline_matmul_nn); - UNITTEST_LOG(result); - UNITTEST_TRUE( - result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 256)); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_matmul_fp16_nt() { - { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(128, 64), ark::FP16); - ark::Tensor b = m.tensor(ark::Dims(256, 64), ark::FP16); - ark::Tensor c = m.matmul(a, b, ark::NullTensor, false, true); - - auto result = ark::op_test("matmul_fp16_nt", m, {a, b}, {c}, - baseline_matmul_nt); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 64)); - } - { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(4096, 2048), ark::FP16); - ark::Tensor b = m.tensor(ark::Dims(16384, 2048), ark::FP16); - ark::Tensor c = m.matmul(a, b, ark::NullTensor, false, true); - - auto result = ark::op_test("matmul_fp16_nt", m, {a, b}, {c}, - baseline_matmul_nt); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 2048)); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_matmul_fp16_tn() { - { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(64, 128), ark::FP16); - ark::Tensor b = m.tensor(ark::Dims(64, 256), ark::FP16); - ark::Tensor c = m.matmul(a, b, ark::NullTensor, true, false); - - auto result = ark::op_test("matmul_fp16_tn", m, {a, b}, {c}, - baseline_matmul_tn); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 64)); - } - { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(2048, 4096), ark::FP16); - ark::Tensor b = m.tensor(ark::Dims(2048, 16384), ark::FP16); - ark::Tensor c = m.matmul(a, b, ark::NullTensor, true, false); - - auto result = ark::op_test("matmul_fp16_tn", m, {a, b}, {c}, - baseline_matmul_tn); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 2048)); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_matmul_fp16_tt() { - { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(64, 128), ark::FP16); - ark::Tensor b = m.tensor(ark::Dims(256, 64), ark::FP16); - ark::Tensor c = m.matmul(a, b, ark::NullTensor, true, true); - - auto result = ark::op_test("matmul_fp16_tt", m, {a, b}, {c}, - baseline_matmul_tt); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 64)); - } - { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(2048, 4096), ark::FP16); - ark::Tensor b = m.tensor(ark::Dims(16384, 2048), ark::FP16); - ark::Tensor c = m.matmul(a, b, ark::NullTensor, true, true); - - auto result = ark::op_test("matmul_fp16_tt", m, {a, b}, {c}, - baseline_matmul_tt); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 2048)); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_matmul_fp16_batched() { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(3, 7, 128, 128), ark::FP16); - ark::Tensor b = m.tensor(ark::Dims(3, 7, 128, 256), ark::FP16); - ark::Tensor c = m.matmul(a, b); - - auto result = ark::op_test("matmul_fp16_batched", m, {a, b}, {c}, - baseline_matmul_nn); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 128)); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_matmul_fp16_batched_padded() { - ark::Model m; - ark::Tensor a = - m.tensor({3, 7, 2, 9}, ark::FP16, {3, 7, 128, 64}, {}, {3, 7, 128, 64}); - ark::Tensor b = - m.tensor({3, 7, 9, 2}, ark::FP16, {3, 7, 64, 256}, {}, {3, 7, 64, 256}); - ark::Tensor c = m.tensor({3, 7, 2, 2}, ark::FP16, {3, 7, 128, 256}, {}, - {3, 7, 128, 256}); - m.matmul(a, b, c); - - auto result = ark::op_test("matmul_fp16_batched_padded", m, {a, b}, {c}, - baseline_matmul_nn); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 9)); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_matmul_fp16_offset() { - ark::Model m; - ark::Tensor a = - m.tensor({1, 128, 64}, ark::FP16, {1, 128, 256}, {0, 0, 64}); - ark::Tensor b = m.tensor({1, 64, 128}, ark::FP16, {1, 128, 256}, {0, 64, 0}, - {1, 64, 256}); - ark::Tensor c = m.tensor({1, 128, 128}, ark::FP16, {2, 256, 384}, - {1, 64, 128}, {1, 128, 256}); - m.matmul(a, b, c); - - auto result = ark::op_test("matmul_fp16_offset", m, {a, b}, {c}, - baseline_matmul_nn); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1f, 64)); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_matmul_invalid() { - ark::Model m; - ark::Tensor a = m.tensor(ark::Dims(128, 64), ark::FP16); - ark::Tensor b = m.tensor(ark::Dims(128, 256), ark::FP16); - UNITTEST_THROW(m.matmul(a, b), ark::ModelError); - - ark::Tensor c = m.tensor(ark::Dims(3, 3, 128, 128), ark::FP16); - ark::Tensor d = m.tensor(ark::Dims(2, 3, 128, 128), ark::FP16); - UNITTEST_THROW(m.matmul(c, d), ark::ModelError); - return ark::unittest::SUCCESS; -} - -int main() { - ark::init(); - UNITTEST(test_matmul_model); - UNITTEST(test_matmul_fp16); - UNITTEST(test_matmul_fp32); - UNITTEST(test_matmul_bf16); - UNITTEST(test_matmul_fp16_nt); - UNITTEST(test_matmul_fp16_tn); - UNITTEST(test_matmul_fp16_tt); - UNITTEST(test_matmul_fp16_batched); - UNITTEST(test_matmul_fp16_batched_padded); - UNITTEST(test_matmul_fp16_offset); - UNITTEST(test_matmul_invalid); - return ark::unittest::SUCCESS; -} diff --git a/ark/ops/ops_reduce.cpp b/ark/ops/ops_reduce.cpp index 78dd9d7e6..55c87aa1e 100644 --- a/ark/ops/ops_reduce.cpp +++ b/ark/ops/ops_reduce.cpp @@ -49,7 +49,7 @@ ModelOpReduce::ModelOpReduce(const std::string &type_name, ModelTensorRef input, } std::string ModelOpReduce::impl_name(const Json &config) const { - check_fields_config(config, {"NumWarps", "SramBytes", "ImplType"}); + check_fields_config(config, {"NumWarps", "SramBytes", "ImplType", "Tile"}); check_fields_args(args_, {"Axis", "KeepDim"}); std::string red_type; @@ -92,6 +92,15 @@ std::string ModelOpReduce::impl_name(const Json &config) const { output_shape.insert(axis, 1); } + Dims unit_out_dims( + config.at("Tile").get>()); + auto udims4 = unit_out_dims.dims4(); + if (udims4[axis] != 1) { + ERR(PlanError, + "Tile dimension along reduce axis (", axis, + ") must be 1, got ", udims4[axis]); + } + return function_name_string( "reduce_" + impl_type + "_" + red_type, { @@ -99,7 +108,7 @@ std::string ModelOpReduce::impl_name(const Json &config) const { vec_string(read_tensors_[0]->shape().dims4()), vec_string(output_strides.dims4()), vec_string(output_shape.dims4()), - vec_string(Dims(1, 1, 1, 1)), + vec_string(udims4), std::to_string(num_warps), std::to_string(sram_bytes), std::to_string(axis), @@ -122,6 +131,7 @@ Json ModelOpReduce::default_config([[maybe_unused]] const ArchRef arch) const { config["ImplType"] = "ElementWise"; config["SramBytes"] = 0; } + config["Tile"] = {1, 1, 1, 1}; config["NumTasks"] = result_tensors_[0]->shape().nelems(); return config; } diff --git a/ark/ops/ops_reduce_test.cpp b/ark/ops/ops_reduce_test.cpp deleted file mode 100644 index 637c8daec..000000000 --- a/ark/ops/ops_reduce_test.cpp +++ /dev/null @@ -1,472 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include -#include -#include - -#include "ops_test_common.hpp" - -template -void baseline_reduce_sum_axis0(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, - int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - - ark::Dims osh = output_shapes[0]; - ark::Dims ish = input_shapes[0].dims4(); - - if (KeepDim) { - assert(osh[0] == 1); - } else { - osh.insert(0, 1); - } - osh = osh.dims4(); - - for (ark::DimType c = 0; c < ish[1]; ++c) { - for (ark::DimType h = 0; h < ish[2]; ++h) { - for (ark::DimType w = 0; w < ish[3]; ++w) { - float sum = 0; - for (ark::DimType n = 0; n < ish[0]; ++n) { - sum += float(input[n * ish[1] * ish[2] * ish[3] + - c * ish[2] * ish[3] + h * ish[3] + w]); - } - out[c * osh[2] * osh[3] + h * osh[3] + w] = T(sum); - } - } - } -} - -template -void baseline_reduce_sum_axis1(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, - int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - - ark::Dims osh = output_shapes[0]; - ark::Dims ish = input_shapes[0].dims4(); - - if (KeepDim) { - assert(osh[1] == 1); - } else { - osh.insert(1, 1); - } - osh = osh.dims4(); - - for (ark::DimType n = 0; n < ish[0]; ++n) { - for (ark::DimType h = 0; h < ish[2]; ++h) { - for (ark::DimType w = 0; w < ish[3]; ++w) { - float sum = 0; - for (ark::DimType c = 0; c < ish[1]; ++c) { - sum += float(input[n * ish[1] * ish[2] * ish[3] + - c * ish[2] * ish[3] + h * ish[3] + w]); - } - out[n * osh[1] * osh[2] * osh[3] + h * osh[3] + w] = T(sum); - } - } - } -} - -template -void baseline_reduce_sum_axis2(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, - int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - - ark::Dims osh = output_shapes[0]; - ark::Dims ish = input_shapes[0].dims4(); - - if (KeepDim) { - assert(osh[2] == 1); - } else { - osh.insert(2, 1); - } - osh = osh.dims4(); - - for (ark::DimType n = 0; n < ish[0]; ++n) { - for (ark::DimType c = 0; c < ish[1]; ++c) { - for (ark::DimType w = 0; w < ish[3]; ++w) { - float sum = 0; - for (ark::DimType h = 0; h < ish[2]; ++h) { - sum += float(input[n * ish[1] * ish[2] * ish[3] + - c * ish[2] * ish[3] + h * ish[3] + w]); - } - out[n * osh[1] * osh[2] * osh[3] + c * osh[2] * osh[3] + w] = - T(sum); - } - } - } -}; - -template -void baseline_reduce_sum_axis3(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, - int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - - ark::Dims osh = output_shapes[0]; - ark::Dims ish = input_shapes[0].dims4(); - - if (KeepDim) { - assert(osh[3] == 1); - } else { - osh.insert(3, 1); - } - osh = osh.dims4(); - - for (ark::DimType n = 0; n < ish[0]; ++n) { - for (ark::DimType c = 0; c < ish[1]; ++c) { - for (ark::DimType h = 0; h < ish[2]; ++h) { - float sum = 0; - for (ark::DimType w = 0; w < ish[3]; ++w) { - sum += float(input[n * ish[1] * ish[2] * ish[3] + - c * ish[2] * ish[3] + h * ish[3] + w]); - } - out[n * osh[1] * osh[2] * osh[3] + c * osh[2] * osh[3] + - h * osh[3]] = T(sum); - } - } - } -}; - -template -void baseline_reduce_max_axis3(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, - int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - - ark::Dims osh = output_shapes[0]; - ark::Dims ish = input_shapes[0].dims4(); - - if (KeepDim) { - assert(osh[3] == 1); - } else { - osh.insert(3, 1); - } - osh = osh.dims4(); - - for (ark::DimType n = 0; n < ish[0]; ++n) { - for (ark::DimType c = 0; c < ish[1]; ++c) { - for (ark::DimType h = 0; h < ish[2]; ++h) { - float max_val = std::numeric_limits::lowest(); - for (ark::DimType w = 0; w < ish[3]; ++w) { - float val = - float(input[n * ish[1] * ish[2] * ish[3] + - c * ish[2] * ish[3] + h * ish[3] + w]); - max_val = std::max(max_val, val); - } - out[n * osh[1] * osh[2] * osh[3] + c * osh[2] * osh[3] + - h * osh[3]] = T(max_val); - } - } - } -}; - -template -void baseline_reduce_mean_axis3(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, - int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - - ark::Dims osh = output_shapes[0]; - ark::Dims ish = input_shapes[0].dims4(); - - if (KeepDim) { - assert(osh[3] == 1); - } else { - osh.insert(3, 1); - } - osh = osh.dims4(); - - for (ark::DimType n = 0; n < ish[0]; ++n) { - for (ark::DimType c = 0; c < ish[1]; ++c) { - for (ark::DimType h = 0; h < ish[2]; ++h) { - float mean = 0; - for (ark::DimType w = 0; w < ish[3]; ++w) { - mean += float(input[n * ish[1] * ish[2] * ish[3] + - c * ish[2] * ish[3] + h * ish[3] + w]); - } - mean /= ish[3]; - out[n * osh[1] * osh[2] * osh[3] + c * osh[2] * osh[3] + - h * osh[3]] = T(mean); - } - } - } -}; - -ark::unittest::State test_reduce_sum_axis0() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 1024), ark::FP32); - ark::Tensor out = m.reduce_sum(t, /*axis=*/0); - - auto result = ark::op_test("reduce_sum_axis0", m, {t}, {out}, - baseline_reduce_sum_axis0); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1, t.shape()[0])); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_reduce_sum_axis1() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(1, 2, 4, 1024), ark::FP32); - ark::Tensor out = m.reduce_sum(t, /*axis=*/1); - - auto result = ark::op_test("reduce_sum_axis1", m, {t}, {out}, - baseline_reduce_sum_axis1); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1, t.shape()[1])); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_reduce_sum_axis2() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(1, 1, 7, 8192), ark::FP32); - ark::Tensor out = m.reduce_sum(t, /*axis=*/2); - - auto result = ark::op_test("reduce_sum_axis2", m, {t}, {out}, - baseline_reduce_sum_axis2); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1, t.shape()[2])); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_reduce_sum_axis3() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(1, 1, 2, 8192), ark::FP32); - ark::Tensor out = m.reduce_sum(t, /*axis=*/3); - - auto result = ark::op_test("reduce_sum_axis3", m, {t}, {out}, - baseline_reduce_sum_axis3); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1, t.shape()[3])); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_reduce_sum_axis3_padded() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(1, 1, 2, 8192), ark::FP32); - ark::Tensor out = - m.tensor(ark::Dims(1, 1, 2, 1), ark::FP32, ark::Dims(1, 1, 2, 32)); - out = m.reduce_sum(t, /*axis=*/3, true, out); - - auto result = ark::op_test("reduce_sum_axis3_padded", m, {t}, {out}, - baseline_reduce_sum_axis3); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1, t.shape()[3])); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_reduce_sum_fp16() { - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 1024), ark::FP16); - ark::Tensor out = m.reduce_sum(t, /*axis=*/0); - - auto result = ark::op_test("reduce_sum_fp16_axis0", m, {t}, {out}, - baseline_reduce_sum_axis0); - UNITTEST_LOG(result); - UNITTEST_TRUE( - result.max_diff[0] < - ark::reduction_abs_error_bound(0.1, t.shape()[0])); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 1024), ark::FP16); - ark::Tensor out = m.reduce_sum(t, /*axis=*/3); - - auto result = ark::op_test("reduce_sum_fp16_axis3", m, {t}, {out}, - baseline_reduce_sum_axis3); - UNITTEST_LOG(result); - UNITTEST_TRUE( - result.max_diff[0] < - ark::reduction_abs_error_bound(0.1, t.shape()[3])); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_reduce_sum_bf16() { - std::vector data_vec(7 * 2 * 4 * 256); - for (size_t i = 0; i < data_vec.size(); ++i) { - data_vec[i] = ark::bf16((i % 1000) * 1e-4f); - } - - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 256), ark::BF16); - ark::Tensor out = m.reduce_sum(t, /*axis=*/0); - - auto result = ark::op_test("reduce_sum_bf16_axis0", m, {t}, {out}, - baseline_reduce_sum_axis0, - {data_vec.data()}); - UNITTEST_LOG(result); - UNITTEST_TRUE( - result.max_diff[0] < - ark::reduction_abs_error_bound(0.1, t.shape()[0])); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 256), ark::BF16); - ark::Tensor out = m.reduce_sum(t, /*axis=*/3); - - auto result = ark::op_test("reduce_sum_bf16_axis3", m, {t}, {out}, - baseline_reduce_sum_axis3, - {data_vec.data()}); - UNITTEST_LOG(result); - UNITTEST_TRUE( - result.max_diff[0] < - ark::reduction_abs_error_bound(0.1, t.shape()[3])); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_reduce_sum_fp16_no_keepdims() { - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 1024), ark::FP16); - ark::Tensor out = m.reduce_sum(t, /*axis=*/0, false); - - UNITTEST_EQ(out.shape(), ark::Dims(2, 4, 1024)); - - auto result = - ark::op_test("reduce_sum_fp16_axis0", m, {t}, {out}, - baseline_reduce_sum_axis0); - UNITTEST_LOG(result); - UNITTEST_TRUE( - result.max_diff[0] < - ark::reduction_abs_error_bound(0.1, t.shape()[0])); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 1024), ark::FP16); - ark::Tensor out = m.reduce_sum(t, /*axis=*/3, false); - - UNITTEST_EQ(out.shape(), ark::Dims(7, 2, 4)); - - auto result = - ark::op_test("reduce_sum_fp16_axis3", m, {t}, {out}, - baseline_reduce_sum_axis3); - UNITTEST_LOG(result); - UNITTEST_TRUE( - result.max_diff[0] < - ark::reduction_abs_error_bound(0.1, t.shape()[3])); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_reduce_sum_invalid() { - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 1024), ark::BF16); - ark::Tensor out = m.tensor(ark::Dims(1, 2, 4, 1024), ark::FP32); - UNITTEST_THROW(m.reduce_sum(t, /*axis=*/0, true, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 1024), ark::BF16); - ark::Tensor out = m.tensor(ark::Dims(7, 2, 4, 1), ark::FP32); - UNITTEST_THROW(m.reduce_sum(t, /*axis=*/3, true, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 1024), ark::BF16); - ark::Tensor out = m.tensor(ark::Dims(1, 2, 4, 512), ark::BF16); - UNITTEST_THROW(m.reduce_sum(t, /*axis=*/0, true, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 1024), ark::BF16); - ark::Tensor out = m.tensor(ark::Dims(7, 1, 4, 1), ark::BF16); - UNITTEST_THROW(m.reduce_sum(t, /*axis=*/3, true, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 1024), ark::BF16); - ark::Tensor out = m.tensor(ark::Dims(3, 2, 4, 1024), ark::BF16); - UNITTEST_THROW(m.reduce_sum(t, /*axis=*/0, true, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 1024), ark::BF16); - ark::Tensor out = m.tensor(ark::Dims(7, 2, 4, 3), ark::BF16); - UNITTEST_THROW(m.reduce_sum(t, /*axis=*/3, true, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 1), ark::BF16); - UNITTEST_THROW(m.reduce_sum(t, /*axis=*/3, true, t), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_reduce_max_axis3() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(1, 1, 2, 8192), ark::FP32); - ark::Tensor out = m.reduce_max(t, /*axis=*/3); - - auto result = ark::op_test("reduce_max_axis3", m, {t}, {out}, - baseline_reduce_max_axis3); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_reduce_mean_axis3() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(1, 1, 2, 8192), ark::FP32); - ark::Tensor out = m.reduce_mean(t, /*axis=*/3); - - auto result = ark::op_test("reduce_mean_axis3", m, {t}, {out}, - baseline_reduce_mean_axis3); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < - ark::reduction_abs_error_bound(0.1, t.shape()[3]) / - t.shape()[3]); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_reduce_invalid() { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(7, 2, 4, 1024), ark::FP32); - UNITTEST_THROW(m.reduce_max(t, /*axis=*/-10), ark::ModelError); - return ark::unittest::SUCCESS; -} - -int main() { - ark::init(); - UNITTEST(test_reduce_sum_axis0); - UNITTEST(test_reduce_sum_axis1); - UNITTEST(test_reduce_sum_axis2); - UNITTEST(test_reduce_sum_axis3); - UNITTEST(test_reduce_sum_axis3_padded); - UNITTEST(test_reduce_sum_fp16); - UNITTEST(test_reduce_sum_bf16); - UNITTEST(test_reduce_sum_fp16_no_keepdims); - UNITTEST(test_reduce_sum_invalid); - UNITTEST(test_reduce_max_axis3); - UNITTEST(test_reduce_mean_axis3); - UNITTEST(test_reduce_invalid); - return ark::unittest::SUCCESS; -} diff --git a/ark/ops/ops_rope_test.cpp b/ark/ops/ops_rope_test.cpp deleted file mode 100644 index b0812faed..000000000 --- a/ark/ops/ops_rope_test.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "ark/model.hpp" -#include "ops_test_common.hpp" -#include "unittest/unittest_utils.h" - -template -void baseline_rope(std::vector &outputs, const std::vector &, - const std::vector &inputs, - const std::vector &input_shapes, int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - T *other = static_cast(inputs[1]); - - ark::Dims ish = input_shapes[0].dims4(); - - for (ark::DimType n = 0; n < ish[0]; ++n) { - for (ark::DimType c = 0; c < ish[1]; ++c) { - for (ark::DimType h = 0; h < ish[2]; ++h) { - for (ark::DimType w = 0; w < ish[3]; w += 2) { - int idx = n * ish[1] * ish[2] * ish[3] + - c * ish[2] * ish[3] + h * ish[3] + w; - T input0 = input[idx]; - T input1 = input[idx + 1]; - T other0 = other[idx]; - T other1 = other[idx + 1]; - out[idx] = input0 * other0 - input1 * other1; - out[idx + 1] = input0 * other1 + input1 * other0; - } - } - } - } -} - -ark::unittest::State test_rope_fp32() { - ark::Model model; - ark::Tensor input = model.tensor(ark::Dims(1, 32, 32, 256), ark::FP32); - ark::Tensor other = model.tensor(ark::Dims(1, 32, 32, 256), ark::FP32); - ark::Tensor out = model.rope(input, other); - auto result = ark::op_test("rope", model, {input, other}, {out}, - baseline_rope); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-6f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_rope_fp16() { - ark::Model model; - ark::Tensor input = model.tensor(ark::Dims(1, 32, 32, 256), ark::FP16); - ark::Tensor other = model.tensor(ark::Dims(1, 32, 32, 256), ark::FP16); - ark::Tensor out = model.rope(input, other); - auto result = ark::op_test("rope", model, {input, other}, {out}, - baseline_rope); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-3f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_rope_bf16() { - ark::Model model; - ark::Tensor input = model.tensor(ark::Dims(1, 32, 32, 256), ark::BF16); - ark::Tensor other = model.tensor(ark::Dims(1, 32, 32, 256), ark::BF16); - ark::Tensor out = model.rope(input, other); - auto result = ark::op_test("rope", model, {input, other}, {out}, - baseline_rope); - UNITTEST_LOG(result); - UNITTEST_TRUE(result.max_diff[0] < 1e-3f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_rope_invalid() { - { - ark::Model model; - ark::Tensor input = model.tensor(ark::Dims(1, 32, 32, 256), ark::BF16); - ark::Tensor other = model.tensor(ark::Dims(1, 32, 32, 256), ark::FP32); - UNITTEST_THROW(model.rope(input, other), ark::ModelError); - } - { - ark::Model model; - ark::Tensor input = model.tensor(ark::Dims(1, 32, 32, 256), ark::FP32); - ark::Tensor other = model.tensor(ark::Dims(1, 32, 32, 256), ark::FP32); - ark::Tensor output = model.tensor(ark::Dims(1, 32, 32, 256), ark::FP16); - UNITTEST_THROW(model.rope(input, other, output), ark::ModelError); - } - { - ark::Model model; - ark::Tensor input = model.tensor(ark::Dims(1, 32, 32, 256), ark::FP32); - ark::Tensor other = model.tensor(ark::Dims(1, 32, 32, 256), ark::FP32); - ark::Tensor output = model.tensor(ark::Dims(1, 32, 32, 32), ark::FP32); - UNITTEST_THROW(model.rope(input, other, output), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -int main() { - ark::init(); - UNITTEST(test_rope_fp32); - UNITTEST(test_rope_fp16); - UNITTEST(test_rope_bf16); - UNITTEST(test_rope_invalid); - return ark::unittest::SUCCESS; -} diff --git a/ark/ops/ops_scalar_test.cpp b/ark/ops/ops_scalar_test.cpp deleted file mode 100644 index 47a5b40bd..000000000 --- a/ark/ops/ops_scalar_test.cpp +++ /dev/null @@ -1,345 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "ark/executor.hpp" -#include "ark/model.hpp" -#include "ops_test_common.hpp" -#include "unittest/unittest_utils.h" - -#define FACTOR 0.7 - -template -void baseline_scalar_add(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &, int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0]; - for (ark::DimType i = 0; i < osh.nelems(); ++i) { - out[i] = input[i] + T(FACTOR); - } -}; - -template -void baseline_scalar_sub(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &, int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0]; - for (ark::DimType i = 0; i < osh.nelems(); ++i) { - out[i] = input[i] - T(FACTOR); - } -}; - -template -void baseline_scalar_mul(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &, int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0]; - for (ark::DimType i = 0; i < osh.nelems(); ++i) { - out[i] = input[i] * T(FACTOR); - } -}; - -template -void baseline_scalar_div(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &, int) { - T *out = static_cast(outputs[0]); - T *input = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0]; - for (ark::DimType i = 0; i < osh.nelems(); ++i) { - out[i] = input[i] / T(FACTOR); - } -}; - -ark::unittest::State test_scalar_assign_fp16() { - { - ark::Model m; - ark::Tensor t = m.constant(7, ark::Dims(4, 2, 50), ark::FP16); - - ark::DefaultExecutor exe(m); - - exe.launch(); - exe.run(1); - exe.stop(); - - std::vector data(4 * 2 * 50); - exe.tensor_read(t, data); - for (auto v : data) { - UNITTEST_EQ(v, ark::half_t(7)); - } - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 50), ark::FP16); - ark::Tensor out = m.copy(7, t); - - ark::DefaultExecutor exe(m); - - std::vector data(4 * 2 * 50, 3); - exe.tensor_write(t, data); - - exe.launch(); - exe.run(1); - exe.stop(); - - data.clear(); - data.resize(4 * 2 * 50); - exe.tensor_read(t, data); - for (auto v : data) { - UNITTEST_EQ(v, ark::half_t(7)); - } - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_scalar_assign_fp32() { - { - ark::Model m; - ark::Tensor out = m.copy(7); - - ark::DefaultExecutor exe(m); - - exe.launch(); - exe.run(1); - exe.stop(); - - std::vector data(1); - exe.tensor_read(out, data); - for (auto v : data) { - UNITTEST_EQ(v, 7); - } - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_scalar_add_fp16() { - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1), ark::FP16); - ark::Tensor out = m.add(t, FACTOR); - - auto result = ark::op_test("scalar_add_fp16_small", m, {t}, {out}, - baseline_scalar_add); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::FP16); - ark::Tensor out = m.add(t, FACTOR); - - auto result = ark::op_test("scalar_add_fp16", m, {t}, {out}, - baseline_scalar_add); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_scalar_sub_fp16() { - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1), ark::FP16); - ark::Tensor out = m.sub(t, FACTOR); - - auto result = ark::op_test("scalar_sub_fp16_small", m, {t}, {out}, - baseline_scalar_sub); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::FP16); - ark::Tensor out = m.sub(t, FACTOR); - - auto result = ark::op_test("scalar_sub_fp16", m, {t}, {out}, - baseline_scalar_sub); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_scalar_mul_fp32() { - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1), ark::FP32); - ark::Tensor out = m.mul(t, FACTOR); - - auto result = ark::op_test("scalar_mul_fp32_small", m, {t}, {out}, - baseline_scalar_mul); - UNITTEST_LOG(result); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::FP32); - ark::Tensor out = m.mul(t, FACTOR); - - auto result = ark::op_test("scalar_mul_fp32", m, {t}, {out}, - baseline_scalar_mul); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_scalar_mul_fp16() { - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1), ark::FP16); - ark::Tensor out = m.mul(t, FACTOR); - - auto result = ark::op_test("scalar_mul_fp16_small", m, {t}, {out}, - baseline_scalar_mul); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::FP16); - ark::Tensor out = m.mul(t, FACTOR); - - auto result = ark::op_test("scalar_mul_fp16", m, {t}, {out}, - baseline_scalar_mul); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_scalar_mul_bf16() { - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1), ark::BF16); - ark::Tensor out = m.mul(t, FACTOR); - - auto result = ark::op_test("scalar_mul_bf16_small", m, {t}, {out}, - baseline_scalar_mul); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::BF16); - ark::Tensor out = m.mul(t, FACTOR); - - auto result = ark::op_test("scalar_mul_bf16", m, {t}, {out}, - baseline_scalar_mul); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_scalar_mul_invalid() { - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::BF16); - ark::Tensor out = m.tensor(ark::Dims(4, 2, 1024), ark::FP32); - UNITTEST_THROW(m.mul(t, 3, out), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::BF16); - ark::Tensor out = m.tensor(ark::Dims(4, 4, 1024), ark::BF16); - UNITTEST_THROW(m.mul(t, 3, out), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_scalar_mul_fp16_offset() { - { - ark::Model m; - ark::Tensor buf = m.tensor({1024}, ark::FP16); - ark::Tensor tns = m.refer(buf, {2}, {1024}, {6}); - ark::Tensor doubled = m.mul(tns, 2, tns); - ark::Tensor out = m.identity(buf, {doubled}); - - std::vector data(1024, ark::half_t(2)); - auto result = ark::op_test( - "scalar_mul_fp16_offset", m, {buf}, {out}, - [](std::vector &outputs, const std::vector &, - const std::vector &, const std::vector &, - int) { - ark::half_t *out = static_cast(outputs[0]); - for (size_t i = 0; i < 1024; ++i) { - if (i == 6 || i == 7) { - out[i] = 4; - } else { - out[i] = 2; - } - } - }, - {data.data()}); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - } - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_scalar_mul_perf() { - ark::DimType nelem = 8 * 1024 * 1024; - - ark::Model m; - ark::Tensor t = m.tensor({nelem}, ark::FP32); - ark::Tensor out = m.mul(t, 0.7); - - auto result = ark::op_test("scalar_mul_perf", m, {t}, {out}, - baseline_scalar_mul); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - - float gbps = nelem * sizeof(float) / result.msec_per_iter * 1e-6; - UNITTEST_LOG(gbps, " GB/s"); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_scalar_div_fp16() { - float rel_err_bound = ark::division_rel_error_bound(FACTOR); - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1), ark::FP16); - ark::Tensor out = m.div(t, FACTOR); - - auto result = ark::op_test("scalar_div_fp16_small", m, {t}, {out}, - baseline_scalar_div); - UNITTEST_LOG(result); - UNITTEST_LT(result.max_err_rate[0], rel_err_bound); - } - { - ark::Model m; - ark::Tensor t = m.tensor(ark::Dims(4, 2, 1024), ark::FP16); - ark::Tensor out = m.div(t, FACTOR); - - auto result = ark::op_test("scalar_div_fp16", m, {t}, {out}, - baseline_scalar_div); - UNITTEST_LOG(result); - UNITTEST_LT(result.max_err_rate[0], rel_err_bound); - } - return ark::unittest::SUCCESS; -} - -int main() { - ark::init(); - UNITTEST(test_scalar_assign_fp16); - UNITTEST(test_scalar_assign_fp32); - UNITTEST(test_scalar_add_fp16); - UNITTEST(test_scalar_sub_fp16); - UNITTEST(test_scalar_mul_fp32); - UNITTEST(test_scalar_mul_fp16); - UNITTEST(test_scalar_mul_bf16); - UNITTEST(test_scalar_mul_invalid); - UNITTEST(test_scalar_mul_fp16_offset); - UNITTEST(test_scalar_mul_perf); - UNITTEST(test_scalar_div_fp16); - return ark::unittest::SUCCESS; -} diff --git a/ark/ops/ops_transpose_test.cpp b/ark/ops/ops_transpose_test.cpp deleted file mode 100644 index 139e1ee66..000000000 --- a/ark/ops/ops_transpose_test.cpp +++ /dev/null @@ -1,281 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "ark/model.hpp" -#include "ark/planner.hpp" -#include "model/model_json.hpp" -#include "ops_test_common.hpp" -#include "unittest/unittest_utils.h" - -#define SYNC_TEST 0 - -template -void baseline_transpose_0132(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, int) { - T *out = static_cast(outputs[0]); - T *in = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0].dims4(); - ark::Dims ish = input_shapes[0].dims4(); - for (ark::DimType n = 0; n < ish[0]; ++n) { - for (ark::DimType c = 0; c < ish[1]; ++c) { - for (ark::DimType h = 0; h < ish[2]; ++h) { - for (ark::DimType w = 0; w < ish[3]; ++w) { - // out[n][c][w][h] = in[n][c][h][w] - out[h + w * osh[3] + c * osh[2] * osh[3] + - n * osh[1] * osh[2] * osh[3]] = - in[w + h * ish[3] + c * ish[3] * ish[2] + - n * ish[3] * ish[2] * ish[1]]; - } - } - } - } -}; - -template -void baseline_transpose_0231(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, int) { - T *out = static_cast(outputs[0]); - T *in = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0].dims4(); - ark::Dims ish = input_shapes[0].dims4(); - for (ark::DimType n = 0; n < ish[0]; ++n) { - for (ark::DimType c = 0; c < ish[1]; ++c) { - for (ark::DimType h = 0; h < ish[2]; ++h) { - for (ark::DimType w = 0; w < ish[3]; ++w) { - // out[n][h][w][c] = in[n][c][h][w] - out[c + w * osh[3] + h * osh[2] * osh[3] + - n * osh[1] * osh[2] * osh[3]] = - in[w + h * ish[3] + c * ish[3] * ish[2] + - n * ish[3] * ish[2] * ish[1]]; - } - } - } - } -}; - -template -void baseline_transpose_0213(std::vector &outputs, - const std::vector &output_shapes, - const std::vector &inputs, - const std::vector &input_shapes, int) { - T *out = static_cast(outputs[0]); - T *in = static_cast(inputs[0]); - ark::Dims osh = output_shapes[0].dims4(); - ark::Dims ish = input_shapes[0].dims4(); - for (ark::DimType n = 0; n < ish[0]; ++n) { - for (ark::DimType c = 0; c < ish[1]; ++c) { - for (ark::DimType h = 0; h < ish[2]; ++h) { - for (ark::DimType w = 0; w < ish[3]; ++w) { - // out[n][h][c][w] = in[n][c][h][w] - out[w + c * osh[3] + h * osh[2] * osh[3] + - n * osh[1] * osh[2] * osh[3]] = - in[w + h * ish[3] + c * ish[3] * ish[2] + - n * ish[3] * ish[2] * ish[1]]; - } - } - } - } -}; - -template -void baseline_transpose_sync_test(std::vector &outputs, - const std::vector &, - const std::vector &inputs, - const std::vector &input_shapes, - int) { - T *out = static_cast(outputs[0]); - T *in = static_cast(inputs[0]); - ::memcpy(out, in, sizeof(T) * input_shapes[0].nelems()); -}; - -ark::unittest::State test_transpose_0132_fp32() { - ark::Model m; - ark::Tensor t = m.tensor({5, 3, 32, 128}, ark::FP32); - ark::Tensor out = m.transpose(t, {0, 1, 3, 2}); - - auto result = ark::op_test("transpose_0132_fp32", m, {t}, {out}, - baseline_transpose_0132); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_transpose_0132_fp16() { - ark::Model m; - ark::Tensor t = m.tensor({5, 3, 32, 128}, ark::FP16); - ark::Tensor out = m.transpose(t, {0, 1, 3, 2}); - - auto result = ark::op_test("transpose_0132_fp16", m, {t}, {out}, - baseline_transpose_0132); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_transpose_0132_bf16() { - ark::Model m; - ark::Tensor t = m.tensor({5, 3, 32, 128}, ark::BF16); - ark::Tensor out = m.transpose(t, {0, 1, 3, 2}); - - auto result = ark::op_test("transpose_0132_bf16", m, {t}, {out}, - baseline_transpose_0132); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_transpose_0231_fp32() { - ark::Model m; - ark::Tensor t = m.tensor({5, 3, 32, 128}, ark::FP32); - ark::Tensor out = m.transpose(t, {0, 2, 3, 1}); - - auto result = ark::op_test("transpose_0231_fp32", m, {t}, {out}, - baseline_transpose_0231); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_transpose_0231_fp16() { - ark::Model m; - ark::Tensor t = m.tensor({5, 3, 32, 128}, ark::FP16); - ark::Tensor out = m.transpose(t, {0, 2, 3, 1}); - - auto result = ark::op_test("transpose_0231_fp16", m, {t}, {out}, - baseline_transpose_0231); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_transpose_0231_bf16() { - ark::Model m; - ark::Tensor t = m.tensor({5, 3, 32, 128}, ark::BF16); - ark::Tensor out = m.transpose(t, {0, 2, 3, 1}); - - auto result = ark::op_test("transpose_0231_bf16", m, {t}, {out}, - baseline_transpose_0231); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_transpose_0213_fp32() { - ark::Model m; - ark::Tensor t = m.tensor({5, 3, 32, 128}, ark::FP32); - ark::Tensor out = m.transpose(t, {0, 2, 1, 3}); - - auto result = ark::op_test("transpose_0213_fp32", m, {t}, {out}, - baseline_transpose_0213); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_transpose_0213_fp16() { - ark::Model m; - ark::PlannerContext ctx(m); - ctx.warp_range(0, 4); - ctx.sram_range(0, 0); - ctx.sync(false); - ctx.config(ark::Json({{"NumWarps", 4}, {"SramBytes", 0}, {"Tile", {8, 64}}}) - .dump()); - - ark::Tensor t = m.tensor({5, 256, 32, 128}, ark::FP16); - ark::Tensor out = m.transpose(t, {0, 2, 1, 3}); - - auto result = ark::op_test("transpose_0213_fp16", m, {t}, {out}, - baseline_transpose_0213); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_transpose_0213_bf16() { - ark::Model m; - ark::Tensor t = m.tensor({5, 3, 32, 128}, ark::BF16); - ark::Tensor out = m.transpose(t, {0, 2, 1, 3}); - - auto result = ark::op_test("transpose_0213_bf16", m, {t}, {out}, - baseline_transpose_0213); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_transpose_sync_test() { - ark::Model m; - ark::PlannerContext shared_ctx(m); - shared_ctx.warp_range(0, 4); - shared_ctx.sram_range(0, 0); - shared_ctx.sync(false); - - ark::Tensor in, t, out; - in = m.tensor({1, 16, 2, 64}, ark::FP16); - { - ark::PlannerContext ctx(m); - ctx.config( - ark::Json({{"NumWarps", 4}, {"SramBytes", 0}, {"Tile", {8, 64}}}) - .dump()); - t = m.transpose(in, {0, 2, 1, 3}); - } - { - ark::PlannerContext ctx(m); - ctx.config( - ark::Json({{"NumWarps", 4}, {"SramBytes", 0}, {"Tile", {8, 1, 64}}}) - .dump()); - out = m.transpose(t, {0, 2, 1, 3}); - } - - auto result = ark::op_test("transpose_sync_test", m, {in}, {out}, - baseline_transpose_sync_test); - UNITTEST_LOG(result); - UNITTEST_EQ(result.max_diff[0], 0.0f); - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_transpose_invalid() { - { - ark::Model m; - ark::Tensor t = m.tensor({5}, ark::FP32); - UNITTEST_THROW(m.transpose(t, {0, 2, 3, 1}), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor({5, 128}, ark::FP32); - UNITTEST_THROW(m.transpose(t, {0, 2, 3, 1}), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor({5, 128}, ark::FP32); - UNITTEST_THROW(m.transpose(t, {0, 2}), ark::ModelError); - } - { - ark::Model m; - ark::Tensor t = m.tensor({5, 128}, ark::FP32); - UNITTEST_THROW(m.transpose(t, {1, 1}), ark::ModelError); - } - return ark::unittest::SUCCESS; -} - -int main() { - ark::init(); - UNITTEST(test_transpose_0132_fp32); - UNITTEST(test_transpose_0132_fp16); - UNITTEST(test_transpose_0132_bf16); - UNITTEST(test_transpose_0231_fp32); - UNITTEST(test_transpose_0231_fp16); - UNITTEST(test_transpose_0231_bf16); - UNITTEST(test_transpose_0213_fp32); - UNITTEST(test_transpose_0213_fp16); - UNITTEST(test_transpose_0213_bf16); -#if (SYNC_TEST) - UNITTEST(test_transpose_sync_test); -#endif - UNITTEST(test_transpose_invalid); - return ark::unittest::SUCCESS; -} diff --git a/examples/multi_head_attention/mha.py b/examples/multi_head_attention/mha.py new file mode 100644 index 000000000..26a45855a --- /dev/null +++ b/examples/multi_head_attention/mha.py @@ -0,0 +1,169 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +""" +Multi-Head Attention implemented as an ARK Module using composed small ops. + +Two versions: +1. MultiHeadAttention — standard (non-flash) attention for correctness baseline +2. FlashMultiHeadAttention — online softmax (flash attention) algorithm + +Both use ARK's operator composition with PlannerContext for scheduling. +""" + +import ark +import math + + +class MultiHeadAttention(ark.Module): + """Standard multi-head attention: O = softmax(Q @ K^T / sqrt(d)) @ V + + Args: + head_dim: dimension per head (used for scaling). + causal: whether to apply causal masking (not yet supported). + """ + + def __init__(self, head_dim: int, causal: bool = False): + super().__init__() + self.scale = 1.0 / math.sqrt(head_dim) + self.causal = causal + + def forward(self, q, r_k, v): + """ + Args: + q: (batch, heads, seq_len, head_dim) — query + r_k: (batch, heads, head_dim, seq_len) — key, already transposed + v: (batch, heads, seq_len, head_dim) — value + + Returns: + o: (batch, heads, seq_len, head_dim) + """ + # S = Q @ K^T -> (batch, heads, seq_len, seq_len) + s = ark.matmul(q, r_k) + + # Scale: S = S * (1 / sqrt(d)) + s = ark.mul(s, self.scale) + + # Softmax along last axis + # max + m = ark.reduce_max(s, axis=-1) + s = ark.sub(s, m) + s = ark.exp(s) + l = ark.reduce_sum(s, axis=-1) + p = ark.div(s, l) + + # O = P @ V -> (batch, heads, seq_len, head_dim) + o = ark.matmul(p, v) + return o + + +class MultiHeadAttentionOptimized(ark.Module): + """Tile-fused MHA: merges matmul, softmax, and output matmul into + aligned tile tasks using PlannerContext. + + Key insight: ARK's matmul is tile-based (e.g., [128, N] per task). + By configuring softmax ops to use the same tile grid — each task + processes the same row-block — all ops can be fused into one task + with sync=False. This eliminates ALL inter-op sync barriers. + + The tile alignment is: + - matmul(Q, K^T): [TileM, N] tiles of S matrix + - softmax(S): [TileM, N] tiles (full-row reduction per tile) + - matmul(P, V): [TileM, D] tiles of output + + All ops share the same number of tasks = batch*heads * ceil(N/TileM). + + Args: + head_dim: dimension per head. + seq_len: sequence length. + tile_m: row-block size for tiling (must divide seq_len). + """ + + def __init__(self, head_dim: int, seq_len: int = 256, tile_m: int = 128): + super().__init__() + self.scale = 1.0 / math.sqrt(head_dim) + self.seq_len = seq_len + self.tile_m = tile_m + + def forward(self, q, r_k, v): + shape = q.shape() + N = shape[-2] + D = shape[-1] + batch_heads = 1 + for d in shape[:-2]: + batch_heads *= d + TM = self.tile_m + S = self.seq_len # = N for self-attention + num_tasks = batch_heads * (N // TM) + + # Fuse matmul(Q,K^T) + softmax into one task per row-block. + # All ops use NumWarps=8 and tile height=TM to produce matching + # task counts. The key fix: reduce ops now use Tile=[TM,1] to + # match the matmul's tile grid. + with ark.PlannerContext( + sync=False, + warp_range=[0, 8], + sram_range=[0, 147456], + ): + # Matmul Q[TM,D] @ K^T[D,S] -> S[TM,S] + with ark.PlannerContext( + config={ + "NumWarps": 8, + "SramBytes": 147456, + "Tile": [TM, S], + }, + ): + s = ark.matmul(q, r_k) + + # scale — element-wise, tile matches matmul + with ark.PlannerContext( + config={ + "NumWarps": 8, "SramBytes": 0, + "Tile": [TM, S], "NumTasks": num_tasks, + }, + ): + s = ark.mul(s, self.scale) + + # reduce_max — NOW with Tile=[TM,1] to match task count + with ark.PlannerContext( + config={ + "NumWarps": 8, "SramBytes": 256, + "ImplType": "WarpWise", + "Tile": [TM, 1], + }, + ): + m = ark.reduce_max(s, axis=-1) + + # sub + exp + with ark.PlannerContext( + config={ + "NumWarps": 8, "SramBytes": 0, + "Tile": [TM, S], "NumTasks": num_tasks, + }, + ): + s = ark.sub(s, m) + s = ark.exp(s) + + # reduce_sum — Tile=[TM,1] + with ark.PlannerContext( + config={ + "NumWarps": 8, "SramBytes": 256, + "ImplType": "WarpWise", + "Tile": [TM, 1], + }, + ): + l = ark.reduce_sum(s, axis=-1) + + # div + with ark.PlannerContext( + config={ + "NumWarps": 8, "SramBytes": 0, + "Tile": [TM, S], "NumTasks": num_tasks, + }, + ): + p = ark.div(s, l) + + # Matmul P @ V — separate task (different SRAM requirement) + o = ark.matmul(p, v) + + return o diff --git a/examples/multi_head_attention/test_mha.py b/examples/multi_head_attention/test_mha.py new file mode 100644 index 000000000..98ed0aeab --- /dev/null +++ b/examples/multi_head_attention/test_mha.py @@ -0,0 +1,371 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +""" +Test and benchmark ARK MultiHeadAttention against FlashAttention-2. + +Correctness: uses Tensor.eval() for concise graph execution. +Benchmark: follows gpu-kernel-perf-bench methodology — + - L2 cache pollution via rotated input buffers + - Pilot-driven iteration count (target 0.1-0.3s total) + - torch.profiler for FlashAttention timing + - ARK native rt.run(iter=N) for ARK timing (persistent loop kernel) +""" + +import sys +import os +import math +import time + +import torch +import torch.nn.functional as F + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from mha import MultiHeadAttention, MultiHeadAttentionOptimized + +try: + from flash_attn import flash_attn_func + + _has_flash = True +except ImportError: + _has_flash = False + +import ark + +DEVICE = "cuda:0" + + +# ─── Correctness ──────────────────────────────────────────────────────────── + + +def test_correctness(B, H, N, D, dtype=torch.float16): + """Compare ARK MHA output against FlashAttention-2 using eval().""" + scale = 1.0 / math.sqrt(D) + q = torch.randn(B, H, N, D, dtype=dtype, device=DEVICE) + k = torch.randn(B, H, N, D, dtype=dtype, device=DEVICE) + v = torch.randn(B, H, N, D, dtype=dtype, device=DEVICE) + k_t = k.transpose(-2, -1).contiguous() + + # ARK vanilla — uses eval() + result = MultiHeadAttention(D)( + ark.Tensor.from_torch(q), ark.Tensor.from_torch(k_t), ark.Tensor.from_torch(v) + ).eval() + + # Reference + if _has_flash: + q_fa = q.transpose(1, 2).contiguous() + k_fa = k.transpose(1, 2).contiguous() + v_fa = v.transpose(1, 2).contiguous() + ref = flash_attn_func(q_fa, k_fa, v_fa, softmax_scale=scale) + ref = ref.transpose(1, 2).contiguous() + label = "FA2" + else: + ref = F.scaled_dot_product_attention(q, k, v, scale=scale) + label = "SDPA" + + diff = (result - ref).abs().max().item() + atol = 5e-2 if dtype == torch.float16 else 1e-1 + ok = diff < atol + print(f" B={B} H={H} N={N:4d} D={D} diff={diff:.4f} vs {label} {'PASS' if ok else 'FAIL'}") + return ok + + +# ─── Benchmark helpers ────────────────────────────────────────────────────── + +# L2 cache size for H200 ≈ 50 MB. Use 2× = 100 MB worth of buffers. +L2_CACHE_BYTES = 50 * 1024 * 1024 + + +def _make_rotated_inputs(B, H, N, D, dtype, num_bufs): + """Create multiple input buffer sets for L2 cache pollution.""" + bufs = [] + for _ in range(num_bufs): + q = torch.randn(B, H, N, D, dtype=dtype, device=DEVICE) + k = torch.randn(B, H, N, D, dtype=dtype, device=DEVICE) + v = torch.randn(B, H, N, D, dtype=dtype, device=DEVICE) + bufs.append((q, k, v)) + return bufs + + +def _pilot_iters(run_once_fn, target_sec=0.2): + """Determine iteration count to reach target_sec total time.""" + # Single pilot + torch.cuda.synchronize() + t0 = time.perf_counter() + run_once_fn() + torch.cuda.synchronize() + t1 = time.perf_counter() + per_iter = max(t1 - t0, 1e-6) + iters = max(1, int(target_sec / per_iter)) + return iters + + +def bench_flash_attn(B, H, N, D, dtype=torch.float16): + """Benchmark FlashAttention-2 with L2 pollution and torch.profiler.""" + if not _has_flash: + return float("nan") + scale = 1.0 / math.sqrt(D) + elem_bytes = N * D * torch.finfo(dtype).bits // 8 + num_bufs = max(4, (2 * L2_CACHE_BYTES) // (3 * B * H * elem_bytes) + 1) + bufs = _make_rotated_inputs(B, H, N, D, dtype, num_bufs) + + def run_one(i): + q, k, v = bufs[i % num_bufs] + q_fa = q.transpose(1, 2).contiguous() + k_fa = k.transpose(1, 2).contiguous() + v_fa = v.transpose(1, 2).contiguous() + flash_attn_func(q_fa, k_fa, v_fa, softmax_scale=scale) + + iters = _pilot_iters(lambda: run_one(0)) + + # Warmup + for i in range(min(3, iters)): + run_one(i) + + # Timed + torch.cuda.synchronize() + t0 = time.perf_counter() + for i in range(iters): + run_one(i) + torch.cuda.synchronize() + elapsed = time.perf_counter() - t0 + return elapsed / iters * 1000 # ms + + +def bench_ark(B, H, N, D, mha_cls, mha_args, dtype=torch.float16): + """Benchmark an ARK MHA module using the persistent loop kernel.""" + q = torch.randn(B, H, N, D, dtype=dtype, device=DEVICE) + k = torch.randn(B, H, N, D, dtype=dtype, device=DEVICE) + v = torch.randn(B, H, N, D, dtype=dtype, device=DEVICE) + k_t = k.transpose(-2, -1).contiguous() + + ark.init() + mha = mha_cls(*mha_args) + out = mha( + ark.Tensor.from_torch(q), ark.Tensor.from_torch(k_t), ark.Tensor.from_torch(v) + ) + + with ark.Runtime() as rt: + rt.launch() + # Pilot: single iteration + iters = _pilot_iters(lambda: rt.run(iter=1), target_sec=0.2) + + # Warmup + rt.run(iter=min(3, iters)) + + # Timed + torch.cuda.synchronize() + t0 = time.perf_counter() + rt.run(iter=iters) + elapsed = time.perf_counter() - t0 + + return elapsed / iters * 1000 # ms + + +def run_benchmark(B, H, N, D, dtype=torch.float16): + fa_ms = bench_flash_attn(B, H, N, D, dtype) + vanilla_ms = bench_ark(B, H, N, D, MultiHeadAttention, (D,), dtype) + opt_ms = bench_ark(B, H, N, D, MultiHeadAttentionOptimized, (D, N), dtype) + ratio = opt_ms / fa_ms if fa_ms > 0 else float("nan") + print( + f" B={B} H={H:2d} N={N:4d} D={D} " + f"FA2={fa_ms:.3f}ms ARK={vanilla_ms:.3f}ms ARK-Opt={opt_ms:.3f}ms " + f"(Opt/FA2={ratio:.2f}x)" + ) + + +# ─── Main ─────────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + print("=" * 70) + print("Correctness: ARK MHA vs FlashAttention-2") + print("=" * 70) + all_pass = True + for B, H, N, D in [ + (1, 1, 256, 128), + (1, 4, 256, 128), + (2, 8, 256, 128), + (1, 1, 512, 128), + ]: + all_pass &= test_correctness(B, H, N, D) + + if not all_pass: + print("\nSome tests FAILED!") + sys.exit(1) + print("\nAll correctness tests PASSED!") + + print() + print("=" * 70) + print("Performance: ARK vs FlashAttention-2") + print("=" * 70) + for B, H, N, D in [ + (1, 1, 256, 128), + (1, 4, 256, 128), + (1, 8, 256, 128), + (1, 1, 512, 128), + (1, 4, 512, 128), + ]: + run_benchmark(B, H, N, D) +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +""" +Test and benchmark ARK MultiHeadAttention against: + - flash_attn (Tri Dao's FlashAttention-2, flash_attn_func) + - PyTorch SDPA (F.scaled_dot_product_attention, which dispatches to + flash/mem-efficient/math backends automatically) +""" + +import ark +import torch +import torch.nn.functional as F +import time +import math +import sys + +from flash_attn import flash_attn_func + +sys.path.insert(0, ".") +from mha import MultiHeadAttention, MultiHeadAttentionOptimized + + +def flash_attn_reference(q, k, v, scale): + """Run Tri Dao's FlashAttention-2. + + flash_attn_func expects (batch, seq_len, heads, head_dim). + Our tensors are (batch, heads, seq_len, head_dim), so we transpose. + """ + q_fa = q.transpose(1, 2).contiguous() # (B, N, H, D) + k_fa = k.transpose(1, 2).contiguous() + v_fa = v.transpose(1, 2).contiguous() + o_fa = flash_attn_func(q_fa, k_fa, v_fa, softmax_scale=scale) + return o_fa.transpose(1, 2).contiguous() # back to (B, H, N, D) + + +def torch_sdpa_reference(q, k, v, scale): + """PyTorch's scaled_dot_product_attention (auto backend selection).""" + return F.scaled_dot_product_attention(q, k, v, scale=scale) + + +def test_correctness(batch, heads, seq_len, head_dim, dtype=torch.float16): + print(f" B={batch}, H={heads}, N={seq_len}, D={head_dim}", end="") + scale = 1.0 / math.sqrt(head_dim) + + q = torch.randn(batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0") + k = torch.randn(batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0") + v = torch.randn(batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0") + + # Reference: FlashAttention-2 + ref = flash_attn_reference(q, k, v, scale) + + # ARK standard MHA + ark.init() + k_t = k.transpose(-2, -1).contiguous() + mha = MultiHeadAttention(head_dim) + ark_out = mha(ark.Tensor.from_torch(q), ark.Tensor.from_torch(k_t), ark.Tensor.from_torch(v)) + with ark.Runtime() as rt: + rt.launch() + rt.run() + result = ark_out.to_torch() + + diff = (result - ref).abs().max().item() + atol = 5e-2 if dtype == torch.float16 else 1e-1 + ok = diff < atol + print(f" diff={diff:.4f} {'PASS' if ok else 'FAIL'}") + return ok + + +def bench_one(label, run_fn, num_warmup=10, num_iter=50): + """Benchmark helper: warmup, then time num_iter iterations.""" + for _ in range(num_warmup): + run_fn() + torch.cuda.synchronize() + start = time.time() + for _ in range(num_iter): + run_fn() + torch.cuda.synchronize() + ms = (time.time() - start) / num_iter * 1000 + return ms + + +def run_benchmark(batch, heads, seq_len, head_dim, dtype=torch.float16): + scale = 1.0 / math.sqrt(head_dim) + + q = torch.randn(batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0") + k = torch.randn(batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0") + v = torch.randn(batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0") + k_t = k.transpose(-2, -1).contiguous() + + # --- FlashAttention-2 (Tri Dao) --- + q_fa = q.transpose(1, 2).contiguous() + k_fa = k.transpose(1, 2).contiguous() + v_fa = v.transpose(1, 2).contiguous() + + flash_ms = bench_one( + "FlashAttn2", + lambda: flash_attn_func(q_fa, k_fa, v_fa, softmax_scale=scale), + ) + + # --- PyTorch SDPA --- + sdpa_ms = bench_one( + "SDPA", + lambda: F.scaled_dot_product_attention(q, k, v, scale=scale), + ) + + # --- ARK Vanilla --- + ark.init() + mha = MultiHeadAttention(head_dim) + ark_out = mha(ark.Tensor.from_torch(q), ark.Tensor.from_torch(k_t), ark.Tensor.from_torch(v)) + with ark.Runtime() as rt: + rt.launch() + vanilla_ms = bench_one("ARK", lambda: rt.run(iter=1), num_warmup=5) + + # --- ARK Optimized (fused softmax) --- + ark.init() + mha_opt = MultiHeadAttentionOptimized(head_dim, seq_len) + ark_out2 = mha_opt(ark.Tensor.from_torch(q), ark.Tensor.from_torch(k_t), ark.Tensor.from_torch(v)) + with ark.Runtime() as rt: + rt.launch() + opt_ms = bench_one("ARK-Opt", lambda: rt.run(iter=1), num_warmup=5) + + print( + f" B={batch} H={heads:2d} N={seq_len:4d} D={head_dim:3d} " + f"FlashAttn2={flash_ms:.3f}ms SDPA={sdpa_ms:.3f}ms " + f"ARK={vanilla_ms:.3f}ms ARK-Opt={opt_ms:.3f}ms " + f"(Opt/Flash={opt_ms/flash_ms:.2f}x)" + ) + return flash_ms, sdpa_ms, vanilla_ms, opt_ms + + +if __name__ == "__main__": + print("=" * 70) + print("Correctness: ARK MHA vs FlashAttention-2") + print("=" * 70) + all_pass = True + for B, H, N, D in [ + (1, 1, 256, 128), + (1, 4, 256, 128), + (2, 8, 256, 128), + (1, 1, 512, 128), + ]: + all_pass &= test_correctness(B, H, N, D) + + if not all_pass: + print("\nSome tests FAILED!") + sys.exit(1) + print("\nAll correctness tests PASSED!") + + print() + print("=" * 70) + print("Performance: ARK vs FlashAttention-2 vs PyTorch SDPA") + print("=" * 70) + for B, H, N, D in [ + (1, 1, 256, 128), + (1, 4, 256, 128), + (1, 8, 256, 128), + (1, 1, 512, 128), + (1, 4, 512, 128), + (1, 8, 512, 128), + ]: + run_benchmark(B, H, N, D) diff --git a/examples/tutorial/module_tutorial.py b/examples/tutorial/module_tutorial.py index b3bac67ea..b18804063 100644 --- a/examples/tutorial/module_tutorial.py +++ b/examples/tutorial/module_tutorial.py @@ -2,52 +2,32 @@ # Licensed under the MIT license. import torch -import numpy as np import torch.nn as nn import ark # Define the parameters of the model batch_size = 1 -seq_len = 64 +seq_len = 128 d_model = 512 d_ff = 2048 -def convert_state_dict(state_dict: dict, type="numpy"): - """ - Convert the state_dict of a module to np.ndarray or torch.Tensor type - """ - new_state_dict = {} - for key in state_dict: - if type == "torch": - new_state_dict[key] = torch.from_numpy(state_dict[key]) - elif type == "numpy": - new_state_dict[key] = state_dict[key].numpy() - return new_state_dict - - class SubModuleARK(ark.Module): - def __init__(self): + def __init__(self, weight_2): super(SubModuleARK, self).__init__() - # Define the parameters of the submodule - self.weight_2 = ark.parameter([d_ff, d_model], ark.fp32) + self.weight_2 = ark.Tensor.from_torch(weight_2) def forward(self, inputs): - # Perform the forward pass of the submodule - middle_result1 = ark.matmul(inputs, self.weight_2) - return middle_result1 + return ark.matmul(inputs, self.weight_2) class TestModelARK(ark.Module): - def __init__(self): + def __init__(self, weight_1, weight_2): super(TestModelARK, self).__init__() - # Define the parameters of the module - self.weight_1 = ark.parameter([d_model, d_ff], ark.fp32) - # Create a submodule of the module - self.submodule = SubModuleARK() + self.weight_1 = ark.Tensor.from_torch(weight_1) + self.submodule = SubModuleARK(weight_2) def forward(self, inputs): - # Perform the forward pass of the model output = ark.matmul(inputs, self.weight_1) output = ark.relu(output) output = self.submodule(output) @@ -56,116 +36,53 @@ def forward(self, inputs): return output -# Use pytorch to define the same model -class SubModulePytorch(nn.Module): - def __init__(self): - super(SubModulePytorch, self).__init__() - self.weight_2 = nn.Parameter(torch.ones(d_ff, d_model)) - - def forward(self, inputs): - middle_result1 = torch.matmul(inputs, self.weight_2) - return middle_result1 - - class TestModelPytorch(nn.Module): def __init__(self): super(TestModelPytorch, self).__init__() - # Define the parameters of the module - self.weight_1 = nn.Parameter(torch.ones(d_model, d_ff)) - # Create a submodule of the module - self.submodule = SubModulePytorch() + self.weight_1 = nn.Parameter(torch.ones(d_model, d_ff, device="cuda:0")) + self.submodule_weight_2 = nn.Parameter(torch.ones(d_ff, d_model, device="cuda:0")) + self.layernorm = nn.LayerNorm(d_model, device="cuda:0") def forward(self, inputs): - # Perform the forward pass of the model output = torch.matmul(inputs, self.weight_1) output = nn.ReLU()(output) - output = self.submodule(output) - output = nn.LayerNorm(d_model)(output + inputs) + output = torch.matmul(output, self.submodule_weight_2) + output = self.layernorm(output + inputs) return output -# An example of using the ARK module def module_test(): - # Create an input tensor - input_tensor = ark.tensor([batch_size, seq_len, d_model], ark.fp32) - - # Create an ARK module - ark_model = TestModelARK() - - # Perform the forward pass - output_tensor = ark_model(input_tensor) - - # Initialize the ARK runtime - runtime = ark.Runtime() - - # Launch the ARK runtime - runtime.launch() - - # Initialize the input tensor - input_tensor_host = ( - (np.random.rand(batch_size, seq_len, d_model) - 0.5) * 0.1 - ).astype(np.float32) - input_tensor.from_numpy(input_tensor_host) - - # Initialize the parameters of the ARK module using numpy state_dict - weight_1_host = ((np.random.rand(d_model, d_ff) - 0.5) * 0.1).astype( - np.float32 - ) - weight_2_host = ((np.random.rand(d_ff, d_model) - 0.5) * 0.1).astype( - np.float32 - ) - state_dict = { - "weight_1": weight_1_host, - "submodule.weight_2": weight_2_host, - } - - # Load model parameters - ark_model.load_state_dict(state_dict) - - # Run the ARK model - runtime.run() - - # Copy the ARK module output tensor from device to host - output_tensor_host = output_tensor.to_numpy() - - # For simplicity, we use float32 to compute the ground truth using pytorch - input_tensor_host_float32 = input_tensor_host.astype(np.float32) - torch_input = torch.from_numpy(input_tensor_host_float32) - + # Create torch tensors for input and weights + input_tensor = torch.randn( + batch_size, seq_len, d_model, dtype=torch.float32, device="cuda:0" + ) * 0.1 + weight_1 = torch.randn(d_model, d_ff, dtype=torch.float32, device="cuda:0") * 0.1 + weight_2 = torch.randn(d_ff, d_model, dtype=torch.float32, device="cuda:0") * 0.1 + + # Build and evaluate the ARK model + ark_model = TestModelARK(weight_1, weight_2) + output = ark_model(input_tensor).eval() + + # Compute PyTorch ground truth torch_model = TestModelPytorch() + torch_model.load_state_dict( + {"weight_1": weight_1, "submodule_weight_2": weight_2}, + strict=False, + ) + gt = torch_model(input_tensor) - # Convert the numpy.ndarray type state_dict to torch.Tensor type state_dict - torch_state_dict = convert_state_dict(state_dict, "torch") - # Load model parameters - torch_model.load_state_dict(torch_state_dict) - - # Run the pytorch model to compute the ground truth - gt = torch_model(torch_input).detach().numpy() - - # Test if the result is correct - max_error = np.max(np.abs(output_tensor_host - gt)) - avg_error = np.mean(np.abs(output_tensor_host - gt)) - - # Use ark_model.state_dict() to get the state_dict of the ARK module - # Note that the state_dict of the ARK module might be modified at the ARK kernel launch time - ark_state_dict = ark_model.state_dict() - - # Test if the parameters are the same - for k, v in state_dict.items(): - np.testing.assert_allclose(v, ark_state_dict[k]) + # Compare results + max_error = (output - gt).abs().max().item() + avg_error = (output - gt).abs().mean().item() print("ARK module test") print( - "batch_size:", - batch_size, - "seq_len:", - seq_len, - "d_model:", - d_model, - "d_ff:", - d_ff, + "batch_size:", batch_size, + "seq_len:", seq_len, + "d_model:", d_model, + "d_ff:", d_ff, ) - print("max error: ", max_error, "avg error: ", avg_error) + print("max error:", max_error, "avg error:", avg_error) if __name__ == "__main__": diff --git a/examples/tutorial/planner_tutorial.py b/examples/tutorial/planner_tutorial.py index 8702f8929..0701cf775 100644 --- a/examples/tutorial/planner_tutorial.py +++ b/examples/tutorial/planner_tutorial.py @@ -35,25 +35,18 @@ def forward(self, input): "NumTasks": 65536, }, ): - with ark.PlannerContext(config={"ImplType": "WarpWise"}): + with ark.PlannerContext(config={"ImplType": "WarpWise", "Tile": [1, 1]}): max = ark.reduce_max(input, axis=-1) with ark.PlannerContext(config={"Tile": [1, 2048]}): output = ark.sub(input, max) output = ark.exp(output) - with ark.PlannerContext(config={"ImplType": "WarpWise"}): + with ark.PlannerContext(config={"ImplType": "WarpWise", "Tile": [1, 1]}): sum = ark.reduce_sum(output, axis=-1) with ark.PlannerContext(config={"Tile": [1, 2048]}): output = ark.div(output, sum) return output -def eval(tensor: ark.Tensor): - with ark.Runtime() as rt: - rt.launch() - rt.run() - return tensor.to_torch() - - def perf(num_iter: int = 1000): with ark.Runtime() as rt: rt.launch() @@ -73,7 +66,7 @@ def perf(num_iter: int = 1000): output = Softmax()(ark.Tensor.from_torch(input)) - if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5): + if torch.allclose(output.eval(), F.softmax(input, dim=-1), atol=1e-5): print("Correct result") else: print("Incorrect result") diff --git a/examples/tutorial/planner_tutorial_2.py b/examples/tutorial/planner_tutorial_2.py index eb9998541..e949eeb56 100644 --- a/examples/tutorial/planner_tutorial_2.py +++ b/examples/tutorial/planner_tutorial_2.py @@ -1,8 +1,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import numpy as np import ark +import torch def quickstart_tutorial(): @@ -10,9 +10,8 @@ def quickstart_tutorial(): ark.init() M, N, K = 1024, 1024, 1024 - m0 = ark.tensor([M, K], ark.fp16) - m1 = ark.tensor([N, K], ark.fp16) - m2 = ark.tensor([M, K], ark.fp16) + m0 = torch.randn(M, K, dtype=torch.float16, device="cuda:0") * 0.01 + m1 = torch.randn(N, K, dtype=torch.float16, device="cuda:0") * 0.01 # stage 1: matmul with ark.PlannerContext(processor_range=[0, 108]): @@ -20,6 +19,7 @@ def quickstart_tutorial(): t0 = ark.matmul(m0, m1, transpose_other=True) # stage 2: parallel copy and matmul + m2 = ark.tensor([M, K], ark.fp16) with ark.PlannerContext(processor_range=[0, 54]): # Use SMs 0~53 t1 = ark.matmul(t0, m1) @@ -27,27 +27,20 @@ def quickstart_tutorial(): # Use SMs 54~107 t2 = ark.copy(input=t0, output=m2) - # Initialize the ARK runtime - runtime = ark.Runtime() - - # Launch the ARK runtime - runtime.launch() - - # Initialize - m0_host = np.random.rand(M, K).astype(np.float16) * 0.01 - m0.from_numpy(m0_host) - m1_host = np.random.rand(N, K).astype(np.float16) * 0.01 - m1.from_numpy(m1_host) - - # Run the ARK program - runtime.run() + # Evaluate and check results + with ark.Runtime() as rt: + rt.launch() + rt.run() + t0_result = t0.to_torch() + t1_result = t1.to_torch() + t2_result = t2.to_torch() # Check the matmul result - res_host = np.matmul(np.matmul(m0_host, m1_host.T), m1_host) - np.testing.assert_allclose(t1.to_numpy(), res_host, rtol=1e-3, atol=1e-3) + expected = torch.matmul(torch.matmul(m0, m1.T), m1) + torch.testing.assert_close(t1_result, expected, rtol=1e-3, atol=1e-3) # Check the copy result - np.testing.assert_equal(t2.to_numpy(), t0.to_numpy()) + torch.testing.assert_close(t2_result, t0_result, atol=0, rtol=0) print("Successful!") diff --git a/examples/tutorial/quickstart_tutorial.py b/examples/tutorial/quickstart_tutorial.py index 1fce51452..d0803f917 100644 --- a/examples/tutorial/quickstart_tutorial.py +++ b/examples/tutorial/quickstart_tutorial.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import numpy as np +import torch import ark @@ -10,36 +10,15 @@ def quickstart_tutorial(): ark.init() M, N = 64, 64 - # Create an input tensor - input_tensor = ark.tensor([M, N], ark.fp16) - # Create another tensor - other_tensor = ark.tensor([M, N], ark.fp16) + # Create input tensors on GPU + input_tensor = torch.randn(M, N, dtype=torch.float16, device="cuda:0") + other_tensor = torch.randn(M, N, dtype=torch.float16, device="cuda:0") - # Add the two tensors - output_tensor = ark.add(input_tensor, other_tensor) + # Add the two tensors using ARK and evaluate + output = ark.add(input_tensor, other_tensor).eval() - # Initialize the ARK runtime - runtime = ark.Runtime() - - # Launch the ARK runtime - runtime.launch() - - # Initialize the input and other tensor with random values - input_tensor_host = np.random.rand(M, N).astype(np.float16) - input_tensor.from_numpy(input_tensor_host) - other_tensor_host = np.random.rand(M, N).astype(np.float16) - other_tensor.from_numpy(other_tensor_host) - - # Run the ARK program - runtime.run() - - # Copy the output tensor from device memory to host memory, if dst is - # None, a new numpy array of the same shape as the src tensor will be returned - output_tensor_host = output_tensor.to_numpy() # Check if the output tensor is equal to the sum of the input and other tensor - np.testing.assert_allclose( - output_tensor_host, input_tensor_host + other_tensor_host - ) + torch.testing.assert_close(output, input_tensor + other_tensor, atol=0, rtol=0) print("Quickstart tutorial is successful!") diff --git a/examples/tutorial/torch_tutorial.py b/examples/tutorial/torch_tutorial.py index 38929d0b5..3f1f89d1b 100644 --- a/examples/tutorial/torch_tutorial.py +++ b/examples/tutorial/torch_tutorial.py @@ -4,10 +4,8 @@ """ Tutorial: Using ARK with PyTorch tensors. -Shows how to: -1. Create ARK placeholder tensors backed by PyTorch memory -2. Run ARK computation on torch-owned GPU memory -3. Read results back as PyTorch tensors +Shows how to use eval() to run ARK computation on torch tensors +and get torch tensor results directly. """ import ark @@ -19,26 +17,12 @@ x = torch.ones(64, dtype=torch.float32, device="cuda:0") * 2 y = torch.ones(64, dtype=torch.float32, device="cuda:0") * 3 -# Create ARK placeholders backed by torch memory -a = ark.placeholder([64], ark.fp32, data=x) -b = ark.placeholder([64], ark.fp32, data=y) +# Run ARK computation and get result as a torch tensor +result = ark.add(x, y).eval() +print(f"x + y = {result}") # tensor([5., 5., ...]) -# Define ARK computation -z = ark.add(a, b) - -# Launch and run -with ark.Runtime() as rt: - rt.launch() - rt.run() - - # Read result back as a torch tensor (zero-copy via DLPack) - result = z.to_torch() - print(f"x + y = {result}") # tensor([5., 5., ...]) - - # Modify torch inputs and re-run - x.fill_(10) - y.fill_(20) - rt.run() - - result = z.to_torch() - print(f"10 + 20 = {result}") # tensor([30., 30., ...]) +# Run again with different values +x = torch.ones(64, dtype=torch.float32, device="cuda:0") * 10 +y = torch.ones(64, dtype=torch.float32, device="cuda:0") * 20 +result = ark.add(x, y).eval() +print(f"10 + 20 = {result}") # tensor([30., 30., ...]) diff --git a/python/ark/__init__.py b/python/ark/__init__.py index 63480262c..31aaa5cef 100644 --- a/python/ark/__init__.py +++ b/python/ark/__init__.py @@ -7,7 +7,7 @@ os.environ["ARK_ROOT"] = os.path.abspath(os.path.dirname(__file__)) from .core import version -from .model import Model +from .model import Model, set_model, current_model, use_model __version__ = version() diff --git a/python/ark/data_type.py b/python/ark/data_type.py index fa2b2c064..0caac294a 100644 --- a/python/ark/data_type.py +++ b/python/ark/data_type.py @@ -10,6 +10,7 @@ "DataType", "fp16", "fp32", + "bf16", "int32", "uint32", "int8", diff --git a/python/ark/model.py b/python/ark/model.py index e103d4083..a1fd37c49 100644 --- a/python/ark/model.py +++ b/python/ark/model.py @@ -2,18 +2,19 @@ # Licensed under the MIT license. from typing import NewType +from contextlib import contextmanager from . import log from .core import CoreModel -__all__ = ["Model"] +__all__ = ["Model", "set_model", "current_model", "use_model"] ModelState = NewType("ModelState", None) class Model(CoreModel): @staticmethod - def get_model(): + def get_model() -> "Model": """ Get the underlying model. """ @@ -115,3 +116,54 @@ class ModelState: rank: int = 0 world_size: int = 1 device_id: int = 0 + + +def set_model(model: Model) -> None: + """Set the current active model. All subsequent ARK ops will be added to this model. + + Similar to ``torch.cuda.set_stream()``. + + Args: + model: The model to set as the current active model. + """ + ModelState.model = model + + +def current_model() -> Model: + """Return the current active model, creating one if none exists. + + Similar to ``torch.cuda.current_stream()``. + + Returns: + The current active model. + """ + return Model.get_model() + + +@contextmanager +def use_model(model: Model): + """Context manager to temporarily switch the active model. + + All ARK ops within the ``with`` block are added to the given model. + On exit, the previous model is restored. + + Similar to ``torch.cuda.stream()``. + + Example:: + + m1 = ark.Model() + m2 = ark.Model() + with ark.use_model(m1): + a = ark.add(x, y) # added to m1 + with ark.use_model(m2): + b = ark.mul(x, y) # added to m2 + + Args: + model: The model to use within the context. + """ + prev = ModelState.model + ModelState.model = model + try: + yield model + finally: + ModelState.model = prev diff --git a/python/ark/ops.py b/python/ark/ops.py index c0eefa2e0..3dbba5115 100644 --- a/python/ark/ops.py +++ b/python/ark/ops.py @@ -10,6 +10,13 @@ from . import log +def _ensure_ark(x): + """Convert a torch.Tensor to an ARK Tensor if needed.""" + if not _no_torch and isinstance(x, torch.Tensor): + return Tensor.from_torch(x) + return x + + __all__ = [ "tensor", "parameter", @@ -54,12 +61,14 @@ def is_list_or_tuple(obj): def add( - input: Union[Tensor, float], - other: Union[Tensor, float], + input: Union[Tensor, float, "torch.Tensor"], + other: Union[Tensor, float, "torch.Tensor"], output: Tensor = NullTensor, name: str = "add", ) -> Union[Tensor, float]: """ """ + input = _ensure_ark(input) + other = _ensure_ark(other) if isinstance(input, Tensor) and isinstance(other, Tensor): a = input._tensor b = other._tensor @@ -81,12 +90,13 @@ def add( def cast( - input: Tensor, + input: Union[Tensor, "torch.Tensor"], dtype: DataType, output: Tensor = NullTensor, name: str = "cast", ) -> Tensor: """ """ + input = _ensure_ark(input) if output is not NullTensor: output = output._tensor return Tensor( @@ -107,11 +117,12 @@ def constant( def copy( - input: Union[Tensor, float], + input: Union[Tensor, float, "torch.Tensor"], output: Tensor = NullTensor, name: str = "copy", ) -> Tensor: """ """ + input = _ensure_ark(input) if output is not NullTensor: output = output._tensor if isinstance(input, Tensor): @@ -120,12 +131,14 @@ def copy( def div( - input: Tensor, - other: Union[Tensor, float], + input: Union[Tensor, "torch.Tensor"], + other: Union[Tensor, float, "torch.Tensor"], output: Tensor = NullTensor, name: str = "div", ) -> Tensor: """ """ + input = _ensure_ark(input) + other = _ensure_ark(other) if output is not NullTensor: output = output._tensor if isinstance(other, Tensor): @@ -134,12 +147,14 @@ def div( def embedding( - input: Tensor, - weight: Tensor, + input: Union[Tensor, "torch.Tensor"], + weight: Union[Tensor, "torch.Tensor"], output: Tensor = NullTensor, name: str = "embedding", ) -> Tensor: """ """ + input = _ensure_ark(input) + weight = _ensure_ark(weight) if output is not NullTensor: output = output._tensor return Tensor( @@ -148,31 +163,34 @@ def embedding( def exp( - input: Tensor, + input: Union[Tensor, "torch.Tensor"], output: Tensor = NullTensor, name: str = "exp", ) -> Tensor: """ """ + input = _ensure_ark(input) if output is not NullTensor: output = output._tensor return Tensor(Model.get_model().exp(input._tensor, output, name)) def gelu( - input: Tensor, + input: Union[Tensor, "torch.Tensor"], output: Tensor = NullTensor, name: str = "gelu", ) -> Tensor: """ """ + input = _ensure_ark(input) if output is not NullTensor: output = output._tensor return Tensor(Model.get_model().gelu(input._tensor, output, name)) def identity( - input: Tensor, deps: List[Tensor] = [], name: str = "identity" + input: Union[Tensor, "torch.Tensor"], deps: List[Tensor] = [], name: str = "identity" ) -> Tensor: """ """ + input = _ensure_ark(input) dep_tensors = [] for dep in deps: if not isinstance(dep, Tensor): @@ -182,14 +200,16 @@ def identity( def matmul( - input: Tensor, - other: Tensor, + input: Union[Tensor, "torch.Tensor"], + other: Union[Tensor, "torch.Tensor"], output: Tensor = NullTensor, transpose_input: bool = False, transpose_other: bool = False, name: str = "matmul", ) -> Tensor: """ """ + input = _ensure_ark(input) + other = _ensure_ark(other) if output is not NullTensor: output = output._tensor return Tensor( @@ -205,12 +225,14 @@ def matmul( def mul( - input: Tensor, - other: Union[Tensor, float], + input: Union[Tensor, "torch.Tensor"], + other: Union[Tensor, float, "torch.Tensor"], output: Tensor = NullTensor, name: str = "mul", ) -> Tensor: """ """ + input = _ensure_ark(input) + other = _ensure_ark(other) if output is not NullTensor: output = output._tensor if isinstance(other, Tensor): @@ -218,8 +240,9 @@ def mul( return Tensor(Model.get_model().mul(input._tensor, other, output, name)) -def noop(input: Tensor, name: str = "noop"): +def noop(input: Union[Tensor, "torch.Tensor"], name: str = "noop"): """ """ + input = _ensure_ark(input) Model.get_model().noop(input._tensor, name) @@ -253,13 +276,14 @@ def placeholder( def reduce_max( - input: Tensor, + input: Union[Tensor, "torch.Tensor"], axis: int, keepdims: bool = True, output: Tensor = NullTensor, name: str = "reduce_max", ) -> Tensor: """ """ + input = _ensure_ark(input) if output is not NullTensor: output = output._tensor return Tensor( @@ -270,13 +294,14 @@ def reduce_max( def reduce_mean( - input: Tensor, + input: Union[Tensor, "torch.Tensor"], axis: int, keepdims: bool = True, output: Tensor = NullTensor, name: str = "reduce_mean", ) -> Tensor: """ """ + input = _ensure_ark(input) if output is not NullTensor: output = output._tensor return Tensor( @@ -287,13 +312,14 @@ def reduce_mean( def reduce_sum( - input: Tensor, + input: Union[Tensor, "torch.Tensor"], axis: int, keepdims: bool = True, output: Tensor = NullTensor, name: str = "reduce_sum", ) -> Tensor: """ """ + input = _ensure_ark(input) if output is not NullTensor: output = output._tensor return Tensor( @@ -304,18 +330,19 @@ def reduce_sum( def relu( - input: Tensor, + input: Union[Tensor, "torch.Tensor"], output: Tensor = NullTensor, name: str = "relu", ) -> Tensor: """ """ + input = _ensure_ark(input) if output is not NullTensor: output = output._tensor return Tensor(Model.get_model().relu(input._tensor, output, name)) def reshape( - input: Tensor, + input: Union[Tensor, "torch.Tensor"], shape: Iterable[int], allowzero: bool = False, name: str = "reshape", @@ -338,6 +365,7 @@ def reshape( "shape should be a list or tuple of integers" ) # only support tensors with up to 4 dimensions + input = _ensure_ark(input) if len(shape) > 4: raise log.InvalidUsageError( "Only support tensors with up to 4 dimensions" @@ -348,12 +376,14 @@ def reshape( def rope( - input: Tensor, - other: Tensor, + input: Union[Tensor, "torch.Tensor"], + other: Union[Tensor, "torch.Tensor"], output: Tensor = NullTensor, name: str = "rope", ) -> Tensor: """ """ + input = _ensure_ark(input) + other = _ensure_ark(other) if output is not NullTensor: output = output._tensor return Tensor( @@ -362,20 +392,22 @@ def rope( def rsqrt( - input: Tensor, + input: Union[Tensor, "torch.Tensor"], output: Tensor = NullTensor, name: str = "rsqrt", ) -> Tensor: """ """ + input = _ensure_ark(input) if output is not NullTensor: output = output._tensor return Tensor(Model.get_model().rsqrt(input._tensor, output, name)) def sharding( - input: Tensor, axis: int, dim_per_shard: int, name: str = "sharding" + input: Union[Tensor, "torch.Tensor"], axis: int, dim_per_shard: int, name: str = "sharding" ) -> List[Tensor]: """ """ + input = _ensure_ark(input) _tensor_list = Model.get_model().sharding( input._tensor, axis, dim_per_shard, name ) @@ -383,34 +415,38 @@ def sharding( def sigmoid( - input: Tensor, + input: Union[Tensor, "torch.Tensor"], output: Tensor = NullTensor, name: str = "sigmoid", ) -> Tensor: """ """ + input = _ensure_ark(input) if output is not NullTensor: output = output._tensor return Tensor(Model.get_model().sigmoid(input._tensor, output, name)) def sqrt( - input: Tensor, + input: Union[Tensor, "torch.Tensor"], output: Tensor = NullTensor, name: str = "sqrt", ) -> Tensor: """ """ + input = _ensure_ark(input) if output is not NullTensor: output = output._tensor return Tensor(Model.get_model().sqrt(input._tensor, output, name)) def sub( - input: Tensor, - other: Union[Tensor, float], + input: Union[Tensor, "torch.Tensor"], + other: Union[Tensor, float, "torch.Tensor"], output: Tensor = NullTensor, name: str = "sub", ) -> Tensor: """ """ + input = _ensure_ark(input) + other = _ensure_ark(other) if output is not NullTensor: output = output._tensor if isinstance(other, Tensor): @@ -436,12 +472,13 @@ def tensor( def transpose( - input: Tensor, + input: Union[Tensor, "torch.Tensor"], perm: Iterable[int], output: Tensor = NullTensor, name: str = "transpose", ) -> Tensor: """ """ + input = _ensure_ark(input) if output is not NullTensor: output = output._tensor if not is_list_or_tuple(perm): diff --git a/python/ark/planner.py b/python/ark/planner.py index 0ed9113e1..609282182 100644 --- a/python/ark/planner.py +++ b/python/ark/planner.py @@ -233,8 +233,10 @@ def __exit__(self, exc_type, exc_value, exc_tb): class Planner(CorePlanner): - def __init__(self, device_id: int = 0): - compressed = Model.get_model().compress() + def __init__(self, device_id: int = 0, model: "Model" = None): + if model is None: + model = Model.get_model() + compressed = model.compress() super().__init__(compressed, device_id) def install_config_rule(self, rule: Callable[[str, str], str]): diff --git a/python/ark/tensor.py b/python/ark/tensor.py index 216318b27..cd83429fc 100644 --- a/python/ark/tensor.py +++ b/python/ark/tensor.py @@ -38,6 +38,8 @@ def __init__( self._tensor: CoreTensor = _tensor self.initializer: Initializer = initializer self.requires_grad: bool = requires_grad + # Track which model this tensor belongs to for eval() + self._model: Model = Model.get_model() def __hash__(self): return self._tensor.id() @@ -283,6 +285,37 @@ def from_torch(tensor: torch.Tensor) -> "Tensor": ark_tensor.__torch_buffer__ = tensor return ark_tensor + def eval(self, stream: "torch.cuda.Stream" = None) -> torch.Tensor: + """ + Evaluate the ARK graph that produces this tensor and return the result + as a torch tensor. Creates a runtime, compiles the graph, runs it, + and returns the output via DLPack (zero-copy). + + Multiple independent ARK graphs can coexist — each tensor tracks + which model it belongs to, and eval() only runs that model. + The executor skips GPU recompilation if the plan hasn't changed. + + Args: + stream: Optional torch CUDA stream to run on. + + Returns: + torch.Tensor: The result tensor on the same device. + """ + if _no_torch: + raise log.SystemError("torch is not available") + from .runtime import Runtime + from .planner import Planner + + plan = Planner(model=self._model).plan() + cuda_stream = stream.cuda_stream if stream is not None else 0 + + with Runtime() as rt: + rt.launch(plan=plan, stream=cuda_stream, loop_mode=False) + rt.run() + result = self.to_torch() + + return result + def copy( self, data: Union[np.ndarray, torch.Tensor], stream: int = 0 ) -> "Tensor": diff --git a/python/unittest/ops/conftest.py b/python/unittest/ops/conftest.py new file mode 100644 index 000000000..5073ab08c --- /dev/null +++ b/python/unittest/ops/conftest.py @@ -0,0 +1,32 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +""" +Shared fixtures and helpers for ARK op numerical tests. +""" + +import sys +import os +import pytest + +# Add parent directory to path so `common` is importable +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from common import ark + +try: + import torch + + _no_torch = False +except ImportError: + _no_torch = True + +# Skip entire ops/ directory if torch is unavailable +pytestmark = pytest.mark.skipif(_no_torch, reason="torch not available") + +DEVICE = "cuda:0" + + +@pytest.fixture(autouse=True) +def _ark_init(): + """Reset ARK state before each test so tests don't share models.""" + ark.init() diff --git a/python/unittest/ops/test_arithmetic.py b/python/unittest/ops/test_arithmetic.py new file mode 100644 index 000000000..fcdc4c37a --- /dev/null +++ b/python/unittest/ops/test_arithmetic.py @@ -0,0 +1,115 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Numerical tests for arithmetic ops: add, sub, mul, div (tensor and scalar).""" + +import pytest +import torch +from conftest import ark, DEVICE + + +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +def test_add(dtype): + a = torch.randn(8192, dtype=dtype, device=DEVICE) + b = torch.randn(8192, dtype=dtype, device=DEVICE) + assert torch.allclose(ark.add(a, b).eval(), a + b, atol=0, rtol=0) + + +def test_add_broadcast(): + a = torch.randn(4, 1024, dtype=torch.float16, device=DEVICE) + b = torch.randn(1, 1024, dtype=torch.float16, device=DEVICE) + assert torch.allclose(ark.add(a, b).eval(), a + b, atol=0, rtol=0) + + +def test_add_broadcast_3d(): + a = torch.randn(3, 1, 1024, dtype=torch.float16, device=DEVICE) + b = torch.randn(1, 4, 1, dtype=torch.float16, device=DEVICE) + assert torch.allclose(ark.add(a, b).eval(), a + b, atol=0, rtol=0) + + +@pytest.mark.parametrize("dtype", [torch.float32]) +def test_sub(dtype): + a = torch.randn(8192, dtype=dtype, device=DEVICE) + b = torch.randn(8192, dtype=dtype, device=DEVICE) + assert torch.allclose(ark.sub(a, b).eval(), a - b, atol=0, rtol=0) + + +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +def test_mul(dtype): + a = torch.randn(8192, dtype=dtype, device=DEVICE) + b = torch.randn(8192, dtype=dtype, device=DEVICE) + assert torch.allclose(ark.mul(a, b).eval(), a * b, atol=0, rtol=0) + + +def test_mul_broadcast(): + a = torch.randn(4, 1024, dtype=torch.float16, device=DEVICE) + b = torch.randn(1, 1024, dtype=torch.float16, device=DEVICE) + assert torch.allclose(ark.mul(a, b).eval(), a * b, atol=0, rtol=0) + + +def test_mul_broadcast_3d(): + a = torch.randn(3, 1, 1024, dtype=torch.float16, device=DEVICE) + b = torch.randn(1, 4, 1, dtype=torch.float16, device=DEVICE) + assert torch.allclose(ark.mul(a, b).eval(), a * b, atol=0, rtol=0) + + +def test_div_fp32(): + a = torch.randn(8192, dtype=torch.float32, device=DEVICE) + b = torch.randn(8192, dtype=torch.float32, device=DEVICE).abs() + 0.01 + assert torch.allclose(ark.div(a, b).eval(), a / b, atol=0, rtol=0) + + +# ─── Scalar operations ────────────────────────────────────────────────────── + +FACTOR = 0.75 + + +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("shape", [(4, 2, 1), (4, 2, 1024)]) +def test_scalar_mul(dtype, shape): + a = torch.randn(shape, dtype=dtype, device=DEVICE) + assert torch.allclose(ark.mul(a, FACTOR).eval(), a * FACTOR, atol=0, rtol=0) + + +@pytest.mark.parametrize("shape", [(4, 2, 1), (4, 2, 1024)]) +def test_scalar_add(shape): + a = torch.randn(shape, dtype=torch.float16, device=DEVICE) + assert torch.allclose(ark.add(a, FACTOR).eval(), a + FACTOR, atol=0, rtol=0) + + +@pytest.mark.parametrize("shape", [(4, 2, 1), (4, 2, 1024)]) +def test_scalar_sub(shape): + a = torch.randn(shape, dtype=torch.float16, device=DEVICE) + assert torch.allclose(ark.sub(a, FACTOR).eval(), a - FACTOR, atol=0, rtol=0) + + +@pytest.mark.parametrize("shape", [(4, 2, 1), (4, 2, 1024)]) +def test_scalar_div(shape): + a = torch.randn(shape, dtype=torch.float16, device=DEVICE) + assert torch.allclose( + ark.div(a, FACTOR).eval(), a / FACTOR, atol=1e-3, rtol=1e-3 + ) + + +# ─── Constant & scalar copy ───────────────────────────────────────────────── + + +def test_constant_fp16(): + out = ark.constant(7, (4, 2, 50), ark.fp16).eval() + assert (out == 7).all() + + +def test_constant_fp32(): + out = ark.constant(7, (1,), ark.fp32).eval() + assert out.item() == 7.0 + + +def test_copy_scalar_fp16(): + t = torch.zeros(4, 2, 50, dtype=torch.float16, device=DEVICE) + out = ark.copy(7.0, ark.Tensor.from_torch(t)).eval() + assert (out == 7).all() + + +def test_copy_scalar_fp32(): + out = ark.copy(7.0).eval() + assert out.item() == 7.0 diff --git a/python/unittest/ops/test_cast.py b/python/unittest/ops/test_cast.py new file mode 100644 index 000000000..fe84ff755 --- /dev/null +++ b/python/unittest/ops/test_cast.py @@ -0,0 +1,27 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Numerical tests for cast op.""" + +import pytest +import torch +from conftest import ark, DEVICE + + +@pytest.mark.parametrize( + "src_dtype, dst_dtype, ark_dst", + [ + (torch.float16, torch.float32, ark.fp32), + (torch.float32, torch.float16, ark.fp16), + (torch.float32, torch.int32, ark.int32), + (torch.int32, torch.float32, ark.fp32), + (torch.bfloat16, torch.float32, ark.fp32), + (torch.float32, torch.bfloat16, ark.bf16), + ], +) +def test_cast(src_dtype, dst_dtype, ark_dst): + a = torch.randn(4, 2, 1024, dtype=torch.float32, device=DEVICE).to(src_dtype) + result = ark.cast(a, ark_dst).eval() + expected = a.to(dst_dtype) + assert result.dtype == dst_dtype + assert torch.allclose(result, expected, atol=0, rtol=0) diff --git a/python/unittest/ops/test_composite.py b/python/unittest/ops/test_composite.py new file mode 100644 index 000000000..a40ba913f --- /dev/null +++ b/python/unittest/ops/test_composite.py @@ -0,0 +1,33 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Numerical tests for composite ops: softmax, layernorm.""" + +import pytest +import torch +import torch.nn.functional as F +from conftest import ark, DEVICE + + +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +def test_softmax(dtype): + shape = (4, 8, 256) + a = torch.randn(shape, dtype=dtype, device=DEVICE) + result = ark.softmax(a).eval() + expected = F.softmax(a, dim=-1) + atol = 1e-5 if dtype == torch.float32 else 1e-3 + assert torch.allclose(result, expected, atol=atol, rtol=1e-3), ( + f"max_diff={(result - expected).abs().max()}" + ) + + +def test_layernorm(): + shape = (4, 8, 256) + a = torch.randn(shape, dtype=torch.float32, device=DEVICE) + result = ark.layernorm(a, eps=1e-6).eval() + mean = a.mean(dim=-1, keepdim=True) + var = ((a - mean) ** 2).mean(dim=-1, keepdim=True) + expected = (a - mean) / torch.sqrt(var + 1e-6) + assert torch.allclose(result, expected, atol=1e-4, rtol=1e-4), ( + f"max_diff={(result - expected).abs().max()}" + ) diff --git a/python/unittest/ops/test_embedding_rope.py b/python/unittest/ops/test_embedding_rope.py new file mode 100644 index 000000000..f8c9ea701 --- /dev/null +++ b/python/unittest/ops/test_embedding_rope.py @@ -0,0 +1,45 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Numerical tests for embedding and rope ops.""" + +import pytest +import torch +import torch.nn.functional as F +from conftest import ark, DEVICE + + +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +def test_embedding(dtype): + vocab_size, embed_dim = 100, 64 + indices = torch.randint(0, vocab_size, (4, 8), device=DEVICE).to(torch.int32) + weight = torch.randn(vocab_size, embed_dim, dtype=dtype, device=DEVICE) + result = ark.embedding(indices, weight).eval() + expected = F.embedding(indices, weight) + assert torch.allclose(result, expected, atol=0, rtol=0), ( + f"max_diff={(result - expected).abs().max()}" + ) + + +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +def test_rope(dtype): + """Test rotary positional embedding against PyTorch complex-multiply reference. + ARK's rope computes element-wise complex multiplication on consecutive pairs: + c[2k] = a[2k]*b[2k] - a[2k+1]*b[2k+1] + c[2k+1] = a[2k]*b[2k+1] + a[2k+1]*b[2k] + """ + shape = (1, 1, 8, 64) + x = torch.randn(shape, dtype=dtype, device=DEVICE) + other = torch.randn(shape, dtype=dtype, device=DEVICE) + result = ark.rope(x, other).eval() + # PyTorch reference: complex multiply on paired elements + a = x.reshape(*shape[:-1], -1, 2) + b = other.reshape(*shape[:-1], -1, 2) + expected = torch.stack([ + a[..., 0] * b[..., 0] - a[..., 1] * b[..., 1], + a[..., 0] * b[..., 1] + a[..., 1] * b[..., 0], + ], dim=-1).reshape(shape) + atol = 1e-5 if dtype == torch.float32 else 5e-2 + assert torch.allclose(result, expected, atol=atol, rtol=1e-3), ( + f"max_diff={(result - expected).abs().max()}" + ) diff --git a/python/unittest/ops/test_math.py b/python/unittest/ops/test_math.py new file mode 100644 index 000000000..000845858 --- /dev/null +++ b/python/unittest/ops/test_math.py @@ -0,0 +1,46 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Numerical tests for unary math ops: exp, gelu, relu, sigmoid, sqrt, rsqrt.""" + +import pytest +import torch +import torch.nn.functional as F +from conftest import ark, DEVICE + + +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +def test_exp(dtype): + a = torch.randn(4, 2, 1024, dtype=dtype, device=DEVICE) + atol = 1e-5 if dtype == torch.float32 else 1e-2 + assert torch.allclose(ark.exp(a).eval(), torch.exp(a), atol=atol, rtol=0) + + +@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16]) +def test_gelu(dtype): + a = torch.randn(4, 2, 1024, dtype=dtype, device=DEVICE) + atol = 1e-5 if dtype == torch.float32 else 1e-2 + assert torch.allclose(ark.gelu(a).eval(), F.gelu(a, approximate="tanh"), atol=atol, rtol=0) + + +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +def test_relu(dtype): + a = torch.randn(4, 2, 1024, dtype=dtype, device=DEVICE) + assert torch.allclose(ark.relu(a).eval(), F.relu(a), atol=0, rtol=0) + + +@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16]) +def test_sigmoid(dtype): + a = torch.randn(4, 2, 1024, dtype=dtype, device=DEVICE) + atol = 1e-5 if dtype == torch.float32 else 1e-2 + assert torch.allclose(ark.sigmoid(a).eval(), torch.sigmoid(a), atol=atol, rtol=0) + + +def test_sqrt_fp32(): + a = torch.rand(4, 2, 1024, dtype=torch.float32, device=DEVICE) + 0.01 + assert torch.allclose(ark.sqrt(a).eval(), torch.sqrt(a), atol=1e-6, rtol=0) + + +def test_rsqrt_fp32(): + a = torch.rand(4, 2, 1024, dtype=torch.float32, device=DEVICE) + 0.01 + assert torch.allclose(ark.rsqrt(a).eval(), torch.rsqrt(a), atol=1e-4, rtol=0) diff --git a/python/unittest/ops/test_matmul.py b/python/unittest/ops/test_matmul.py new file mode 100644 index 000000000..1855f7636 --- /dev/null +++ b/python/unittest/ops/test_matmul.py @@ -0,0 +1,65 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Numerical tests for matmul: NN, NT, TN, TT, batched.""" + +import pytest +import torch +from conftest import ark, DEVICE + + +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +def test_matmul_nn(dtype): + M, N, K = 256, 256, 512 + a = torch.randn(M, K, dtype=dtype, device=DEVICE) + b = torch.randn(K, N, dtype=dtype, device=DEVICE) + result = ark.matmul(a, b).eval() + expected = a @ b + atol = 1e-3 if dtype == torch.float32 else 1e-1 + assert torch.allclose(result, expected, atol=atol, rtol=1e-2), ( + f"max_diff={(result - expected).abs().max()}" + ) + + +def test_matmul_nt(): + M, N, K = 256, 256, 512 + a = torch.randn(M, K, dtype=torch.float16, device=DEVICE) + b = torch.randn(N, K, dtype=torch.float16, device=DEVICE) + result = ark.matmul(a, b, transpose_other=True).eval() + expected = a @ b.t() + assert torch.allclose(result, expected, atol=1e-1, rtol=1e-2), ( + f"max_diff={(result - expected).abs().max()}" + ) + + +def test_matmul_tn(): + M, N, K = 256, 256, 512 + a = torch.randn(K, M, dtype=torch.float16, device=DEVICE) + b = torch.randn(K, N, dtype=torch.float16, device=DEVICE) + result = ark.matmul(a, b, transpose_input=True).eval() + expected = a.t() @ b + assert torch.allclose(result, expected, atol=1e-1, rtol=1e-2), ( + f"max_diff={(result - expected).abs().max()}" + ) + + +def test_matmul_tt(): + M, N, K = 256, 256, 512 + a = torch.randn(K, M, dtype=torch.float16, device=DEVICE) + b = torch.randn(N, K, dtype=torch.float16, device=DEVICE) + result = ark.matmul(a, b, transpose_input=True, transpose_other=True).eval() + expected = a.t() @ b.t() + assert torch.allclose(result, expected, atol=1e-1, rtol=1e-2), ( + f"max_diff={(result - expected).abs().max()}" + ) + + +def test_matmul_batched(): + B, M, N, K = 4, 256, 256, 512 + a = torch.randn(B, M, K, dtype=torch.float16, device=DEVICE) + b = torch.randn(B, K, N, dtype=torch.float16, device=DEVICE) + result = ark.matmul(a, b).eval() + expected = a @ b + assert torch.allclose(result, expected, atol=3e-1, rtol=1e-2), ( + f"max_diff={(result - expected).abs().max()}" + ) diff --git a/python/unittest/ops/test_reduce.py b/python/unittest/ops/test_reduce.py new file mode 100644 index 000000000..5ab8efae8 --- /dev/null +++ b/python/unittest/ops/test_reduce.py @@ -0,0 +1,46 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Numerical tests for reduce ops: reduce_sum, reduce_max, reduce_mean.""" + +import pytest +import torch +from conftest import ark, DEVICE + + +@pytest.mark.parametrize("axis", [0, 1, 2, 3]) +def test_reduce_sum_fp32(axis): + shape = [7, 2, 4, 1024] + a = torch.randn(shape, dtype=torch.float32, device=DEVICE) * 0.1 + result = ark.reduce_sum(a, axis=axis).eval() + expected = torch.sum(a, dim=axis, keepdim=True) + atol = shape[axis] * 1e-5 + assert torch.allclose(result, expected, atol=atol, rtol=1e-4), ( + f"axis={axis}, max_diff={(result - expected).abs().max()}" + ) + + +@pytest.mark.parametrize("axis", [0, 3]) +def test_reduce_sum_fp16(axis): + shape = [7, 2, 4, 1024] + a = torch.randn(shape, dtype=torch.float16, device=DEVICE) * 0.1 + result = ark.reduce_sum(a, axis=axis).eval() + expected = torch.sum(a, dim=axis, keepdim=True) + atol = shape[axis] * 2e-2 + assert torch.allclose(result, expected, atol=atol, rtol=1e-2), ( + f"axis={axis}, max_diff={(result - expected).abs().max()}" + ) + + +def test_reduce_max_fp32(): + a = torch.randn(1, 1, 2, 8192, dtype=torch.float32, device=DEVICE) + result = ark.reduce_max(a, axis=-1).eval() + expected = torch.max(a, dim=-1, keepdim=True).values + assert torch.allclose(result, expected, atol=0, rtol=0) + + +def test_reduce_mean_fp32(): + a = torch.randn(1, 1, 2, 8192, dtype=torch.float32, device=DEVICE) * 0.1 + result = ark.reduce_mean(a, axis=-1).eval() + expected = torch.mean(a, dim=-1, keepdim=True) + assert torch.allclose(result, expected, atol=1e-4, rtol=1e-4) diff --git a/python/unittest/ops/test_transpose.py b/python/unittest/ops/test_transpose.py new file mode 100644 index 000000000..156c4ee9b --- /dev/null +++ b/python/unittest/ops/test_transpose.py @@ -0,0 +1,26 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Numerical tests for transpose op.""" + +import pytest +import torch +from conftest import ark, DEVICE + + +@pytest.mark.parametrize( + "perm, shape", + [ + ([0, 1, 3, 2], [2, 3, 64, 128]), + ([0, 2, 3, 1], [2, 3, 64, 128]), + ([0, 2, 1, 3], [2, 3, 64, 128]), + ], +) +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +def test_transpose(perm, shape, dtype): + a = torch.randn(shape, dtype=dtype, device=DEVICE) + result = ark.transpose(a, perm).eval() + expected = a.permute(perm).contiguous() + assert torch.allclose(result, expected, atol=0, rtol=0), ( + f"max_diff={(result - expected).abs().max()}" + ) diff --git a/python/unittest/test_eval.py b/python/unittest/test_eval.py new file mode 100644 index 000000000..7adf5126e --- /dev/null +++ b/python/unittest/test_eval.py @@ -0,0 +1,128 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Test that Tensor.eval() correctly reuses compiled plans and recompiles +when the model graph changes.""" + +import pytest +import torch +import ark + +DEVICE = "cuda:0" + + +@pytest.fixture(autouse=True) +def _ark_init(): + """Reset ARK state before each test.""" + ark.init() + + +def _get_compiled_plan(): + """Return the plan string currently compiled in the executor.""" + from ark.executor import Executor + return Executor.get().plan() + + +def test_eval_same_structure_produces_correct_results(): + """Two eval() calls on same-shaped graphs should both produce correct results. + Note: the plan strings may differ (different tensor IDs), but the executor's + file-level compile cache avoids redundant nvcc invocations.""" + a = torch.ones(64, dtype=torch.float32, device=DEVICE) * 3.0 + b = torch.ones(64, dtype=torch.float32, device=DEVICE) * 4.0 + + r1 = ark.add(a, b).eval() + assert torch.allclose(r1, a + b) + + r2 = ark.add(a, b).eval() + assert torch.allclose(r2, a + b) + + +def test_eval_recompile_on_different_graph(): + """A different graph should produce a different plan → triggers recompile.""" + a = torch.ones(64, dtype=torch.float32, device=DEVICE) * 2.0 + b = torch.ones(64, dtype=torch.float32, device=DEVICE) * 3.0 + + # Graph 1: add + r1 = ark.add(a, b).eval() + plan1 = _get_compiled_plan() + assert torch.allclose(r1, a + b) + + # Graph 2: mul (different op → different plan) + r2 = ark.mul(a, b).eval() + plan2 = _get_compiled_plan() + assert torch.allclose(r2, a * b) + + assert plan1 != plan2, ( + "Different graph structure should produce a different plan" + ) + + +def test_eval_recompile_on_graph_update(): + """Building more ops on top of a previously eval'd graph should + recompile and produce correct results.""" + ark.init() + a = torch.ones(64, dtype=torch.float32, device=DEVICE) * 2.0 + b = torch.ones(64, dtype=torch.float32, device=DEVICE) * 3.0 + + # Step 1: build c = a + b, eval + c = ark.add(a, b) + r1 = c.eval() + plan1 = _get_compiled_plan() + assert torch.allclose(r1, a + b) + + # Step 2: extend the SAME graph with d = c + a, eval + # c is still a valid ARK tensor in the same model + d = ark.add(c, a) + r2 = d.eval() + plan2 = _get_compiled_plan() + assert torch.allclose(r2, (a + b) + a) + + # The plan must have changed (graph grew from 1 op to 2 ops) + assert plan1 != plan2, ( + "Extending the graph should produce a different plan and recompile" + ) + + +def test_eval_with_torch_stream(): + """eval() with a torch.cuda.Stream should correctly interleave with + torch operations on the same stream across multiple iterations.""" + s = torch.cuda.Stream() + x = torch.ones(64, dtype=torch.float32, device=DEVICE) + + for i in range(5): + # torch op on the stream: x = x * 2 + with torch.cuda.stream(s): + x = x * 2 + # ARK op on the same stream: x = x + 1 + x = ark.add(x, 1.0).eval(stream=s) + + s.synchronize() + # Each iteration: x = x * 2 + 1 + # Starting from 1: 3, 7, 15, 31, 63 + expected = torch.full((64,), 63.0, dtype=torch.float32, device=DEVICE) + assert torch.allclose(x, expected) + + +def test_eval_chain_with_intermediate_read(): + """Build a chain of dependent ARK ops, eval() the final tensor, + then verify an intermediate tensor also has the correct value.""" + a = torch.ones(64, dtype=torch.float32, device=DEVICE) * 2.0 + + # Chain: b = a + 3 -> c = b * 4 -> d = c - 1 + b = ark.add(a, 3.0) + c = ark.mul(b, 4.0) + d = ark.sub(c, 1.0) + + # Only eval the final tensor + result = d.eval() + + # Final: (2+3)*4 - 1 = 19 + assert torch.allclose(result, torch.full((64,), 19.0, device=DEVICE)) + + # Intermediate b should also be materialized: 2+3 = 5 + b_val = b.to_torch() + assert torch.allclose(b_val, torch.full((64,), 5.0, device=DEVICE)) + + # Intermediate c should also be materialized: 5*4 = 20 + c_val = c.to_torch() + assert torch.allclose(c_val, torch.full((64,), 20.0, device=DEVICE)) From b99cffc06bfff7ec4c0e117fb085edf8840edf41 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 4 Apr 2026 04:40:13 +0000 Subject: [PATCH 56/61] update workflow --- .github/workflows/ut-cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml index 10b0679da..b3a3993ef 100644 --- a/.github/workflows/ut-cuda.yml +++ b/.github/workflows/ut-cuda.yml @@ -75,7 +75,7 @@ jobs: --cov=python/ark \ --cov-report lcov:py_coverage.info \ --verbose \ - ../python/unittest/test.py + ../python/unittest/ - name: Report Coverage env: From 135fef2c578a95dd61d98d5a5a6e80f82d9457f7 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 4 Apr 2026 05:20:50 +0000 Subject: [PATCH 57/61] update workflow --- .github/workflows/ut-cuda.yml | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml index b3a3993ef..0b6759de0 100644 --- a/.github/workflows/ut-cuda.yml +++ b/.github/workflows/ut-cuda.yml @@ -11,38 +11,34 @@ on: jobs: UnitTest: - runs-on: [ self-hosted, A100 ] + runs-on: [ self-hosted ] defaults: run: shell: bash - timeout-minutes: 30 + timeout-minutes: 60 strategy: matrix: - cuda: [ cuda11.8, cuda12.2 ] + include: + - cuda: cuda12.8 + container: nvcr.io/nvidia/pytorch:25.01-py3 concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.cuda }} cancel-in-progress: true container: - image: "ghcr.io/microsoft/ark/ark:base-dev-${{ matrix.cuda }}" + image: ${{ matrix.container }} options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 steps: - name: Checkout uses: actions/checkout@v4 - - name: LockGPUClock - run: | - sudo nvidia-smi -pm 1 - for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do - sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i - done - - name: Dubious ownership exception run: | git config --global --add safe.directory /__w/ark/ark - name: Build run: | + apt-get update && apt-get install -y lcov mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=Debug .. make -j ut ark_py From a3d90d0f9fe0bfb881c54e3d9d0a2ecb5a923c7d Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 4 Apr 2026 08:09:24 +0000 Subject: [PATCH 58/61] lint --- .github/workflows/lint.yml | 15 ++-- ark/api/context.cpp | 4 +- ark/api/planner.cpp | 20 ++--- ark/api/planner_test.cpp | 34 ++++----- ark/context_impl.cpp | 4 +- ark/context_impl.hpp | 3 +- ark/gpu/gpu.hpp | 6 +- ark/include/ark.hpp | 2 +- ark/include/ark/executor.hpp | 5 +- ark/include/ark/planner.hpp | 4 +- ark/include/ark/tensor.hpp | 4 +- ark/include/kernels/comm.h | 3 +- ark/include/kernels/common/arch.h | 4 +- ark/include/kernels/common/broadcast.h | 79 ++++++++------------ ark/include/kernels/common/vector_type.h | 28 +++---- ark/include/kernels/gemm_ck.h | 95 +++++++++++++----------- ark/include/kernels/reduce.h | 45 ++++++----- ark/model/model_buffer.cpp | 3 +- ark/model/model_context_manager.cpp | 4 +- ark/model/model_graph_impl.hpp | 2 +- ark/model/model_op.hpp | 4 +- ark/model/model_tensor.cpp | 8 +- ark/ops/ops_all_reduce_test.cpp | 13 +++- ark/ops/ops_broadcast.cpp | 16 ++-- ark/ops/ops_communication.cpp | 69 +++++++++-------- ark/ops/ops_communication.hpp | 1 - ark/ops/ops_communication_test.cpp | 6 +- ark/ops/ops_embedding.cpp | 8 +- ark/ops/ops_noop.cpp | 4 +- ark/ops/ops_reduce.cpp | 10 +-- ark/ops/ops_scalar.cpp | 16 ++-- ark/ops/ops_test_common.cpp | 13 ++-- ark/ops/ops_test_common.hpp | 13 ++-- ark/ops/ops_transpose.cpp | 8 +- ark/unittest/unittest_utils.cpp | 8 ++ ark/unittest/unittest_utils.h | 11 +++ python/tensor_py.cpp | 2 +- tools/lint.sh | 74 ++++++++++++++++++ 38 files changed, 367 insertions(+), 281 deletions(-) create mode 100755 tools/lint.sh diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c799e86c6..0fe0cf826 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -7,7 +7,7 @@ on: jobs: linters: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - name: Check out Git repository @@ -16,22 +16,19 @@ jobs: - name: Install ClangFormat run: sudo apt-get install -y clang-format - - name: Run git-clang-format - run: git clang-format --style=file --diff - - name: Set up Python uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: '3.12' - name: Install Python dependencies - run: python3.8 -m pip install black + run: pip install black - - name: Run black - run: python3.8 -m black --check --config pyproject.toml . + - name: Run lint + run: bash tools/lint.sh dry spelling: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - name: Check out Git repository diff --git a/ark/api/context.cpp b/ark/api/context.cpp index 702247ddf..087e0e7c9 100644 --- a/ark/api/context.cpp +++ b/ark/api/context.cpp @@ -29,8 +29,6 @@ void Context::set(const std::string& key, const std::string& value, this->impl_->set(key, value_json, type); } -std::string Context::dump() const { - return this->impl_->dump().dump(); -} +std::string Context::dump() const { return this->impl_->dump().dump(); } } // namespace ark diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp index d7e96e957..c48e19c50 100644 --- a/ark/api/planner.cpp +++ b/ark/api/planner.cpp @@ -211,8 +211,8 @@ std::string Planner::Impl::plan(bool pretty) const { Dims tile(trim_leading_ones); std::stringstream ss; - ss << "Result shape is not divided by tile " - << tile << ". Op: " << op->serialize().dump(); + ss << "Result shape is not divided by tile " << tile + << ". Op: " << op->serialize().dump(); auto not_divided_error = ss.str(); auto &result_shape = result_tensors[0]->padded_shape(); @@ -224,11 +224,10 @@ std::string Planner::Impl::plan(bool pretty) const { max_num_tasks = 1; for (int i = 0; i < tile4.ndims(); i++) { if (tile4[i] == 0) { - ERR(PlanError, "Tile dimension is zero. Op: ", - op->serialize().dump()); + ERR(PlanError, + "Tile dimension is zero. Op: ", op->serialize().dump()); } - max_num_tasks *= - (result_shape4[i] + tile4[i] - 1) / tile4[i]; + max_num_tasks *= (result_shape4[i] + tile4[i] - 1) / tile4[i]; } if (max_num_tasks == 0) ERR(InternalError, "max_num_tasks == 0"); } @@ -328,10 +327,13 @@ std::string Planner::Impl::plan(bool pretty) const { max_processor_id = std::max(max_processor_id, num_processors); } else if (processor_group_root == -1) { processor_group_root = ctx_processor_range_list.front()[0]; - processor_group["ProcessorRange"] = ctx_processor_range_list.front()[1]; - resource_group["ProcessorRange"] = ctx_processor_range_list.back()[1]; + processor_group["ProcessorRange"] = + ctx_processor_range_list.front()[1]; + resource_group["ProcessorRange"] = + ctx_processor_range_list.back()[1]; max_processor_id = std::max( - max_processor_id, ctx_processor_range_list.front()[1][1].get()); + max_processor_id, + ctx_processor_range_list.front()[1][1].get()); } else { new_processor_group = false; resource_group["ProcessorRange"] = diff --git a/ark/api/planner_test.cpp b/ark/api/planner_test.cpp index 7507ea023..e557ee307 100644 --- a/ark/api/planner_test.cpp +++ b/ark/api/planner_test.cpp @@ -87,8 +87,9 @@ ark::unittest::State test_planner_context_processor_range() { auto t = model.add(t0, t1); tensors.push_back(t); - UNITTEST_EQ(ctx.get("ProcessorRange"), - ark::Json({subctx.id(), {0 * (int)i, 2 * (int)i}}).dump()); + UNITTEST_EQ( + ctx.get("ProcessorRange"), + ark::Json({subctx.id(), {0 * (int)i, 2 * (int)i}}).dump()); } UNITTEST_TRUE(model.verify()); @@ -131,15 +132,13 @@ ark::unittest::State test_planner_context_warp_range() { ctx.warp_range(0, 4); t3 = model.relu(t2); - UNITTEST_EQ(ctx.get("WarpRange"), - ark::Json({ctx.id(), {0, 4}}).dump()); + UNITTEST_EQ(ctx.get("WarpRange"), ark::Json({ctx.id(), {0, 4}}).dump()); // node 2 ctx.warp_range(2, 4); t4 = model.sqrt(t3); - UNITTEST_EQ(ctx.get("WarpRange"), - ark::Json({ctx.id(), {2, 4}}).dump()); + UNITTEST_EQ(ctx.get("WarpRange"), ark::Json({ctx.id(), {2, 4}}).dump()); // Invalid usage: range (0, 4) is out of previous range (2, 4) UNITTEST_THROW(ctx.warp_range(0, 4), ark::PlanError); @@ -197,15 +196,13 @@ ark::unittest::State test_planner_context_sram_range() { ctx.sram_range(0, 4); t3 = model.relu(t2); - UNITTEST_EQ(ctx.get("SramRange"), - ark::Json({ctx.id(), {0, 4}}).dump()); + UNITTEST_EQ(ctx.get("SramRange"), ark::Json({ctx.id(), {0, 4}}).dump()); // node 2 ctx.sram_range(2, 4); t4 = model.sqrt(t3); - UNITTEST_EQ(ctx.get("SramRange"), - ark::Json({ctx.id(), {2, 4}}).dump()); + UNITTEST_EQ(ctx.get("SramRange"), ark::Json({ctx.id(), {2, 4}}).dump()); // Invalid usage: range (0, 4) is out of previous range (2, 4) UNITTEST_THROW(ctx.sram_range(0, 4), ark::PlanError); @@ -263,15 +260,13 @@ ark::unittest::State test_planner_context_sync() { ctx.sync(false); t3 = model.relu(t2); - UNITTEST_EQ(ctx.get("Sync"), - ark::Json({ctx.id(), false}).dump()); + UNITTEST_EQ(ctx.get("Sync"), ark::Json({ctx.id(), false}).dump()); // node 2 ctx.sync(true); t4 = model.sqrt(t3); - UNITTEST_EQ(ctx.get("Sync"), - ark::Json({ctx.id(), true}).dump()); + UNITTEST_EQ(ctx.get("Sync"), ark::Json({ctx.id(), true}).dump()); } { // node 3 @@ -280,8 +275,7 @@ ark::unittest::State test_planner_context_sync() { ctx.sync(true); t5 = model.exp(t2); - UNITTEST_EQ(ctx.get("Sync"), - ark::Json({ctx.id(), true}).dump()); + UNITTEST_EQ(ctx.get("Sync"), ark::Json({ctx.id(), true}).dump()); } UNITTEST_TRUE(model.verify()); @@ -297,8 +291,9 @@ ark::unittest::State test_planner_context_sync() { UNITTEST_EQ(nodes[1]->context.at("Sync"), ark::Json({{sync_id_1, true}, {sync_id_1, false}})); UNITTEST_GE(nodes[2]->context.size(), 1); - UNITTEST_EQ(nodes[2]->context.at("Sync"), - ark::Json({{sync_id_1, true}, {sync_id_1, false}, {sync_id_1, true}})); + UNITTEST_EQ( + nodes[2]->context.at("Sync"), + ark::Json({{sync_id_1, true}, {sync_id_1, false}, {sync_id_1, true}})); UNITTEST_GE(nodes[3]->context.size(), 1); UNITTEST_EQ(nodes[3]->context.at("Sync"), ark::Json({{sync_id_2, true}, {sync_id_2, true}})); @@ -361,7 +356,8 @@ ark::unittest::State test_planner_context_config() { ark::Json({{cfg_id_1, {{"key0", "val1"}}}})); UNITTEST_GE(nodes[2]->context.size(), 1); UNITTEST_EQ(nodes[2]->context.at("Config"), - ark::Json({{cfg_id_1, {{"key0", "val1"}}}, {cfg_id_1, {{"key1", "val2"}}}})); + ark::Json({{cfg_id_1, {{"key0", "val1"}}}, + {cfg_id_1, {{"key1", "val2"}}}})); UNITTEST_GE(nodes[3]->context.size(), 1); UNITTEST_EQ(nodes[3]->context.at("Config"), ark::Json({{cfg_id_2, {{"key2", "val3"}}}})); diff --git a/ark/context_impl.cpp b/ark/context_impl.cpp index c4f95f2c3..0eca1bf0e 100644 --- a/ark/context_impl.cpp +++ b/ark/context_impl.cpp @@ -52,8 +52,6 @@ bool Context::Impl::has(const std::string& key) const { return context_manager_->has(key); } -Json Context::Impl::dump() const { - return context_manager_->dump(); -} +Json Context::Impl::dump() const { return context_manager_->dump(); } } // namespace ark diff --git a/ark/context_impl.hpp b/ark/context_impl.hpp index b79353296..cf1509167 100644 --- a/ark/context_impl.hpp +++ b/ark/context_impl.hpp @@ -17,7 +17,8 @@ class Context::Impl { Json get(const std::string& key) const; - void set(const std::string& key, const Json& value_json, ContextType type = ContextType::Overwrite); + void set(const std::string& key, const Json& value_json, + ContextType type = ContextType::Overwrite); bool has(const std::string& key) const; diff --git a/ark/gpu/gpu.hpp b/ark/gpu/gpu.hpp index dbcd50f3e..fe1bf07bb 100644 --- a/ark/gpu/gpu.hpp +++ b/ark/gpu/gpu.hpp @@ -21,7 +21,7 @@ constexpr auto alias = cuda_const; #define ARK_GPU_DEFINE_FUNC_ALIAS(alias, cuda_func, rocm_func) \ template \ - inline auto alias(Args &&... args) { \ + inline auto alias(Args &&...args) { \ return cuda_func(std::forward(args)...); \ } @@ -35,7 +35,7 @@ constexpr auto alias = rocm_const; #define ARK_GPU_DEFINE_FUNC_ALIAS(alias, cuda_func, rocm_func) \ template \ - inline auto alias(Args &&... args) { \ + inline auto alias(Args &&...args) { \ return rocm_func(std::forward(args)...); \ } @@ -148,6 +148,8 @@ ARK_GPU_DEFINE_FUNC_ALIAS(gpuMemcpy, cudaMemcpy, hipMemcpy); ARK_GPU_DEFINE_FUNC_ALIAS(gpuMemcpyAsync, cudaMemcpyAsync, hipMemcpyAsync); ARK_GPU_DEFINE_FUNC_ALIAS(gpuMemsetAsync, cudaMemsetAsync, hipMemsetAsync); ARK_GPU_DEFINE_FUNC_ALIAS(gpuSetDevice, cudaSetDevice, hipSetDevice); +ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetDeviceCount, cudaGetDeviceCount, + hipGetDeviceCount); ARK_GPU_DEFINE_FUNC_ALIAS(gpuStreamCreateWithFlags, cudaStreamCreateWithFlags, hipStreamCreateWithFlags); ARK_GPU_DEFINE_FUNC_ALIAS(gpuStreamDestroy, cudaStreamDestroy, diff --git a/ark/include/ark.hpp b/ark/include/ark.hpp index b1955bf9c..90f23b2f1 100644 --- a/ark/include/ark.hpp +++ b/ark/include/ark.hpp @@ -14,9 +14,9 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp index 2e97ffe78..765cd0f27 100644 --- a/ark/include/ark/executor.hpp +++ b/ark/include/ark/executor.hpp @@ -52,9 +52,8 @@ class Executor { bool record = false); /// Run the executor for `iter` iterations. - void run( - int iter, - const std::unordered_map &placeholder_data = {}); + void run(int iter, + const std::unordered_map &placeholder_data = {}); /// Wait for the previous run to finish. void wait(int64_t max_spin_count = -1); diff --git a/ark/include/ark/planner.hpp b/ark/include/ark/planner.hpp index 9547848b9..b34acbc39 100644 --- a/ark/include/ark/planner.hpp +++ b/ark/include/ark/planner.hpp @@ -38,8 +38,8 @@ class Planner { ~Planner(); - using ConfigRule = std::function; + using ConfigRule = std::function; void install_config_rule(ConfigRule rule); diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp index aa8dcaa68..67eda64ae 100644 --- a/ark/include/ark/tensor.hpp +++ b/ark/include/ark/tensor.hpp @@ -69,9 +69,7 @@ std::ostream &operator<<(std::ostream &os, const Tensor &tensor); namespace std { template <> struct hash { - size_t operator()(const ark::Tensor &t) const noexcept { - return t.id(); - } + size_t operator()(const ark::Tensor &t) const noexcept { return t.id(); } }; } // namespace std diff --git a/ark/include/kernels/comm.h b/ark/include/kernels/comm.h index 9075bb728..4a2deca80 100644 --- a/ark/include/kernels/comm.h +++ b/ark/include/kernels/comm.h @@ -414,8 +414,7 @@ DEVICE void read_reduce_and_write( DataType, NelemPerThread, Rank, NPeers, nelems_per_rank>>::run(dst, src, scratch, peer_offsets, uop_idx); - } - else { + } else { PacketType *scratch = reinterpret_cast(scratch_base); comm::PacketReduce< OutDims, OutShape, UnitOutDims, NumWarps, SmemBytes, PacketType, diff --git a/ark/include/kernels/common/arch.h b/ark/include/kernels/common/arch.h index e268ad78c..7eff95c7b 100644 --- a/ark/include/kernels/common/arch.h +++ b/ark/include/kernels/common/arch.h @@ -32,13 +32,13 @@ DEVICE int warp_id() { #if defined(ARK_TARGET_CUDA_ARCH) #define ARCH_ALIAS_FUNC(alias, cuda_func, hip_func) \ template \ - inline auto alias(Args &&... args) { \ + inline auto alias(Args &&...args) { \ return cuda_func(std::forward(args)...); \ } #elif defined(ARK_TARGET_ROCM_ARCH) #define ARCH_ALIAS_FUNC(alias, cuda_func, hip_func) \ template \ - inline auto alias(Args &&... args) { \ + inline auto alias(Args &&...args) { \ return hip_func(std::forward(args)...); \ } #endif diff --git a/ark/include/kernels/common/broadcast.h b/ark/include/kernels/common/broadcast.h index 86e84e5d0..d64a31fd5 100644 --- a/ark/include/kernels/common/broadcast.h +++ b/ark/include/kernels/common/broadcast.h @@ -41,22 +41,17 @@ struct Broadcast1Intrinsic { static constexpr int InConsecBytes = InConsecLen * sizeof(InputType); static constexpr int OutNelemPerThread = - (OutConsecBytes % 16 == 0) - ? 16 / sizeof(OutputType) - : (OutConsecBytes % 8 == 0) - ? 8 / sizeof(OutputType) - : (OutConsecBytes % 4 == 0) - ? 4 / sizeof(OutputType) - : (OutConsecBytes % 2 == 0) ? 2 / sizeof(OutputType) - : 1; + (OutConsecBytes % 16 == 0) ? 16 / sizeof(OutputType) + : (OutConsecBytes % 8 == 0) ? 8 / sizeof(OutputType) + : (OutConsecBytes % 4 == 0) ? 4 / sizeof(OutputType) + : (OutConsecBytes % 2 == 0) ? 2 / sizeof(OutputType) + : 1; static constexpr int InNelemPerThread = - (InConsecBytes % 16 == 0) - ? 16 / sizeof(InputType) - : (InConsecBytes % 8 == 0) - ? 8 / sizeof(InputType) - : (InConsecBytes % 4 == 0) - ? 4 / sizeof(InputType) - : (InConsecBytes % 2 == 0) ? 2 / sizeof(InputType) : 1; + (InConsecBytes % 16 == 0) ? 16 / sizeof(InputType) + : (InConsecBytes % 8 == 0) ? 8 / sizeof(InputType) + : (InConsecBytes % 4 == 0) ? 4 / sizeof(InputType) + : (InConsecBytes % 2 == 0) ? 2 / sizeof(InputType) + : 1; static constexpr int NelemPerThread = BroadcastInput ? OutNelemPerThread @@ -155,43 +150,35 @@ struct Broadcast2Intrinsic { static constexpr int In1ConsecBytes = In1ConsecLen * sizeof(InputType); static constexpr int OutNelemPerThread = - (OutConsecBytes % 16 == 0) - ? 16 / sizeof(OutputType) - : (OutConsecBytes % 8 == 0) - ? 8 / sizeof(OutputType) - : (OutConsecBytes % 4 == 0) - ? 4 / sizeof(OutputType) - : (OutConsecBytes % 2 == 0) ? 2 / sizeof(OutputType) - : 1; + (OutConsecBytes % 16 == 0) ? 16 / sizeof(OutputType) + : (OutConsecBytes % 8 == 0) ? 8 / sizeof(OutputType) + : (OutConsecBytes % 4 == 0) ? 4 / sizeof(OutputType) + : (OutConsecBytes % 2 == 0) ? 2 / sizeof(OutputType) + : 1; static constexpr int In0NelemPerThread = - (In0ConsecBytes % 16 == 0) - ? 16 / sizeof(InputType) - : (In0ConsecBytes % 8 == 0) - ? 8 / sizeof(InputType) - : (In0ConsecBytes % 4 == 0) - ? 4 / sizeof(InputType) - : (In0ConsecBytes % 2 == 0) ? 2 / sizeof(InputType) : 1; + (In0ConsecBytes % 16 == 0) ? 16 / sizeof(InputType) + : (In0ConsecBytes % 8 == 0) ? 8 / sizeof(InputType) + : (In0ConsecBytes % 4 == 0) ? 4 / sizeof(InputType) + : (In0ConsecBytes % 2 == 0) ? 2 / sizeof(InputType) + : 1; static constexpr int In1NelemPerThread = - (In1ConsecBytes % 16 == 0) - ? 16 / sizeof(InputType) - : (In1ConsecBytes % 8 == 0) - ? 8 / sizeof(InputType) - : (In1ConsecBytes % 4 == 0) - ? 4 / sizeof(InputType) - : (In1ConsecBytes % 2 == 0) ? 2 / sizeof(InputType) : 1; + (In1ConsecBytes % 16 == 0) ? 16 / sizeof(InputType) + : (In1ConsecBytes % 8 == 0) ? 8 / sizeof(InputType) + : (In1ConsecBytes % 4 == 0) ? 4 / sizeof(InputType) + : (In1ConsecBytes % 2 == 0) ? 2 / sizeof(InputType) + : 1; static constexpr int NelemPerThread = - (BroadcastInput0 && BroadcastInput1) - ? OutNelemPerThread - : BroadcastInput0 - ? math::gcd::value - : BroadcastInput1 - ? math::gcd::value - : math::gcd::value>::value; + (BroadcastInput0 && BroadcastInput1) ? OutNelemPerThread + : BroadcastInput0 + ? math::gcd::value + : BroadcastInput1 + ? math::gcd::value + : math::gcd::value>::value; static_assert(math::is_pow2::value, "NelemPerThread must be power of 2"); diff --git a/ark/include/kernels/common/vector_type.h b/ark/include/kernels/common/vector_type.h index 1e5316e20..f247c53ee 100644 --- a/ark/include/kernels/common/vector_type.h +++ b/ark/include/kernels/common/vector_type.h @@ -71,28 +71,29 @@ struct Constant { template struct IntrinsicCompute1Exists { template - static auto test(const InputVtype &) - -> decltype(&U::compute, std::true_type{}); + static auto test(const InputVtype &) -> decltype(&U::compute, + std::true_type{}); template static auto test(...) -> std::false_type; - static constexpr bool value = decltype( - test(type::Constant::zero()))::value; + static constexpr bool value = decltype(test( + type::Constant::zero()))::value; }; template struct IntrinsicCompute2Exists { template - static auto test(const InputVtype &, const InputVtype &) - -> decltype(&U::compute, std::true_type{}); + static auto test(const InputVtype &, + const InputVtype &) -> decltype(&U::compute, + std::true_type{}); template static auto test(...) -> std::false_type; - static constexpr bool value = decltype( - test(type::Constant::zero(), - type::Constant::zero()))::value; + static constexpr bool value = decltype(test( + type::Constant::zero(), + type::Constant::zero()))::value; }; template @@ -198,11 +199,10 @@ struct DefaultNelemPerThread { : math::min::value; static const int value = - (sizeof(OutDataType) <= 2 && ConsecutiveDimLen % 8 == 0) - ? 8 - : (ConsecutiveDimLen % 4 == 0) - ? 4 - : (ConsecutiveDimLen % 2 == 0) ? 2 : 1; + (sizeof(OutDataType) <= 2 && ConsecutiveDimLen % 8 == 0) ? 8 + : (ConsecutiveDimLen % 4 == 0) ? 4 + : (ConsecutiveDimLen % 2 == 0) ? 2 + : 1; }; } // namespace ark diff --git a/ark/include/kernels/gemm_ck.h b/ark/include/kernels/gemm_ck.h index 478419691..a15cf49e0 100644 --- a/ark/include/kernels/gemm_ck.h +++ b/ark/include/kernels/gemm_ck.h @@ -90,13 +90,15 @@ struct CkGemmConfig::value; static constexpr auto MXdlPerWave = (TileSizeM == 16) ? 1 - : (TileSizeM < TileSizeN) - ? 1 << (LogMNXdlPerWave / 2) - : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2); + : (TileSizeM < TileSizeN) + ? 1 << (LogMNXdlPerWave / 2) + : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2); static constexpr auto NXdlPerWave = MNXdlPerWave / MXdlPerWave; static constexpr bool Is_256x256x128 = @@ -197,13 +199,15 @@ struct CkGemmConfig, typename std::conditional, S<1, 0, 2>>::type, typename std::conditional, S<1, 0, 2>>::type, - (IsColA ? 1 : 2), (!IsColA ? 8 : Is_128x128x64 ? 4 : MXdlPerWave), 8, - true, S<4, NumThreads / 4, 1>, + (IsColA ? 1 : 2), + (!IsColA ? 8 + : Is_128x128x64 ? 4 + : MXdlPerWave), + 8, true, S<4, NumThreads / 4, 1>, typename std::conditional, S<0, 2, 1>>::type, typename std::conditional, S<0, 2, 1>>::type, (IsColB ? 2 : 1), - (IsColB ? 8 - : Is_128x32x256 - ? 8 - : (Is_128x32x128 || Is_128x64x128 || Is_128x128x128) - ? 4 - : (Is_128x32x64 || Is_64x32x32) ? 2 : NXdlPerWave), + (IsColB ? 8 + : Is_128x32x256 ? 8 + : (Is_128x32x128 || Is_128x64x128 || Is_128x128x128) ? 4 + : (Is_128x32x64 || Is_64x32x32) ? 2 + : NXdlPerWave), 8, true, 7, 1, 1, LoopSched, PipelineVer>; using ImplXdlCShuffle = @@ -234,16 +240,17 @@ struct CkGemmConfig, S<1, 0, 2>>::type, typename std::conditional, S<1, 0, 2>>::type, (IsColA ? 1 : 2), - (!IsColA ? 8 : (AK1 == 2 || Is_128x128x64) ? 4 : MXdlPerWave), AK1, - (AK1 == 8), S, + (!IsColA ? 8 + : (AK1 == 2 || Is_128x128x64) ? 4 + : MXdlPerWave), + AK1, (AK1 == 8), S, typename std::conditional, S<0, 2, 1>>::type, typename std::conditional, S<0, 2, 1>>::type, (IsColB ? 2 : 1), (IsColB ? 8 - : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || - Is_128x64x128) - ? 4 - : NXdlPerWave), + : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || Is_128x64x128) + ? 4 + : NXdlPerWave), BK1, (BK1 == 8), 1, 1, S<1, (Is_128x128x128 || Is_128x64x128 || Is_128x32x128 || @@ -255,16 +262,17 @@ struct CkGemmConfig; #if (DEBUG_CK != 0) - PrintDeviceGemmXdlCShuffle< - NumThreads, TileSizeM, TileSizeN, 32, AK1, BK1, 32, 32, MXdlPerWave, - NXdlPerWave, - (!IsColA ? 8 : (AK1 == 2 || Is_128x128x64) ? 4 : MXdlPerWave), - (IsColB - ? 8 - : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || Is_128x64x128) - ? 4 - : NXdlPerWave), - 1, 1> + PrintDeviceGemmXdlCShuffle p; #endif // (DEBUG_CK != 0) }; @@ -286,9 +294,9 @@ struct CkGemmConfig::value; static constexpr auto MXdlPerWave = (TileSizeM == 16) ? 1 - : (TileSizeM < TileSizeN) - ? 1 << (LogMNXdlPerWave / 2) - : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2); + : (TileSizeM < TileSizeN) + ? 1 << (LogMNXdlPerWave / 2) + : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2); static constexpr auto NXdlPerWave = MNXdlPerWave / MXdlPerWave; static constexpr bool Is_256x256x128 = @@ -307,7 +315,8 @@ struct CkGemmConfig, S<1, 0, 2>>::type, typename std::conditional, S<1, 0, 2>>::type, (IsColA ? 1 : 2), - (!IsColA ? 8 : (AK1 == 2 || Is_128x128x64) ? 4 : MXdlPerWave), AK1, - (AK1 == 8), S, + (!IsColA ? 8 + : (AK1 == 2 || Is_128x128x64) ? 4 + : MXdlPerWave), + AK1, (AK1 == 8), S, typename std::conditional, S<0, 2, 1>>::type, typename std::conditional, S<0, 2, 1>>::type, (IsColB ? 2 : 1), (IsColB ? 8 - : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || - Is_128x64x128) - ? 4 - : NXdlPerWave), + : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || Is_128x64x128) + ? 4 + : NXdlPerWave), BK1, (BK1 == 8), 1, 1, S<1, (Is_128x128x128 || Is_128x64x128 || Is_128x32x128 || diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h index a25e1ccf4..587674b91 100644 --- a/ark/include/kernels/reduce.h +++ b/ark/include/kernels/reduce.h @@ -357,13 +357,11 @@ struct WwiseReduce { ReduceShapeChecker; constexpr int InConsecBytes = sizeof(DataType) * InShape::W; constexpr int NelemPerThread = - (InConsecBytes % 16 == 0) - ? 16 / sizeof(DataType) - : (InConsecBytes % 8 == 0) - ? 8 / sizeof(DataType) - : (InConsecBytes % 4 == 0) - ? 4 / sizeof(DataType) - : (InConsecBytes % 2 == 0) ? 2 / sizeof(DataType) : 1; + (InConsecBytes % 16 == 0) ? 16 / sizeof(DataType) + : (InConsecBytes % 8 == 0) ? 8 / sizeof(DataType) + : (InConsecBytes % 4 == 0) ? 4 / sizeof(DataType) + : (InConsecBytes % 2 == 0) ? 2 / sizeof(DataType) + : 1; constexpr int NonReduceDimLength = UnitOutDims::N * UnitOutDims::C * UnitOutDims::H; @@ -411,20 +409,30 @@ struct WwiseReduce { if constexpr (NelemPerThread > 8) { #pragma unroll for (int i = 8; i < NelemPerThread; i += 8) { - ReduceType::template reduce<8>(&reduced[0], &reduced[0], &reduced[i]); + ReduceType::template reduce<8>(&reduced[0], &reduced[0], + &reduced[i]); } - ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]); - ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]); - ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + ReduceType::template reduce<4>(&reduced[0], &reduced[0], + &reduced[4]); + ReduceType::template reduce<2>(&reduced[0], &reduced[0], + &reduced[2]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], + &reduced[1]); } else if constexpr (NelemPerThread == 8) { - ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]); - ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]); - ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + ReduceType::template reduce<4>(&reduced[0], &reduced[0], + &reduced[4]); + ReduceType::template reduce<2>(&reduced[0], &reduced[0], + &reduced[2]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], + &reduced[1]); } else if constexpr (NelemPerThread == 4) { - ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]); - ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + ReduceType::template reduce<2>(&reduced[0], &reduced[0], + &reduced[2]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], + &reduced[1]); } else if constexpr (NelemPerThread == 2) { - ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], + &reduced[1]); } if constexpr (InShape::W % ThreadsPerRow != 0) { @@ -444,8 +452,7 @@ struct WwiseReduce { warpReduce(reduced[0]); } else { // Threads for one row span multiple warps — need shared memory. - reduced[0] = warpsReduce( + reduced[0] = warpsReduce( reduced[0], tid % PhysicalThreadsPerRow, smem_per_warp); } diff --git a/ark/model/model_buffer.cpp b/ark/model/model_buffer.cpp index a54b6e81f..3778190d1 100644 --- a/ark/model/model_buffer.cpp +++ b/ark/model/model_buffer.cpp @@ -80,8 +80,7 @@ std::shared_ptr ModelBuffer::deserialize(const Json &serialized) { } else if (!serialized.contains("SendTags")) { ERR(ModelError, "ModelBuffer deserialization failed: missing SendTags"); } else if (!serialized.contains("RecvTags")) { - ERR(ModelError, - "ModelBuffer deserialization failed: missing RecvTags"); + ERR(ModelError, "ModelBuffer deserialization failed: missing RecvTags"); } else if (!serialized.contains("IsExternal")) { ERR(ModelError, "ModelBuffer deserialization failed: missing IsExternal"); diff --git a/ark/model/model_context_manager.cpp b/ark/model/model_context_manager.cpp index 799cce785..e3be664f9 100644 --- a/ark/model/model_context_manager.cpp +++ b/ark/model/model_context_manager.cpp @@ -27,8 +27,6 @@ Json ModelContextManager::get(const std::string& key) const { return context_stack_->get(key); } -Json ModelContextManager::dump() const { - return context_stack_->dump(); -} +Json ModelContextManager::dump() const { return context_stack_->dump(); } } // namespace ark diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp index b9646d057..18c33f28a 100644 --- a/ark/model/model_graph_impl.hpp +++ b/ark/model/model_graph_impl.hpp @@ -54,7 +54,7 @@ class ModelGraph::Impl { Impl &operator=(const Impl &other); template - ModelOpRef create_op(const std::string &name, Args &&... args) { + ModelOpRef create_op(const std::string &name, Args &&...args) { ModelOpRef op = std::make_shared(std::forward(args)...); std::string name_copy; if (name.empty()) { diff --git a/ark/model/model_op.hpp b/ark/model/model_op.hpp index ab261eb20..6c5bbbbfd 100644 --- a/ark/model/model_op.hpp +++ b/ark/model/model_op.hpp @@ -50,8 +50,8 @@ class ModelOp { return ""; } - virtual std::vector impl_args([ - [maybe_unused]] const Json &config) const { + virtual std::vector impl_args( + [[maybe_unused]] const Json &config) const { return {}; } diff --git a/ark/model/model_tensor.cpp b/ark/model/model_tensor.cpp index 068783045..405faa4e2 100644 --- a/ark/model/model_tensor.cpp +++ b/ark/model/model_tensor.cpp @@ -92,13 +92,9 @@ size_t ModelTensor::shape_bytes() const { return shape_.nelems() * data_type_->bytes(); } -void *ModelTensor::data() const { - return buffer_->data(); -} +void *ModelTensor::data() const { return buffer_->data(); } -void *ModelTensor::data(void *data) { - return buffer_->data(data); -} +void *ModelTensor::data(void *data) { return buffer_->data(data); } bool ModelTensor::is_external() const { return buffer_->is_external(); } diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp index 8cf68b085..e4fe4dac0 100644 --- a/ark/ops/ops_all_reduce_test.cpp +++ b/ark/ops/ops_all_reduce_test.cpp @@ -91,7 +91,8 @@ ark::Tensor all_reduce_packet(ark::Model &m, ark::Tensor input, int rank, std::vector outputs; size_t out_off = flag % 2 == 0 ? 0 : nbytes_per_rank * 2; ark::Dims out_shape = {nbytes_per_rank * 2}; - ark::Dims out_strides = {nbytes_per_rank * 2 * 2}; // packet + double buffer + ark::Dims out_strides = {nbytes_per_rank * 2 * + 2}; // packet + double buffer for (int i = 0; i < rank_num; i++) { if (i != rank) { outputs.push_back(m.tensor(out_shape, ark::UINT8, out_strides, @@ -121,7 +122,8 @@ void test_all_reduce_packet_internal(ark::DimType nelem) { ark::Model m(gpu_id, NumGpus); ark::Tensor ones = m.tensor({nelem}, ark::FP16); ark::Tensor data = m.mul(ones, float(gpu_id + 1)); - ark::Tensor output = all_reduce_packet(m, data, gpu_id, NumGpus, 1, data); + ark::Tensor output = + all_reduce_packet(m, data, gpu_id, NumGpus, 1, data); std::vector ones_vec(ones.shape().nelems(), ark::half_t(1.0f)); @@ -186,7 +188,6 @@ ark::Tensor all_reduce_sm(ark::Model &m, ark::Tensor input, int rank, return res; } - template void test_all_reduce_sm_internal(ark::DimType nelem) { auto config_rule = [nelem](const std::string op_str, const std::string) { @@ -244,36 +245,42 @@ void test_all_reduce_sm_internal(ark::DimType nelem) { } ark::unittest::State test_all_reduce_4gpus() { + UNITTEST_SKIP(ark::unittest::get_gpu_count() < 4); test_all_reduce_internal<4>(64); test_all_reduce_internal<4>(8192); return ark::unittest::SUCCESS; } ark::unittest::State test_all_reduce_8gpus() { + UNITTEST_SKIP(ark::unittest::get_gpu_count() < 8); test_all_reduce_internal<8>(64); test_all_reduce_internal<8>(8192); return ark::unittest::SUCCESS; } ark::unittest::State test_all_reduce_packet_4gpus() { + UNITTEST_SKIP(ark::unittest::get_gpu_count() < 4); test_all_reduce_packet_internal<4>(2048); test_all_reduce_packet_internal<4>(8192); return ark::unittest::SUCCESS; } ark::unittest::State test_all_reduce_packet_8gpus() { + UNITTEST_SKIP(ark::unittest::get_gpu_count() < 8); test_all_reduce_packet_internal<8>(2048); test_all_reduce_packet_internal<8>(8192); return ark::unittest::SUCCESS; } ark::unittest::State test_all_reduce_sm_4gpus() { + UNITTEST_SKIP(ark::unittest::get_gpu_count() < 4); test_all_reduce_sm_internal<4>(2048 * 1024); test_all_reduce_sm_internal<4>(8192 * 1024); return ark::unittest::SUCCESS; } ark::unittest::State test_all_reduce_sm_8gpus() { + UNITTEST_SKIP(ark::unittest::get_gpu_count() < 8); test_all_reduce_sm_internal<8>(2048 * 1024); test_all_reduce_sm_internal<8>(8192 * 1024); return ark::unittest::SUCCESS; diff --git a/ark/ops/ops_broadcast.cpp b/ark/ops/ops_broadcast.cpp index 2fd02b801..8642feefd 100644 --- a/ark/ops/ops_broadcast.cpp +++ b/ark/ops/ops_broadcast.cpp @@ -39,13 +39,13 @@ std::string ModelOpBroadcast1::impl_name(const Json &config) const { std::to_string(0)}); } -std::vector ModelOpBroadcast1::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpBroadcast1::impl_args( + [[maybe_unused]] const Json &config) const { return {result_tensors_[0], read_tensors_[0]}; } -Json ModelOpBroadcast1::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpBroadcast1::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; config["NumWarps"] = 1; config["SramBytes"] = 0; @@ -108,8 +108,8 @@ std::string ModelOpBroadcast2::impl_name(const Json &config) const { std::to_string(0)}); } -std::vector ModelOpBroadcast2::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpBroadcast2::impl_args( + [[maybe_unused]] const Json &config) const { std::vector args; args.emplace_back(result_tensors_[0]); args.emplace_back(read_tensors_[0]); @@ -117,8 +117,8 @@ std::vector ModelOpBroadcast2::impl_args([ return args; } -Json ModelOpBroadcast2::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpBroadcast2::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; config["NumWarps"] = 1; config["SramBytes"] = 0; diff --git a/ark/ops/ops_communication.cpp b/ark/ops/ops_communication.cpp index c5be1ca65..4e221e173 100644 --- a/ark/ops/ops_communication.cpp +++ b/ark/ops/ops_communication.cpp @@ -71,8 +71,8 @@ std::string ModelOpSend::impl_name(const Json &config) const { output->data_type()->type_str()}); } -std::vector ModelOpSend::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpSend::impl_args( + [[maybe_unused]] const Json &config) const { return {ModelOffset(write_tensors_[0]), ModelOffset(read_tensors_[0])}; } @@ -107,13 +107,13 @@ std::string ModelOpSendDone::impl_name(const Json &config) const { std::to_string(remote_rank)}); } -std::vector ModelOpSendDone::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpSendDone::impl_args( + [[maybe_unused]] const Json &config) const { return {}; } -Json ModelOpSendDone::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpSendDone::default_config( + [[maybe_unused]] const ArchRef arch) const { return {{"ChannelType", "Proxy"}, {"NumTasks", 1}, {"NumWarps", 1}, @@ -138,8 +138,8 @@ ModelOpRecv::ModelOpRecv(ModelTensorRef output, int remote_rank, int tag) } std::string ModelOpRecv::impl_name(const Json &config) const { - check_fields_config(config, - {"ChannelType", "NumTasks", "NumWarps", "SramBytes", "Wait"}); + check_fields_config( + config, {"ChannelType", "NumTasks", "NumWarps", "SramBytes", "Wait"}); std::string channel_type = config["ChannelType"]; bool wait = config["Wait"]; if (channel_type != "Proxy" && channel_type != "SecondaryProxy" && @@ -155,8 +155,8 @@ std::string ModelOpRecv::impl_name(const Json &config) const { std::to_string(max_spin_cnt), std::to_string(wait)}); } -std::vector ModelOpRecv::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpRecv::impl_args( + [[maybe_unused]] const Json &config) const { return {}; } @@ -231,13 +231,13 @@ std::string ModelOpSendPacket::impl_name(const Json &config) const { packet_type, std::to_string(flag)}); } -std::vector ModelOpSendPacket::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpSendPacket::impl_args( + [[maybe_unused]] const Json &config) const { return {ModelOffset(write_tensors_[0]), ModelOffset(read_tensors_[0])}; } -Json ModelOpSendPacket::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpSendPacket::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; if (arch->belongs_to(ARCH_ROCM)) { config["PacketType"] = "mscclpp::LL8Packet"; @@ -324,13 +324,13 @@ std::string ModelOpRecvPacket::impl_name(const Json &config) const { packet_type, std::to_string(flag)}); } -std::vector ModelOpRecvPacket::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpRecvPacket::impl_args( + [[maybe_unused]] const Json &config) const { return {ModelOffset(write_tensors_[0]), ModelOffset(read_tensors_[1])}; } -Json ModelOpRecvPacket::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpRecvPacket::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; if (arch->belongs_to(ARCH_ROCM)) { config["PacketType"] = "mscclpp::LL8Packet"; @@ -418,8 +418,8 @@ std::string ModelOpRecvReduceSendPacket::impl_name(const Json &config) const { input->data_type()->type_str(), std::to_string(flag)}); } -std::vector ModelOpRecvReduceSendPacket::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpRecvReduceSendPacket::impl_args( + [[maybe_unused]] const Json &config) const { std::vector args = {write_tensors_[0], read_tensors_[0], read_tensors_[1]}; for (size_t i = 1; i < write_tensors_.size(); ++i) { @@ -431,8 +431,8 @@ std::vector ModelOpRecvReduceSendPacket::impl_args([ return args; } -Json ModelOpRecvReduceSendPacket::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpRecvReduceSendPacket::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; if (arch->belongs_to(ARCH_ROCM)) { config["PacketType"] = "mscclpp::LL8Packet"; @@ -452,12 +452,10 @@ Json ModelOpRecvReduceSendPacket::default_config([ return config; } -ModelOpRecvReduceSend::ModelOpRecvReduceSend(ModelTensorRef input, - ModelTensorRef output, int rank, - const std::vector &remote_ranks, - int recv_tag, int output_tag, - std::vector &peer_output_refs, - ModelTensorRef scratch) +ModelOpRecvReduceSend::ModelOpRecvReduceSend( + ModelTensorRef input, ModelTensorRef output, int rank, + const std::vector &remote_ranks, int recv_tag, int output_tag, + std::vector &peer_output_refs, ModelTensorRef scratch) : ModelOp("RecvReduceSend") { check_null(input); uint32_t n_remote_ranks = remote_ranks.size(); @@ -519,8 +517,8 @@ std::string ModelOpRecvReduceSend::impl_name(const Json &config) const { input->data_type()->type_str(), input->data_type()->type_str()}); } -std::vector ModelOpRecvReduceSend::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpRecvReduceSend::impl_args( + [[maybe_unused]] const Json &config) const { std::vector args = {write_tensors_[0], read_tensors_[0], read_tensors_[1]}; for (size_t i = 1; i < write_tensors_.size(); ++i) { @@ -532,8 +530,8 @@ std::vector ModelOpRecvReduceSend::impl_args([ return args; } -Json ModelOpRecvReduceSend::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpRecvReduceSend::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; config["NumWarps"] = 1; config["SramBytes"] = 0; @@ -576,12 +574,13 @@ std::string ModelOpDeviceSync::impl_name(const Json &config) const { std::to_string(peer_num), std::to_string(rank)}); } -std::vector ModelOpDeviceSync::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpDeviceSync::impl_args( + [[maybe_unused]] const Json &config) const { return {}; } -Json ModelOpDeviceSync::default_config([[maybe_unused]] const ArchRef arch) const { +Json ModelOpDeviceSync::default_config( + [[maybe_unused]] const ArchRef arch) const { return {{"ChannelType", "Proxy"}, {"NumTasks", 1}, {"NumWarps", 1}, diff --git a/ark/ops/ops_communication.hpp b/ark/ops/ops_communication.hpp index 23f3b84af..f0c0134f2 100644 --- a/ark/ops/ops_communication.hpp +++ b/ark/ops/ops_communication.hpp @@ -103,7 +103,6 @@ class ModelOpRecvReduceSend : public ModelOp { Json default_config(const ArchRef arch = ARCH_ANY) const override; }; - class ModelOpDeviceSync : public ModelOp { public: ModelOpDeviceSync() = default; diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp index de7c42833..e5ffc8804 100644 --- a/ark/ops/ops_communication_test.cpp +++ b/ark/ops/ops_communication_test.cpp @@ -346,7 +346,8 @@ ark::unittest::State test_communication_send_recv_reduce_packet() { ark::unittest::spawn_process([gpu_id]() { ark::Model model(gpu_id, 2); ark::Tensor tns_data = model.tensor({1024}, ark::FP16); - std::vector shard_tensors = model.sharding(tns_data, 0, 512); + std::vector shard_tensors = + model.sharding(tns_data, 0, 512); int peer_gpu_id = (gpu_id + 1) % 2; model.send_packet(shard_tensors[peer_gpu_id], peer_gpu_id, 0, 1); @@ -389,8 +390,7 @@ ark::unittest::State test_communication_send_recv_reduce() { config["NumTasks"] = 4; config["NumWarps"] = 4; config["SramBytes"] = 0; - } - else if (op.at("Type") == "DeviceSync") { + } else if (op.at("Type") == "DeviceSync") { config["ChannelType"] = "Sm"; config["NumTasks"] = 1; config["NumWarps"] = 1; diff --git a/ark/ops/ops_embedding.cpp b/ark/ops/ops_embedding.cpp index 2d6b63720..8f29aba9a 100644 --- a/ark/ops/ops_embedding.cpp +++ b/ark/ops/ops_embedding.cpp @@ -54,13 +54,13 @@ std::string ModelOpEmbedding::impl_name(const Json &config) const { }); } -std::vector ModelOpEmbedding::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpEmbedding::impl_args( + [[maybe_unused]] const Json &config) const { return {result_tensors_[0], read_tensors_[0], read_tensors_[1]}; } -Json ModelOpEmbedding::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpEmbedding::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; config["NumWarps"] = 1; config["SramBytes"] = 0; diff --git a/ark/ops/ops_noop.cpp b/ark/ops/ops_noop.cpp index 894ab29be..50d1c2640 100644 --- a/ark/ops/ops_noop.cpp +++ b/ark/ops/ops_noop.cpp @@ -16,8 +16,8 @@ std::string ModelOpNoop::impl_name([[maybe_unused]] const Json &config) const { return function_name_string("noop"); } -std::vector ModelOpNoop::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpNoop::impl_args( + [[maybe_unused]] const Json &config) const { return {}; } diff --git a/ark/ops/ops_reduce.cpp b/ark/ops/ops_reduce.cpp index 55c87aa1e..02d8b5c96 100644 --- a/ark/ops/ops_reduce.cpp +++ b/ark/ops/ops_reduce.cpp @@ -92,12 +92,10 @@ std::string ModelOpReduce::impl_name(const Json &config) const { output_shape.insert(axis, 1); } - Dims unit_out_dims( - config.at("Tile").get>()); + Dims unit_out_dims(config.at("Tile").get>()); auto udims4 = unit_out_dims.dims4(); if (udims4[axis] != 1) { - ERR(PlanError, - "Tile dimension along reduce axis (", axis, + ERR(PlanError, "Tile dimension along reduce axis (", axis, ") must be 1, got ", udims4[axis]); } @@ -115,8 +113,8 @@ std::string ModelOpReduce::impl_name(const Json &config) const { }); } -std::vector ModelOpReduce::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpReduce::impl_args( + [[maybe_unused]] const Json &config) const { return {result_tensors_[0], read_tensors_[0]}; } diff --git a/ark/ops/ops_scalar.cpp b/ark/ops/ops_scalar.cpp index 944a7247c..c65bc93de 100644 --- a/ark/ops/ops_scalar.cpp +++ b/ark/ops/ops_scalar.cpp @@ -39,14 +39,14 @@ std::string ModelOpScalarAssign::impl_name(const Json &config) const { std::to_string(num_warps), std::to_string(0)}); } -std::vector ModelOpScalarAssign::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpScalarAssign::impl_args( + [[maybe_unused]] const Json &config) const { float val = args_.at("Value").value(); return {result_tensors_[0], val}; } -Json ModelOpScalarAssign::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpScalarAssign::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; config["NumWarps"] = 1; config["SramBytes"] = 0; @@ -84,8 +84,8 @@ ModelOpScalarAdd::ModelOpScalarAdd(ModelTensorRef input, float factor, verify(); } -std::vector ModelOpScalarAdd::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpScalarAdd::impl_args( + [[maybe_unused]] const Json &config) const { float factor = args_.at("Factor").value(); return {result_tensors_[0], read_tensors_[0], factor}; } @@ -106,8 +106,8 @@ ModelOpScalarMul::ModelOpScalarMul(ModelTensorRef input, float factor, verify(); } -std::vector ModelOpScalarMul::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpScalarMul::impl_args( + [[maybe_unused]] const Json &config) const { float factor = args_.at("Factor").value(); return {result_tensors_[0], read_tensors_[0], factor}; } diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp index bfbe79a70..f902e626d 100644 --- a/ark/ops/ops_test_common.cpp +++ b/ark/ops/ops_test_common.cpp @@ -32,12 +32,13 @@ std::ostream &operator<<(std::ostream &os, const OpsTestResult &result) { return os; } -OpsTestResult op_test( - const std::string &test_name_prefix, const Model &model, - const std::vector &inputs, const std::vector &outputs, - OpsTestBaseline baseline, const std::vector &inputs_data, - const std::vector &config_rules, - bool print_on_error) { +OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, + const std::vector &inputs, + const std::vector &outputs, + OpsTestBaseline baseline, + const std::vector &inputs_data, + const std::vector &config_rules, + bool print_on_error) { DefaultExecutor exe(model, -1, nullptr, config_rules); std::vector>> inputs_data_storages; diff --git a/ark/ops/ops_test_common.hpp b/ark/ops/ops_test_common.hpp index 12fb88a7b..cd3f0b7f6 100644 --- a/ark/ops/ops_test_common.hpp +++ b/ark/ops/ops_test_common.hpp @@ -167,12 +167,13 @@ using OpsTestBaseline = std::function &inputs, const std::vector &outputs, - OpsTestBaseline baseline, const std::vector &inputs_data = {}, - const std::vector &config_rules = {}, - bool print_on_error = false); +OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, + const std::vector &inputs, + const std::vector &outputs, + OpsTestBaseline baseline, + const std::vector &inputs_data = {}, + const std::vector &config_rules = {}, + bool print_on_error = false); OpsTestGpuMem to_gpu(void *host_ptr, size_t size); diff --git a/ark/ops/ops_transpose.cpp b/ark/ops/ops_transpose.cpp index b7a67c8c0..f1b079c2d 100644 --- a/ark/ops/ops_transpose.cpp +++ b/ark/ops/ops_transpose.cpp @@ -112,13 +112,13 @@ std::string ModelOpTranspose::impl_name(const Json &config) const { }); } -std::vector ModelOpTranspose::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpTranspose::impl_args( + [[maybe_unused]] const Json &config) const { return {result_tensors_[0], read_tensors_[0]}; } -Json ModelOpTranspose::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpTranspose::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; config["NumWarps"] = 1; config["SramBytes"] = 0; diff --git a/ark/unittest/unittest_utils.cpp b/ark/unittest/unittest_utils.cpp index 4b74f9513..1b2aa029b 100644 --- a/ark/unittest/unittest_utils.cpp +++ b/ark/unittest/unittest_utils.cpp @@ -11,6 +11,7 @@ #include #include "file_io.h" +#include "gpu/gpu.hpp" #include "logging.hpp" // Grep SIGALRM and exit. @@ -96,6 +97,13 @@ void wait_all_processes() { // Run the given test function. State test(std::function test_func) { return test_func(); } +// Get the number of available GPUs. +int get_gpu_count() { + int count = 0; + if (gpuGetDeviceCount(&count) != gpuSuccess) return 0; + return count; +} + // std::string get_kernel_code(const std::string &name) { return ark::read_file(ark::get_dir(std::string{__FILE__}) + diff --git a/ark/unittest/unittest_utils.h b/ark/unittest/unittest_utils.h index 383f49b6d..e994bf80c 100644 --- a/ark/unittest/unittest_utils.h +++ b/ark/unittest/unittest_utils.h @@ -42,6 +42,8 @@ void wait_all_processes(); State test(std::function test_func); // +int get_gpu_count(); +// std::string get_kernel_code(const std::string &name); } // namespace unittest @@ -86,6 +88,15 @@ std::string get_kernel_code(const std::string &name); #define UNITTEST_UNEXPECTED(...) \ UNITTEST_EXIT(ark::unittest::UNEXPECTED, __VA_ARGS__) +// Skip the test if the condition is true. +#define UNITTEST_SKIP(cond) \ + do { \ + if (cond) { \ + UNITTEST_LOG("Skip: " #cond); \ + return ark::unittest::SUCCESS; \ + } \ + } while (0) + // Success. #define UNITTEST_SUCCESS() UNITTEST_EXIT(ark::unittest::SUCCESS, "") diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp index c6fde978e..d8d687546 100644 --- a/python/tensor_py.cpp +++ b/python/tensor_py.cpp @@ -9,7 +9,7 @@ namespace py = pybind11; -void register_tensor(py::module &m) { +void register_tensor(py::module& m) { py::class_(m, "CoreTensor") .def("id", &ark::Tensor::id) .def("shape", &ark::Tensor::shape) diff --git a/tools/lint.sh b/tools/lint.sh new file mode 100755 index 000000000..5c97626b0 --- /dev/null +++ b/tools/lint.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash + +PROJECT_ROOT=$(dirname "$(realpath "$0")")/.. +LINT_CPP=false +LINT_PYTHON=false +DRY_RUN=false +EXIT_CODE=0 + +usage() { + echo "Usage: $0 [cpp] [py] [dry]" + echo " cpp Lint C++ code" + echo " py Lint Python code" + echo " dry Dry run mode (no changes made)" +} + +# Parse arguments +for arg in "$@"; do + case "$arg" in + cpp) + LINT_CPP=true + ;; + py) + LINT_PYTHON=true + ;; + dry) + DRY_RUN=true + ;; + *) + echo "Error: Unknown argument '$arg'" + usage + exit 1 + ;; + esac +done + +# If no cpp or py specified, default to both +if [ "$LINT_CPP" = false ] && [ "$LINT_PYTHON" = false ]; then + LINT_CPP=true + LINT_PYTHON=true +fi + +if $LINT_CPP; then + echo "Linting C++ code..." + # Find all git-tracked files with .c/.h/.cpp/.hpp/.cc/.cu/.cuh extensions + files=$(git -C "$PROJECT_ROOT" ls-files --cached | grep -E '\.(c|h|cpp|hpp|cc|cu|cuh)$' | sed "s|^|$PROJECT_ROOT/|") + if [ -n "$files" ]; then + if $DRY_RUN; then + clang-format -style=file --dry-run --Werror $files + else + clang-format -style=file -i $files + fi + if [ $? -ne 0 ]; then + EXIT_CODE=1 + fi + fi +fi + +if $LINT_PYTHON; then + echo "Linting Python code..." + # Find all git-tracked files with .py extension + files=$(git -C "$PROJECT_ROOT" ls-files --cached | grep -E '\.py$' | sed "s|^|$PROJECT_ROOT/|") + if [ -n "$files" ]; then + if $DRY_RUN; then + python3 -m black --check --diff $files + else + python3 -m black $files + fi + if [ $? -ne 0 ]; then + EXIT_CODE=1 + fi + fi +fi + +exit $EXIT_CODE From 293c96ba93597dce04da516b9ff3fa786f2b00fb Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 4 Apr 2026 08:39:03 +0000 Subject: [PATCH 59/61] update workflows --- .github/workflows/codeql.yml | 104 ---------------------- .github/workflows/ut-rocm.yml | 64 ------------- .github/workflows/{ut-cuda.yml => ut.yml} | 37 ++++++-- 3 files changed, 31 insertions(+), 174 deletions(-) delete mode 100644 .github/workflows/codeql.yml delete mode 100644 .github/workflows/ut-rocm.yml rename .github/workflows/{ut-cuda.yml => ut.yml} (66%) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml deleted file mode 100644 index 7ac2f1649..000000000 --- a/.github/workflows/codeql.yml +++ /dev/null @@ -1,104 +0,0 @@ -name: CodeQL - -on: - push: - branches: - - main - pull_request: - branches: - - main - schedule: - - cron: '42 20 * * 4' - -jobs: - analyze-cuda: - name: Analyze (CUDA) - strategy: - fail-fast: false - matrix: - language: [ 'cpp' ] - concurrency: - group: ${{ github.workflow }}-cuda-${{ github.ref }} - cancel-in-progress: true - runs-on: ubuntu-latest - container: - image: ghcr.io/microsoft/ark/ark:base-dev-cuda12.2 - permissions: - actions: read - contents: read - security-events: write - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Check disk space - run: | - df -h - - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v3 - with: - languages: ${{ matrix.language }} - - - name: Dubious ownership exception - run: | - git config --global --add safe.directory /__w/ark/ark - - - name: Build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF .. - make build ark_py - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v3 - with: - category: "/language:${{matrix.language}}" - - analyze-rocm: - name: Analyze (ROCM) - strategy: - fail-fast: false - matrix: - language: [ 'cpp' ] - concurrency: - group: ${{ github.workflow }}-rocm-${{ github.ref }} - cancel-in-progress: true - runs-on: ubuntu-latest - container: - image: ghcr.io/microsoft/ark/ark:build-rocm6.1 - permissions: - actions: read - contents: read - security-events: write - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Check disk space - run: | - df -h - - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v3 - with: - languages: ${{ matrix.language }} - - - name: Dubious ownership exception - run: | - git config --global --add safe.directory /__w/ark/ark - - - name: Build - run: | - mkdir build && cd build - CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_ROCM=ON -DARK_BUILD_TESTS=OFF .. - make -j build ark_py - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v3 - with: - category: "/language:${{matrix.language}}" diff --git a/.github/workflows/ut-rocm.yml b/.github/workflows/ut-rocm.yml deleted file mode 100644 index ac8ed0e90..000000000 --- a/.github/workflows/ut-rocm.yml +++ /dev/null @@ -1,64 +0,0 @@ -name: "Unit Tests (ROCm)" - -on: - push: - branches: - - main - pull_request: - branches: - - main - -jobs: - UnitTest: - runs-on: [ self-hosted, AMD ] - defaults: - run: - shell: bash - strategy: - matrix: - rocm: [ rocm6.0 ] - concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.rocm }} - cancel-in-progress: true - # container: - # image: "ghcr.io/microsoft/ark/ark:base-dev-${{ matrix.rocm }}" - # options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1 - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Dubious ownership exception - run: | - git config --global --add safe.directory /__w/ark/ark - - - name: Build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Debug .. - make -j ut - - - name: RunUT - run: | - cd build && ARK_ROOT=$PWD ARK_IGNORE_BINARY_CACHE=1 ctest --stop-on-failure --verbose --schedule-random - - - name: ReportCoverage - run: | - cd build - lcov --capture --directory . --output-file coverage.info - lcov --remove coverage.info \ - '/usr/*' \ - '/tmp/*' \ - '*/third_party/*' \ - '*/ark/*_test.*' \ - '*/examples/*' \ - '*/python/*' \ - '*/ark/unittest/unittest_utils.cc' \ - --output-file coverage.info - lcov --list coverage.info - bash <(curl -s https://codecov.io/bash) -f coverage.info || echo "Codecov did not collect coverage reports" - - - name: BuildPython - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install . diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut.yml similarity index 66% rename from .github/workflows/ut-cuda.yml rename to .github/workflows/ut.yml index 0b6759de0..e7d295be9 100644 --- a/.github/workflows/ut-cuda.yml +++ b/.github/workflows/ut.yml @@ -1,4 +1,4 @@ -name: "Unit Tests (CUDA)" +name: "Unit Tests" on: push: @@ -11,22 +11,33 @@ on: jobs: UnitTest: - runs-on: [ self-hosted ] defaults: run: shell: bash timeout-minutes: 60 + permissions: + actions: read + contents: read + security-events: write strategy: + fail-fast: false matrix: include: - - cuda: cuda12.8 - container: nvcr.io/nvidia/pytorch:25.01-py3 + - platform: cuda + runner: [self-hosted, CUDA] + container: nvcr.io/nvidia/pytorch:26.03-py3 + container_options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 + - platform: rocm + runner: [self-hosted, ROCM] + container: rocm/pytorch:rocm7.2.1_ubuntu24.04_py3.12_pytorch_release_2.9.1 + container_options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1 + runs-on: ${{ matrix.runner }} concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.cuda }} + group: ${{ github.workflow }}-${{ matrix.platform }}-${{ github.ref }} cancel-in-progress: true container: image: ${{ matrix.container }} - options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 + options: ${{ matrix.container_options }} steps: - name: Checkout @@ -36,6 +47,11 @@ jobs: run: | git config --global --add safe.directory /__w/ark/ark + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: cpp + - name: Build run: | apt-get update && apt-get install -y lcov @@ -47,6 +63,10 @@ jobs: run: | cd build ARK_ROOT=$PWD ctest --stop-on-failure --verbose --schedule-random + + - name: C++ Coverage + run: | + cd build lcov --capture --directory . --output-file cpp_coverage.info lcov --remove cpp_coverage.info \ '/usr/*' \ @@ -88,3 +108,8 @@ jobs: - name: Run Tutorials run: | python3 ./examples/tutorial/quickstart_tutorial.py + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:cpp-${{ matrix.platform }}" From d0973eb19c1c747758e2f592dc746d4b4c028bd6 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 4 Apr 2026 08:55:09 +0000 Subject: [PATCH 60/61] lint --- examples/llama/model.py | 2 +- examples/multi_head_attention/mha.py | 24 ++++++---- examples/multi_head_attention/test_mha.py | 54 +++++++++++++++++----- examples/tutorial/module_tutorial.py | 33 +++++++++---- examples/tutorial/planner_tutorial.py | 8 +++- examples/tutorial/quickstart_tutorial.py | 4 +- python/ark/ops.py | 10 +++- python/tensor_py.cpp | 2 +- python/unittest/ops/test_arithmetic.py | 8 +++- python/unittest/ops/test_cast.py | 4 +- python/unittest/ops/test_composite.py | 12 ++--- python/unittest/ops/test_embedding_rope.py | 35 ++++++++------ python/unittest/ops/test_math.py | 20 ++++++-- python/unittest/ops/test_matmul.py | 34 +++++++------- python/unittest/ops/test_reduce.py | 12 ++--- python/unittest/ops/test_transpose.py | 10 ++-- python/unittest/test_eval.py | 13 +++--- python/unittest/test_placeholder.py | 50 ++++++++++---------- 18 files changed, 214 insertions(+), 121 deletions(-) diff --git a/examples/llama/model.py b/examples/llama/model.py index ebd424612..ad3c2f0b9 100644 --- a/examples/llama/model.py +++ b/examples/llama/model.py @@ -2,7 +2,7 @@ # Licensed under the MIT license. """LLaMA 2 Transformer model. - Correspond to https://github.com/facebookresearch/llama/blob/main/llama/model.py +Correspond to https://github.com/facebookresearch/llama/blob/main/llama/model.py """ import ark diff --git a/examples/multi_head_attention/mha.py b/examples/multi_head_attention/mha.py index 26a45855a..1cc3711b0 100644 --- a/examples/multi_head_attention/mha.py +++ b/examples/multi_head_attention/mha.py @@ -118,8 +118,10 @@ def forward(self, q, r_k, v): # scale — element-wise, tile matches matmul with ark.PlannerContext( config={ - "NumWarps": 8, "SramBytes": 0, - "Tile": [TM, S], "NumTasks": num_tasks, + "NumWarps": 8, + "SramBytes": 0, + "Tile": [TM, S], + "NumTasks": num_tasks, }, ): s = ark.mul(s, self.scale) @@ -127,7 +129,8 @@ def forward(self, q, r_k, v): # reduce_max — NOW with Tile=[TM,1] to match task count with ark.PlannerContext( config={ - "NumWarps": 8, "SramBytes": 256, + "NumWarps": 8, + "SramBytes": 256, "ImplType": "WarpWise", "Tile": [TM, 1], }, @@ -137,8 +140,10 @@ def forward(self, q, r_k, v): # sub + exp with ark.PlannerContext( config={ - "NumWarps": 8, "SramBytes": 0, - "Tile": [TM, S], "NumTasks": num_tasks, + "NumWarps": 8, + "SramBytes": 0, + "Tile": [TM, S], + "NumTasks": num_tasks, }, ): s = ark.sub(s, m) @@ -147,7 +152,8 @@ def forward(self, q, r_k, v): # reduce_sum — Tile=[TM,1] with ark.PlannerContext( config={ - "NumWarps": 8, "SramBytes": 256, + "NumWarps": 8, + "SramBytes": 256, "ImplType": "WarpWise", "Tile": [TM, 1], }, @@ -157,8 +163,10 @@ def forward(self, q, r_k, v): # div with ark.PlannerContext( config={ - "NumWarps": 8, "SramBytes": 0, - "Tile": [TM, S], "NumTasks": num_tasks, + "NumWarps": 8, + "SramBytes": 0, + "Tile": [TM, S], + "NumTasks": num_tasks, }, ): p = ark.div(s, l) diff --git a/examples/multi_head_attention/test_mha.py b/examples/multi_head_attention/test_mha.py index 98ed0aeab..6654ec3ef 100644 --- a/examples/multi_head_attention/test_mha.py +++ b/examples/multi_head_attention/test_mha.py @@ -49,7 +49,9 @@ def test_correctness(B, H, N, D, dtype=torch.float16): # ARK vanilla — uses eval() result = MultiHeadAttention(D)( - ark.Tensor.from_torch(q), ark.Tensor.from_torch(k_t), ark.Tensor.from_torch(v) + ark.Tensor.from_torch(q), + ark.Tensor.from_torch(k_t), + ark.Tensor.from_torch(v), ).eval() # Reference @@ -67,7 +69,9 @@ def test_correctness(B, H, N, D, dtype=torch.float16): diff = (result - ref).abs().max().item() atol = 5e-2 if dtype == torch.float16 else 1e-1 ok = diff < atol - print(f" B={B} H={H} N={N:4d} D={D} diff={diff:.4f} vs {label} {'PASS' if ok else 'FAIL'}") + print( + f" B={B} H={H} N={N:4d} D={D} diff={diff:.4f} vs {label} {'PASS' if ok else 'FAIL'}" + ) return ok @@ -143,7 +147,9 @@ def bench_ark(B, H, N, D, mha_cls, mha_args, dtype=torch.float16): ark.init() mha = mha_cls(*mha_args) out = mha( - ark.Tensor.from_torch(q), ark.Tensor.from_torch(k_t), ark.Tensor.from_torch(v) + ark.Tensor.from_torch(q), + ark.Tensor.from_torch(k_t), + ark.Tensor.from_torch(v), ) with ark.Runtime() as rt: @@ -252,9 +258,15 @@ def test_correctness(batch, heads, seq_len, head_dim, dtype=torch.float16): print(f" B={batch}, H={heads}, N={seq_len}, D={head_dim}", end="") scale = 1.0 / math.sqrt(head_dim) - q = torch.randn(batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0") - k = torch.randn(batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0") - v = torch.randn(batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0") + q = torch.randn( + batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0" + ) + k = torch.randn( + batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0" + ) + v = torch.randn( + batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0" + ) # Reference: FlashAttention-2 ref = flash_attn_reference(q, k, v, scale) @@ -263,7 +275,11 @@ def test_correctness(batch, heads, seq_len, head_dim, dtype=torch.float16): ark.init() k_t = k.transpose(-2, -1).contiguous() mha = MultiHeadAttention(head_dim) - ark_out = mha(ark.Tensor.from_torch(q), ark.Tensor.from_torch(k_t), ark.Tensor.from_torch(v)) + ark_out = mha( + ark.Tensor.from_torch(q), + ark.Tensor.from_torch(k_t), + ark.Tensor.from_torch(v), + ) with ark.Runtime() as rt: rt.launch() rt.run() @@ -292,9 +308,15 @@ def bench_one(label, run_fn, num_warmup=10, num_iter=50): def run_benchmark(batch, heads, seq_len, head_dim, dtype=torch.float16): scale = 1.0 / math.sqrt(head_dim) - q = torch.randn(batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0") - k = torch.randn(batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0") - v = torch.randn(batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0") + q = torch.randn( + batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0" + ) + k = torch.randn( + batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0" + ) + v = torch.randn( + batch, heads, seq_len, head_dim, dtype=dtype, device="cuda:0" + ) k_t = k.transpose(-2, -1).contiguous() # --- FlashAttention-2 (Tri Dao) --- @@ -316,7 +338,11 @@ def run_benchmark(batch, heads, seq_len, head_dim, dtype=torch.float16): # --- ARK Vanilla --- ark.init() mha = MultiHeadAttention(head_dim) - ark_out = mha(ark.Tensor.from_torch(q), ark.Tensor.from_torch(k_t), ark.Tensor.from_torch(v)) + ark_out = mha( + ark.Tensor.from_torch(q), + ark.Tensor.from_torch(k_t), + ark.Tensor.from_torch(v), + ) with ark.Runtime() as rt: rt.launch() vanilla_ms = bench_one("ARK", lambda: rt.run(iter=1), num_warmup=5) @@ -324,7 +350,11 @@ def run_benchmark(batch, heads, seq_len, head_dim, dtype=torch.float16): # --- ARK Optimized (fused softmax) --- ark.init() mha_opt = MultiHeadAttentionOptimized(head_dim, seq_len) - ark_out2 = mha_opt(ark.Tensor.from_torch(q), ark.Tensor.from_torch(k_t), ark.Tensor.from_torch(v)) + ark_out2 = mha_opt( + ark.Tensor.from_torch(q), + ark.Tensor.from_torch(k_t), + ark.Tensor.from_torch(v), + ) with ark.Runtime() as rt: rt.launch() opt_ms = bench_one("ARK-Opt", lambda: rt.run(iter=1), num_warmup=5) diff --git a/examples/tutorial/module_tutorial.py b/examples/tutorial/module_tutorial.py index b18804063..af395869e 100644 --- a/examples/tutorial/module_tutorial.py +++ b/examples/tutorial/module_tutorial.py @@ -40,7 +40,9 @@ class TestModelPytorch(nn.Module): def __init__(self): super(TestModelPytorch, self).__init__() self.weight_1 = nn.Parameter(torch.ones(d_model, d_ff, device="cuda:0")) - self.submodule_weight_2 = nn.Parameter(torch.ones(d_ff, d_model, device="cuda:0")) + self.submodule_weight_2 = nn.Parameter( + torch.ones(d_ff, d_model, device="cuda:0") + ) self.layernorm = nn.LayerNorm(d_model, device="cuda:0") def forward(self, inputs): @@ -53,11 +55,18 @@ def forward(self, inputs): def module_test(): # Create torch tensors for input and weights - input_tensor = torch.randn( - batch_size, seq_len, d_model, dtype=torch.float32, device="cuda:0" - ) * 0.1 - weight_1 = torch.randn(d_model, d_ff, dtype=torch.float32, device="cuda:0") * 0.1 - weight_2 = torch.randn(d_ff, d_model, dtype=torch.float32, device="cuda:0") * 0.1 + input_tensor = ( + torch.randn( + batch_size, seq_len, d_model, dtype=torch.float32, device="cuda:0" + ) + * 0.1 + ) + weight_1 = ( + torch.randn(d_model, d_ff, dtype=torch.float32, device="cuda:0") * 0.1 + ) + weight_2 = ( + torch.randn(d_ff, d_model, dtype=torch.float32, device="cuda:0") * 0.1 + ) # Build and evaluate the ARK model ark_model = TestModelARK(weight_1, weight_2) @@ -77,10 +86,14 @@ def module_test(): print("ARK module test") print( - "batch_size:", batch_size, - "seq_len:", seq_len, - "d_model:", d_model, - "d_ff:", d_ff, + "batch_size:", + batch_size, + "seq_len:", + seq_len, + "d_model:", + d_model, + "d_ff:", + d_ff, ) print("max error:", max_error, "avg error:", avg_error) diff --git a/examples/tutorial/planner_tutorial.py b/examples/tutorial/planner_tutorial.py index 0701cf775..a0a88462a 100644 --- a/examples/tutorial/planner_tutorial.py +++ b/examples/tutorial/planner_tutorial.py @@ -35,12 +35,16 @@ def forward(self, input): "NumTasks": 65536, }, ): - with ark.PlannerContext(config={"ImplType": "WarpWise", "Tile": [1, 1]}): + with ark.PlannerContext( + config={"ImplType": "WarpWise", "Tile": [1, 1]} + ): max = ark.reduce_max(input, axis=-1) with ark.PlannerContext(config={"Tile": [1, 2048]}): output = ark.sub(input, max) output = ark.exp(output) - with ark.PlannerContext(config={"ImplType": "WarpWise", "Tile": [1, 1]}): + with ark.PlannerContext( + config={"ImplType": "WarpWise", "Tile": [1, 1]} + ): sum = ark.reduce_sum(output, axis=-1) with ark.PlannerContext(config={"Tile": [1, 2048]}): output = ark.div(output, sum) diff --git a/examples/tutorial/quickstart_tutorial.py b/examples/tutorial/quickstart_tutorial.py index d0803f917..f36d31498 100644 --- a/examples/tutorial/quickstart_tutorial.py +++ b/examples/tutorial/quickstart_tutorial.py @@ -18,7 +18,9 @@ def quickstart_tutorial(): output = ark.add(input_tensor, other_tensor).eval() # Check if the output tensor is equal to the sum of the input and other tensor - torch.testing.assert_close(output, input_tensor + other_tensor, atol=0, rtol=0) + torch.testing.assert_close( + output, input_tensor + other_tensor, atol=0, rtol=0 + ) print("Quickstart tutorial is successful!") diff --git a/python/ark/ops.py b/python/ark/ops.py index 3dbba5115..68c3846c1 100644 --- a/python/ark/ops.py +++ b/python/ark/ops.py @@ -187,7 +187,9 @@ def gelu( def identity( - input: Union[Tensor, "torch.Tensor"], deps: List[Tensor] = [], name: str = "identity" + input: Union[Tensor, "torch.Tensor"], + deps: List[Tensor] = [], + name: str = "identity", ) -> Tensor: """ """ input = _ensure_ark(input) @@ -404,7 +406,10 @@ def rsqrt( def sharding( - input: Union[Tensor, "torch.Tensor"], axis: int, dim_per_shard: int, name: str = "sharding" + input: Union[Tensor, "torch.Tensor"], + axis: int, + dim_per_shard: int, + name: str = "sharding", ) -> List[Tensor]: """ """ input = _ensure_ark(input) @@ -530,6 +535,7 @@ def recv( Model.get_model().recv(output._tensor, remote_rank, tag, name) ) + ################################################################################ diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp index d8d687546..c6fde978e 100644 --- a/python/tensor_py.cpp +++ b/python/tensor_py.cpp @@ -9,7 +9,7 @@ namespace py = pybind11; -void register_tensor(py::module& m) { +void register_tensor(py::module &m) { py::class_(m, "CoreTensor") .def("id", &ark::Tensor::id) .def("shape", &ark::Tensor::shape) diff --git a/python/unittest/ops/test_arithmetic.py b/python/unittest/ops/test_arithmetic.py index fcdc4c37a..ac917747f 100644 --- a/python/unittest/ops/test_arithmetic.py +++ b/python/unittest/ops/test_arithmetic.py @@ -8,7 +8,9 @@ from conftest import ark, DEVICE -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize( + "dtype", [torch.float32, torch.float16, torch.bfloat16] +) def test_add(dtype): a = torch.randn(8192, dtype=dtype, device=DEVICE) b = torch.randn(8192, dtype=dtype, device=DEVICE) @@ -64,7 +66,9 @@ def test_div_fp32(): FACTOR = 0.75 -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize( + "dtype", [torch.float32, torch.float16, torch.bfloat16] +) @pytest.mark.parametrize("shape", [(4, 2, 1), (4, 2, 1024)]) def test_scalar_mul(dtype, shape): a = torch.randn(shape, dtype=dtype, device=DEVICE) diff --git a/python/unittest/ops/test_cast.py b/python/unittest/ops/test_cast.py index fe84ff755..8587cdd2f 100644 --- a/python/unittest/ops/test_cast.py +++ b/python/unittest/ops/test_cast.py @@ -20,7 +20,9 @@ ], ) def test_cast(src_dtype, dst_dtype, ark_dst): - a = torch.randn(4, 2, 1024, dtype=torch.float32, device=DEVICE).to(src_dtype) + a = torch.randn(4, 2, 1024, dtype=torch.float32, device=DEVICE).to( + src_dtype + ) result = ark.cast(a, ark_dst).eval() expected = a.to(dst_dtype) assert result.dtype == dst_dtype diff --git a/python/unittest/ops/test_composite.py b/python/unittest/ops/test_composite.py index a40ba913f..f12194a56 100644 --- a/python/unittest/ops/test_composite.py +++ b/python/unittest/ops/test_composite.py @@ -16,9 +16,9 @@ def test_softmax(dtype): result = ark.softmax(a).eval() expected = F.softmax(a, dim=-1) atol = 1e-5 if dtype == torch.float32 else 1e-3 - assert torch.allclose(result, expected, atol=atol, rtol=1e-3), ( - f"max_diff={(result - expected).abs().max()}" - ) + assert torch.allclose( + result, expected, atol=atol, rtol=1e-3 + ), f"max_diff={(result - expected).abs().max()}" def test_layernorm(): @@ -28,6 +28,6 @@ def test_layernorm(): mean = a.mean(dim=-1, keepdim=True) var = ((a - mean) ** 2).mean(dim=-1, keepdim=True) expected = (a - mean) / torch.sqrt(var + 1e-6) - assert torch.allclose(result, expected, atol=1e-4, rtol=1e-4), ( - f"max_diff={(result - expected).abs().max()}" - ) + assert torch.allclose( + result, expected, atol=1e-4, rtol=1e-4 + ), f"max_diff={(result - expected).abs().max()}" diff --git a/python/unittest/ops/test_embedding_rope.py b/python/unittest/ops/test_embedding_rope.py index f8c9ea701..6c2ede3e4 100644 --- a/python/unittest/ops/test_embedding_rope.py +++ b/python/unittest/ops/test_embedding_rope.py @@ -9,19 +9,25 @@ from conftest import ark, DEVICE -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize( + "dtype", [torch.float32, torch.float16, torch.bfloat16] +) def test_embedding(dtype): vocab_size, embed_dim = 100, 64 - indices = torch.randint(0, vocab_size, (4, 8), device=DEVICE).to(torch.int32) + indices = torch.randint(0, vocab_size, (4, 8), device=DEVICE).to( + torch.int32 + ) weight = torch.randn(vocab_size, embed_dim, dtype=dtype, device=DEVICE) result = ark.embedding(indices, weight).eval() expected = F.embedding(indices, weight) - assert torch.allclose(result, expected, atol=0, rtol=0), ( - f"max_diff={(result - expected).abs().max()}" - ) + assert torch.allclose( + result, expected, atol=0, rtol=0 + ), f"max_diff={(result - expected).abs().max()}" -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize( + "dtype", [torch.float32, torch.float16, torch.bfloat16] +) def test_rope(dtype): """Test rotary positional embedding against PyTorch complex-multiply reference. ARK's rope computes element-wise complex multiplication on consecutive pairs: @@ -35,11 +41,14 @@ def test_rope(dtype): # PyTorch reference: complex multiply on paired elements a = x.reshape(*shape[:-1], -1, 2) b = other.reshape(*shape[:-1], -1, 2) - expected = torch.stack([ - a[..., 0] * b[..., 0] - a[..., 1] * b[..., 1], - a[..., 0] * b[..., 1] + a[..., 1] * b[..., 0], - ], dim=-1).reshape(shape) + expected = torch.stack( + [ + a[..., 0] * b[..., 0] - a[..., 1] * b[..., 1], + a[..., 0] * b[..., 1] + a[..., 1] * b[..., 0], + ], + dim=-1, + ).reshape(shape) atol = 1e-5 if dtype == torch.float32 else 5e-2 - assert torch.allclose(result, expected, atol=atol, rtol=1e-3), ( - f"max_diff={(result - expected).abs().max()}" - ) + assert torch.allclose( + result, expected, atol=atol, rtol=1e-3 + ), f"max_diff={(result - expected).abs().max()}" diff --git a/python/unittest/ops/test_math.py b/python/unittest/ops/test_math.py index 000845858..835a1b15c 100644 --- a/python/unittest/ops/test_math.py +++ b/python/unittest/ops/test_math.py @@ -9,7 +9,9 @@ from conftest import ark, DEVICE -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize( + "dtype", [torch.float32, torch.float16, torch.bfloat16] +) def test_exp(dtype): a = torch.randn(4, 2, 1024, dtype=dtype, device=DEVICE) atol = 1e-5 if dtype == torch.float32 else 1e-2 @@ -20,10 +22,14 @@ def test_exp(dtype): def test_gelu(dtype): a = torch.randn(4, 2, 1024, dtype=dtype, device=DEVICE) atol = 1e-5 if dtype == torch.float32 else 1e-2 - assert torch.allclose(ark.gelu(a).eval(), F.gelu(a, approximate="tanh"), atol=atol, rtol=0) + assert torch.allclose( + ark.gelu(a).eval(), F.gelu(a, approximate="tanh"), atol=atol, rtol=0 + ) -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize( + "dtype", [torch.float32, torch.float16, torch.bfloat16] +) def test_relu(dtype): a = torch.randn(4, 2, 1024, dtype=dtype, device=DEVICE) assert torch.allclose(ark.relu(a).eval(), F.relu(a), atol=0, rtol=0) @@ -33,7 +39,9 @@ def test_relu(dtype): def test_sigmoid(dtype): a = torch.randn(4, 2, 1024, dtype=dtype, device=DEVICE) atol = 1e-5 if dtype == torch.float32 else 1e-2 - assert torch.allclose(ark.sigmoid(a).eval(), torch.sigmoid(a), atol=atol, rtol=0) + assert torch.allclose( + ark.sigmoid(a).eval(), torch.sigmoid(a), atol=atol, rtol=0 + ) def test_sqrt_fp32(): @@ -43,4 +51,6 @@ def test_sqrt_fp32(): def test_rsqrt_fp32(): a = torch.rand(4, 2, 1024, dtype=torch.float32, device=DEVICE) + 0.01 - assert torch.allclose(ark.rsqrt(a).eval(), torch.rsqrt(a), atol=1e-4, rtol=0) + assert torch.allclose( + ark.rsqrt(a).eval(), torch.rsqrt(a), atol=1e-4, rtol=0 + ) diff --git a/python/unittest/ops/test_matmul.py b/python/unittest/ops/test_matmul.py index 1855f7636..dfa26e988 100644 --- a/python/unittest/ops/test_matmul.py +++ b/python/unittest/ops/test_matmul.py @@ -8,7 +8,9 @@ from conftest import ark, DEVICE -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize( + "dtype", [torch.float32, torch.float16, torch.bfloat16] +) def test_matmul_nn(dtype): M, N, K = 256, 256, 512 a = torch.randn(M, K, dtype=dtype, device=DEVICE) @@ -16,9 +18,9 @@ def test_matmul_nn(dtype): result = ark.matmul(a, b).eval() expected = a @ b atol = 1e-3 if dtype == torch.float32 else 1e-1 - assert torch.allclose(result, expected, atol=atol, rtol=1e-2), ( - f"max_diff={(result - expected).abs().max()}" - ) + assert torch.allclose( + result, expected, atol=atol, rtol=1e-2 + ), f"max_diff={(result - expected).abs().max()}" def test_matmul_nt(): @@ -27,9 +29,9 @@ def test_matmul_nt(): b = torch.randn(N, K, dtype=torch.float16, device=DEVICE) result = ark.matmul(a, b, transpose_other=True).eval() expected = a @ b.t() - assert torch.allclose(result, expected, atol=1e-1, rtol=1e-2), ( - f"max_diff={(result - expected).abs().max()}" - ) + assert torch.allclose( + result, expected, atol=1e-1, rtol=1e-2 + ), f"max_diff={(result - expected).abs().max()}" def test_matmul_tn(): @@ -38,9 +40,9 @@ def test_matmul_tn(): b = torch.randn(K, N, dtype=torch.float16, device=DEVICE) result = ark.matmul(a, b, transpose_input=True).eval() expected = a.t() @ b - assert torch.allclose(result, expected, atol=1e-1, rtol=1e-2), ( - f"max_diff={(result - expected).abs().max()}" - ) + assert torch.allclose( + result, expected, atol=1e-1, rtol=1e-2 + ), f"max_diff={(result - expected).abs().max()}" def test_matmul_tt(): @@ -49,9 +51,9 @@ def test_matmul_tt(): b = torch.randn(N, K, dtype=torch.float16, device=DEVICE) result = ark.matmul(a, b, transpose_input=True, transpose_other=True).eval() expected = a.t() @ b.t() - assert torch.allclose(result, expected, atol=1e-1, rtol=1e-2), ( - f"max_diff={(result - expected).abs().max()}" - ) + assert torch.allclose( + result, expected, atol=1e-1, rtol=1e-2 + ), f"max_diff={(result - expected).abs().max()}" def test_matmul_batched(): @@ -60,6 +62,6 @@ def test_matmul_batched(): b = torch.randn(B, K, N, dtype=torch.float16, device=DEVICE) result = ark.matmul(a, b).eval() expected = a @ b - assert torch.allclose(result, expected, atol=3e-1, rtol=1e-2), ( - f"max_diff={(result - expected).abs().max()}" - ) + assert torch.allclose( + result, expected, atol=3e-1, rtol=1e-2 + ), f"max_diff={(result - expected).abs().max()}" diff --git a/python/unittest/ops/test_reduce.py b/python/unittest/ops/test_reduce.py index 5ab8efae8..e1b4f9ee6 100644 --- a/python/unittest/ops/test_reduce.py +++ b/python/unittest/ops/test_reduce.py @@ -15,9 +15,9 @@ def test_reduce_sum_fp32(axis): result = ark.reduce_sum(a, axis=axis).eval() expected = torch.sum(a, dim=axis, keepdim=True) atol = shape[axis] * 1e-5 - assert torch.allclose(result, expected, atol=atol, rtol=1e-4), ( - f"axis={axis}, max_diff={(result - expected).abs().max()}" - ) + assert torch.allclose( + result, expected, atol=atol, rtol=1e-4 + ), f"axis={axis}, max_diff={(result - expected).abs().max()}" @pytest.mark.parametrize("axis", [0, 3]) @@ -27,9 +27,9 @@ def test_reduce_sum_fp16(axis): result = ark.reduce_sum(a, axis=axis).eval() expected = torch.sum(a, dim=axis, keepdim=True) atol = shape[axis] * 2e-2 - assert torch.allclose(result, expected, atol=atol, rtol=1e-2), ( - f"axis={axis}, max_diff={(result - expected).abs().max()}" - ) + assert torch.allclose( + result, expected, atol=atol, rtol=1e-2 + ), f"axis={axis}, max_diff={(result - expected).abs().max()}" def test_reduce_max_fp32(): diff --git a/python/unittest/ops/test_transpose.py b/python/unittest/ops/test_transpose.py index 156c4ee9b..d042b67de 100644 --- a/python/unittest/ops/test_transpose.py +++ b/python/unittest/ops/test_transpose.py @@ -16,11 +16,13 @@ ([0, 2, 1, 3], [2, 3, 64, 128]), ], ) -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize( + "dtype", [torch.float32, torch.float16, torch.bfloat16] +) def test_transpose(perm, shape, dtype): a = torch.randn(shape, dtype=dtype, device=DEVICE) result = ark.transpose(a, perm).eval() expected = a.permute(perm).contiguous() - assert torch.allclose(result, expected, atol=0, rtol=0), ( - f"max_diff={(result - expected).abs().max()}" - ) + assert torch.allclose( + result, expected, atol=0, rtol=0 + ), f"max_diff={(result - expected).abs().max()}" diff --git a/python/unittest/test_eval.py b/python/unittest/test_eval.py index 7adf5126e..ed19d8b2e 100644 --- a/python/unittest/test_eval.py +++ b/python/unittest/test_eval.py @@ -20,6 +20,7 @@ def _ark_init(): def _get_compiled_plan(): """Return the plan string currently compiled in the executor.""" from ark.executor import Executor + return Executor.get().plan() @@ -52,9 +53,9 @@ def test_eval_recompile_on_different_graph(): plan2 = _get_compiled_plan() assert torch.allclose(r2, a * b) - assert plan1 != plan2, ( - "Different graph structure should produce a different plan" - ) + assert ( + plan1 != plan2 + ), "Different graph structure should produce a different plan" def test_eval_recompile_on_graph_update(): @@ -78,9 +79,9 @@ def test_eval_recompile_on_graph_update(): assert torch.allclose(r2, (a + b) + a) # The plan must have changed (graph grew from 1 op to 2 ops) - assert plan1 != plan2, ( - "Extending the graph should produce a different plan and recompile" - ) + assert ( + plan1 != plan2 + ), "Extending the graph should produce a different plan and recompile" def test_eval_with_torch_stream(): diff --git a/python/unittest/test_placeholder.py b/python/unittest/test_placeholder.py index 640cc0e3c..74744853e 100644 --- a/python/unittest/test_placeholder.py +++ b/python/unittest/test_placeholder.py @@ -13,9 +13,7 @@ def test_placeholder_is_external(): assert t_placeholder.is_external(), "Placeholder tensor should be external" t_regular = ark.tensor([64], ark.fp32) - assert not t_regular.is_external(), ( - "Regular tensor should not be external" - ) + assert not t_regular.is_external(), "Regular tensor should not be external" @pytest_ark(need_torch=True) @@ -34,9 +32,9 @@ def test_placeholder_immediate_binding(): result = out.to_numpy() expected = torch_data.cpu().numpy() + 1.0 - assert np.allclose(result, expected), ( - f"max diff: {np.max(np.abs(result - expected))}" - ) + assert np.allclose( + result, expected + ), f"max diff: {np.max(np.abs(result - expected))}" @pytest_ark(need_torch=True) @@ -44,7 +42,9 @@ def test_placeholder_scalar_add(): """Test placeholder with scalar addition on non-aligned shape.""" import torch - torch_data = torch.arange(10, dtype=torch.float32, device="cuda:0").reshape(10, 1) + torch_data = torch.arange(10, dtype=torch.float32, device="cuda:0").reshape( + 10, 1 + ) t = ark.placeholder([10, 1], ark.fp32, data=torch_data) out = ark.add(t, 5.0) @@ -55,9 +55,9 @@ def test_placeholder_scalar_add(): result = out.to_numpy() expected = torch_data.cpu().numpy() + 5.0 - assert np.allclose(result, expected), ( - f"max diff: {np.max(np.abs(result - expected))}" - ) + assert np.allclose( + result, expected + ), f"max diff: {np.max(np.abs(result - expected))}" @pytest_ark(need_torch=True) @@ -79,9 +79,9 @@ def test_placeholder_multiple(): result = out.to_numpy() expected = torch_a.cpu().numpy() + torch_b.cpu().numpy() - assert np.allclose(result, expected), ( - f"max diff: {np.max(np.abs(result - expected))}" - ) + assert np.allclose( + result, expected + ), f"max diff: {np.max(np.abs(result - expected))}" @pytest_ark(need_torch=True) @@ -100,9 +100,9 @@ def test_placeholder_fp16(): result = out.to_numpy() expected = torch_data.cpu().numpy() * 0.5 - assert np.allclose(result, expected, atol=1e-2), ( - f"max diff: {np.max(np.abs(result - expected))}" - ) + assert np.allclose( + result, expected, atol=1e-2 + ), f"max diff: {np.max(np.abs(result - expected))}" @pytest_ark(need_torch=True) @@ -122,9 +122,9 @@ def test_placeholder_from_torch(): result = out.to_numpy() expected = torch_tensor.cpu().numpy() + 10.0 - assert np.allclose(result, expected), ( - f"max diff: {np.max(np.abs(result - expected))}" - ) + assert np.allclose( + result, expected + ), f"max diff: {np.max(np.abs(result - expected))}" @pytest_ark(need_torch=True) @@ -144,9 +144,9 @@ def test_placeholder_tensor_mappings_launch(): result = out.to_numpy() expected = torch_input.cpu().numpy() * 3.0 - assert np.allclose(result, expected), ( - f"max diff: {np.max(np.abs(result - expected))}" - ) + assert np.allclose( + result, expected + ), f"max diff: {np.max(np.abs(result - expected))}" @pytest_ark(need_torch=True) @@ -169,9 +169,9 @@ def test_placeholder_runtime_rebinding(): result2 = out.to_numpy() assert np.allclose(result1, 6.0), f"Run 1: expected 6.0, got {result1[:5]}" - assert np.allclose(result2, 11.0), ( - f"Run 2: expected 11.0, got {result2[:5]}" - ) + assert np.allclose( + result2, 11.0 + ), f"Run 2: expected 11.0, got {result2[:5]}" @pytest_ark(need_torch=True) From 78721b1d36ba81b355506e4147435b4f63af3e47 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 7 Apr 2026 08:08:08 +0000 Subject: [PATCH 61/61] Fixes --- .github/workflows/ut.yml | 8 ++++++-- CMakeLists.txt | 19 +++++++++++++++---- cmake/CheckNvidiaGpu.cmake | 3 ++- python/unittest/common.py | 2 ++ python/unittest/test_eval.py | 2 ++ 5 files changed, 27 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ut.yml b/.github/workflows/ut.yml index e7d295be9..0929c75e2 100644 --- a/.github/workflows/ut.yml +++ b/.github/workflows/ut.yml @@ -29,7 +29,7 @@ jobs: container_options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 - platform: rocm runner: [self-hosted, ROCM] - container: rocm/pytorch:rocm7.2.1_ubuntu24.04_py3.12_pytorch_release_2.9.1 + container: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0 container_options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1 runs-on: ${{ matrix.runner }} concurrency: @@ -56,7 +56,11 @@ jobs: run: | apt-get update && apt-get install -y lcov mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Debug .. + CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" + if [ "${{ matrix.platform }}" = "rocm" ]; then + CMAKE_ARGS="$CMAKE_ARGS -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc" + fi + cmake $CMAKE_ARGS .. make -j ut ark_py - name: Run C++ UT diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d5de19d1..437746888 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,13 +65,24 @@ if(ARK_USE_CUDA) endif() # Set CUDA architectures - if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 11) + if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 13) + # CUDA 13+ dropped sm_60 and sm_70 + set(CMAKE_CUDA_ARCHITECTURES 80 90) + elseif(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12) + set(CMAKE_CUDA_ARCHITECTURES 60 70 80 90) + elseif(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 11) set(CMAKE_CUDA_ARCHITECTURES 60 70 80) endif() - # Hopper architecture - if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12) - set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 90) + # CUDA 13+ moved CCCL headers into a cccl/ subdirectory. + # Add it to the include path so third-party code (e.g. MSCCL++) + # that includes can still find the headers. + if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 13) + set(CCCL_INCLUDE_DIR "${CUDAToolkit_INCLUDE_DIRS}/cccl") + if(EXISTS "${CCCL_INCLUDE_DIR}") + include_directories(SYSTEM "${CCCL_INCLUDE_DIR}") + message(STATUS "CUDA 13+: added CCCL include dir ${CCCL_INCLUDE_DIR}") + endif() endif() else() # ARK_USE_ROCM set(CMAKE_HIP_STANDARD 17) diff --git a/cmake/CheckNvidiaGpu.cmake b/cmake/CheckNvidiaGpu.cmake index 79f8589c4..ed445e5db 100644 --- a/cmake/CheckNvidiaGpu.cmake +++ b/cmake/CheckNvidiaGpu.cmake @@ -9,7 +9,8 @@ if(NOT CUDAToolkit_FOUND) return() endif() -set(CMAKE_CUDA_ARCHITECTURES "60") +# Use sm_80 as minimum for the detection check. +set(CMAKE_CUDA_ARCHITECTURES "80") if(NOT CMAKE_CUDA_COMPILER) # In case the CUDA Toolkit directory is not in the PATH find_program(CUDA_COMPILER diff --git a/python/unittest/common.py b/python/unittest/common.py index 0c385e89a..0bb866d3b 100644 --- a/python/unittest/common.py +++ b/python/unittest/common.py @@ -2,6 +2,7 @@ # Licensed under the MIT license. import pytest +import functools import ark @@ -19,6 +20,7 @@ def decorator(test_func): test_func ) + @functools.wraps(test_func) def wrapper(*args, **kwargs): ark.init() test_func(*args, **kwargs) diff --git a/python/unittest/test_eval.py b/python/unittest/test_eval.py index ed19d8b2e..6986e03a0 100644 --- a/python/unittest/test_eval.py +++ b/python/unittest/test_eval.py @@ -91,6 +91,8 @@ def test_eval_with_torch_stream(): x = torch.ones(64, dtype=torch.float32, device=DEVICE) for i in range(5): + # Reset ARK model each iteration so eval() only runs the single add op + ark.init() # torch op on the stream: x = x * 2 with torch.cuda.stream(s): x = x * 2