diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..43e9b37 --- /dev/null +++ b/.clang-format @@ -0,0 +1,141 @@ +--- +Language: Cpp +Standard: c++20 + +# Indentation +TabWidth: 4 +IndentWidth: 4 +UseTab: Never +IndentPPDirectives: None +IndentWrappedFunctionNames: false +NamespaceIndentation: None + +# Empty lines +KeepEmptyLinesAtTheStartOfBlocks: false +MaxEmptyLinesToKeep: 1 + +# Line length +ColumnLimit: 100 + +# Line endings +DeriveLineEnding: false +LineEnding: LF + +# Breaking and Penalties +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakInheritanceList: BeforeColon +BreakStringLiterals: false + +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 + +# Spacing and padding +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceAfterLogicalNot: false +SpaceAfterControlStatementKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: false +SpaceInEmptyParentheses: false +SpacesInAngles: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesBeforeTrailingComments: 1 +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeSquareBrackets: false +SpacesInSquareBrackets: false +SpaceBeforeCaseColon: false + +# Brace placement +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +Cpp11BracedListStyle: true + +# Function definitions +# BreakAfterReturnType: AllDefinitions +AlwaysBreakAfterDefinitionReturnType: All + +AttributeMacros: + - __host__ + - __device__ + - __hostdev__ + - __global__ + - __forceinline__ + - __shared__ + - __launch_bounds__ + +# Alignment +AlignConsecutiveAssignments: true +AlignConsecutiveBitFields: true +AlignConsecutiveDeclarations: false +AlignConsecutiveMacros: true +AlignEscapedNewlines: Left +AlignOperands: true +AlignTrailingComments: + Kind: Always + OverEmptyLines: 2 + +# Single line allowances +BinPackParameters: false +BinPackArguments: false +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: true +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: false +AllowShortLambdasOnASingleLine: true +AllowShortLoopsOnASingleLine: false + +# Sorting +IncludeBlocks: Regroup +IncludeIsMainRegex: "$" +IncludeCategories: + - Regex: '^ int: + """Simple Python hash - fastest but less robust""" + return hash(tuple(tensor.detach().cpu().flatten().tolist())) + + +def create_grid_from_points( + grid_coord: torch.Tensor, + feat: torch.Tensor, + offset: torch.Tensor, + voxel_size: float, + device: str = "cuda", +) -> tuple[fvdb.GridBatch, fvdb.JaggedTensor, fvdb.JaggedTensor]: + """Create FVDB tensor from ScanNet-like point data with proper batching. + + Args: + grid_coord: Batched grid coordinates [N, 3] + feat: Batched features [N, C] + offset: Tensor indicating batch boundaries [B] + voxel_size: Voxel size for grid creation + device: Device for tensor operations + + Returns: + grid: fvdb.GridBatch + jfeats: fvdb.JaggedTensor with features + original_coord_to_voxel_idx: Mapping from original coords to voxel indices + """ + + offset_list = list(offset.cpu().numpy()) + # Convert offset to individual sample boundaries + if len(offset_list) == 1: + # Single sample case + coords_list = [grid_coord.to(device=device, dtype=torch.int32)] + feats_list = [feat.to(device=device, dtype=torch.float32)] + else: + # Multiple samples case - split using offset + coords_list = [] + feats_list = [] + prev_offset = 0 + for curr_offset in offset_list: + coords_list.append(grid_coord[prev_offset:curr_offset].to(device=device, dtype=torch.int32)) + feats_list.append(feat[prev_offset:curr_offset].to(device=device, dtype=torch.float32)) + prev_offset = curr_offset + + coords_jagged = fvdb.JaggedTensor(coords_list) + + grid = fvdb.GridBatch.from_ijk( + coords_jagged, + voxel_sizes=[[voxel_size, voxel_size, voxel_size]] * len(coords_list), + origins=[0.0] * 3, + ) + + feats_jagged = fvdb.JaggedTensor(feats_list) + feats_vdb_order = grid.inject_from_ijk(coords_jagged, feats_jagged) # + original_coord_to_voxel_idx = grid.ijk_to_index(coords_jagged, cumulative=True) + + return grid, feats_vdb_order, original_coord_to_voxel_idx + + +@MODELS.register_module("PT-v3fvdb") +class PointTransformerV3(PointModule): + def __init__( + self, + in_channels: int = 6, + enc_depths: tuple[int, ...] = (2, 2, 2, 2), + enc_channels: tuple[int, ...] = (32, 64, 128, 256), + enc_num_heads: tuple[int, ...] = (1, 1, 1, 1), + dec_depths: tuple[int, ...] = (2, 2, 2), + dec_channels: tuple[int, ...] = (128, 64, 32), + dec_num_heads: tuple[int, ...] = (1, 1, 1), + patch_size: int = 1024, + drop_path: float = 0.3, + proj_drop: float = 0.0, + qk_scale: float = 1.0, + enable_batch_norm: bool = False, + embedding_mode: str = "linear", + no_conv_in_cpe: bool = False, + cross_patch_attention: bool = False, + cross_patch_pooling: str = "mean", + sliding_window_attention: bool = False, + pipelined_batch: bool = False, + order_type: str | tuple[str, ...] = ("z", "z-trans"), + shuffle_orders: bool = True, + ): + super().__init__() + + self.pipelined_batch = pipelined_batch + self.order_type = order_type + + self.fvdb_ptv3_model = PTV3( + num_classes=-1, + input_dim=in_channels, + enc_depths=enc_depths, + enc_channels=enc_channels, + enc_num_heads=enc_num_heads, + dec_depths=dec_depths, + dec_channels=dec_channels, + dec_num_heads=dec_num_heads, + patch_size=patch_size, + drop_path=drop_path, + proj_drop=proj_drop, + qk_scale=qk_scale, + enable_batch_norm=enable_batch_norm, + embedding_mode=embedding_mode, + no_conv_in_cpe=no_conv_in_cpe, + # cross_patch_attention=cross_patch_attention, + # cross_patch_pooling=cross_patch_pooling, + sliding_window_attention=sliding_window_attention, + order_type=order_type, + shuffle_orders=shuffle_orders, + ) + + def forward(self, data_dict): + + grid_coord = data_dict["grid_coord"] + feat = data_dict["feat"] + offset = data_dict["offset"] + # import pdb; pdb.set_trace() + # print(f"grid_coord.shape: {grid_coord.shape}, feat.shape: {feat.shape}, offset.shape: {offset.shape}") + # exit() + + if self.pipelined_batch and len(offset) > 1: + # Pipelined batch mode: process each point cloud individually + # This mode splits the batch into individual point clouds, processes each + # one separately through the FVDB model, and concatenates the results. + # This can be useful for: + # 1. Memory efficiency when individual processing uses less memory + # 2. Debugging to isolate issues with specific point clouds + # 3. Different processing requirements per sample + outputs = [] + prev_offset = 0 + # catted_input_grid_ijk = [] + # catted_input_feat = [] + # catted_original_coord_to_voxel_idx = [] + for curr_offset in offset: + # Extract data for current point cloud + curr_grid_coord = grid_coord[prev_offset:curr_offset] + curr_feat = feat[prev_offset:curr_offset] + curr_num_points = curr_offset - prev_offset + curr_offset_tensor = torch.tensor([curr_num_points], dtype=offset.dtype, device=offset.device) + + # Process single point cloud + grid, jfeats, original_coord_to_voxel_idx = create_grid_from_points( + curr_grid_coord, curr_feat, curr_offset_tensor, voxel_size=0.02 + ) + assert ( + grid.ijk.jdata.shape == curr_grid_coord.shape + ), f"curr_grid_coord.shape: {curr_grid_coord.shape}, grid.ijk.jdata.shape: {grid.ijk.jdata.shape}" # + + # catted_input_grid_ijk.append(grid.ijk.jdata) + # catted_input_feat.append(jfeats.jdata) + # catted_original_coord_to_voxel_idx.append(original_coord_to_voxel_idx.jdata) + # grid shape and feats values match here. + grid, jfeats = self.fvdb_ptv3_model(grid, jfeats) + # feats values does not match here. + + # Get output for this point cloud. + curr_output = jfeats.jdata[original_coord_to_voxel_idx.jdata] + outputs.append(curr_output) + + prev_offset = curr_offset + + # Concatenate all outputs + output = torch.cat(outputs, dim=0) + # import pdb; pdb.set_trace() + + # catted_input_grid_ijk = torch.cat(catted_input_grid_ijk, dim=0) + # catted_input_feat = torch.cat(catted_input_feat, dim=0) + # catted_original_coord_to_voxel_idx = torch.cat(catted_original_coord_to_voxel_idx, dim=0) + + else: + # Standard batch mode (original implementation) + grid, jfeats, original_coord_to_voxel_idx = create_grid_from_points( + grid_coord, feat, offset, voxel_size=0.02 + ) + # import pdb; pdb.set_trace() + # TODO: check the downsampling behavior is the same or not? + assert ( + grid_coord.shape == grid.ijk.jdata.shape + ), f"grid_coord.shape: {grid_coord.shape}, grid.ijk.jdata.shape: {grid.ijk.jdata.shape}" # this is not always true, because mix-prob may duplicate points with the same coordinate. + assert ( + grid_coord.shape[0] == original_coord_to_voxel_idx.jdata.shape[0] + ), f"grid_coord.shape: {grid_coord.shape}, original_coord_to_voxel_idx.jdata.shape: {original_coord_to_voxel_idx.jdata.shape}" + + # import pdb; pdb.set_trace() + if torch.is_autocast_enabled(): + with torch.autocast(device_type="cuda", enabled=False): + grid, jfeats = self.fvdb_ptv3_model(grid, jfeats) + else: + grid, jfeats = self.fvdb_ptv3_model(grid, jfeats) + + output = jfeats.jdata[original_coord_to_voxel_idx.jdata] + # import pdb; pdb.set_trace() + + return output # return logits in torch.tensor format diff --git a/point_transformer_v3/model.py b/point_transformer_v3/fvdb_extensions/models/ptv3_fvdb.py similarity index 88% rename from point_transformer_v3/model.py rename to point_transformer_v3/fvdb_extensions/models/ptv3_fvdb.py index 7295bea..f19a29c 100644 --- a/point_transformer_v3/model.py +++ b/point_transformer_v3/fvdb_extensions/models/ptv3_fvdb.py @@ -1,18 +1,31 @@ # Copyright Contributors to the OpenVDB Project # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, Tuple, Union, List +""" +PTV3 FVDB Implementation -# Add NVTX import for profiling -import flash_attn +This module contains the core Point Transformer V3 implementation using FVDB. +It works directly with FVDB GridBatch and JaggedTensor types. + +For pointcept framework integration, see point_transformer_v3m1_fvdb.py +""" + +from typing import Any, Callable, cast + +try: + import flash_attn +except ImportError: + flash_attn = None + +from functools import partial + +import fvdb import torch import torch.nn import torch.nn.functional as F from timm.layers import DropPath -from functools import partial - -import fvdb +# Add NVTX import for profiling try: import torch.cuda.nvtx as nvtx @@ -39,21 +52,21 @@ def __init__( self, in_channels, embed_channels, - norm_layer_module: torch.nn.Module = torch.nn.LayerNorm, + norm_layer_module: type[torch.nn.Module] | Callable = torch.nn.LayerNorm, embedding_mode: str = "linear", - shared_plan_cache: Dict = None, + shared_plan_cache: dict | None = None, ): """ Args: in_channels (int): Number of channels in the input features. embed_channels (int): Number of channels in the output features. - norm_layer_module (torch.nn.Module): Normalization layer module. + norm_layer_module (type[torch.nn.Module] | Callable): Normalization layer module. embedding_mode (str): The type of embedding layer, "linear" or "conv3x3", "conv5x5". - shared_plan_cache (Dict): Shared cache for ConvolutionPlans across all layers. + shared_plan_cache (dict | None): Shared cache for ConvolutionPlans across all layers. """ super().__init__() self.embedding_mode = embedding_mode - self.shared_plan_cache = shared_plan_cache + self.shared_plan_cache = shared_plan_cache if shared_plan_cache is not None else {} if embedding_mode == "linear": self.embed = torch.nn.Linear(in_channels, embed_channels) @@ -65,7 +78,7 @@ def __init__( elif embedding_mode == "conv5x5": ## Implementation Option 1: Cascaded 3x3 convolutions # This approach uses two 3x3 convs to achieve a 5x5 receptive field with fewer parameters - # Parameters: (27 × in_channels × embed_channels) + (27 × embed_channels²) + # Parameters: (27 x in_channels x embed_channels) + (27 x embed_channels^2) self.embed_conv3x3_1 = fvdb.nn.SparseConv3d( in_channels, embed_channels, kernel_size=3, stride=1, bias=False ) @@ -75,7 +88,7 @@ def __init__( ## Implementation Option 2: Direct 5x5 convolution # TODO: Implementation pending - requires additional sparse convolution support from fVDB-core. - # Expected parameters: 125 × in_channels × embed_channels + # Expected parameters: 125 x in_channels x embed_channels # self.embed_conv5x5_1 = fvdb.nn.SparseConv3d(in_channels, embed_channels, kernel_size=5, stride=1) else: raise ValueError(f"Unsupported embedding mode: {embedding_mode}") @@ -127,7 +140,7 @@ def __init__( kernel_size: int = 2, in_channels: int = 64, out_channels: int = 64, - norm_layer_module: torch.nn.Module = torch.nn.LayerNorm, + norm_layer_module: type[torch.nn.Module] | Callable = torch.nn.LayerNorm, ): """ Args: @@ -162,7 +175,7 @@ def __init__( in_channels: int = 64, out_channels: int = 64, skip_channels: int = 64, - norm_layer_module: torch.nn.Module = torch.nn.LayerNorm, + norm_layer_module: type[torch.nn.Module] | Callable = torch.nn.LayerNorm, ): """ Args: @@ -238,7 +251,7 @@ def __init__( num_heads: int, proj_drop: float = 0.0, patch_size: int = 0, - qk_scale: float = None, + qk_scale: float | None = None, sliding_window_attention: bool = False, order_index: int = 0, order_types: tuple = ("vdb",), @@ -249,7 +262,7 @@ def __init__( num_heads (int): Number of attention heads in each block. proj_drop (float): Dropout rate for MLP layers. patch_size (int): Patch size for patch attention. - qk_scale (float): Scale factor for query-key dot product. If None, uses 1/sqrt(head_dim). + qk_scale (float | None): Scale factor for query-key dot product. If None, uses 1/sqrt(head_dim). sliding_window_attention (bool): Whether to use sliding window attention (uses patch_size as window size). order_index (int): Index into order_types to select which order to use for this block. order_types (tuple): Tuple of order type strings (e.g., ("z", "z-trans")). @@ -377,6 +390,9 @@ def forward(self, grid, feats): if self.sliding_window_attention and self.patch_size > 0: # Perform sliding window attention per-grid using flash attention + assert ( + flash_attn is not None + ), "flash_attn is required for sliding_window_attention. Install with: pip install flash-attn" num_voxels = feats_j.shape[0] H = self.num_heads D = self.head_dim @@ -390,8 +406,11 @@ def forward(self, grid, feats): continue qkv_b = qkv[start:end].view(1, Li, 3, H, D) window_size = (self.patch_size // 2, self.patch_size // 2) - out_b = flash_attn.flash_attn_qkvpacked_func( - qkv_b.half(), dropout_p=0.0, softmax_scale=self.scale, window_size=window_size + out_b = cast( + Any, + flash_attn.flash_attn_qkvpacked_func( + qkv_b.half(), dropout_p=0.0, softmax_scale=self.scale, window_size=window_size + ), ).reshape( Li, self.hidden_size ) # dtype: float16 @@ -405,6 +424,9 @@ def forward(self, grid, feats): elif self.patch_size > 0: # Perform attention within each patch_size window per-grid using varlen API + assert ( + flash_attn is not None + ), "flash_attn is required when patch_size > 0. Install with: pip install flash-attn" num_voxels = feats_j.shape[0] H = self.num_heads D = self.head_dim @@ -431,12 +453,15 @@ def forward(self, grid, feats): cu_seqlens = torch.zeros(len(lengths) + 1, device=qkv.device, dtype=torch.int32) cu_seqlens[1:] = torch.as_tensor(lengths, device=qkv.device, dtype=torch.int32).cumsum(dim=0) - feats_out_j = flash_attn.flash_attn_varlen_qkvpacked_func( - qkv.half(), - cu_seqlens, - max_seqlen=self.patch_size, - dropout_p=0.0, # TODO: implement attention dropout in the future. By default, it is 0. - softmax_scale=self.scale, + feats_out_j = cast( + Any, + flash_attn.flash_attn_varlen_qkvpacked_func( + qkv.half(), + cu_seqlens, + max_seqlen=self.patch_size, + dropout_p=0.0, # TODO: implement attention dropout in the future. By default, it is 0. + softmax_scale=self.scale, + ), ).reshape( num_voxels, self.hidden_size ) # dtype: float16 @@ -461,17 +486,17 @@ def forward(self, grid, feats): class PTV3_CPE(torch.nn.Module): - def __init__(self, hidden_size: int, no_conv_in_cpe: bool = False, shared_plan_cache: Dict = None): + def __init__(self, hidden_size: int, no_conv_in_cpe: bool = False, shared_plan_cache: dict | None = None): """ Args: hidden_size (int): Number of channels in the input features. no_conv_in_cpe (bool): Whether to disable convolution in CPE. - shared_plan_cache (Dict): Shared cache for ConvolutionPlans across all layers. + shared_plan_cache (dict | None): Shared cache for ConvolutionPlans across all layers. """ super().__init__() self.hidden_size = hidden_size self.no_conv_in_cpe = no_conv_in_cpe - self.shared_plan_cache = shared_plan_cache + self.shared_plan_cache = shared_plan_cache if shared_plan_cache is not None else {} self.cpe = torch.nn.ModuleList( [ ( @@ -521,12 +546,12 @@ def __init__( drop_path: float, proj_drop: float = 0.0, patch_size: int = 0, - qk_scale: float = None, + qk_scale: float | None = None, no_conv_in_cpe: bool = False, sliding_window_attention: bool = False, order_index: int = 0, order_types: tuple = ("vdb",), - shared_plan_cache: Dict = None, + shared_plan_cache: dict | None = None, ): """ Args: @@ -535,12 +560,12 @@ def __init__( drop_path (float): Drop path rate for regularization. proj_drop (float): Dropout rate for MLP layers. patch_size (int): Patch size for patch attention. - qk_scale (float): Scale factor for query-key dot product. If None, uses 1/sqrt(head_dim). + qk_scale (float | None): Scale factor for query-key dot product. If None, uses 1/sqrt(head_dim). no_conv_in_cpe (bool): Whether to disable convolution in CPE. sliding_window_attention (bool): Whether to use sliding window attention (uses patch_size as window size). order_index (int): Index into order_types to select which order to use for this block. order_types (tuple): Tuple of order type strings (e.g., ("z", "z-trans")). - shared_plan_cache (Dict): Shared cache for ConvolutionPlans across all layers. + shared_plan_cache (dict | None): Shared cache for ConvolutionPlans across all layers. """ super().__init__() @@ -599,11 +624,11 @@ def __init__( drop_path, # drop_path is a list of drop path rates for each block. proj_drop: float = 0.0, patch_size: int = 0, - qk_scale: float = None, + qk_scale: float | None = None, no_conv_in_cpe: bool = False, sliding_window_attention: bool = False, order_types: tuple = ("vdb",), - shared_plan_cache: Dict = None, + shared_plan_cache: dict | None = None, ): """ Args: @@ -613,11 +638,11 @@ def __init__( drop_path (list): Drop path rates for each block. proj_drop (float): Dropout rate for MLP layers. patch_size (int): Patch size for patch attention. - qk_scale (float): Scale factor for query-key dot product. If None, uses 1/sqrt(head_dim). + qk_scale (float | None): Scale factor for query-key dot product. If None, uses 1/sqrt(head_dim). no_conv_in_cpe (bool): Whether to disable convolution in CPE. sliding_window_attention (bool): Whether to use sliding window attention (uses patch_size as window size). order_types (tuple): Tuple of order type strings (e.g., ("z", "z-trans")). - shared_plan_cache (Dict): Shared cache for ConvolutionPlans across all layers. + shared_plan_cache (dict | None): Shared cache for ConvolutionPlans across all layers. """ super().__init__() self.depth = depth @@ -653,27 +678,27 @@ def __init__( self, num_classes: int, input_dim: int = 6, # xyz + intensity/reflectance + additional features - enc_depths: Tuple[int, ...] = ( + enc_depths: tuple[int, ...] = ( 2, 2, 2, 2, ), # default hyper-parameters to align with sonata ptv3's default hyper-parameters. - enc_channels: Tuple[int, ...] = (32, 64, 128, 256, 512), - enc_num_heads: Tuple[int, ...] = (2, 4, 8, 16, 32), - # enc_patch_size: Tuple[int, ...] = (4096), - dec_depths: Tuple[int, ...] = (), # by default, no decoder. - dec_channels: Tuple[int, ...] = (), - dec_num_heads: Tuple[int, ...] = (), + enc_channels: tuple[int, ...] = (32, 64, 128, 256, 512), + enc_num_heads: tuple[int, ...] = (2, 4, 8, 16, 32), + # enc_patch_size: tuple[int, ...] = (4096), + dec_depths: tuple[int, ...] = (), # by default, no decoder. + dec_channels: tuple[int, ...] = (), + dec_num_heads: tuple[int, ...] = (), patch_size: int = 0, drop_path: float = 0.3, proj_drop: float = 0.0, - qk_scale: float = None, + qk_scale: float | None = None, enable_batch_norm: bool = False, embedding_mode: str = "linear", no_conv_in_cpe: bool = False, sliding_window_attention: bool = False, - order_type: Union[str, tuple] = ("z", "z-trans"), + order_type: str | tuple = ("z", "z-trans"), shuffle_orders: bool = True, ) -> None: """ @@ -682,22 +707,22 @@ def __init__( Args: num_classes (int): Number of classes for segmentation. input_dim (int): Input feature dimension (default: 4 for xyz + intensity). - hidden_dims (Tuple[int, ...]): Hidden layer dimensions (not used in simplified version). - enc_depths (Tuple[int, ...]): Number of encoder blocks for each stage. - enc_channels (Tuple[int, ...]): Number of channels for each stage. - enc_num_heads (Tuple[int, ...]): Number of attention heads for each stage. - dec_depths (Tuple[int, ...]): Number of decoder blocks for each stage. - dec_channels (Tuple[int, ...]): Number of channels for each stage. - dec_num_heads (Tuple[int, ...]): Number of attention heads for each stage. + hidden_dims (tuple[int, ...]): Hidden layer dimensions (not used in simplified version). + enc_depths (tuple[int, ...]): Number of encoder blocks for each stage. + enc_channels (tuple[int, ...]): Number of channels for each stage. + enc_num_heads (tuple[int, ...]): Number of attention heads for each stage. + dec_depths (tuple[int, ...]): Number of decoder blocks for each stage. + dec_channels (tuple[int, ...]): Number of channels for each stage. + dec_num_heads (tuple[int, ...]): Number of attention heads for each stage. patch_size (int): Patch size for patch attention. drop_path (float): Drop path rate for regularization. proj_drop (float): Dropout rate for MLP layers. - qk_scale (float): Scale factor for query-key dot product. If None, uses 1/sqrt(head_dim). + qk_scale (float | None): Scale factor for query-key dot product. If None, uses 1/sqrt(head_dim). enable_batch_norm (bool): Whether to use batch normalization for the embedding, down pooling, and up pooling. embedding_mode (bool): the mode for the embedding layer, "linear" or "conv3x3", "conv5x5". no_conv_in_cpe (bool): Whether to disable convolution in CPE. sliding_window_attention (bool): Whether to use sliding window attention (uses patch_size as window size). - order (Union[str, tuple]): The type(s) of point ordering. Can be a single string ("vdb", "z", "z-trans", "hilbert", "hilbert-trans") + order (str | tuple): The type(s) of point ordering. Can be a single string ("vdb", "z", "z-trans", "hilbert", "hilbert-trans") or a tuple of strings (e.g., ("z", "z-trans")). Each block within a stage cycles through the order types. shuffle_orders (bool): Whether to shuffle the order of order types at the beginning of each forward pass and after each pooling. """ diff --git a/point_transformer_v3/requirements.txt b/point_transformer_v3/requirements.txt index 2c18427..c4b1285 100644 --- a/point_transformer_v3/requirements.txt +++ b/point_transformer_v3/requirements.txt @@ -1,2 +1,23 @@ -flash-attn==2.7.4.post1 +# Core dependencies for PT-v3 FVDB implementation timm +requests + +# flash-attn is only needed when patch_size > 0 (default config uses patch_size=1024) +# While PyTorch 2.8+ has built-in flash attention, flash-attn provides optimized varlen functions +# that are faster for variable-length sequences. The build is slow but worth it for performance. +# +# If pip install freezes or is very slow, try installing separately with: +# MAX_JOBS=4 pip install flash-attn==2.7.4.post1 --no-build-isolation +# Or check for pre-built wheels at: https://github.com/Dao-AILab/flash-attention/releases +flash-attn==2.7.4.post1 + +# Pointcept framework dependencies (only needed when using point_transformer_v3m1_fvdb.py) +# Install from PyG wheels for PyTorch 2.8.0 + CUDA 12.9 +--find-links https://data.pyg.org/whl/torch-2.8.0+cu129.html +torch-cluster +# Sparse convolution - spconv-cu129 not available, try cu124 (usually compatible with 12.9) +# If this fails, install from source: https://github.com/traveller59/spconv +spconv-cu124 + +# Development +black~=24.0 diff --git a/point_transformer_v3/scripts/README.md b/point_transformer_v3/scripts/README.md new file mode 100644 index 0000000..0f89317 --- /dev/null +++ b/point_transformer_v3/scripts/README.md @@ -0,0 +1,32 @@ +# Scripts Directory + +This directory contains utility scripts organized by purpose. + +## `data/` - Data Management Scripts + +Scripts for downloading and preprocessing datasets: + +- **`download_example_data.py`**: Downloads preprocessed test data from remote repository +- **`prepare_scannet_dataset.py`**: Prepares ScanNet dataset samples from raw data + +## `test/` - Testing and Validation Scripts + +Scripts for running inference and validating results: + +- **`minimal_inference.py`**: Runs PT-v3 model inference on point cloud data +- **`compute_difference.py`**: Compares inference outputs between different implementations + +## Usage + +All scripts should be run from the `point_transformer_v3/` directory: + +```bash +# Data scripts +python scripts/data/download_example_data.py +python scripts/data/prepare_scannet_dataset.py --data-root /path/to/scannet --output data/samples.json + +# Test scripts +python scripts/test/minimal_inference.py --data-path data/scannet_samples.json +python scripts/test/compute_difference.py --stats_path_1 data/output1.json --stats_path_2 data/output2.json +``` + diff --git a/point_transformer_v3/scripts/apply_formatting.py b/point_transformer_v3/scripts/apply_formatting.py new file mode 100755 index 0000000..e8130a5 --- /dev/null +++ b/point_transformer_v3/scripts/apply_formatting.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 + + +""" +Apply code formatting to point_transformer_v3 project. + +This script applies black formatting to: +- scripts directory +- fvdb_extensions directory +- setup_env.py + +It ignores the external directory. +""" + +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path + +# Get the directory containing this script +SCRIPT_DIR = Path(__file__).parent.resolve() +PROJECT_ROOT = SCRIPT_DIR.parent.resolve() + + +def main(): + """Apply formatting using black.""" + # Directories and files to format + targets = [ + str(PROJECT_ROOT / "scripts"), + str(PROJECT_ROOT / "fvdb_extensions"), + str(PROJECT_ROOT / "setup_env.py"), + ] + + # Black options matching codestyle.yml + black_options = [ + "--target-version=py311", + "--line-length=120", + "--verbose", + ] + + # Run black via python -m for better portability + cmd = [sys.executable, "-m", "black"] + black_options + targets + + print(f"Running: {' '.join(cmd)}") + print(f"Formatting targets:") + for target in targets: + print(f" - {target}") + print() + + try: + result = subprocess.run(cmd, check=True) + print("\n[OK] Formatting applied successfully!") + return 0 + except subprocess.CalledProcessError as e: + print(f"\n[FAIL] Formatting failed with exit code {e.returncode}") + return e.returncode + except FileNotFoundError: + print("\n[FAIL] Error: black not found. Please install it:") + print(" pip install black~=24.0") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/point_transformer_v3/scripts/check_spdx.py b/point_transformer_v3/scripts/check_spdx.py new file mode 100755 index 0000000..adea244 --- /dev/null +++ b/point_transformer_v3/scripts/check_spdx.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 + +""" +Check for SPDX identifiers in source files. +Excludes external directory and hidden files. +""" + +import os +import sys +from pathlib import Path + +# Extensions to check +EXTENSIONS = {".py", ".cpp", ".h", ".cu", ".cuh", ".sh"} + +# Directories to exclude +EXCLUDES = {"external", "__pycache__", ".git", ".github", ".vscode", ".idea"} + + +def check_file(filepath): + """Check if file contains SPDX-License-Identifier.""" + try: + with open(filepath, "r", encoding="utf-8") as f: + # Read first 20 lines + for _ in range(20): + line = f.readline() + if not line: + break + if "SPDX-License-Identifier" in line: + return True + except Exception as e: + print(f"Error reading {filepath}: {e}") + return False + return False + + +def main(): + script_dir = Path(__file__).parent.resolve() + project_root = script_dir.parent + + print(f"Checking for SPDX identifiers in {project_root}...") + print(f"Excluding: {', '.join(EXCLUDES)}") + + failed_files = [] + checked_count = 0 + + for root, dirs, files in os.walk(project_root): + # Modify dirs in-place to skip excluded directories + dirs[:] = [d for d in dirs if d not in EXCLUDES] + + for file in files: + file_path = Path(root) / file + if file_path.suffix in EXTENSIONS: + checked_count += 1 + if not check_file(file_path): + failed_files.append(str(file_path.relative_to(project_root))) + + print(f"Checked {checked_count} files.") + + if failed_files: + print("\nMissing SPDX-License-Identifier in:") + for f in failed_files: + print(f" - {f}") + return 1 + + print("\nAll files have SPDX identifiers.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/point_transformer_v3/scripts/data/__pycache__/download_example_data.cpython-312.pyc b/point_transformer_v3/scripts/data/__pycache__/download_example_data.cpython-312.pyc new file mode 100644 index 0000000..10026d6 Binary files /dev/null and b/point_transformer_v3/scripts/data/__pycache__/download_example_data.cpython-312.pyc differ diff --git a/point_transformer_v3/scripts/data/__pycache__/prepare_scannet_dataset.cpython-312.pyc b/point_transformer_v3/scripts/data/__pycache__/prepare_scannet_dataset.cpython-312.pyc new file mode 100644 index 0000000..0c24169 Binary files /dev/null and b/point_transformer_v3/scripts/data/__pycache__/prepare_scannet_dataset.cpython-312.pyc differ diff --git a/point_transformer_v3/download_example_data.py b/point_transformer_v3/scripts/data/download_example_data.py similarity index 89% rename from point_transformer_v3/download_example_data.py rename to point_transformer_v3/scripts/data/download_example_data.py index 47db6e6..4ea12c5 100644 --- a/point_transformer_v3/download_example_data.py +++ b/point_transformer_v3/scripts/data/download_example_data.py @@ -1,6 +1,8 @@ # Copyright Contributors to the OpenVDB Project # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import json import logging import os @@ -15,7 +17,9 @@ def download_example_data(file_name: str, logger: logging.Logger): """ raw_url = f"https://raw.githubusercontent.com/voxel-foundation/fvdb-test-data/scannet/unit_tests/ptv3/{file_name}" - data_dir = Path("data") + # Script is in scripts/data/, so go up one level to get project root + project_root = Path(__file__).parent.parent.parent.resolve() + data_dir = project_root / "data" data_dir.mkdir(exist_ok=True) output_file = data_dir / file_name diff --git a/point_transformer_v3/prepare_scannet_dataset.py b/point_transformer_v3/scripts/data/prepare_scannet_dataset.py similarity index 93% rename from point_transformer_v3/prepare_scannet_dataset.py rename to point_transformer_v3/scripts/data/prepare_scannet_dataset.py index 09d0ebd..7561f93 100644 --- a/point_transformer_v3/prepare_scannet_dataset.py +++ b/point_transformer_v3/scripts/data/prepare_scannet_dataset.py @@ -1,4 +1,6 @@ +# Copyright Contributors to the OpenVDB Project # SPDX-License-Identifier: Apache-2.0 + """ Modified from https://github.com/Pointcept/Pointcept.git @@ -9,13 +11,15 @@ ensures consistent point counts per sample. """ +from __future__ import annotations + import argparse import glob import json import logging import os from pathlib import Path -from typing import Any, Dict, List +from typing import Any import numpy as np from torch.utils.data import Dataset @@ -192,7 +196,8 @@ def export_scannet_samples( # Randomly sample scenes np.random.seed(42) # create a permutation of the scene paths - selected_paths = np.random.permutation(scene_paths) + selected_paths = np.array(scene_paths) + selected_paths = selected_paths[np.random.permutation(len(selected_paths))] # Initialize dataset dataset = ScanNetDataset(data_root=data_root, split=split) @@ -304,8 +309,6 @@ def main(): if __name__ == "__main__": main() -# Create scannet_samples_small.json -# python prepare_scannet_dataset.py --data-root /home/hexuz/openvdb/fvdb/projects/sparse_attention/Pointcept/data/scannet --output data/scannet_samples_small.json --num-samples 8 --split train --min-points 2048 --max-points 4096 --voxel-size 0.1 --patch-size 1024 - -# Create scannet_samples_large.json -# python prepare_scannet_dataset.py --data-root /home/hexuz/openvdb/fvdb/projects/sparse_attention/Pointcept/data/scannet --output data/scannet_samples_large.json --num-samples 4 --split train --min-points 50000 --max-points 100000 --voxel-size 0.02 --patch-size 1024 +# Run from point_transformer_v3/ directory: +# python scripts/data/prepare_scannet_dataset.py --data-root /path/to/scannet --output data/scannet_samples_small.json --num-samples 8 --split train --min-points 2048 --max-points 4096 --voxel-size 0.1 --patch-size 1024 +# python scripts/data/prepare_scannet_dataset.py --data-root /path/to/scannet --output data/scannet_samples_large.json --num-samples 4 --split train --min-points 50000 --max-points 100000 --voxel-size 0.02 --patch-size 1024 diff --git a/point_transformer_v3/scripts/fix_formatting.sh b/point_transformer_v3/scripts/fix_formatting.sh new file mode 100755 index 0000000..d8313d5 --- /dev/null +++ b/point_transformer_v3/scripts/fix_formatting.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 + +# Exit on error +set -e + +# Determine the directory of this script +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +# Determine the root of point_transformer_v3 (parent of scripts) +PTV3_ROOT="$(dirname "$DIR")" + +# Change to the project root +cd "$PTV3_ROOT" + +echo "Running black formatting on $(pwd)..." + +# Run black, excluding the submodule +# The pattern "external/pointcept" will match the directory relative to the root +black --target-version=py311 --line-length=120 --extend-exclude "external/pointcept" . + +echo "Formatting complete." diff --git a/point_transformer_v3/scripts/test/__pycache__/compute_difference.cpython-312.pyc b/point_transformer_v3/scripts/test/__pycache__/compute_difference.cpython-312.pyc new file mode 100644 index 0000000..47b0445 Binary files /dev/null and b/point_transformer_v3/scripts/test/__pycache__/compute_difference.cpython-312.pyc differ diff --git a/point_transformer_v3/scripts/test/__pycache__/minimal_inference.cpython-312.pyc b/point_transformer_v3/scripts/test/__pycache__/minimal_inference.cpython-312.pyc new file mode 100644 index 0000000..c8c7bb7 Binary files /dev/null and b/point_transformer_v3/scripts/test/__pycache__/minimal_inference.cpython-312.pyc differ diff --git a/point_transformer_v3/compute_difference.py b/point_transformer_v3/scripts/test/compute_difference.py similarity index 93% rename from point_transformer_v3/compute_difference.py rename to point_transformer_v3/scripts/test/compute_difference.py index df75b1e..d48d3e7 100644 --- a/point_transformer_v3/compute_difference.py +++ b/point_transformer_v3/scripts/test/compute_difference.py @@ -6,17 +6,19 @@ Usage: python compute_difference.py file1.json file2.json """ +from __future__ import annotations + import argparse import json import logging import os import sys -from typing import Any, Dict, List +from typing import Any import numpy as np -def load_stats_file(filepath: str, logger: logging.Logger) -> tuple[List[Dict[str, Any]], Dict[str, Any]]: +def load_stats_file(filepath: str, logger: logging.Logger) -> tuple[list[dict[str, Any]], dict[str, Any]]: """Load and parse a minimal_inference_stats.json file. Args: @@ -58,8 +60,8 @@ def load_stats_file(filepath: str, logger: logging.Logger) -> tuple[List[Dict[st def compute_deviations( - stats1: List[Dict[str, Any]], stats2: List[Dict[str, Any]], logger: logging.Logger -) -> Dict[str, Dict[str, float]]: + stats1: list[dict[str, Any]], stats2: list[dict[str, Any]], logger: logging.Logger +) -> dict[str, dict[str, float]]: """Compute deviations between corresponding entries in two stats files. Args: @@ -120,8 +122,8 @@ def compute_deviations( def compute_global_deviations( - global_stats1: Dict[str, Any], global_stats2: Dict[str, Any], logger: logging.Logger -) -> Dict[str, Dict[str, float]]: + global_stats1: dict[str, Any], global_stats2: dict[str, Any], logger: logging.Logger +) -> dict[str, dict[str, float]]: """Compute deviations between global statistics from two files. Args: @@ -251,8 +253,6 @@ def main(): if __name__ == "__main__": main() -# scannet_samples_large.json -# python compute_difference.py --stats_path_1 data/scannet_samples_large_output.json --stats_path_2 data/scannet_samples_large_output_gt.json - -# scannet_samples_small.json -# python compute_difference.py --stats_path_1 data/scannet_samples_small_output.json --stats_path_2 data/scannet_samples_small_output_gt.json +# Run from point_transformer_v3/ directory: +# python scripts/test/compute_difference.py --stats_path_1 data/scannet_samples_large_output.json --stats_path_2 data/scannet_samples_large_output_gt.json +# python scripts/test/compute_difference.py --stats_path_1 data/scannet_samples_small_output.json --stats_path_2 data/scannet_samples_small_output_gt.json diff --git a/point_transformer_v3/minimal_inference.py b/point_transformer_v3/scripts/test/minimal_inference.py similarity index 91% rename from point_transformer_v3/minimal_inference.py rename to point_transformer_v3/scripts/test/minimal_inference.py index 216da66..42e4a0c 100644 --- a/point_transformer_v3/minimal_inference.py +++ b/point_transformer_v3/scripts/test/minimal_inference.py @@ -8,15 +8,26 @@ 2. Load and run the PT-v3 model """ +from __future__ import annotations + import argparse import gc import json import logging import os +import sys +from pathlib import Path +from typing import Any + +# Setup paths for imports +# Script is in scripts/test/, so go up two levels to get project root +_project_root = Path(__file__).parent.parent.parent.resolve() +sys.path.insert(0, str(_project_root)) +sys.path.insert(0, str(_project_root / "external" / "pointcept")) import numpy as np import torch -from model import PTV3 +from fvdb_extensions.models.ptv3_fvdb import PTV3 import fvdb @@ -37,7 +48,7 @@ def range_pop(self): nvtx = DummyNVTX() -def create_ptv3_model(args, device, num_classes): +def create_ptv3_model(args: argparse.Namespace, device: torch.device | str, num_classes: int) -> torch.nn.Module: """Create a PT-v3 model. Args: @@ -129,7 +140,9 @@ def create_ptv3_model(args, device, num_classes): return model -def prepare_batched_inputs_from_scannet_points(batch_samples, voxel_size=0.1, device="cuda"): +def prepare_batched_inputs_from_scannet_points( + batch_samples: list[dict[str, Any]], voxel_size: float = 0.1, device: torch.device | str = "cuda" +) -> tuple[fvdb.GridBatch, fvdb.JaggedTensor]: """Prepare batched inputs from a list of ScanNet-like samples. Args: @@ -165,7 +178,7 @@ def main(): parser = argparse.ArgumentParser(description="Minimal inference script for PT-v3 on ScanNet point cloud data") parser.add_argument( - "--data-path", type=str, default="scannet_samples.json", help="Path to the scannet samples json file" + "--data-path", type=str, default="data/scannet_samples.json", help="Path to the scannet samples json file" ) parser.add_argument("--voxel-size", type=float, default=0.02, help="Voxel size for grid sampling") parser.add_argument("--patch-size", type=int, default=1024, help="Maximum points per sample") @@ -342,8 +355,6 @@ def main(): main() ## Example commands: -# scannet_samples_small.json -# python minimal_inference.py --data-path data/scannet_samples_small.json --voxel-size 0.1 --patch-size 1024 --batch-size 1 - -# scannet_samples_large.json -# python minimal_inference.py --data-path data/scannet_samples_large.json --voxel-size 0.02 --patch-size 1024 --batch-size 1 +# Run from point_transformer_v3/ directory: +# python scripts/test/minimal_inference.py --data-path data/scannet_samples_small.json --voxel-size 0.1 --patch-size 1024 --batch-size 1 +# python scripts/test/minimal_inference.py --data-path data/scannet_samples_large.json --voxel-size 0.02 --patch-size 1024 --batch-size 1 diff --git a/point_transformer_v3/setup_env.py b/point_transformer_v3/setup_env.py new file mode 100644 index 0000000..f61300e --- /dev/null +++ b/point_transformer_v3/setup_env.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 + +""" +Setup script for point_transformer_v3 project. + +This script sets up the Python path to allow imports from: +- fvdb_extensions (local extensions) +- external.pointcept.pointcept (pointcept submodule) + +Usage: + python setup_env.py + # or source it: + source setup_env.py # This will export PYTHONPATH + +Or import it in your scripts: + import setup_env # This will add paths to sys.path +""" + +import os +import sys +from pathlib import Path + +# Get the directory containing this script (point_transformer_v3) +PROJECT_ROOT = Path(__file__).parent.resolve() + + +def setup_paths(): + """Add necessary paths to sys.path for imports.""" + paths_to_add = [ + str(PROJECT_ROOT), # For importing fvdb_extensions + str(PROJECT_ROOT / "external" / "pointcept"), # For importing pointcept + ] + + for path in paths_to_add: + if path not in sys.path: + sys.path.insert(0, path) + + return paths_to_add + + +def get_pythonpath(): + """Get PYTHONPATH string for shell export.""" + paths = [ + str(PROJECT_ROOT), + str(PROJECT_ROOT / "external" / "pointcept"), + ] + return os.pathsep.join(paths) + + +if __name__ == "__main__": + # When run as script, print export command + pythonpath = get_pythonpath() + print(f"export PYTHONPATH={pythonpath}:$PYTHONPATH") + print("\n# Or run this script in Python to set up paths:") + print("import setup_env") +else: + # When imported, automatically set up paths + setup_paths() diff --git a/point_transformer_v3/setup_env.sh b/point_transformer_v3/setup_env.sh new file mode 100755 index 0000000..98dc0e1 --- /dev/null +++ b/point_transformer_v3/setup_env.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +# Setup script for point_transformer_v3 +# This sets up PYTHONPATH so imports work correctly + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +export PYTHONPATH="${SCRIPT_DIR}:${SCRIPT_DIR}/external/pointcept:${PYTHONPATH}" + +echo "PYTHONPATH set to:" +echo "$PYTHONPATH" +echo "" +echo "You can now run scripts from this directory." +echo "Example: python minimal_inference.py --help"