mivertowski
diff --git a/‎docs/paper/main.pdf‎
166 KB b/‎docs/paper/main.pdf‎
166 KB
diff --git a/‎docs/paper/main.tex‎
Lines changed: 114 additions & 44 deletions b/‎docs/paper/main.tex‎
Lines changed: 114 additions & 44 deletions
diff --git a/‎docs/paper/references.bib‎
Lines changed: 34 additions & 0 deletions b/‎docs/paper/references.bib‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎docs/paper/sections/00-abstract.tex‎
Lines changed: 16 additions & 14 deletions b/‎docs/paper/sections/00-abstract.tex‎
Lines changed: 16 additions & 14 deletions
diff --git a/‎docs/paper/sections/01-introduction.tex‎
Lines changed: 38 additions & 10 deletions b/‎docs/paper/sections/01-introduction.tex‎
Lines changed: 38 additions & 10 deletions
@@ -1,65 +1,109 @@
-% RingKernel: A GPU-Native Persistent Actor Model
-% Technical Paper - Main Document
+% The GPU-Native Persistent Actor Model
+% Technical Paper - arXiv Preprint Style
 %
-% Target venues: arXiv (cs.DC, cs.PL), ASPLOS, EuroSys, PLDI, PPoPP
+% Describes a paradigm for treating GPU compute units as actors with
+% persistent kernels, lock-free message passing, and causal ordering.
 %
-\documentclass[11pt,a4paper]{article}
-
-% Page geometry
-\usepackage[margin=1in]{geometry}
+% Implementations: RingKernel (Rust), DotCompute (.NET), Orleans.GpuBridge, RustGraph
+%
+% Target: arXiv (cs.DC, cs.PL, cs.AR)
+%
+\documentclass[11pt,letterpaper]{article}
+
+%% ============================================================================
+%% arXiv-style formatting
+%% ============================================================================
+
+% Page geometry - generous margins for readability
+\usepackage[
+    letterpaper,
+    top=1in,
+    bottom=1in,
+    left=1.25in,
+    right=1.25in
+]{geometry}
+
+% Typography
+\usepackage[T1]{fontenc}
+\usepackage{lmodern}              % Latin Modern fonts
+\usepackage{microtype}            % Improved typography
+\usepackage{setspace}
+\setstretch{1.15}                 % Slightly increased line spacing
+
+% Math
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{amsthm}
 
-% Packages
+% Tables and figures
 \usepackage{booktabs}
+\usepackage{multirow}
 \usepackage{subcaption}
+\usepackage{graphicx}
+\usepackage{float}
+
+% Code listings
 \usepackage{listings}
 \usepackage{xcolor}
+
+% Graphics
 \usepackage{tikz}
 \usepackage{pgfplots}
-% algorithm/algpseudocode removed - use lstlisting for pseudocode
+\pgfplotsset{compat=1.16}
+\usetikzlibrary{shapes,arrows,positioning,fit,calc}
+
+% References and links
 \usepackage{hyperref}
-\usepackage{amsmath}
-\usepackage{amssymb}
-\usepackage{graphicx}
-\usepackage{multirow}
 \usepackage{url}
-\usepackage{natbib}
+\usepackage[numbers,sort&compress]{natbib}
 
-\pgfplotsset{compat=1.16}
-\usetikzlibrary{shapes,arrows,positioning,fit,calc}
+% Author handling
+\usepackage{authblk}
+\usepackage{orcidlink}            % ORCID icons
 
-% Hyperref setup
+%% ============================================================================
+%% Hyperref setup
+%% ============================================================================
 \hypersetup{
     colorlinks=true,
-    linkcolor=blue,
+    linkcolor=blue!70!black,
     filecolor=magenta,
-    urlcolor=cyan,
-    citecolor=blue,
+    urlcolor=blue!70!black,
+    citecolor=green!50!black,
+    pdftitle={The GPU-Native Persistent Actor Model},
+    pdfauthor={Michael Ivertowski},
+    pdfsubject={GPU Computing, Actor Model, Distributed Systems},
+    pdfkeywords={Actor Model, GPU, CUDA, Persistent Kernels, HLC}
 }
 
-% Code listing style for Rust
+%% ============================================================================
+%% Code listing styles
+%% ============================================================================
+
+% Rust
 \lstdefinelanguage{Rust}{
   keywords={fn, let, mut, if, else, match, for, while, loop, return, struct, enum, impl, trait, pub, use, mod, async, await, self, Self, where, type, const, static, unsafe, extern, crate, super},
-  keywordstyle=\color{blue}\bfseries,
-  keywords=[2]{i32, i64, u32, u64, f32, f64, bool, usize, isize, String, Vec, Option, Result, Arc, Box},
-  keywordstyle=[2]\color{teal},
+  keywordstyle=\color{blue!80!black}\bfseries,
+  keywords=[2]{i32, i64, u32, u64, f32, f64, bool, usize, isize, String, Vec, Option, Result, Arc, Box, AtomicU32, AtomicU64},
+  keywordstyle=[2]\color{teal!80!black},
   comment=[l]{//},
   morecomment=[s]{/*}{*/},
   commentstyle=\color{gray}\itshape,
-  stringstyle=\color{red},
+  stringstyle=\color{red!70!black},
   morestring=[b]",
   basicstyle=\ttfamily\small,
   breaklines=true,
   showstringspaces=false,
   tabsize=2,
 }
 
-% Code listing style for CUDA
+% CUDA
 \lstdefinelanguage{CUDA}{
   language=C++,
-  morekeywords={__global__, __device__, __shared__, __host__, threadIdx, blockIdx, blockDim, gridDim, atomicAdd, atomicCAS, __syncthreads},
-  keywordstyle=\color{blue}\bfseries,
+  morekeywords={__global__, __device__, __shared__, __host__, threadIdx, blockIdx, blockDim, gridDim, atomicAdd, atomicCAS, __syncthreads, __threadfence},
+  keywordstyle=\color{blue!80!black}\bfseries,
   commentstyle=\color{gray}\itshape,
-  stringstyle=\color{red},
+  stringstyle=\color{red!70!black},
   basicstyle=\ttfamily\small,
   breaklines=true,
   showstringspaces=false,
@@ -68,39 +112,65 @@
 \lstset{
   language=Rust,
   frame=single,
+  framerule=0.5pt,
+  rulecolor=\color{gray!50},
   numbers=left,
   numberstyle=\tiny\color{gray},
   xleftmargin=2em,
   framexleftmargin=1.5em,
+  backgroundcolor=\color{gray!5},
+  captionpos=b,
 }
 
-% Document metadata
-\title{\textbf{RingKernel: A GPU-Native Persistent Actor Model for\\High-Performance Concurrent Computing}}
+%% ============================================================================
+%% Custom commands
+%% ============================================================================
+\newcommand{\arxiv}[1]{\href{https://arxiv.org/abs/#1}{arXiv:#1}}
+\newcommand{\github}[1]{\href{https://github.com/#1}{\texttt{github.com/#1}}}
 
-\author{
-  Michael Ivertowski\\
-  \textit{Independent Researcher}\\
-  Zurich, Switzerland\\
-  \texttt{mivertowski@outlook.com}
+%% ============================================================================
+%% Document metadata
+%% ============================================================================
+
+\title{%
+    \LARGE\textbf{The GPU-Native Persistent Actor Model:}\\[0.3em]
+    \Large\textbf{Bringing Actor Semantics to Massively Parallel Hardware}
+}
+
+\author[1]{Michael Ivertowski~\orcidlink{0009-0008-7829-2249}}
+\affil[1]{%
+    Ernst \& Young AG\\
+    Zurich, Switzerland\\
+    \texttt{michael.ivertowski@ch.ey.com}
 }
 
-\date{\today}
+\date{%
+    January 2026\\[1em]
+    \small\textit{Preprint. Under review.}
+}
 
+%% ============================================================================
+%% Document
+%% ============================================================================
 \begin{document}
 
 \maketitle
 
-% Abstract
+%% Abstract
 \begin{abstract}
+\noindent
 \input{sections/00-abstract}
 \end{abstract}
 
 \vspace{1em}
-\noindent\textbf{Keywords:} Actor Model, GPU Computing, Persistent Kernels, Message Passing, CUDA, Hybrid Logical Clocks, Lock-Free Algorithms
+\noindent\textbf{Keywords:} Actor Model, GPU Computing, Persistent Kernels, Message Passing, Hybrid Logical Clocks, Lock-Free Algorithms, CUDA, WebGPU, Distributed Systems, Graph Analytics
 
-\vspace{1em}
+\vspace{0.5em}
+\noindent\textbf{ACM CCS:} Computer systems organization $\rightarrow$ Parallel architectures; Software and its engineering $\rightarrow$ Concurrent programming structures
+
+\vspace{1.5em}
 
-% Main content sections
+%% Main content sections
 \input{sections/01-introduction}
 \input{sections/02-background}
 \input{sections/03-related-work}
@@ -110,17 +180,17 @@
 \input{sections/07-discussion}
 \input{sections/08-conclusion}
 
-% Acknowledgments
+%% Acknowledgments
 \section*{Acknowledgments}
 We thank the open-source community for their contributions to the CUDA ecosystem,
 particularly the cudarc project for Rust CUDA bindings. We also acknowledge the
 foundational work on the actor model by Carl Hewitt and colleagues.
 
-% Bibliography
+%% Bibliography
 \bibliographystyle{plainnat}
 \bibliography{references}
 
-% Appendix
+%% Appendix
 \appendix
 \input{sections/09-appendix}
 
 
@@ -266,3 +266,37 @@ @misc{nvidia2023cooperative
   year         = {2023},
   howpublished = {\url{https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cooperative-groups}},
 }
+
+%% ===== GPU-Native Actor Ecosystem =====
+
+@misc{ringkernel2025,
+  author       = {Ivertowski, Michael},
+  title        = {RingKernel: GPU-Native Persistent Actor Model for Rust},
+  year         = {2025},
+  howpublished = {\url{https://github.com/mivertowski/RustCompute}},
+  note         = {Rust implementation with CUDA/WebGPU backends},
+}
+
+@misc{dotcompute2025,
+  author       = {Ivertowski, Michael},
+  title        = {DotCompute: Universal Compute Acceleration for .NET},
+  year         = {2025},
+  howpublished = {\url{https://github.com/mivertowski/DotCompute}},
+  note         = {.NET 9 implementation with CUDA/OpenCL/Metal backends},
+}
+
+@misc{orleansgpubridge2025,
+  author       = {Ivertowski, Michael},
+  title        = {Orleans.GpuBridge: GPU Acceleration for Microsoft Orleans},
+  year         = {2025},
+  howpublished = {\url{https://github.com/mivertowski/Orleans.GpuBridge}},
+  note         = {Integration of GPU-native actors with Orleans virtual actors},
+}
+
+@misc{rustgraph2025,
+  author       = {Ivertowski, Michael},
+  title        = {RustGraph: Living Graph Database with GPU-Native Actors},
+  year         = {2025},
+  howpublished = {\url{https://github.com/mivertowski/RustGraph}},
+  note         = {Graph nodes as persistent GPU actors with 64+ living analytics},
+}
@@ -4,20 +4,22 @@
 The actor model, introduced by Hewitt in 1973, has become foundational for building
 concurrent and distributed systems. However, existing implementations target CPU
 architectures, leaving GPU parallelism largely unexplored for actor-based computation.
-We present \textbf{RingKernel}, a GPU-native persistent actor model that treats GPU
+We present the \textbf{GPU-Native Persistent Actor Model}, a paradigm that treats GPU
 compute units as long-running actors with lock-free message passing and causal ordering.
 
-Our key contributions are: (1) a formal extension of the actor model for GPU execution
-with Host-to-Kernel (H2K), Kernel-to-Host (K2H), and Kernel-to-Kernel (K2K) messaging
-channels; (2) a 128-byte \texttt{ControlBlock} structure for GPU-resident actor lifecycle
-management; (3) integration of Hybrid Logical Clocks (HLC) for causal ordering across
-thousands of concurrent GPU actors; and (4) a Rust-to-CUDA transpiler that generates
-persistent kernel code from high-level actor definitions.
+This paper describes \textbf{RingKernel}, the Rust implementation of this paradigm,
+alongside three companion frameworks: \textbf{DotCompute} (.NET), \textbf{Orleans.GpuBridge}
+(Microsoft Orleans integration), and \textbf{RustGraph} (living graph database). Together,
+these systems demonstrate the broad applicability of GPU-native actors.
 
-We evaluate RingKernel on NVIDIA RTX Ada GPUs, demonstrating that persistent GPU actors
-achieve \textbf{11,327$\times$ lower latency} for interactive commands compared to
-traditional kernel launches (0.03$\mu$s vs 317$\mu$s). For mixed workloads combining
-computation with interactive commands, RingKernel achieves \textbf{2.7$\times$ higher
-throughput} than the traditional launch-per-operation model. Our system bridges the
-gap between high-level actor semantics and GPU hardware capabilities, enabling new
-classes of interactive GPU applications.
+Our key contributions are: (1) formalization of GPU actor semantics with Host-to-Kernel (H2K),
+Kernel-to-Host (K2H), and Kernel-to-Kernel (K2K) messaging channels; (2) a 128-byte
+\texttt{ControlBlock} structure for GPU-resident actor lifecycle management; (3) integration
+of Hybrid Logical Clocks (HLC) for causal ordering across thousands of concurrent GPU actors;
+and (4) cross-language implementations proving the paradigm's universality.
+
+We evaluate on NVIDIA RTX Ada GPUs, demonstrating that persistent GPU actors achieve
+\textbf{11,327$\times$ lower latency} for interactive commands compared to traditional
+kernel launches (0.03$\mu$s vs 317$\mu$s). For mixed workloads, GPU-native actors achieve
+\textbf{2.7$\times$ higher throughput}, enabling new classes of interactive GPU applications
+including real-time fraud detection, living graph analytics, and distributed digital twins.
@@ -52,12 +52,35 @@ \subsection{Persistent Kernels: A Partial Solution}
 
 \subsection{Our Contribution: GPU-Native Actors}
 
-We present \textbf{RingKernel}, a system that applies actor model semantics to GPU
-computing. Our key insight is that GPU threads (or thread blocks) can be viewed as
-actors: they have private state (registers, shared memory), communicate via messages
-(through lock-free queues), and run persistently.
+We present the \textbf{GPU-Native Persistent Actor Model}, a paradigm that applies
+actor semantics to GPU computing. Our key insight is that GPU threads (or thread blocks)
+can be viewed as actors: they have private state (registers, shared memory), communicate
+via messages (through lock-free queues), and run persistently.
 
-RingKernel makes the following contributions:
+This paradigm is realized through four complementary implementations:
+
+\begin{itemize}
+    \item \textbf{RingKernel} (Rust): The reference implementation described in this paper,
+    featuring a Rust-to-CUDA transpiler and comprehensive runtime.
+
+    \item \textbf{DotCompute} (.NET 9/C\#): A production-grade framework with multi-backend
+    support (CUDA, OpenCL, Metal), LINQ-to-GPU compilation, and Native AOT compatibility.
+
+    \item \textbf{Orleans.GpuBridge} (.NET/Orleans): Integration with Microsoft Orleans'
+    virtual actor model, enabling distributed GPU actors across Orleans clusters with
+    hypergraph support and temporal causality.
+
+    \item \textbf{RustGraph} (Rust): A living graph database where nodes and edges are
+    persistent GPU actors, maintaining 64+ analytics algorithms via continuous message
+    propagation with O(1) query latency.
+\end{itemize}
+
+Together, these systems demonstrate that GPU-native actors are a universal paradigm
+applicable across languages, frameworks, and domains.
+
+\subsection{Contributions}
+
+This paper makes the following contributions:
 
 \begin{enumerate}
     \item \textbf{Formalization of GPU Actor Semantics} (\S\ref{sec:design}): We extend
@@ -73,9 +96,13 @@ \subsection{Our Contribution: GPU-Native Actors}
     HLC~\cite{kulkarni2014hlc} for causal ordering of messages across GPU actors,
     enabling distributed systems semantics on massively parallel hardware.
 
-    \item \textbf{Rust-to-CUDA Transpilation} (\S\ref{sec:implementation}): We provide
-    a DSL and transpiler that generates persistent kernel CUDA code from high-level
-    Rust actor definitions, including automatic message envelope handling.
+    \item \textbf{Cross-Language Implementations} (\S\ref{sec:implementation}): We provide
+    implementations in Rust and .NET, with transpilers generating CUDA, WGSL, and MSL,
+    demonstrating the paradigm's language-independence.
+
+    \item \textbf{Domain-Specific Applications}: We apply GPU-native actors to FDTD
+    simulation (RingKernel), enterprise accounting (DotCompute), distributed virtual
+    actors (Orleans.GpuBridge), and living graph analytics (RustGraph).
 
     \item \textbf{Comprehensive Evaluation} (\S\ref{sec:evaluation}): We demonstrate
     11,327$\times$ lower command latency and 2.7$\times$ higher mixed-workload
@@ -86,7 +113,8 @@ \subsection{Paper Organization}
 
 The remainder of this paper is organized as follows. Section~\ref{sec:background}
 provides background on the actor model and GPU programming. Section~\ref{sec:related}
-discusses related work. Section~\ref{sec:design} presents the RingKernel system design.
-Section~\ref{sec:implementation} details the implementation. Section~\ref{sec:evaluation}
+discusses related work including our companion implementations. Section~\ref{sec:design}
+presents the GPU-native actor system design. Section~\ref{sec:implementation} details
+the RingKernel implementation and cross-language ecosystem. Section~\ref{sec:evaluation}
 evaluates performance. Section~\ref{sec:discussion} discusses limitations and future
 work. Section~\ref{sec:conclusion} concludes.