Created using Colab

ratulb · ratulb · commit 859b5deed055 · 2025-05-24T23:44:03.000+05:30
diff --git a/gpu_puzzles/broadcast_add_layout.ipynb b/gpu_puzzles/broadcast_add_layout.ipynb
@@ -0,0 +1,206 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/ratulb/mojo_programming/blob/main/gpu_puzzles/broadcast_add_layout.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!curl -ssL https://magic.modular.com/ | bash"
+      ],
+      "metadata": {
+        "id": "oghVhc-plDnV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "os.environ['PATH'] += ':/root/.modular/bin'"
+      ],
+      "metadata": {
+        "id": "bo0LqVellMRb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic init gpu_puzzles --format mojoproject"
+      ],
+      "metadata": {
+        "id": "orDFbNYOlmVj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd gpu_puzzles/"
+      ],
+      "metadata": {
+        "id": "I_II8JVmluuj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%writefile broadcast_add_layout.mojo\n",
+        "\n",
+        "### Broadcast Addiotion\n",
+        "### Add 2 vectors\n",
+        "\n",
+        "from gpu import thread_idx\n",
+        "from gpu.host import DeviceContext\n",
+        "from layout import Layout, LayoutTensor\n",
+        "from testing import assert_equal\n",
+        "\n",
+        "\n",
+        "alias SIZE = 3\n",
+        "alias dtype = DType.float32\n",
+        "alias BLOCKS_PER_GRID = 1\n",
+        "alias THREADS_PER_BLOCK = (3, 3)\n",
+        "\n",
+        "alias layout_out = Layout.row_major(SIZE, SIZE)\n",
+        "alias layout_a = Layout.row_major(1, SIZE)\n",
+        "alias layout_b = Layout.row_major(SIZE, 1)\n",
+        "\n",
+        "\n",
+        "\n",
+        "fn broadcast_add_layout[layout_out: Layout, layout_a: Layout, layout_b: Layout](\n",
+        "    out: LayoutTensor[mut=True, dtype, layout_out],\n",
+        "    a: LayoutTensor[mut=True, dtype, layout_a],\n",
+        "    b: LayoutTensor[mut=True, dtype, layout_b],\n",
+        "):\n",
+        "    row = thread_idx.y\n",
+        "    col = thread_idx.x\n",
+        "    if row < SIZE and col < SIZE:\n",
+        "        out[row, col] = a[0, row] + b[col, 0]\n",
+        "\n",
+        "\n",
+        "fn main() raises:\n",
+        "    with DeviceContext() as ctx:\n",
+        "        out_buffer = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)\n",
+        "        expected_buffer = ctx.enqueue_create_host_buffer[dtype](\n",
+        "            SIZE * SIZE\n",
+        "        ).enqueue_fill(0)\n",
+        "        a_buffer = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)\n",
+        "        b_buffer = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)\n",
+        "\n",
+        "        with a_buffer.map_to_host() as a_buffer_host, b_buffer.map_to_host() as b_buffer_host:\n",
+        "            for i in range(SIZE):\n",
+        "                a_buffer_host[i] = i\n",
+        "                b_buffer_host[i] = i\n",
+        "            print(a_buffer)\n",
+        "            print(b_buffer)\n",
+        "            for i in range(SIZE):\n",
+        "                for j in range(SIZE):\n",
+        "                    expected_buffer[i * SIZE + j] = a_buffer_host[i] + b_buffer_host[j]\n",
+        "            print(expected_buffer)\n",
+        "\n",
+        "        out = LayoutTensor[mut=True, dtype, layout_out](out_buffer.unsafe_ptr())\n",
+        "        a =  LayoutTensor[mut=True, dtype, layout_a](a_buffer.unsafe_ptr())\n",
+        "        b = LayoutTensor[mut=True, dtype, layout_b](b_buffer.unsafe_ptr())\n",
+        "        expected = LayoutTensor[mut=True, dtype, layout_out](expected_buffer.unsafe_ptr())\n",
+        "\n",
+        "        ctx.enqueue_function[broadcast_add_layout[layout_out, layout_a, layout_b]](\n",
+        "            out,\n",
+        "            a,\n",
+        "            b,\n",
+        "            SIZE,\n",
+        "            grid_dim=BLOCKS_PER_GRID,\n",
+        "            block_dim=THREADS_PER_BLOCK,\n",
+        "        )\n",
+        "        ctx.synchronize()\n",
+        "\n",
+        "        with out_buffer.map_to_host() as out_buffer_host:\n",
+        "            print(out_buffer_host)\n",
+        "            for i in range(SIZE):\n",
+        "                for j in range(SIZE):\n",
+        "                    assert_equal(out_buffer_host[i * SIZE + j], expected_buffer[i * SIZE + j])\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "r8TtOuGcmo7L",
+        "outputId": "80443e36-bc03-42ef-a22a-4d7472efc586"
+      },
+      "execution_count": 14,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Overwriting broadcast_add_layout.mojo\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo broadcast_add_layout.mojo"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "2heIJSH7lxPj",
+        "outputId": "a963ec38-d02b-4869-c069-f120c4b370ad"
+      },
+      "execution_count": 15,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[32m⠁\u001b[0m                                                                               \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2K\u001b[32m⠁\u001b[0m activating environment                                                        \r\u001b[2KDeviceBuffer([0.0, 0.0, 0.0])\n",
+            "DeviceBuffer([0.0, 0.0, 0.0])\n",
+            "HostBuffer([0.0, 1.0, 2.0, 1.0, 2.0, 3.0, 2.0, 3.0, 4.0])\n",
+            "HostBuffer([0.0, 1.0, 2.0, 1.0, 2.0, 3.0, 2.0, 3.0, 4.0])\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!magic run mojo format broadcast_add_layout.mojo"
+      ],
+      "metadata": {
+        "id": "2KeEPNK2GYKV"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "name": "Welcome To Colab",
+      "provenance": [],
+      "gpuType": "T4",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "accelerator": "GPU"
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}