Gemm update

t81dev · t81dev · commit f9f5f7818f5a · 2025-12-14T10:15:38.000-05:00
diff --git a/README.md b/README.md
@@ -78,6 +78,17 @@ pip install ".[torch]"
 
 On macOS or other PEP 668-enforced environments, activate a virtualenv before running `pip install ".[torch]"` (or use `python3 -m pip install --user ".[torch]" --break-system-packages` if you understand the risks) so pip can install the extra dependencies without hitting the “externally managed environment” error.
 
+### 2a. CLI-friendly Pipx install
+
+If you prefer shell-level access to `t81-convert`, `t81-gguf`, `t81-qat`, and `t81-dequant`, pipx can install the repo and then inject the torch extras:
+
+```bash
+pipx install --python python3 /Users/t81dev/Desktop/t81lib
+pipx inject t81lib torch transformers accelerate datasets safetensors
+```
+
+Pipx doesn’t understand `.[torch]` when pointing at a local directory, so we first install the package from source and then inject the optional dependencies you need (torch, transformers, accelerate, datasets, safetensors). Once that completes, the CLI helpers will run from `~/.local/bin` with the same requirements as `pip install ".[torch]"`. Continue running `pipx uninstall t81lib` and reinject if you upgrade the repo checkout.
+
 ### 3. Consume as a subproject
 
 ```cmake
diff --git a/include/t81/linalg/gemm.hpp b/include/t81/linalg/gemm.hpp
@@ -63,71 +63,6 @@ namespace t81::linalg {
             return low_value + high_value * radix;
         }
 
-        inline void gemm_ternary_cpu_impl(std::span<const core::limb> A,
-                                          std::span<const core::limb> B,
-                                          std::span<float> C,
-                                          int M,
-                                          int N,
-                                          int K,
-                                          int K_limbs,
-                                          float alpha,
-                                          float beta) {
-            if (M == 0 || N == 0) {
-                return;
-            }
-
-            constexpr int BlockM = 8;
-            constexpr int BlockN = 8;
-            constexpr int BlockK = 4;
-            const std::size_t N_size = static_cast<std::size_t>(N);
-            const auto *const a_data = A.data();
-            const auto *const b_data = B.data();
-            auto *const c_data = C.data();
-
-            for (int ib = 0; ib < M; ib += BlockM) {
-                const int i_end = std::min(M, ib + BlockM);
-                for (int jb = 0; jb < N; jb += BlockN) {
-                    const int j_end = std::min(N, jb + BlockN);
-                    std::array<std::array<double, BlockN>, BlockM> accum{};
-                    for (int i = ib; i < i_end; ++i) {
-                        const std::size_t row = static_cast<std::size_t>(i) * N_size;
-                        for (int j = jb; j < j_end; ++j) {
-                            const float existing = c_data[row + static_cast<std::size_t>(j)];
-                            accum[i - ib][j - jb] = static_cast<double>(existing) * beta;
-                        }
-                    }
-
-                    for (int kb = 0; kb < K_limbs; kb += BlockK) {
-                        const int k_end = std::min(K_limbs, kb + BlockK);
-                        for (int k = kb; k < k_end; ++k) {
-                            const std::size_t b_row = static_cast<std::size_t>(k) * N_size;
-                            for (int j = jb; j < j_end; ++j) {
-                                const core::limb b_value = b_data[b_row + static_cast<std::size_t>(j)];
-                                detail::prefetch_read(b_data + b_row + static_cast<std::size_t>(j) + 1);
-                                for (int i = ib; i < i_end; ++i) {
-                                    const std::size_t a_index = static_cast<std::size_t>(i) *
-                                                                    static_cast<std::size_t>(K_limbs) +
-                                                                static_cast<std::size_t>(k);
-                                    const core::limb a_value = a_data[a_index];
-                                    const double product = detail::multiply_to_double(a_value, b_value);
-                                    accum[i - ib][j - jb] += product * static_cast<double>(alpha);
-                                    detail::prefetch_read(a_data + a_index + 1);
-                                }
-                            }
-                        }
-                    }
-
-                    for (int i = ib; i < i_end; ++i) {
-                        const std::size_t row = static_cast<std::size_t>(i) * N_size;
-                        for (int j = jb; j < j_end; ++j) {
-                            c_data[row + static_cast<std::size_t>(j)] =
-                                static_cast<float>(accum[i - ib][j - jb]);
-                        }
-                    }
-                }
-            }
-        }
-
     } // namespace detail
 
     inline void gemm_ternary(std::span<const core::limb> A,
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
@@ -23,7 +23,8 @@ pybind11_add_module(t81lib_python MODULE bindings.cpp)
 
 target_sources(t81lib_python PRIVATE
     ${CMAKE_CURRENT_SOURCE_DIR}/../src/t81/core/gguf_quants.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/../src/linalg/gemm_dispatch.cpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/../src/linalg/gemm_dispatch.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../src/linalg/gemm_cpu.cpp)
 target_compile_features(t81lib_python PRIVATE cxx_std_20)
 
 #target_link_libraries(t81lib_python PRIVATE t81lib)
diff --git a/src/linalg/gemm_cpu.cpp b/src/linalg/gemm_cpu.cpp
@@ -0,0 +1,73 @@
+#include <algorithm>
+#include <array>
+
+#include "t81/linalg/gemm.hpp"
+
+namespace t81::linalg::detail {
+
+    void gemm_ternary_cpu_impl(std::span<const core::limb> A,
+                               std::span<const core::limb> B,
+                               std::span<float> C,
+                               int M,
+                               int N,
+                               int K,
+                               int K_limbs,
+                               float alpha,
+                               float beta) {
+        if (M == 0 || N == 0) {
+            return;
+        }
+
+        constexpr int BlockM = 8;
+        constexpr int BlockN = 8;
+        constexpr int BlockK = 4;
+        const std::size_t N_size = static_cast<std::size_t>(N);
+        const auto *const a_data = A.data();
+        const auto *const b_data = B.data();
+        auto *const c_data = C.data();
+
+        for (int ib = 0; ib < M; ib += BlockM) {
+            const int i_end = std::min(M, ib + BlockM);
+            for (int jb = 0; jb < N; jb += BlockN) {
+                const int j_end = std::min(N, jb + BlockN);
+                std::array<std::array<double, BlockN>, BlockM> accum{};
+                for (int i = ib; i < i_end; ++i) {
+                    const std::size_t row = static_cast<std::size_t>(i) * N_size;
+                    for (int j = jb; j < j_end; ++j) {
+                        const float existing = c_data[row + static_cast<std::size_t>(j)];
+                        accum[i - ib][j - jb] = static_cast<double>(existing) * beta;
+                    }
+                }
+
+                for (int kb = 0; kb < K_limbs; kb += BlockK) {
+                    const int k_end = std::min(K_limbs, kb + BlockK);
+                    for (int k = kb; k < k_end; ++k) {
+                        const std::size_t b_row = static_cast<std::size_t>(k) * N_size;
+                        for (int j = jb; j < j_end; ++j) {
+                            const core::limb b_value = b_data[b_row + static_cast<std::size_t>(j)];
+                            detail::prefetch_read(b_data + b_row + static_cast<std::size_t>(j) + 1);
+                            for (int i = ib; i < i_end; ++i) {
+                                const std::size_t a_index = static_cast<std::size_t>(i) *
+                                                                static_cast<std::size_t>(K_limbs) +
+                                                            static_cast<std::size_t>(k);
+                                const core::limb a_value = a_data[a_index];
+                                const double product = detail::multiply_to_double(a_value, b_value);
+                                accum[i - ib][j - jb] += product * static_cast<double>(alpha);
+                                detail::prefetch_read(a_data + a_index + 1);
+                            }
+                        }
+                    }
+                }
+
+                for (int i = ib; i < i_end; ++i) {
+                    const std::size_t row = static_cast<std::size_t>(i) * N_size;
+                    for (int j = jb; j < j_end; ++j) {
+                        c_data[row + static_cast<std::size_t>(j)] =
+                            static_cast<float>(accum[i - ib][j - jb]);
+                    }
+                }
+            }
+        }
+    }
+
+} // namespace t81::linalg::detail