diff --git a/CMakeLists.txt b/CMakeLists.txt index 150ecde1c..b16a33e07 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,6 +204,12 @@ if(SD_WEBM) endif() endif() +if (SD_RPC) + message("-- Use RPC as backend stable-diffusion") + set(GGML_RPC ON) + add_definitions(-DSD_USE_RPC) +endif () + set(SD_LIB stable-diffusion) file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS diff --git a/docs/rpc.md b/docs/rpc.md new file mode 100644 index 000000000..617a8b5a1 --- /dev/null +++ b/docs/rpc.md @@ -0,0 +1,220 @@ +# Building and Using the RPC Server with `stable-diffusion.cpp` + +This guide covers how to build a version of [the RPC server from `llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/tools/rpc/README.md) that is compatible with your version of `stable-diffusion.cpp` to manage multi-backends setups. RPC allows you to offload specific model components to a remote server. + +> **Note on Model Location:** The model files (e.g., `.safetensors` or `.gguf`) remain on the **Client** machine. The client parses the file and transmits the necessary tensor data and computational graphs to the server. The server does not need to store the model files locally. + +## 1. Building `stable-diffusion.cpp` with RPC client + +First, you should build the client application from source. It requires `SD_RPC=ON` to include the RPC backend to your client. + +```bash +mkdir build +cd build +cmake .. \ + -DSD_RPC=ON \ + # Add other build flags here (e.g., -DSD_VULKAN=ON) +cmake --build . --config Release -j $(nproc) +``` + +> **Note:** Ensure you add the other flags you would normally use (e.g., `-DSD_VULKAN=ON`, `-DSD_CUDA=ON`, `-DSD_HIPBLAS=ON`, or `-DGGML_METAL=ON`), for more information about building `stable-diffusion.cpp` from source, please refer to the [build.md](build.md) documentation. + +## 2. Ensure `llama.cpp` is at the correct commit + +`stable-diffusion.cpp`'s RPC client is designed to work with a specific version of `llama.cpp` (compatible with the `ggml` submodule) to ensure API compatibility. The commit hash for `llama.cpp` is stored in `ggml/scripts/sync-llama.last`. + +> **Start from Root:** Perform these steps from the root of your `stable-diffusion.cpp` directory. + +1. Read the target commit hash from the submodule tracker: + + ```bash + # Linux / WSL / MacOS + HASH=$(cat ggml/scripts/sync-llama.last) + + # Windows (PowerShell) + $HASH = Get-Content -Path "ggml\scripts\sync-llama.last" + ``` + +2. Clone `llama.cpp` at the target commit . + ```bash + git clone https://github.com/ggml-org/llama.cpp.git + cd llama.cpp + git checkout $HASH + ``` + To save on download time and storage, you can use a shallow clone to download only the target commit: + ```bash + mkdir -p llama.cpp + cd llama.cpp + git init + git remote add origin https://github.com/ggml-org/llama.cpp.git + git fetch --depth 1 origin $HASH + git checkout FETCH_HEAD + ``` + +## 3. Build `llama.cpp` (RPC Server) + +The RPC server acts as the worker. You must explicitly enable the **backend** (the hardware interface, such as CUDA for Nvidia, Metal for Apple Silicon, or Vulkan) when building, otherwise the server will default to using only the CPU. + +To find the correct flags for your system, refer to the official documentation for the [`llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) repository. + +> **Crucial:** You must include the compiler flags required to satisfy the API compatibility with `stable-diffusion.cpp` (`-DGGML_MAX_NAME=128`). Without this flag, `GGML_MAX_NAME` will default to `64` for the server, and data transfers between the client and server will fail. Of course, `-DGGML_RPC` must also be enabled. +> +> I recommend disabling the `LLAMA_CURL` flag to avoid unnecessary dependencies, and disabling shared library builds to avoid potential conflicts. + +> **Build Target:** We are specifically building the `rpc-server` target. This prevents the build system from compiling the entire `llama.cpp` suite (like `llama-server`), making the build significantly faster. + +### Linux / WSL (Vulkan) + +```bash +mkdir build +cd build +cmake .. -DGGML_RPC=ON \ + -DGGML_VULKAN=ON \ # Ensure backend is enabled + -DGGML_BUILD_SHARED_LIBS=OFF \ + -DLLAMA_CURL=OFF \ + -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 \ + -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128 +cmake --build . --config Release --target rpc-server -j $(nproc) +``` + +### macOS (Metal) + +```bash +mkdir build +cd build +cmake .. -DGGML_RPC=ON \ + -DGGML_METAL=ON \ + -DGGML_BUILD_SHARED_LIBS=OFF \ + -DLLAMA_CURL=OFF \ + -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 \ + -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128 +cmake --build . --config Release --target rpc-server +``` + +### Windows (Visual Studio 2022, Vulkan) + +```powershell +mkdir build +cd build +cmake .. -G "Visual Studio 17 2022" -A x64 ` + -DGGML_RPC=ON ` + -DGGML_VULKAN=ON ` + -DGGML_BUILD_SHARED_LIBS=OFF ` + -DLLAMA_CURL=OFF ` + -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 ` + -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128 +cmake --build . --config Release --target rpc-server +``` + +## 4. Usage + +Once both applications are built, you can run the server and the client to manage your GPU allocation. + +### Step A: Run the RPC Server + +Start the server. It listens for connections on the default address (usually `localhost:50052`). If your server is on a different machine, ensure the server binds to the correct interface and your firewall allows the connection. + +**On the Server :** +If running on the same machine, you can use the default address: + +```bash +./rpc-server +``` + +If you want to allow connections from other machines on the network: + +```bash +./rpc-server --host 0.0.0.0 +``` + +> **Security Warning:** The RPC server does not currently support authentication or encryption. **Only run the server on trusted local networks**. Never expose the RPC server directly to the open internet. + +> **Drivers & Hardware:** Ensure the Server machine has the necessary drivers installed and functional (e.g., Nvidia Drivers for CUDA, Vulkan SDK, or Metal). If no devices are found, the server will simply fallback to CPU usage. + + + +### Step B: Run with RPC device + +If everything is working correctly, you can now run the client while offloading some or all of the work to the RPC server. + +Example: Setting the main backend to the RPC0 device for doing all the work on the server. + +```bash +./sd-cli -m models/sd1.5.safetensors -p "A cat" --rpc-servers localhost:50052 --backend RPC0 +``` + +--- + +## 5. Scaling: Multiple RPC Servers + +You can connect the client to multiple RPC servers simultaneously to scale out your hardware usage. + +Example: A main machine (192.168.1.10) with 3 GPUs, with one GPU running CUDA and the other two running Vulkan, and a second machine (192.168.1.11) only one GPU. + +**On the first machine (Running two server instances):** + +**Terminal 1 (CUDA):** + +```bash +# Linux / WSL +export CUDA_VISIBLE_DEVICES=0 +cd ./build_cuda/bin/Release +./rpc-server --host 0.0.0.0 + +# Windows PowerShell +$env:CUDA_VISIBLE_DEVICES="0" +cd .\build_cuda\bin\Release +./rpc-server --host 0.0.0.0 +``` + +**Terminal 2 (Vulkan):** + +```bash +cd ./build_vulkan/bin/Release +# ignore the first GPU (used by CUDA server) +./rpc-server --host 0.0.0.0 --port 50053 -d Vulkan1,Vulkan2 +``` + +**On the second machine:** + +```bash +cd ./build/bin/Release +./rpc-server --host 0.0.0.0 +``` + +**On the Client:** +Pass multiple server addresses separated by commas. + +```bash +./sd-cli --rpc-servers 192.168.1.10:50052,192.168.1.10:50053,192.168.1.11:50052 [...] +``` + +The client will map these servers to sequential device IDs (e.g., RPC0 from the first server, RPC2, RPC3 from the second, and RPC4 from the third). With this setup, you could for example use RPC0 for the main backend, RPC1 and RPC2 for the text encoders, and RPC3 for the VAE. + +--- + +## 6. Performance Considerations + +RPC performance is heavily dependent on network bandwidth, as large weights and activations must be transferred back and forth over the network, especially for large models, or when using high resolutions. For best results, ensure your network connection is stable and has sufficient bandwidth (>1Gbps recommended). This shoumd not be a concern if you are running the server and client on the same machine, as the data transfer will happen over the loopback interface. \ No newline at end of file diff --git a/examples/common/common.cpp b/examples/common/common.cpp index f0742f62f..03a35c9a7 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -427,6 +427,10 @@ ArgOptions SDContextParams::get_options() { "--params-backend", "parameter backend assignment, e.g. disk, cpu, or diffusion=disk,clip=cpu", ¶ms_backend}, + {"", + "--rpc-servers", + "comma-separated list of RPC servers to connect to for offloading, in the format host:port, e.g. localhost:50052,192.168.1.3:50052", + &rpc_servers}, }; options.int_options = { @@ -836,6 +840,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) { sd_ctx_params.stream_layers = stream_layers; sd_ctx_params.backend = effective_backend.c_str(); sd_ctx_params.params_backend = effective_params_backend.c_str(); + sd_ctx_params.rpc_servers = rpc_servers.c_str(); return sd_ctx_params; } diff --git a/examples/common/common.h b/examples/common/common.h index 2ae54c2c7..86fcc1627 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -148,6 +148,7 @@ struct SDContextParams { bool stream_layers = false; std::string backend; std::string params_backend; + std::string rpc_servers; std::string effective_backend; std::string effective_params_backend; bool enable_mmap = false; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index ffefdaadf..674c9d63a 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -220,6 +220,7 @@ typedef struct { bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram) const char* backend; const char* params_backend; + const char* rpc_servers; } sd_ctx_params_t; typedef struct { diff --git a/src/core/ggml_extend_backend.cpp b/src/core/ggml_extend_backend.cpp index 834a047e7..d8062fef2 100644 --- a/src/core/ggml_extend_backend.cpp +++ b/src/core/ggml_extend_backend.cpp @@ -204,6 +204,36 @@ void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value } } +bool add_rpc_devices(const std::string& servers) { + const std::string in = trim_copy(servers); + if (in.empty()) { + return true; + } + auto rpc_servers = split_copy(in, ','); + if (rpc_servers.empty()) { + LOG_ERROR("invalid RPC servers specification: '%s'", servers.c_str()); + return false; + } + ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); + if (!rpc_reg) { + LOG_ERROR("RPC backend not found, cannot add RPC servers"); + return false; + } + typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char* endpoint); + ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t)ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server"); + if (!ggml_backend_rpc_add_server_fn) { + LOG_ERROR("RPC backend does not have ggml_backend_rpc_add_server function, cannot add RPC servers"); + return false; + } + for (const auto& server : rpc_servers) { + LOG_INFO("Adding RPC server: %s", server.c_str()); + auto reg = ggml_backend_rpc_add_server_fn(server.c_str()); + // no return value to check for success but should print errors from the RPC backend if it fails to add the server + ggml_backend_register(reg); + } + return true; +} + static void ggml_backend_load_all_once() { // If the registry already has devices and the CPU backend is present, // assume either static registration or explicit host-side preloading has diff --git a/src/core/ggml_extend_backend.h b/src/core/ggml_extend_backend.h index 58d41ac44..4db543257 100644 --- a/src/core/ggml_extend_backend.h +++ b/src/core/ggml_extend_backend.h @@ -73,4 +73,5 @@ ggml_backend_t sd_backend_cpu_init(); bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); const char* sd_backend_module_name(SDBackendModule module); void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value); +bool add_rpc_devices(const std::string& servers); #endif // __SD_CORE_GGML_EXTEND_BACKEND_H__ diff --git a/src/model_loader.cpp b/src/model_loader.cpp index 8d37d39a2..587b01f4d 100644 --- a/src/model_loader.cpp +++ b/src/model_loader.cpp @@ -1002,6 +1002,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, std::atomic tensor_idx(0); std::atomic failed(false); std::vector workers; + std::mutex rpc_backend_mutex; for (int i = 0; i < n_threads; ++i) { workers.emplace_back([&, file_path, is_zip]() { @@ -1158,7 +1159,19 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, if (dst_tensor->buffer != nullptr && !ggml_backend_buffer_is_host(dst_tensor->buffer)) { t0 = ggml_time_ms(); - ggml_backend_tensor_set(dst_tensor, convert_buf, 0, ggml_nbytes(dst_tensor)); + + // RPC backends require serialized access to prevent concurrency issues + const char* buffer_type_name = ggml_backend_buft_name(ggml_backend_buffer_get_type(dst_tensor->buffer)); + bool is_rpc_buffer = buffer_type_name != nullptr && + std::string(buffer_type_name).find("RPC") != std::string::npos; + + if (is_rpc_buffer) { + std::lock_guard lock(rpc_backend_mutex); + ggml_backend_tensor_set(dst_tensor, convert_buf, 0, ggml_nbytes(dst_tensor)); + } else { + ggml_backend_tensor_set(dst_tensor, convert_buf, 0, ggml_nbytes(dst_tensor)); + } + t1 = ggml_time_ms(); copy_to_backend_time_ms.fetch_add(t1 - t0); } diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index a5fb0e54d..c74d73634 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -318,6 +318,10 @@ class StableDiffusionGGML { stream_layers = sd_ctx_params->stream_layers; backend_spec = SAFE_STR(sd_ctx_params->backend); params_backend_spec = SAFE_STR(sd_ctx_params->params_backend); + + std::string rpc_servers_spec = SAFE_STR(sd_ctx_params->rpc_servers); + add_rpc_devices(rpc_servers_spec); + if (stream_layers && max_vram == 0.f) { LOG_WARN("--stream-layers has no effect without --max-vram set; ignoring"); stream_layers = false;