diff --git a/.asf.yaml b/.asf.yaml index b2f56d8d29..e5a156cbb0 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -21,3 +21,16 @@ github: issues: true projects: false discussions: true + rulesets: + - name: "Default Branch Protection" + type: branch + branches: + includes: + - "~DEFAULT_BRANCH" + - "release/*" + - "rel/*" + excludes: [] + bypass_teams: + - root + restrict_deletion: true + restrict_force_push: true diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index f2d6d69287..ee1dfaea0a 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -70,7 +70,7 @@ jobs: run: | export CC=clang && export CXX=clang++ mkdir clang_build_all && cd clang_build_all - cmake -DWITH_MESALINK=OFF -DWITH_GLOG=ON -DWITH_THRIFT=ON -DWITH_RDMA=ON -DWITH_DEBUG_BTHREAD_SCHE_SAFETY=ON -DWITH_DEBUG_LOCK=ON -DWITH_BTHREAD_TRACER=ON -DWITH_ASAN=ON -DCMAKE_POLICY_VERSION_MINIMUM=3.5 .. + cmake -DWITH_MESALINK=OFF -DWITH_GLOG=ON -DWITH_THRIFT=ON -DWITH_RDMA=ON -DWITH_UBRING=ON -DWITH_DEBUG_BTHREAD_SCHE_SAFETY=ON -DWITH_DEBUG_LOCK=ON -DWITH_BTHREAD_TRACER=ON -DWITH_ASAN=ON -DCMAKE_POLICY_VERSION_MINIMUM=3.5 .. make -j ${{env.proc_num}} && make clean gcc-compile-with-make-protobuf: diff --git a/.github/workflows/ci-macos.yml b/.github/workflows/ci-macos.yml index 61d45ac821..1f64b18997 100644 --- a/.github/workflows/ci-macos.yml +++ b/.github/workflows/ci-macos.yml @@ -34,7 +34,7 @@ jobs: - name: compile with cmake run: | echo "CMAKE_PREFIX_PATH=$(brew --prefix protobuf@21)" - mkdir build && cd build && cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_PREFIX_PATH=$(brew --prefix protobuf@21) .. + mkdir build && cd build && cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DWITH_UBRING=ON -DCMAKE_PREFIX_PATH=$(brew --prefix protobuf@21) .. make -j ${{env.proc_num}} && make clean compile-with-make-cmake-protobuf29: @@ -56,7 +56,7 @@ jobs: - name: compile with cmake run: | - mkdir build && cd build && cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_PREFIX_PATH=$(brew --prefix protobuf@29) .. + mkdir build && cd build && cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DWITH_UBRING=ON -DCMAKE_PREFIX_PATH=$(brew --prefix protobuf@29) .. make -j ${{env.proc_num}} && make clean compile-with-bazel: diff --git a/BUILD.bazel b/BUILD.bazel index 138e416b10..c763cb3a2f 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -475,7 +475,7 @@ cc_library( deps = [ ":brpc_idl_options_cc_proto", ":butil", - "@com_google_protobuf//src/google/protobuf/compiler:code_generator", + "@com_google_protobuf//:protoc_lib", ], ) diff --git a/CMakeLists.txt b/CMakeLists.txt index 77703a4661..1adf647e72 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ option(WITH_THRIFT "With thrift framed protocol supported" OFF) option(WITH_BTHREAD_TRACER "With bthread tracer supported" OFF) option(WITH_SNAPPY "With snappy" OFF) option(WITH_RDMA "With RDMA" OFF) +option(WITH_UBRING "With UB" OFF) option(WITH_DEBUG_BTHREAD_SCHE_SAFETY "With debugging bthread sche safety" OFF) option(WITH_DEBUG_LOCK "With debugging lock" OFF) option(WITH_ASAN "With AddressSanitizer" OFF) @@ -104,6 +105,11 @@ if(WITH_RDMA) set(WITH_RDMA_VAL "1") endif() +set(WITH_UBRING_VAL "0") +if(WITH_UBRING) + set(WITH_UBRING_VAL "1") +endif() + set(WITH_DEBUG_BTHREAD_SCHE_SAFETY_VAL "0") if(WITH_DEBUG_BTHREAD_SCHE_SAFETY) set(WITH_DEBUG_BTHREAD_SCHE_SAFETY_VAL "1") @@ -136,7 +142,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -Wno-deprecated-declarations -Wno-inconsistent-missing-override") endif() -set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} ${DEFINE_CLOCK_GETTIME} -DBRPC_WITH_GLOG=${WITH_GLOG_VAL} -DBRPC_WITH_RDMA=${WITH_RDMA_VAL} -DBRPC_DEBUG_BTHREAD_SCHE_SAFETY=${WITH_DEBUG_BTHREAD_SCHE_SAFETY_VAL} -DBRPC_DEBUG_LOCK=${WITH_DEBUG_LOCK_VAL}") +set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} ${DEFINE_CLOCK_GETTIME} -DBRPC_WITH_GLOG=${WITH_GLOG_VAL} -DBRPC_WITH_RDMA=${WITH_RDMA_VAL} -DBRPC_WITH_UBRING=${WITH_UBRING_VAL} -DBRPC_DEBUG_BTHREAD_SCHE_SAFETY=${WITH_DEBUG_BTHREAD_SCHE_SAFETY_VAL} -DBRPC_DEBUG_LOCK=${WITH_DEBUG_LOCK_VAL}") if (WITH_ASAN) set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -fsanitize=address") set(CMAKE_C_FLAGS "${CMAKE_CPP_FLAGS} -fsanitize=address") @@ -322,6 +328,11 @@ if(WITH_RDMA) list(APPEND DYNAMIC_LIB ${RDMA_LIB}) endif() +if(WITH_UBRING) + message(STATUS "brpc compile with ubring") + list(APPEND DYNAMIC_LIB ${UB_LIB}) +endif() + set(BRPC_PRIVATE_LIBS "-lgflags -lprotobuf -lleveldb -lprotoc -lssl -lcrypto -ldl -lz") if(WITH_GLOG) diff --git a/README.md b/README.md index 1c4f78528b..d65366fafb 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,7 @@ You can use it to: * [FlatMap](docs/en/flatmap.md) * [Coroutine](docs/en/coroutine.md) * [Circuit Breaker](docs/en/circuit_breaker.md) + * [UBRing](docs/en/ubring.md) * [RDMA](docs/en/rdma.md) * [Bazel Support](docs/en/bazel_support.md) * [Wireshark baidu_std dissector plugin](docs/en/wireshark_baidu_std.md) diff --git a/README_cn.md b/README_cn.md index 6413f83fde..2cc686bd85 100644 --- a/README_cn.md +++ b/README_cn.md @@ -87,6 +87,7 @@ * [FlatMap](docs/cn/flatmap.md) * [协程](docs/cn/coroutine.md) * [熔断](docs/cn/circuit_breaker.md) + * [UBRing](docs/cn/ubring.md) * [RDMA](docs/cn/rdma.md) * [Bazel构建支持](docs/cn/bazel_support.md) * [Wireshark baidu_std协议解析插件](docs/cn/wireshark_baidu_std.md) diff --git a/docs/cn/ubring.md b/docs/cn/ubring.md new file mode 100644 index 0000000000..6519ae3f9f --- /dev/null +++ b/docs/cn/ubring.md @@ -0,0 +1,199 @@ +# UBRing: 高性能共享内存 RPC + +UBRing 是 brpc 中的高性能 RPC 实现,它利用共享内存进行进程间通信(IPC)。它支持本地共享内存(POSIX IPC)和远端共享内存(ubs-mem)两种模式,提供微秒到纳秒级的进程间通信延迟。 + +## 技术背景 + +传统的 RPC 框架通常使用网络套接字进行通信,由于内核参与、上下文切换和数据拷贝等原因,会引入显著的开销。UBRing 通过使用共享内存作为通信介质来解决这个问题,允许进程之间直接内存访问,最小化内核干预。 + +UBRing 的主要优势: + +- **超低延迟**:微秒级 RPC 延迟 +- **高吞吐量**:每秒数百万次 RPC 调用 +- **减少数据拷贝**:进程间直接内存访问 +- **跨平台支持**:支持 Linux 和 macOS + +## 支持的共享内存后端 + +UBRing 支持两种共享内存后端,通过 `ub_shm_type` 参数控制: + +### 1. POSIX IPC 共享内存 (ub\_shm\_type = 1) + +这是默认模式,使用标准 POSIX 共享内存进行本地 IPC。同一机器上的进程可以通过共享内存区域直接通信。 + +### 2. UBS-Mem 远端共享内存 (ub\_shm\_type = 2) + +此模式使用 ubs-mem(Unified Block Storage Memory),这是来自 openEuler 的开源远端共享内存框架。它支持机架内节点之间的共享内存通信,类似于 RDMA 但部署要求更简单。 + +**UBS-Mem 开源地址**: + +**所需库文件**: +- `libubsm_sdk.so` - UBS-Mem SDK 库(安装路径:`/usr/local/ubs_mem/lib/libubsm_sdk.so`) +- UBS-Mem 通过 `dlopen()` 动态加载该库,并使用 `ubsmem_initialize()`、`ubsmem_create_region()`、`ubsmem_shmem_allocate()`、`ubsmem_shmem_map()` 等函数 + +**UBS-Mem 关键函数**: +- `ubsmem_init_attributes()` - 初始化 UBS-Mem 属性 +- `ubsmem_initialize()` - 初始化 UBS-Mem 库 +- `ubsmem_finalize()` - 释放 UBS-Mem 库 +- `ubsmem_create_region()` - 创建共享内存区域 +- `ubsmem_shmem_allocate()` - 分配共享内存 +- `ubsmem_shmem_map()` - 将共享内存映射到本地地址空间 +- `ubsmem_shmem_unmap()` - 解除共享内存映射 +- `ubsmem_shmem_deallocate()` - 释放共享内存 +- `ubsmem_destroy_region()` - 销毁共享内存区域 + +### 未来扩展 + +该架构设计支持未来扩展 CXL(Compute Express Link)基于的远端共享内存,实现更灵活的分布式内存共享。 + +## 构建配置 + +### 使用 CMake 构建 + +要构建带有 UBRing 支持的 brpc,请使用以下命令: + +```bash +# 构建 brpc 并启用 UBRing 支持 +cd /path/to/brpc +cmake -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_UBRING:BOOL=ON +cmake --build build -j 8 + +# 构建 ubring_performance 示例 +cd /path/to/brpc/example/ubring_performance +cmake -B build +cmake --build build -j 8 +``` + +### 使用 Bazel 构建 + +使用 Bazel 构建带有 UBRing 支持的 brpc: + +```bash +# 构建 brpc 并启用 UBRing 支持 +cd /path/to/brpc +bazel build //... --define=with_ubring=true + +# 构建 ubring_performance 示例 +bazel build //example/ubring_performance/... +``` + +### 选择共享内存后端 + +共享内存后端通过 `--ub_shm_type` 参数控制: + +```bash +# 使用 POSIX IPC(默认) +./your_program --ub_shm_type=1 + +# 使用 UBS-Mem +./your_program --ub_shm_type=2 +``` + +## 性能测试 + +### 示例: ubring\_performance + +brpc 在 `example/ubring_performance/` 目录提供了性能测试示例。 + +#### 构建示例 + +```bash +cd example/ubring_performance +mkdir -p build && cd build +cmake .. +make +``` + +#### 运行服务端 + +```bash +# 使用 POSIX IPC +./ubring_performance_server --ub_shm_type=1 + +# 使用 UBS-Mem +./ubring_performance_server --ub_shm_type=2 +``` + +#### 运行客户端 + +```bash +# 使用 POSIX IPC +./ubring_performance_client --ub_shm_type=1 --server=127.0.0.1:8000 + +# 使用 UBS-Mem +./ubring_performance_client --ub_shm_type=2 --server=:8000 +``` + +#### 测试选项 + +| 选项 | 描述 | 默认值 | +| --------------- | ------------------------- | -------------- | +| `--ub_shm_type` | 共享内存类型 (1=IPC, 2=UBS-Mem) | 1 | +| `--server` | 服务端地址 | 127.0.0.1:8000 | +| `--thread_num` | 客户端线程数 | 1 | +| `--request_num` | 每线程请求总数 | 1000000 | +| `--timeout_ms` | 请求超时时间(毫秒) | 1000 | + +## 架构概述 + +```mermaid +graph TD + subgraph 客户端进程 + A[Client] + end + + subgraph 服务端进程 + B[Server] + end + + subgraph 共享内存层 + C[SHM Manager] + D[IPC Backend] + E[UBS-Mem Backend] + end + + A -->|直接内存访问| C + B -->|直接内存访问| C + C --> D + C --> E + + style A fill:#636,color:#fff,stroke:#333,stroke-width:2px + style B fill:#369,color:#fff,stroke:#333,stroke-width:2px + style C fill:#396,color:#fff,stroke:#333,stroke-width:2px +``` + +### 架构细节 + +UBRing 架构包含以下组件: + +1. **客户端/服务端进程**: 通过共享内存通信的应用进程 +2. **SHM Manager**: 共享内存操作的中央管理器 (`shm_mgr.cpp`) +3. **IPC Backend**: 用于本地通信的 POSIX 共享内存实现 +4. **UBS-Mem Backend**: 用于跨节点通信的远端共享内存实现 + +## 实现细节 + +### 共享内存管理 + +共享内存管理器 (`shm_mgr.cpp`) 为不同的共享内存后端提供统一接口: + +- **初始化**: `ShmMgrInit()` - 初始化共享内存子系统 +- **本地分配**: `ShmLocalMalloc()` - 分配本地共享内存 +- **远端分配**: `ShmRemoteMalloc()` - 分配远程节点可访问的共享内存 +- **释放**: `ShmFree()` - 释放共享内存资源 + +### 定时器管理 + +UBRing 使用高精度定时器系统 (`timer_mgr.cpp`) 进行连接管理和超时处理,支持 epoll(Linux)和 kqueue(macOS)。 + +## 参考资料 + +- [UBRing 特性提案](https://github.com/apache/brpc/issues/3226) +- [UBRing 技术讨论](https://github.com/apache/brpc/discussions/3217) +- [UBS-Mem 开源项目](https://atomgit.com/openeuler/ubs-mem) + +## 相关文档 + +- [UB Client](ub_client.md) - 访问 UB 服务 +- [RDMA 支持](rdma.md) - 远程直接内存访问 + diff --git a/docs/en/ubring.md b/docs/en/ubring.md new file mode 100644 index 0000000000..f910facb7b --- /dev/null +++ b/docs/en/ubring.md @@ -0,0 +1,197 @@ +# UBRing: High-Performance Shared Memory RPC + +UBRing is a high-performance RPC implementation in brpc that leverages shared memory for inter-process communication (IPC). It supports both local shared memory (POSIX IPC) and remote shared memory (ubs-mem), providing ultra-low latency communication between processes. + +## Technical Background + +Traditional RPC frameworks typically use network sockets for communication, which introduces significant overhead due to kernel involvement, context switches, and data copying. UBRing addresses this by using shared memory as the communication medium, allowing direct memory access between processes with minimal kernel intervention. + +Key advantages of UBRing: +- **Ultra-low latency**: Microsecond-level RPC latency +- **High throughput**: Millions of RPC calls per second +- **Reduced data copying**: Direct memory access between processes +- **Cross-platform support**: Works on Linux and macOS + +## Supported Shared Memory Backends + +UBRing supports two types of shared memory backends, controlled by the `ub_shm_type` flag: + +### 1. POSIX IPC Shared Memory (ub_shm_type = 1) + +This is the default mode, using standard POSIX shared memory for local IPC. Processes on the same machine can communicate directly through shared memory regions. + +### 2. UBS-Mem Remote Shared Memory (ub_shm_type = 2) + +This mode uses ubs-mem (Unified Block Storage Memory), an open-source remote shared memory framework from openEuler. It enables shared memory communication across nodes in a rack, similar to RDMA but with simpler deployment requirements. + +**UBS-Mem Open Source**: https://atomgit.com/openeuler/ubs-mem + +**Required Libraries**: +- `libubsm_sdk.so` - UBS-Mem SDK library (installed at `/usr/local/ubs_mem/lib/libubsm_sdk.so`) +- UBS-Mem dynamically loads the library via `dlopen()` and uses functions like `ubsmem_initialize()`, `ubsmem_create_region()`, `ubsmem_shmem_allocate()`, `ubsmem_shmem_map()`, etc. + +**UBS-Mem Key Functions**: +- `ubsmem_init_attributes()` - Initialize UBS-Mem attributes +- `ubsmem_initialize()` - Initialize UBS-Mem library +- `ubsmem_finalize()` - Finalize UBS-Mem library +- `ubsmem_create_region()` - Create a shared memory region +- `ubsmem_shmem_allocate()` - Allocate shared memory +- `ubsmem_shmem_map()` - Map shared memory to local address space +- `ubsmem_shmem_unmap()` - Unmap shared memory +- `ubsmem_shmem_deallocate()` - Deallocate shared memory +- `ubsmem_destroy_region()` - Destroy a shared memory region + +### Future Expansion + +The architecture is designed to support CXL (Compute Express Link) based remote shared memory in the future, enabling even more flexible distributed memory sharing. + +## Build Configuration + +### Build with CMake + +To build brpc with UBRing support, use the following commands: + +```bash +# Build brpc with UBRing support +cd /path/to/brpc +cmake -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_UBRING:BOOL=ON +cmake --build build -j 8 + +# Build the ubring_performance example +cd /path/to/brpc/example/ubring_performance +cmake -B build +cmake --build build -j 8 +``` + +### Build with Bazel + +To build brpc with UBRing support using Bazel: + +```bash +# Build brpc with UBRing support +cd /path/to/brpc +bazel build //... --define=with_ubring=true + +# Build the ubring_performance example +bazel build //example/ubring_performance/... +``` + +### Select Shared Memory Backend + +The shared memory backend is controlled by the `--ub_shm_type` flag: + +```bash +# Use POSIX IPC (default) +./your_program --ub_shm_type=1 + +# Use UBS-Mem +./your_program --ub_shm_type=2 +``` + +## Performance Testing + +### Example: ubring_performance + +brpc provides a performance test example at `example/ubring_performance/`. + +#### Build the Example + +```bash +cd example/ubring_performance +mkdir -p build && cd build +cmake .. +make +``` + +#### Run Server + +```bash +# Run with POSIX IPC +./ubring_performance_server --ub_shm_type=1 + +# Run with UBS-Mem +./ubring_performance_server --ub_shm_type=2 +``` + +#### Run Client + +```bash +# Run with POSIX IPC +./ubring_performance_client --ub_shm_type=1 --server=127.0.0.1:8000 + +# Run with UBS-Mem +./ubring_performance_client --ub_shm_type=2 --server=:8000 +``` + +#### Test Options + +| Option | Description | Default | +|--------|-------------|---------| +| `--ub_shm_type` | Shared memory type (1=IPC, 2=UBS-Mem) | 1 | +| `--server` | Server address | 127.0.0.1:8000 | +| `--thread_num` | Number of client threads | 1 | +| `--request_num` | Total requests per thread | 1000000 | +| `--timeout_ms` | Request timeout in milliseconds | 1000 | + +## Architecture Overview + +```mermaid +graph TD + subgraph Client Process + A[Client] + end + + subgraph Server Process + B[Server] + end + + subgraph Shared Memory + C[SHM Manager] + D[IPC Backend] + E[UBS-Mem Backend] + end + + A -->|Direct Memory Access| C + B -->|Direct Memory Access| C + C --> D + C --> E + + style A fill:#636,color:#fff,stroke:#333,stroke-width:2px + style B fill:#369,color:#fff,stroke:#333,stroke-width:2px + style C fill:#396,color:#fff,stroke:#333,stroke-width:2px +``` + +### Architecture Details + +The UBRing architecture consists of: + +1. **Client/Server Processes**: Application processes that communicate via shared memory +2. **SHM Manager**: Central manager for shared memory operations (`shm_mgr.cpp`) +3. **IPC Backend**: POSIX shared memory implementation for local communication +4. **UBS-Mem Backend**: Remote shared memory implementation for cross-node communication + +## Implementation Details + +### Shared Memory Management + +The shared memory manager (`shm_mgr.cpp`) provides a unified interface for different shared memory backends: + +- **Initialization**: `ShmMgrInit()` - Initializes the shared memory subsystem +- **Local Allocation**: `ShmLocalMalloc()` - Allocates shared memory for local use +- **Remote Allocation**: `ShmRemoteMalloc()` - Allocates shared memory accessible by remote nodes +- **Free**: `ShmFree()` - Releases shared memory resources + +### Timer Management + +UBRing uses a high-precision timer system (`timer_mgr.cpp`) for connection management and timeout handling, supporting both epoll (Linux) and kqueue (macOS). + +## References + +- [UBRing Feature Proposal](https://github.com/apache/brpc/issues/3226) +- [UBRing Technical Discussion](https://github.com/apache/brpc/discussions/3217) +- [UBS-Mem Open Source](https://atomgit.com/openeuler/ubs-mem) + +## See Also + +- [UB Client](ub_client.md) - Accessing UB services +- [RDMA Support](rdma.md) - Remote direct memory access \ No newline at end of file diff --git a/example/ubring_performance/CMakeLists.txt b/example/ubring_performance/CMakeLists.txt new file mode 100644 index 0000000000..729381ccb8 --- /dev/null +++ b/example/ubring_performance/CMakeLists.txt @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +cmake_minimum_required(VERSION 2.8.10) +project(ubring_performance C CXX) + +option(LINK_SO "Whether examples are linked dynamically" OFF) + +execute_process( + COMMAND bash -c "find ${PROJECT_SOURCE_DIR}/../.. -type d -regex '.*output/include$' | head -n1 | xargs dirname | tr -d '\n'" + OUTPUT_VARIABLE OUTPUT_PATH +) + +set(CMAKE_PREFIX_PATH ${OUTPUT_PATH}) + +include(FindThreads) +include(FindProtobuf) +protobuf_generate_cpp(PROTO_SRC PROTO_HEADER test.proto) +# include PROTO_HEADER +include_directories(${CMAKE_CURRENT_BINARY_DIR}) + +# Search for libthrift* by best effort. If it is not found and brpc is +# compiled with thrift protocol enabled, a link error would be reported. +find_library(THRIFT_LIB NAMES thrift) +if (NOT THRIFT_LIB) + set(THRIFT_LIB "") +endif() + +find_path(BRPC_INCLUDE_PATH NAMES brpc/server.h) +if(LINK_SO) + find_library(BRPC_LIB NAMES brpc) +else() + find_library(BRPC_LIB NAMES libbrpc.a brpc) +endif() +if((NOT BRPC_INCLUDE_PATH) OR (NOT BRPC_LIB)) + message(FATAL_ERROR "Fail to find brpc") +endif() +include_directories(${BRPC_INCLUDE_PATH}) + +find_path(GFLAGS_INCLUDE_PATH gflags/gflags.h) +find_library(GFLAGS_LIBRARY NAMES gflags libgflags) +if((NOT GFLAGS_INCLUDE_PATH) OR (NOT GFLAGS_LIBRARY)) + message(FATAL_ERROR "Fail to find gflags") +endif() +include_directories(${GFLAGS_INCLUDE_PATH}) + +if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + include(CheckFunctionExists) + CHECK_FUNCTION_EXISTS(clock_gettime HAVE_CLOCK_GETTIME) + if(NOT HAVE_CLOCK_GETTIME) + set(DEFINE_CLOCK_GETTIME "-DNO_CLOCK_GETTIME_IN_MAC") + endif() +endif() + +set(CMAKE_CPP_FLAGS "${DEFINE_CLOCK_GETTIME} -DBRPC_WITH_UBRING=1") +set(CMAKE_CXX_FLAGS "${CMAKE_CPP_FLAGS} -DNDEBUG -O2 -D__const__=__unused__ -pipe -W -Wall -Wno-unused-parameter -fPIC -fno-omit-frame-pointer") + +if(CMAKE_VERSION VERSION_LESS "3.1.3") + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + endif() +else() + set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD_REQUIRED ON) +endif() + +find_path(LEVELDB_INCLUDE_PATH NAMES leveldb/db.h) +find_library(LEVELDB_LIB NAMES leveldb) +if ((NOT LEVELDB_INCLUDE_PATH) OR (NOT LEVELDB_LIB)) + message(FATAL_ERROR "Fail to find leveldb") +endif() +include_directories(${LEVELDB_INCLUDE_PATH}) + +if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + set(OPENSSL_ROOT_DIR + "/usr/local/opt/openssl" # Homebrew installed OpenSSL + ) +endif() + +find_package(OpenSSL) +include_directories(${OPENSSL_INCLUDE_DIR}) + +set(DYNAMIC_LIB + ${CMAKE_THREAD_LIBS_INIT} + ${GFLAGS_LIBRARY} + ${PROTOBUF_LIBRARIES} + ${LEVELDB_LIB} + ${OPENSSL_CRYPTO_LIBRARY} + ${OPENSSL_SSL_LIBRARY} + ${THRIFT_LIB} + dl + z + ) + +if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + set(DYNAMIC_LIB ${DYNAMIC_LIB} + pthread + "-framework CoreFoundation" + "-framework CoreGraphics" + "-framework CoreData" + "-framework CoreText" + "-framework Security" + "-framework Foundation" + "-Wl,-U,_MallocExtension_ReleaseFreeMemory" + "-Wl,-U,_ProfilerStart" + "-Wl,-U,_ProfilerStop" + "-Wl,-U,__Z13GetStackTracePPvii" + "-Wl,-U,_mallctl" + "-Wl,-U,_malloc_stats_print" + ) +endif() + +add_executable(ubring_performance_client client.cpp ${PROTO_SRC} ${PROTO_HEADER}) +add_executable(ubring_performance_server server.cpp ${PROTO_SRC} ${PROTO_HEADER}) + +target_link_libraries(ubring_performance_client ${BRPC_LIB} ${DYNAMIC_LIB}) +target_link_libraries(ubring_performance_server ${BRPC_LIB} ${DYNAMIC_LIB}) \ No newline at end of file diff --git a/example/ubring_performance/client.cpp b/example/ubring_performance/client.cpp new file mode 100644 index 0000000000..c14268a430 --- /dev/null +++ b/example/ubring_performance/client.cpp @@ -0,0 +1,328 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include "butil/atomicops.h" +#include "butil/fast_rand.h" +#include "butil/logging.h" +#include "brpc/server.h" +#include "brpc/channel.h" +#include "bthread/bthread.h" +#include "bvar/latency_recorder.h" +#include "bvar/variable.h" +#include "test.pb.h" + +#ifdef BRPC_WITH_UBRING + +DEFINE_int32(thread_num, 0, "How many threads are used"); +DEFINE_int32(queue_depth, 1, "How many requests can be pending in the queue"); +DEFINE_int32(expected_qps, 0, "The expected QPS"); +DEFINE_int32(max_thread_num, 16, "The max number of threads are used"); +DEFINE_int32(attachment_size, -1, "Attachment size is used (in Bytes)"); +DEFINE_bool(echo_attachment, false, "Select whether attachment should be echo"); +DEFINE_string(connection_type, "single", "Connection type of the channel"); +DEFINE_string(protocol, "baidu_std", "Protocol type."); +DEFINE_string(servers, "0.0.0.0:8002+0.0.0.0:8002", "IP Address of servers"); +DEFINE_bool(use_ubring, false, "Use UBRING or not"); +DEFINE_int32(rpc_timeout_ms, 5000, "RPC call timeout"); +DEFINE_int32(test_seconds, 20, "Test running time"); +DEFINE_int32(test_iterations, 0, "Test iterations"); +DEFINE_int32(dummy_port, 8001, "Dummy server port number"); + +bvar::LatencyRecorder g_latency_recorder("client"); +bvar::LatencyRecorder g_server_cpu_recorder("server_cpu"); +bvar::LatencyRecorder g_client_cpu_recorder("client_cpu"); +butil::atomic g_last_time(0); +butil::atomic g_total_bytes; +butil::atomic g_total_cnt; +std::vector g_servers; +int rr_index = 0; +volatile bool g_stop = false; + +butil::atomic g_token(10000); + +static void* GenerateToken(void* arg) { + int64_t start_time = butil::monotonic_time_ns(); + int64_t accumulative_token = g_token.load(butil::memory_order_relaxed); + while (!g_stop) { + bthread_usleep(100000); + int64_t now = butil::monotonic_time_ns(); + if (accumulative_token * 1000000000 / (now - start_time) < FLAGS_expected_qps) { + int64_t delta = FLAGS_expected_qps * (now - start_time) / 1000000000 - accumulative_token; + g_token.fetch_add(delta, butil::memory_order_relaxed); + accumulative_token += delta; + } + } + return NULL; +} + +class PerformanceTest { +public: + PerformanceTest(int attachment_size, bool echo_attachment) + : _addr(NULL) + , _channel(NULL) + , _start_time(0) + , _iterations(0) + , _stop(false) + { + if (attachment_size > 0) { + _addr = malloc(attachment_size); + butil::fast_rand_bytes(_addr, attachment_size); + _attachment.append(_addr, attachment_size); + } + _echo_attachment = echo_attachment; + } + + ~PerformanceTest() { + if (_addr) { + free(_addr); + } + delete _channel; + } + + inline bool IsStop() { return _stop; } + + int Init() { + brpc::ChannelOptions options; + options.socket_mode = FLAGS_use_ubring? brpc::SOCKET_MODE_UBRING : brpc::SOCKET_MODE_TCP; + options.protocol = FLAGS_protocol; + options.connection_type = FLAGS_connection_type; + options.timeout_ms = FLAGS_rpc_timeout_ms; + options.max_retry = 0; + // TODO A bug exists when the connection_group parameter is used. + // options.connection_group = std::to_string(reinterpret_cast(this)); + std::string server = g_servers[(rr_index++) % g_servers.size()]; + _channel = new brpc::Channel(); + if (_channel->Init(server.c_str(), &options) != 0) { + LOG(ERROR) << "Fail to initialize channel"; + return -1; + } + + // Add retry mechanism for RPC call + int retry = 3; + while (retry > 0) { + brpc::Controller cntl; + test::PerfTestResponse response; + test::PerfTestRequest request; + request.set_echo_attachment(_echo_attachment); + test::PerfTestService_Stub stub(_channel); + stub.Test(&cntl, &request, &response, NULL); + if (!cntl.Failed()) { + return 0; + } + LOG(WARNING) << "RPC call failed, retrying... (" << retry << " left): " << cntl.ErrorText(); + retry--; + bthread_usleep(1000000); // 100ms delay before retry + } + LOG(ERROR) << "RPC call failed after multiple retries"; + return -1; + } + + struct RespClosure { + brpc::Controller* cntl; + test::PerfTestResponse* resp; + PerformanceTest* test; + }; + + void SendRequest() { + if (FLAGS_expected_qps > 0) { + while (g_token.load(butil::memory_order_relaxed) <= 0) { + bthread_usleep(10); + } + g_token.fetch_sub(1, butil::memory_order_relaxed); + } + RespClosure* closure = new RespClosure; + test::PerfTestRequest request; + closure->resp = new test::PerfTestResponse(); + closure->cntl = new brpc::Controller(); + request.set_echo_attachment(_echo_attachment); + closure->cntl->request_attachment().append(_attachment); + closure->test = this; + google::protobuf::Closure* done = brpc::NewCallback(&HandleResponse, closure); + test::PerfTestService_Stub stub(_channel); + stub.Test(closure->cntl, &request, closure->resp, done); + } + + static void HandleResponse(RespClosure* closure) { + std::unique_ptr cntl_guard(closure->cntl); + std::unique_ptr response_guard(closure->resp); + if (closure->cntl->Failed()) { + LOG(DEBUG) << "RPC call failed: " << closure->cntl->ErrorText(); + // Don't stop the test immediately, just log the error and continue + } else { + g_latency_recorder << closure->cntl->latency_us(); + if (closure->resp->cpu_usage().size() > 0) { + g_server_cpu_recorder << atof(closure->resp->cpu_usage().c_str()) * 100; + } + g_total_bytes.fetch_add(closure->cntl->request_attachment().size(), butil::memory_order_relaxed); + g_total_cnt.fetch_add(1, butil::memory_order_relaxed); + } + + cntl_guard.reset(NULL); + response_guard.reset(NULL); + + if (closure->test->_iterations == 0 && FLAGS_test_iterations > 0) { + closure->test->_stop = true; + return; + } + --closure->test->_iterations; + uint64_t last = g_last_time.load(butil::memory_order_relaxed); + uint64_t now = butil::gettimeofday_us(); + if (now > last && now - last > 100000) { + if (g_last_time.exchange(now, butil::memory_order_relaxed) == last) { + g_client_cpu_recorder << + atof(bvar::Variable::describe_exposed("process_cpu_usage").c_str()) * 100; + } + } + if (now - closure->test->_start_time > FLAGS_test_seconds * 1000000u) { + closure->test->_stop = true; + return; + } + closure->test->SendRequest(); + } + + static void* RunTest(void* arg) { + PerformanceTest* test = (PerformanceTest*)arg; + test->_start_time = butil::gettimeofday_us(); + test->_iterations = FLAGS_test_iterations; + + for (int i = 0; i < FLAGS_queue_depth; ++i) { + test->SendRequest(); + } + + return NULL; + } + +private: + void* _addr; + brpc::Channel* _channel; + uint64_t _start_time; + uint32_t _iterations; + volatile bool _stop; + butil::IOBuf _attachment; + bool _echo_attachment; +}; + +static void* DeleteTest(void* arg) { + PerformanceTest* test = (PerformanceTest*)arg; + delete test; + return NULL; +} + +void Test(int thread_num, int attachment_size) { + std::cout << "[Threads: " << thread_num + << ", Depth: " << FLAGS_queue_depth + << ", Attachment: " << attachment_size << "B" + << ", UBRING: " << (FLAGS_use_ubring ? "yes" : "no") + << ", Echo: " << (FLAGS_echo_attachment ? "yes]" : "no]") + << std::endl; + g_total_bytes.store(0, butil::memory_order_relaxed); + g_total_cnt.store(0, butil::memory_order_relaxed); + std::vector tests; + for (int k = 0; k < thread_num; ++k) { + PerformanceTest* t = new PerformanceTest(attachment_size, FLAGS_echo_attachment); + if (t->Init() < 0) { + exit(1); + } + tests.push_back(t); + } + uint64_t start_time = butil::gettimeofday_us(); + bthread_t tid[thread_num]; + if (FLAGS_expected_qps > 0) { + bthread_t tid; + bthread_start_background(&tid, &BTHREAD_ATTR_NORMAL, GenerateToken, NULL); + } + for (int k = 0; k < thread_num; ++k) { + bthread_start_background(&tid[k], &BTHREAD_ATTR_NORMAL, + PerformanceTest::RunTest, tests[k]); + } + for (int k = 0; k < thread_num; ++k) { + while (!tests[k]->IsStop()) { + bthread_usleep(10000); + } + } + uint64_t end_time = butil::gettimeofday_us(); + double throughput = g_total_bytes / 1.048576 / (end_time - start_time); + if (FLAGS_test_iterations == 0) { + std::cout << "Avg-Latency: " << g_latency_recorder.latency(10) + << ", 90th-Latency: " << g_latency_recorder.latency_percentile(0.9) + << ", 99th-Latency: " << g_latency_recorder.latency_percentile(0.99) + << ", 99.9th-Latency: " << g_latency_recorder.latency_percentile(0.999) + << ", Throughput: " << throughput << "MB/s" + << ", QPS: " << (g_total_cnt.load(butil::memory_order_relaxed) * 1000 / (end_time - start_time)) << "k" + << ", Server CPU-utilization: " << g_server_cpu_recorder.latency(10) << "%" + << ", Client CPU-utilization: " << g_client_cpu_recorder.latency(10) << "%" + << std::endl; + } else { + std::cout << " Throughput: " << throughput << "MB/s" << std::endl; + } + g_stop = true; + for (int k = 0; k < thread_num; ++k) { + bthread_start_background(&tid[k], &BTHREAD_ATTR_NORMAL, DeleteTest, tests[k]); + } + for (int k = 0; k < thread_num; ++k) { + bthread_join(tid[k], NULL); + } +} + +int main(int argc, char* argv[]) { + GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + + brpc::StartDummyServerAt(FLAGS_dummy_port); + + std::string::size_type pos1 = 0; + std::string::size_type pos2 = FLAGS_servers.find('+'); + while (pos2 != std::string::npos) { + g_servers.push_back(FLAGS_servers.substr(pos1, pos2 - pos1)); + pos1 = pos2 + 1; + pos2 = FLAGS_servers.find('+', pos1); + } + g_servers.push_back(FLAGS_servers.substr(pos1)); + + if (FLAGS_thread_num > 0 && FLAGS_attachment_size >= 0) { + Test(FLAGS_thread_num, FLAGS_attachment_size); + } else if (FLAGS_thread_num <= 0 && FLAGS_attachment_size >= 0) { + for (int i = 1; i <= FLAGS_max_thread_num; i *= 2) { + Test(i, FLAGS_attachment_size); + } + } else if (FLAGS_thread_num > 0 && FLAGS_attachment_size < 0) { + for (int i = 1; i <= 1024; i *= 4) { + Test(FLAGS_thread_num, i); + } + } else { + for (int j = 1; j <= 1024; j *= 4) { + for (int i = 1; i <= FLAGS_max_thread_num; i *= 2) { + Test(i, j); + } + } + } + + return 0; +} + +#else + +int main(int argc, char* argv[]) { + LOG(ERROR) << " brpc is not compiled with ubring. To enable it, please refer to the ubring documentation"; + return 0; +} + +#endif diff --git a/example/ubring_performance/server.cpp b/example/ubring_performance/server.cpp new file mode 100644 index 0000000000..b138c91c8d --- /dev/null +++ b/example/ubring_performance/server.cpp @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +#include +#include "butil/atomicops.h" +#include "butil/logging.h" +#include "butil/time.h" +#include "brpc/server.h" +#include "bvar/variable.h" +#include "test.pb.h" + +#ifdef BRPC_WITH_UBRING + +DEFINE_int32(port, 8002, "TCP Port of this server"); +DEFINE_bool(use_ubring, false, "Use UBRING or not"); + +butil::atomic g_last_time(0); + +namespace test { +class PerfTestServiceImpl : public PerfTestService { +public: + PerfTestServiceImpl() {} + ~PerfTestServiceImpl() {} + + void Test(google::protobuf::RpcController* cntl_base, + const PerfTestRequest* request, + PerfTestResponse* response, + google::protobuf::Closure* done) { + brpc::ClosureGuard done_guard(done); + uint64_t last = g_last_time.load(butil::memory_order_relaxed); + uint64_t now = butil::monotonic_time_us(); + if (now > last && now - last > 100000) { + if (g_last_time.exchange(now, butil::memory_order_relaxed) == last) { + response->set_cpu_usage(bvar::Variable::describe_exposed("process_cpu_usage")); + } else { + response->set_cpu_usage(""); + } + } else { + response->set_cpu_usage(""); + } + if (request->echo_attachment()) { + brpc::Controller* cntl = + static_cast(cntl_base); + cntl->response_attachment().append(cntl->request_attachment()); + } + } +}; +} + +int main(int argc, char* argv[]) { + GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + + brpc::Server server; + test::PerfTestServiceImpl perf_test_service_impl; + + if (server.AddService(&perf_test_service_impl, + brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { + LOG(ERROR) << "Fail to add service"; + return -1; + } + g_last_time.store(0, butil::memory_order_relaxed); + + brpc::ServerOptions options; + options.socket_mode = FLAGS_use_ubring? brpc::SOCKET_MODE_UBRING : brpc::SOCKET_MODE_TCP; + if (server.Start(FLAGS_port, &options) != 0) { + LOG(ERROR) << "Fail to start EchoServer"; + return -1; + } + + server.RunUntilAskedToQuit(); + return 0; +} + +#else + + +int main(int argc, char* argv[]) { + LOG(ERROR) << " brpc is not compiled with ubring. To enable it, please refer to the ubring documentation"; + return 0; +} + +#endif \ No newline at end of file diff --git a/example/ubring_performance/test.proto b/example/ubring_performance/test.proto new file mode 100644 index 0000000000..22646d113c --- /dev/null +++ b/example/ubring_performance/test.proto @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +syntax="proto2"; +option cc_generic_services = true; + +package test; + +message PerfTestRequest { + required bool echo_attachment = 1; +}; + +message PerfTestResponse { + required string cpu_usage = 1; +}; + +service PerfTestService { + rpc Test(PerfTestRequest) returns (PerfTestResponse); +}; \ No newline at end of file diff --git a/src/brpc/input_messenger.cpp b/src/brpc/input_messenger.cpp index c249cca22c..fa05423640 100644 --- a/src/brpc/input_messenger.cpp +++ b/src/brpc/input_messenger.cpp @@ -312,7 +312,7 @@ int InputMessenger::ProcessNewMessage( // not in the bthread where the polling bthread is located, because the // method for processing messages may call synchronization primitives, // causing the polling bthread to be scheduled out. - if (m->_socket_mode == SOCKET_MODE_RDMA) { + if (m->_socket_mode == SOCKET_MODE_RDMA || m->_socket_mode == SOCKET_MODE_UBRING) { m->_transport->QueueMessage(last_msg, &num_bthread_created, true); } if (num_bthread_created) { diff --git a/src/brpc/input_messenger.h b/src/brpc/input_messenger.h index 8482c3f3fc..5203c02505 100644 --- a/src/brpc/input_messenger.h +++ b/src/brpc/input_messenger.h @@ -29,6 +29,9 @@ namespace brpc { namespace rdma { class RdmaEndpoint; } +namespace ubring { +class UBShmEndpoint; +} class TcpTransport; struct InputMessageHandler { // The callback to cut a message from `source'. @@ -93,6 +96,7 @@ class InputMessenger : public SocketUser { friend class Socket; friend class TcpTransport; friend class rdma::RdmaEndpoint; +friend class ubring::UBShmEndpoint; public: explicit InputMessenger(size_t capacity = 128); ~InputMessenger(); diff --git a/src/brpc/memcache.cpp b/src/brpc/memcache.cpp index 489d84db16..c7d6f836b1 100644 --- a/src/brpc/memcache.cpp +++ b/src/brpc/memcache.cpp @@ -17,6 +17,7 @@ #include "brpc/memcache.h" +#include #include "brpc/policy/memcache_binary_header.h" #include "brpc/proto_base.pb.h" #include "butil/logging.h" diff --git a/src/brpc/nonreflectable_message.h b/src/brpc/nonreflectable_message.h index 7f2acd78a3..089b23957d 100644 --- a/src/brpc/nonreflectable_message.h +++ b/src/brpc/nonreflectable_message.h @@ -129,7 +129,7 @@ class NonreflectableMessage : public ::google::protobuf::Message { void DiscardUnknownFields() override {} #endif -#if GOOGLE_PROTOBUF_VERSION < 5026000 +#if GOOGLE_PROTOBUF_VERSION >= 3004000 && GOOGLE_PROTOBUF_VERSION < 5026000 // Unsupported by default. size_t SpaceUsedLong() const override { return 0; @@ -163,9 +163,19 @@ class NonreflectableMessage : public ::google::protobuf::Message { #endif // Size of bytes after serialization. +#if GOOGLE_PROTOBUF_VERSION < 3004000 + virtual size_t ByteSizeLong() const { + return 0; + } + + int ByteSize() const override { + return static_cast(ByteSizeLong()); + } +#else size_t ByteSizeLong() const override { return 0; } +#endif #if GOOGLE_PROTOBUF_VERSION >= 3007000 && GOOGLE_PROTOBUF_VERSION < 3010000 void SerializeWithCachedSizes(::google::protobuf::io::CodedOutputStream*) const override {} diff --git a/src/brpc/rdma/rdma_endpoint.cpp b/src/brpc/rdma/rdma_endpoint.cpp index c69bf8ec07..5b333938af 100644 --- a/src/brpc/rdma/rdma_endpoint.cpp +++ b/src/brpc/rdma/rdma_endpoint.cpp @@ -1334,6 +1334,21 @@ static void DeallocateCq(ibv_cq* cq) { LOG_IF(WARNING, 0 != err) << "Fail to destroy CQ: " << berror(err); } +static int DrainCq(ibv_cq* cq) { + if (NULL == cq) { + return 0; + } + + ibv_wc wc; + int ret; + do { + ret = ibv_poll_cq(cq, 1, &wc); + } while (ret > 0); + + LOG_IF(ERROR, ret < 0) << "drain CQ failed: " << ret; + return ret; +} + void RdmaEndpoint::DeallocateResources() { if (!_resource) { return; @@ -1360,6 +1375,7 @@ void RdmaEndpoint::DeallocateResources() { } bool remove_consumer = true; +_reclaim: if (!move_to_rdma_resource_list) { if (NULL != _resource->qp) { int err = IbvDestroyQp(_resource->qp); @@ -1403,6 +1419,24 @@ void RdmaEndpoint::DeallocateResources() { } if (move_to_rdma_resource_list) { + // When a QP is moved to the RESET state, all associated send and + // receive queues are flushed, meaning any outstanding WRs are effectively + // abandoned by the hardware. + // + // However, the CQ associated with that QP is *not* cleared automatically, + // meaning that it will still contain entries for WRs that completed before + // the reset. + // + // The application should finish polling the CQ to remove these obsolete + // entries before reusing the QP. + int ret = DrainCq(_resource->polling_cq); + ret += DrainCq(_resource->send_cq); + ret += DrainCq(_resource->recv_cq); + if (ret < 0) { + move_to_rdma_resource_list = false; + goto _reclaim; + } + BAIDU_SCOPED_LOCK(*g_rdma_resource_mutex); _resource->next = g_rdma_resource_list; g_rdma_resource_list = _resource; diff --git a/src/brpc/selective_channel.cpp b/src/brpc/selective_channel.cpp index a59580e321..8dee422598 100644 --- a/src/brpc/selective_channel.cpp +++ b/src/brpc/selective_channel.cpp @@ -419,9 +419,13 @@ void Sender::Clear() { if (_main_cntl == NULL) { return; } - delete _alloc_resources[1].response; - delete _alloc_resources[1].sub_done; - _alloc_resources[1] = Resource(); + for (int i = 0; i < _nalloc; ++i) { + delete _alloc_resources[i].response; + if (_alloc_resources[i].sub_done != &_sub_done0) { + delete _alloc_resources[i].sub_done; + } + _alloc_resources[i] = Resource(); + } const CallId cid = _main_cntl->call_id(); _main_cntl = NULL; if (_user_done) { @@ -434,7 +438,7 @@ inline Resource Sender::PopFree() { if (_nfree == 0) { if (_nalloc == 0) { Resource r; - r.response = _response; + r.response = _response->New(); r.sub_done = &_sub_done0; _alloc_resources[_nalloc++] = r; return r; diff --git a/src/brpc/socket.cpp b/src/brpc/socket.cpp index 005873e9b0..0ca6950428 100644 --- a/src/brpc/socket.cpp +++ b/src/brpc/socket.cpp @@ -81,6 +81,13 @@ DEFINE_int32(socket_send_buffer_size, -1, DEFINE_int32(ssl_bio_buffer_size, 16*1024, "Set buffer size for SSL read/write"); +DEFINE_int32(ssl_handshake_timeout_ms, 5000, + "Max duration of one SSL handshake on a socket. Zero or negative " + "disables the limit and falls back to waiting forever, which can " + "leak ESTABLISHED sockets if the peer never finishes the TLS " + "handshake (e.g. server not actually listening with SSL)."); +BRPC_VALIDATE_GFLAG(ssl_handshake_timeout_ms, PassValidate); + DEFINE_int64(socket_max_unwritten_bytes, 64 * 1024 * 1024, "Max unwritten bytes in each socket, if the limit is reached," " Socket.Write fails with EOVERCROWDED"); @@ -1956,9 +1963,23 @@ int Socket::SSLHandshake(int fd, bool server_mode) { _ssl_state = SSL_CONNECTING; + // Bound the handshake by a deadline; without it, a peer that completes + // the TCP handshake but never returns a TLS Hello (e.g. server not + // configured for SSL) would park this bthread on bthread_fd_wait + // forever. That bthread holds a Socket reference via WriteRequest, so + // the underlying fd would never be recycled and the connection would + // remain ESTABLISHED indefinitely. + const int handshake_timeout_ms = FLAGS_ssl_handshake_timeout_ms; + timespec abstime_storage; + const timespec* abstime = NULL; + if (handshake_timeout_ms > 0) { + abstime_storage = butil::milliseconds_from_now(handshake_timeout_ms); + abstime = &abstime_storage; + } + // Loop until SSL handshake has completed. For SSL_ERROR_WANT_READ/WRITE, - // we use bthread_fd_wait as polling mechanism instead of EventDispatcher - // as it may confuse the origin event processing code. + // we use bthread_fd_timedwait as polling mechanism instead of + // EventDispatcher as it may confuse the origin event processing code. while (true) { ERR_clear_error(); int rc = SSL_do_handshake(_ssl_session); @@ -2004,20 +2025,32 @@ int Socket::SSLHandshake(int fd, bool server_mode) { switch (ssl_error) { case SSL_ERROR_WANT_READ: #if defined(OS_LINUX) - if (bthread_fd_wait(fd, EPOLLIN) != 0) { + if (bthread_fd_timedwait(fd, EPOLLIN, abstime) != 0) { #elif defined(OS_MACOSX) - if (bthread_fd_wait(fd, EVFILT_READ) != 0) { + if (bthread_fd_timedwait(fd, EVFILT_READ, abstime) != 0) { #endif + if (errno == ETIMEDOUT) { + LOG(WARNING) << "SSL handshake timed out after " + << handshake_timeout_ms + << "ms while waiting for peer data on fd=" + << fd << " remote_side=" << _remote_side; + } return -1; } break; case SSL_ERROR_WANT_WRITE: #if defined(OS_LINUX) - if (bthread_fd_wait(fd, EPOLLOUT) != 0) { + if (bthread_fd_timedwait(fd, EPOLLOUT, abstime) != 0) { #elif defined(OS_MACOSX) - if (bthread_fd_wait(fd, EVFILT_WRITE) != 0) { + if (bthread_fd_timedwait(fd, EVFILT_WRITE, abstime) != 0) { #endif + if (errno == ETIMEDOUT) { + LOG(WARNING) << "SSL handshake timed out after " + << handshake_timeout_ms + << "ms while waiting to send on fd=" << fd + << " remote_side=" << _remote_side; + } return -1; } break; diff --git a/src/brpc/socket.h b/src/brpc/socket.h index 816fccdf27..167cc8f418 100644 --- a/src/brpc/socket.h +++ b/src/brpc/socket.h @@ -57,7 +57,10 @@ namespace rdma { class RdmaEndpoint; class RdmaConnect; } - +namespace ubring { + class UBShmEndpoint; + class UBConnect; +} class Socket; class AuthContext; class EventDispatcher; @@ -317,6 +320,9 @@ friend class policy::RtmpContext; friend class schan::ChannelBalancer; friend class rdma::RdmaEndpoint; friend class rdma::RdmaConnect; +friend class ubring::UBShmEndpoint; +friend class ubring::UBConnect; +friend class UBShmTransport; friend class HealthCheckTask; friend class OnAppHealthCheckDone; friend class HealthCheckManager; diff --git a/src/brpc/socket_mode.h b/src/brpc/socket_mode.h index b5d42be4aa..b4ac7dfbca 100644 --- a/src/brpc/socket_mode.h +++ b/src/brpc/socket_mode.h @@ -20,7 +20,8 @@ namespace brpc { enum SocketMode { SOCKET_MODE_TCP = 0, - SOCKET_MODE_RDMA = 1 + SOCKET_MODE_RDMA = 1, + SOCKET_MODE_UBRING = 2 }; } // namespace brpc #endif //BRPC_SOCKET_MODE_H \ No newline at end of file diff --git a/src/brpc/transport_factory.cpp b/src/brpc/transport_factory.cpp index b689e2edd2..36fdaaed05 100644 --- a/src/brpc/transport_factory.cpp +++ b/src/brpc/transport_factory.cpp @@ -18,6 +18,7 @@ #include "brpc/transport_factory.h" #include "brpc/tcp_transport.h" #include "brpc/rdma_transport.h" +#include "brpc/ubshm_transport.h" namespace brpc { int TransportFactory::ContextInitOrDie(SocketMode mode, bool serverOrNot, const void* _options) { @@ -28,6 +29,11 @@ int TransportFactory::ContextInitOrDie(SocketMode mode, bool serverOrNot, const else if (mode == SOCKET_MODE_RDMA) { return RdmaTransport::ContextInitOrDie(serverOrNot, _options); } +#endif +#if BRPC_WITH_UBRING + else if (mode == SOCKET_MODE_UBRING) { + return UBShmTransport::ContextInitOrDie(serverOrNot, _options); + } #endif else { LOG(ERROR) << "unknown transport type " << mode; @@ -43,6 +49,11 @@ std::unique_ptr TransportFactory::CreateTransport(SocketMode mode) { else if (mode == SOCKET_MODE_RDMA) { return std::unique_ptr(new RdmaTransport()); } +#endif +#if BRPC_WITH_UBRING + else if (mode == SOCKET_MODE_UBRING) { + return std::unique_ptr(new UBShmTransport()); + } #endif else { LOG(ERROR) << "socket_mode set error"; diff --git a/src/brpc/ubshm/common/common.h b/src/brpc/ubshm/common/common.h new file mode 100644 index 0000000000..80e7ad83c8 --- /dev/null +++ b/src/brpc/ubshm/common/common.h @@ -0,0 +1,181 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_COMMON_H +#define BRPC_COMMON_H +#include +#include +#include +#include +#include "butil/logging.h" + +#define LIKELY(x) __builtin_expect(!!(x), 1) +#define UNLIKELY(x) __builtin_expect(!!(x), 0) + +#ifndef UNREFERENCE_PARAM +#define UNREFERENCE_PARAM(x) ((void)(x)) +#endif + +#ifdef UT +#define STATIC +#define INLINE +#define UBRING_STATISTICS_PATH ROOT_PATH "/ubring/run" +#else +#define STATIC static +#define INLINE inline +#define UBRING_STATISTICS_PATH "/opt/ubring/run" +#endif + +#ifdef __cplusplus +#include +using AtomicInt = std::atomic; +using AtomicBool = std::atomic; +using AtomicUintFast64 = std::atomic; +using AtomicUintFast8 = std::atomic; +#define ATOMIC_INIT(var, value) var.store(value) +#define ATOMIC_STORE(var, value) var.store(value) +#define ATOMIC_LOAD(var) var.load() +#define ATOMIC_ADD(var, value) var.fetch_add(value) +#define ATOMIC_SUB(var, value) var.fetch_sub(value) +#define ATOMIC_COMPARE_EXCHANGE_STRONG(var, expected, desired) \ + var.compare_exchange_strong((expected), (desired)) +#else +#include +typedef atomic_int AtomicInt; +typedef atomic_bool AtomicBool; +typedef atomic_uint_fast64_t AtomicUintFast64; +typedef atomic_uint_fast8_t AtomicUintFast8; +#define ATOMIC_INIT(var, value) atomic_init(&(var), value) +#define ATOMIC_STORE(var, value) atomic_store(&(var), value) +#define ATOMIC_LOAD(var) atomic_load(&(var)) +#define ATOMIC_ADD(var, value) atomic_fetch_add(&(var), value) +#define ATOMIC_SUB(var, value) atomic_fetch_sub(&(var), value) +#define ATOMIC_COMPARE_EXCHANGE_STRONG(var, expected, desired) \ + atomic_compare_exchange_strong(&(var), &(expected), (desired)) +#endif + +#define ISB() __asm__ __volatile__("isb" ::: "memory") +#define DSB() __asm__ __volatile__("dsb sy" ::: "memory") + +#ifndef errno_t +typedef int errno_t; +#endif +#ifndef EOK +#define EOK 0 +#endif + +#define MAX_NODE_NUM 8 +#define IPV4_FIRST_BYTE_OFFSET 24 +#define COPY_ALIGNED_DATA_BYTES 64 + +#if defined(OS_MACOSX) +#define EPOLLIN 0x001 +#define EPOLLOUT 0x004 +#define EPOLLET 0x80000000 +#endif + +static inline int Copy64Byte(int8_t *dst, int8_t *src) { +#ifdef LS64 + asm volatile ( + "mov x12, %0\n" + "mov x13, %1\n" + "ldr x4, [x12]\n" + "ldr x5, [x12, #8]\n" + "ldr x6, [x12, #16]\n" + "ldr x7, [x12, #24]\n" + "ldr x8, [x12, #32]\n" + "ldr x9, [x12, #40]\n" + "ldr x10, [x12, #48]\n" + "ldr x11, [x12, #56]\n" + "ST64B x4, [x13]\n" + : + : "r" (src), "r" (dst) + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13" + ); + return EOK; +#else + memcpy(dst, src, COPY_ALIGNED_DATA_BYTES); + return EOK; +#endif +} + +#define SEC_TO_NSEC 1000000000 +#define MSEC_TO_NSEC 1000000 +#define USEC_TO_NSEC 1000 +#define MSEC_TO_SEC 1000 +#define MAX_IP_PORT_STR_LEN 23 +#define DECIMAL_BASE 10 + +static inline uint64_t GetCurNanoSeconds(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + uint64_t timestamp = (uint64_t)ts.tv_sec * SEC_TO_NSEC + (uint64_t)ts.tv_nsec; + return timestamp; +} + +#define FREE_PTR(ptr) \ + do { \ + if ((ptr) != NULL) { \ + free(ptr); \ + (ptr) = NULL; \ + } \ + } while (0) + +typedef enum { + UBRING_OK = 0, + UBRING_ERR = -1, + UBRING_RETRY = -2, + UBRING_REENTRY = -3, + UBRING_ERR_TIMEOUT = -4, + SHM_ERR = -100, + SHM_ERR_INPUT_INVALID = -101, + SHM_ERR_EXIST = -102, + SHM_ERR_RESOURCE_ATTACHED = -103, + SHM_ERR_NOT_FOUND = -104, + SHM_ERR_UBSM_NET_ERR = -105, + MPA_UDP_ERR = -200, + MPA_UDP_NO_TRX = -201, + MPA_UDP_STATUS_NOT_JOINED = -202, + MPA_MUXER_NOT_READY = -203, + MPA_PORT_FULL = -204, + MPA_PORT_OUTRANGE = -205, + MPA_PORT_TAKEN = -206, + MPA_UDP_STATUS_NOT_CONNECTED = -207, + MPA_UDP_STATUS_ALREADY_CONNECTED = -208, + MPA_UDP_OLD_RDLIST = -209, + MPA_UDP_RDLIST_FULL = -210, + UBR_NOT_CONNECTED = -300, + UBR_ERR_ADDR_IN_USE = -301, +} RETURN_CODE; + +#define ALIGN_BYTES 0x40 +#define CHECKED_ALIGN_BITS (ALIGN_BYTES - 1) + +static inline size_t Aligned64Offset(uint8_t *addr) { + return ((ALIGN_BYTES - (((size_t)(addr)) & CHECKED_ALIGN_BITS)) & CHECKED_ALIGN_BITS); +} + +static inline RETURN_CODE HasTimedOut(const uint64_t startTime, const uint32_t timeout) { + uint64_t endTime = startTime + (uint64_t)timeout * SEC_TO_NSEC; + if (GetCurNanoSeconds() > endTime) { + LOG(ERROR) << "task time out " << timeout << " seconds."; + return UBRING_ERR; + } + return UBRING_OK; +} + +#endif // BRPC_COMMON_H \ No newline at end of file diff --git a/src/brpc/ubshm/common/thread_lock.h b/src/brpc/ubshm/common/thread_lock.h new file mode 100644 index 0000000000..8c07ce360d --- /dev/null +++ b/src/brpc/ubshm/common/thread_lock.h @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_THREAD_LOCK_H +#define BRPC_THREAD_LOCK_H +#include +#include +#include +#include +#include +#include "brpc/ubshm/common/common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static inline void UnlockMutex(pthread_mutex_t **mtx) +{ + if (LIKELY(mtx != NULL && *mtx != NULL)) { + pthread_mutex_unlock(*mtx); + } else { + LOG(ERROR) << "Invalid input for mtx."; + } +} + +#define LOCK_GUARD(mtxPtr) \ + pthread_mutex_t *__attribute__((cleanup(UnlockMutex))) _mtxPtr = ({ \ + pthread_mutex_lock(&(mtxPtr)); \ + &(mtxPtr); \ + }) + +static inline void UnlockSpinLock(pthread_spinlock_t **spinLock) +{ + if (LIKELY(spinLock != NULL && *spinLock != NULL)) { + pthread_spin_unlock(*spinLock); + } else { + LOG(ERROR) << "Invalid input for spinLock."; + } +} + +#define SPIN_LOCK_GUARD(spinLockPtr) \ + pthread_spinlock_t *__attribute__((cleanup(UnlockSpinLock))) _spinLockPtr = ({ \ + pthread_spin_lock(&(spinLockPtr)); \ + &(spinLockPtr); \ + }) + +static inline void UnlockRWLock(pthread_rwlock_t **rwLock) +{ + if (LIKELY(rwLock != NULL && *rwLock != NULL)) { + pthread_rwlock_unlock(*rwLock); + } else { + LOG(ERROR) << "Invalid input for rwLock."; + } +} + +#define R_LOCK_GUARD(readLockPtr) \ + pthread_rwlock_t *__attribute__((cleanup(UnlockRWLock))) _readLockPtr = ({ \ + pthread_rwlock_rdlock(&(readLockPtr)); \ + &(readLockPtr); \ + }) + +#define W_LOCK_GUARD(writeLockPtr) \ + pthread_rwlock_t *__attribute__((cleanup(UnlockRWLock))) _writeLockPtr = ({ \ + pthread_rwlock_wrlock(&(writeLockPtr)); \ + &(writeLockPtr); \ + }) + +static inline void PostSemWithClose(sem_t **sem) +{ + if (LIKELY(sem != NULL && *sem != NULL)) { + sem_post(*sem); + sem_close(*sem); + *sem = NULL; + sem = NULL; + } else { + LOG(ERROR) << "Invalid input for semaphore."; + } +} + +static inline void PostSem(sem_t **sem) +{ + if (LIKELY(sem != NULL && *sem != NULL)) { + sem_post(*sem); + } else { + LOG(ERROR) << "Invalid input for semaphore."; + } +} + +#define SEMAPHORE_WAIT_GUARD_WITH_CLOSE(semPtr) \ + sem_t *__attribute__((cleanup(PostSemWithClose))) _semPtr = ({ \ + sem_wait(semPtr); \ + semPtr; \ + }) + +#define SEMAPHORE_WAIT_GUARD(semPtr) \ + sem_t *__attribute__((cleanup(PostSem))) _semPtr = ({ \ + sem_wait(semPtr); \ + semPtr; \ + }) + +#ifdef __cplusplus +} +#endif +#endif //BRPC_THREAD_LOCK_H \ No newline at end of file diff --git a/src/brpc/ubshm/shm/shm_def.h b/src/brpc/ubshm/shm/shm_def.h new file mode 100644 index 0000000000..0c28084b96 --- /dev/null +++ b/src/brpc/ubshm/shm/shm_def.h @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_SHM_DEF_H +#define BRPC_SHM_DEF_H +#include +#include +#include + +#define PROT_READ 0x1 /* Page can be read. */ +#define PROT_WRITE 0x2 /* Page can be written. */ +#define PROT_EXEC 0x4 /* Page can be executed. */ +#define PROT_NONE 0x0 /* Page can not be accessed. */ +#define PROT_GROWSDOWN 0x01000000 /* Extend change to start of growsdown vma (mprotect only). */ +#define PROT_GROWSUP 0x02000000 /* Extend change to start of growsup vma (mprotect only). */ +/* Sharing types (must choose one and only one of these). */ +#define MAP_SHARED 0x01 /* Share changes. */ +#define MAP_PRIVATE 0x02 /* Changes are private. */ +#define SHM_MAX_NAME_BUFF_LEN 48 // byte, buffer size, ubsm_sdk need name to be below 48byte +#define SHM_MAX_NAME_LEN (SHM_MAX_NAME_BUFF_LEN - 1) // byte, string length +#define SHM_ALLOC_UNIT_SIZE (4 * 1024 * 1024) // 4MB + +namespace brpc { +namespace ubring { +typedef enum { SHM_TYPE_UB, SHM_TYPE_IPC, SHM_TYPE_UBS, SHM_TYPE_UNSUPPORT } SHM_TYPE; + +typedef struct { + uint8_t *addr; + size_t len; + uint64_t memid; + char name[SHM_MAX_NAME_BUFF_LEN]; + uint32_t fd; +} SHM; + +typedef struct ShmListNode { + SHM shm; + struct ShmListNode *next; + struct ShmListNode *prev; +} ShmListNode; + +typedef struct { + ShmListNode* head; + ShmListNode* tail; + size_t size; + pthread_mutex_t shmLock; +} ShmList; +} +} +#endif //BRPC_SHM_DEF_H \ No newline at end of file diff --git a/src/brpc/ubshm/shm/shm_ipc.cpp b/src/brpc/ubshm/shm/shm_ipc.cpp new file mode 100644 index 0000000000..7e934c7568 --- /dev/null +++ b/src/brpc/ubshm/shm/shm_ipc.cpp @@ -0,0 +1,189 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include "brpc/ubshm/common/common.h" +#include "brpc/ubshm/shm/shm_def.h" +#include "brpc/ubshm/shm/shm_ipc.h" + +namespace brpc { +namespace ubring { +RETURN_CODE IpcShmLocalMalloc(SHM *shm) +{ + int fd = shm_open(shm->name, O_CREAT | O_EXCL | O_RDWR, SHM_IPC_MODE); + if (fd < 0) { + if (errno == EEXIST) { + LOG(ERROR) << "IPC Create shm=" << shm->name << " failed, shm exists."; + return SHM_ERR_EXIST; + } + + LOG(ERROR) << "IPC Open shm=" << shm->name << " failed, ret(" << errno << ")."; + return SHM_ERR; + } + + int ret = ftruncate(fd, (off_t)shm->len); + if (ret < 0) { + LOG(ERROR) << "IPC Set shm=" << shm->name << " length=" << shm->len << " failed, ret(" << errno << ")."; + close(fd); + shm_unlink(shm->name); + return SHM_ERR; + } + + shm->addr = (uint8_t*)mmap(NULL, shm->len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (shm->addr == (uint8_t*)MAP_FAILED) { + LOG(ERROR) << "IPC map shm=" << shm->name << " length=" << shm->len << " failed, ret(" << errno << ")."; + close(fd); + shm_unlink(shm->name); + return SHM_ERR; + } + + close(fd); + return UBRING_OK; +} + +RETURN_CODE IpcShmMunmap(SHM *shm) +{ + if (shm->addr == NULL) { + LOG(INFO) << "IPC unmap shm=" << shm->name << " already unmapped."; + return UBRING_OK; + } + + int ret = munmap(shm->addr, shm->len); + if (ret != UBRING_OK) { + LOG(ERROR) << "IPC unmap shm=" << shm->name << " failed, errno=" << errno; + return SHM_ERR; + } + + LOG(INFO) << "IPC unmap shm=" << shm->name << " length=" << shm->len << " success."; + return UBRING_OK; +} + +RETURN_CODE IpcShmFree(SHM *shm) +{ + // free + int ret = shm_unlink(shm->name); + if (ret != UBRING_OK) { + if (errno == EBUSY) { + LOG_EVERY_SECOND(ERROR) << "IPC free shm=" << shm->name << " failed, errno=" << errno; + return SHM_ERR_RESOURCE_ATTACHED; + } + if (errno == ENOENT) { + LOG(INFO) << "IPC free shm=" << shm->name << " already deleted."; + shm->addr = NULL; + return SHM_ERR_NOT_FOUND; + } + LOG_EVERY_SECOND(ERROR) << "IPC free shm=" << shm->name << " failed, errno=" << errno; + return SHM_ERR; + } + return UBRING_OK; +} + +RETURN_CODE IpcShmLocalFree(SHM *shm) +{ + if (shm->addr == NULL) { + LOG(INFO) << "IPC free local shm=" << shm->name << " already freed."; + return SHM_ERR_NOT_FOUND; + } + + int ret = munmap(shm->addr, shm->len); + if (ret != UBRING_OK) { + LOG(WARNING) << "IPC unmap shm=" << shm->name << " failed, ret=" << ret; + } + + ret = shm_unlink(shm->name); + if (ret != UBRING_OK) { + if (errno == EBUSY) { + LOG_EVERY_SECOND(ERROR) << "IPC delete shm=" << shm->name << " failed, ret=" << ret; + return SHM_ERR_RESOURCE_ATTACHED; + } + if (errno == ENOENT) { + LOG(INFO) << "IPC delete shm=" << shm->name << " already deleted by peer."; + shm->addr = NULL; + return SHM_ERR_NOT_FOUND; + } + LOG_EVERY_SECOND(ERROR) << "IPC delete shm=" << shm->name << " failed, ret=" << ret; + return SHM_ERR; + } + shm->addr = NULL; + LOG(INFO) << "IPC free local shm=" << shm->name << " success."; + return UBRING_OK; +} + +RETURN_CODE IpcShmRemoteMalloc(SHM *shm) +{ + int fd = shm_open(shm->name, O_RDWR, SHM_IPC_MODE); + if (fd < 0) { + LOG(ERROR) << "IPC open shm=" << shm->name << " failed, ret=" << errno; + return SHM_ERR; + } + + shm->addr = (uint8_t*)mmap(NULL, shm->len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (shm->addr == (uint8_t*)MAP_FAILED) { + LOG(ERROR) << "IPC map shm=" << shm->name << " failed, ret=" << errno; + close(fd); + return SHM_ERR; + } + + close(fd); + return UBRING_OK; +} + +RETURN_CODE IpcShmLocalMmap(SHM *shm, int prot) +{ + int fd = shm_open(shm->name, O_RDWR, SHM_IPC_MODE); + if (fd < 0) { + LOG(ERROR) << "IPC open shm=" << shm->name << " failed, ret=" << errno; + return SHM_ERR; + } + + shm->addr = (uint8_t*)mmap(NULL, shm->len, prot, MAP_SHARED, fd, 0); + if (shm->addr == (uint8_t*)MAP_FAILED) { + LOG(ERROR) << "IPC map shm=" << shm->name << " failed, ret=" << errno; + close(fd); + return SHM_ERR; + } + + close(fd); + LOG(INFO) << "IPC mmap remote shm=" << shm->name << " length=" << shm->len << " success."; + return UBRING_OK; +} + +RETURN_CODE IpcShmRemoteFree(SHM *shm) +{ + if (shm->addr == NULL) { + LOG(INFO) << "IPC free remote shm=" << shm->name << " already freed."; + return UBRING_OK; + } + + int ret = munmap(shm->addr, shm->len); + if (ret != UBRING_OK) { + LOG(ERROR) << "IPC unmap shm=" << shm->name << " failed, ret=" << ret; + return SHM_ERR; + } + + LOG(INFO) << "IPC free remote shm=" << shm->name << " success."; + return UBRING_OK; +} +} +} diff --git a/src/brpc/ubshm/shm/shm_ipc.h b/src/brpc/ubshm/shm/shm_ipc.h new file mode 100644 index 0000000000..34e8307bb8 --- /dev/null +++ b/src/brpc/ubshm/shm/shm_ipc.h @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_SHM_IPC_H +#define BRPC_SHM_IPC_H + +#include "shm_def.h" + +#define SHM_IPC_MODE 0666 + +namespace brpc { +namespace ubring { + RETURN_CODE IpcShmLocalMalloc(SHM *shm); + RETURN_CODE IpcShmMunmap(SHM *shm); + RETURN_CODE IpcShmFree(SHM *shm); + RETURN_CODE IpcShmLocalFree(SHM *shm); + RETURN_CODE IpcShmRemoteMalloc(SHM *shm); + RETURN_CODE IpcShmRemoteFree(SHM *shm); + RETURN_CODE IpcShmLocalMmap(SHM *shm, int prot); +} +} + +#endif //BRPC_SHM_IPC_H \ No newline at end of file diff --git a/src/brpc/ubshm/shm/shm_mgr.cpp b/src/brpc/ubshm/shm/shm_mgr.cpp new file mode 100644 index 0000000000..3f819857b2 --- /dev/null +++ b/src/brpc/ubshm/shm/shm_mgr.cpp @@ -0,0 +1,247 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include "brpc/ubshm/common/common.h" +#include "brpc/ubshm/shm/shm_ipc.h" +#include "brpc/ubshm/shm/shm_ubs.h" +#include "brpc/ubshm/shm/shm_mgr.h" + +namespace brpc { +namespace ubring { +DEFINE_int32(ub_shm_type, 1, "shm type: 1-ipc; 2-ub_ring"); +static SHM_TYPE g_shmType; + +static bool CheckInputShmParam(SHM *shm) { + if (shm == NULL) { + LOG(ERROR) << "Input Param shm is NULL."; + return false; + } + + size_t nameLen = strlen(shm->name); + if (nameLen <= 0 || nameLen > SHM_MAX_NAME_LEN) { + LOG(ERROR) << "Shm name=" << shm->name << ", length=" << shm->len + << ", which is not between 1 and " << SHM_MAX_NAME_LEN; + return false; + } + + if (shm->len <= 0) { + LOG(ERROR) << "Shm length=" << shm->len << " is invalid."; + return false; + } + + if (shm->len < SHM_ALLOC_UNIT_SIZE || (shm->len & (SHM_ALLOC_UNIT_SIZE - 1)) != 0) { + LOG(ERROR) << "Shm length=" << shm->len << " need to be (1..n) * 4MB."; + return false; + } + + return true; +} + +RETURN_CODE ShmMgrInit(void) { + if (UNLIKELY(FLAGS_ub_shm_type >= (int32_t)SHM_TYPE_UNSUPPORT || FLAGS_ub_shm_type <= (int32_t)SHM_TYPE_UB)) { + LOG(ERROR) << "Shm type config=" << FLAGS_ub_shm_type << " is not supported."; + return UBRING_ERR; + } + + g_shmType = (SHM_TYPE)FLAGS_ub_shm_type; + if (g_shmType == SHM_TYPE_UBS) { + if (UbsShmInit() != UBRING_OK) { + LOG(ERROR) << "Init beiming ubs shm failed."; + return UBRING_ERR; + } + } + LOG(INFO) << "shm mgr init success, shm type=" << g_shmType; + return UBRING_OK; +} + +void ShmMgrFini(void) { + if (g_shmType == SHM_TYPE_UBS) { + if (UbsShmFini() != UBRING_OK) { + LOG(ERROR) << "Fini beiming ubs shm failed."; + return; + } + } + LOG(INFO) << "shm mgr fini success, shm type=" << g_shmType; +} + +void SetShmType(SHM_TYPE type) { + g_shmType = type; +} + +RETURN_CODE ShmLocalMalloc(SHM *shm) { + if (UNLIKELY(!CheckInputShmParam(shm))) { + LOG(ERROR) << "Input param shm is invalid."; + return SHM_ERR_INPUT_INVALID; + } + + RETURN_CODE rc = UBRING_OK; + switch (g_shmType) { + case SHM_TYPE_IPC: + rc = IpcShmLocalMalloc(shm); + break; + case SHM_TYPE_UBS: + rc = UbsShmLocalMalloc(shm); + break; + default: + rc = SHM_ERR; + LOG(ERROR) << "Unsupported shm type."; + } + return rc; +} + +RETURN_CODE ShmLocalCalloc(SHM *shm) { + RETURN_CODE rc = ShmLocalMalloc(shm); + if (UNLIKELY(rc != UBRING_OK)) { + LOG(ERROR) << "Failed to alloc local shm."; + return rc; + } + memset(shm->addr, 0, shm->len); + return UBRING_OK; +} + +RETURN_CODE ShmLocalFree(SHM *shm) { + if (UNLIKELY(!CheckInputShmParam(shm))) { + LOG(ERROR) << "Input param shm is invalid."; + return SHM_ERR_INPUT_INVALID; + } + + RETURN_CODE rc = UBRING_OK; + switch (g_shmType) { + case SHM_TYPE_IPC: + rc = IpcShmLocalFree(shm); + break; + case SHM_TYPE_UBS: + rc = UbsShmLocalFree(shm); + break; + default: + rc = SHM_ERR; + LOG(ERROR) << "Unsupported shm type."; + } + return rc; +} + +RETURN_CODE ShmRemoteMalloc(SHM *shm) { + if (UNLIKELY(!CheckInputShmParam(shm))) { + LOG(ERROR) << "Input param shm is invalid."; + return SHM_ERR_INPUT_INVALID; + } + + RETURN_CODE rc = UBRING_OK; + switch (g_shmType) { + case SHM_TYPE_IPC: + rc = IpcShmRemoteMalloc(shm); + break; + case SHM_TYPE_UBS: + rc = UbsShmRemoteMalloc(shm); + break; + default: + rc = SHM_ERR; + LOG(ERROR) << "Unsupported shm type."; + } + return rc; +} + +RETURN_CODE ShmRemoteFree(SHM *shm) { + if (UNLIKELY(!CheckInputShmParam(shm))) { + LOG(ERROR) << "Input param shm is invalid."; + return SHM_ERR_INPUT_INVALID; + } + + RETURN_CODE rc = UBRING_OK; + switch (g_shmType) { + case SHM_TYPE_IPC: + rc = IpcShmRemoteFree(shm); + break; + case SHM_TYPE_UBS: + rc = UbsShmRemoteFree(shm); + break; + default: + rc = SHM_ERR; + LOG(ERROR) << "Unsupported shm type."; + } + return rc; +} + +RETURN_CODE ShmLocalMmap(SHM *shm, int prot) { + if (UNLIKELY(!CheckInputShmParam(shm))) { + LOG(ERROR) << "Input param shm is invalid."; + return SHM_ERR_INPUT_INVALID; + } + + RETURN_CODE rc = UBRING_OK; + switch (g_shmType) { + case SHM_TYPE_IPC: + rc = IpcShmLocalMmap(shm, prot); + break; + case SHM_TYPE_UBS: + rc = UbsShmLocalMmap(shm, prot); + break; + default: + rc = SHM_ERR; + LOG(ERROR) << "Unsupported shm type."; + } + return rc; +} + +RETURN_CODE ShmMunmap(SHM *shm) { + if (UNLIKELY(!CheckInputShmParam(shm))) { + LOG(ERROR) << "Input param shm is invalid."; + return SHM_ERR_INPUT_INVALID; + } + + RETURN_CODE rc = UBRING_OK; + switch (g_shmType) { + case SHM_TYPE_IPC: + rc = IpcShmMunmap(shm); + break; + case SHM_TYPE_UBS: + rc = UbsShmMunmap(shm); + break; + default: + rc = SHM_ERR; + LOG(ERROR) << "Unsupported shm type."; + } + return rc; +} + +RETURN_CODE ShmFree(SHM *shm) { + if (UNLIKELY(!CheckInputShmParam(shm))) { + LOG(ERROR) << "Input param shm is invalid."; + return SHM_ERR_INPUT_INVALID; + } + + RETURN_CODE rc = UBRING_OK; + switch (g_shmType) { + case SHM_TYPE_IPC: + rc = IpcShmFree(shm); + break; + case SHM_TYPE_UBS: + rc = UbsShmFree(shm); + break; + default: + rc = SHM_ERR; + LOG(ERROR) << "Unsupported shm type."; + } + return rc; +} +} +} \ No newline at end of file diff --git a/src/brpc/ubshm/shm/shm_mgr.h b/src/brpc/ubshm/shm/shm_mgr.h new file mode 100644 index 0000000000..597f5e4ba5 --- /dev/null +++ b/src/brpc/ubshm/shm/shm_mgr.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_SHM_MGR_H +#define BRPC_SHM_MGR_H + +#include +#include "brpc/ubshm/common/common.h" +#include "brpc/ubshm/shm/shm_def.h" + +namespace brpc { +namespace ubring { +void SetShmType(SHM_TYPE type); + +RETURN_CODE ShmMgrInit(void); + +void ShmMgrFini(void); + +RETURN_CODE ShmLocalMalloc(SHM *shm); + +RETURN_CODE ShmLocalCalloc(SHM *shm); + +RETURN_CODE ShmLocalFree(SHM *shm); + +RETURN_CODE ShmRemoteMalloc(SHM *shm); + +RETURN_CODE ShmRemoteFree(SHM *shm); + +RETURN_CODE ShmLocalMmap(SHM *shm, int prot); + +RETURN_CODE ShmMunmap(SHM *shm); + +RETURN_CODE ShmFree(SHM *shm); +} +} + +#endif //BRPC_SHM_MGR_H \ No newline at end of file diff --git a/src/brpc/ubshm/shm/shm_ubs.cpp b/src/brpc/ubshm/shm/shm_ubs.cpp new file mode 100644 index 0000000000..537d8e91aa --- /dev/null +++ b/src/brpc/ubshm/shm/shm_ubs.cpp @@ -0,0 +1,565 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include "brpc/ubshm/timer/timer_mgr.h" +#include "brpc/ubshm/common/thread_lock.h" +#include "brpc/ubshm/common/common.h" +#include "brpc/ubshm/shm/shm_def.h" +#include "brpc/ubshm/ub_ring_manager.h" +#include "brpc/ubshm/ubs_mem/ubs_mem.h" +#include "brpc/ubshm/ubs_mem/ubs_mem_def.h" +#ifdef UT +#include "ubs_mem.h" +#endif +#include "shm_ubs.h" + +namespace brpc { +namespace ubring { +#define UBRING_MK_UBSM(ret, fn, args) ret (*fn) args = NULL +#include "brpc/ubshm/ubs_mem/declare_shm_ubs.h" +#define SHM_RIGHT_MODE 0666 +#define UBRING_REGION_NAME_PREFIX "UbrONE2ALLRegion" +DEFINE_uint32(node_location, 1, "Location of the ub machine."); +DEFINE_bool(shm_wr_delay_comp, true, "Indicates whether to enable the write relay." + "0: relay; 1: non-relay."); +DEFINE_int32(ub_flying_io_timeout, 5, "Waiting time for stopping data" + "sending and receiving when the link is disconnected."); +char g_regionName[MAX_REGION_NAME_DESC_LENGTH] = {0}; +int g_shmTimerFd = 0; +ShmList *g_shmList = NULL; +static RETURN_CODE UbsShmInterfacesLoad(void); +char hostname[MAX_HOST_NAME_DESC_LENGTH]; + +RETURN_CODE UbsShmInterfacesLoad(void) +{ +#ifndef UT + const char *ubsmSdkLocation = "/usr/local/ubs_mem/lib/libubsm_sdk.so"; +#if defined(OS_LINUX) + void* dlhandler = dlmopen(LM_ID_NEWLM, ubsmSdkLocation, RTLD_NOW | RTLD_LOCAL | RTLD_NODELETE | RTLD_DEEPBIND); +#elif defined(OS_MACOSX) + void* dlhandler = dlopen(ubsmSdkLocation, RTLD_NOW | RTLD_LOCAL | RTLD_NODELETE); +#endif + if (dlhandler == NULL) { + LOG(ERROR) << "Dlopen libubsm_sdk.so in " << ubsmSdkLocation << " failed, error:" << dlerror(); + return UBRING_ERR; + } + +#define UBRING_MK_UBSM_OPTIONAL(ret, fn, args) \ + do { \ + fn = (decltype(fn))dlsym(dlhandler, #fn); \ + } while (0) + +#define UBRING_MK_UBSM(ret, fn, args) \ + do { \ + if ((fn) != NULL) { \ + break; \ + } \ + UBRING_MK_UBSM_OPTIONAL(ret, fn, args); \ + if ((fn) == NULL) { \ + LOG(ERROR) << "Fail load ubs_mem func " << #fn <<" error:" << dlerror(); \ + return UBRING_ERR; \ + } \ + } while (0) +#include "brpc/ubshm/ubs_mem/declare_shm_ubs.h" + + dlclose(dlhandler); + dlhandler = NULL; +#endif + return UBRING_OK; +} + +static RETURN_CODE CreateUbsShmRegion(const char *regionName) +{ + int ret = snprintf(g_regionName, MAX_REGION_NAME_DESC_LENGTH, "%s_%u", + UBRING_REGION_NAME_PREFIX, FLAGS_node_location); + if (ret < 0) { + LOG(ERROR) << "Snprintf_s region name failed, ret=" << ret; + return UBRING_ERR; + } + + ubsmem_regions_t regions = {0}; // 16 * (48 + 1) bytes, 约0.8k + ret = ubsmem_lookup_regions(®ions); + if (ret != UBSM_OK || regions.region[0].host_num <= 0) { + LOG(ERROR) << "Ubs lookup share region failed, ret=" << ret << ", region.num=" << regions.region[0].host_num; + return UBRING_ERR; + } + ubsmem_region_attributes_t regionAttr = {0}; + regionAttr.host_num = regions.region[0].host_num; + for (int i = 0; i < regionAttr.host_num; i++) { + strcpy(regionAttr.hosts[i].host_name, regions.region[0].hosts[i].host_name); + regionAttr.hosts[i].affinity = (strcmp(regionAttr.hosts[i].host_name, hostname) == 0) ? + true : false; + } + + ret = ubsmem_create_region(regionName, 0, ®ionAttr); + if (ret == UBSM_ERR_ALREADY_EXIST) { + LOG(WARNING) << "Ubs region exists, region_name=" << regionName; + return UBRING_OK; + } else if (ret != UBSM_OK) { + LOG(ERROR) << "Ubsmem create region failed, ret=" << ret; + return UBRING_ERR; + } + + return UBRING_OK; +} + +static uint64_t AquireFlagIfWrDelayComp(const uint64_t flag) +{ + if (FLAGS_shm_wr_delay_comp == 0) { + return flag; + } + return flag | UBSM_FLAG_WR_DELAY_COMP; +} + +RETURN_CODE UbsShmLocalMalloc(SHM *shm) +{ + int ret = ubsmem_shmem_allocate(g_regionName, shm->name, shm->len, SHM_RIGHT_MODE, + AquireFlagIfWrDelayComp(UBSM_FLAG_ONLY_IMPORT_NONCACHE | UBSM_FLAG_MEM_ANONYMOUS)); +do { + if (ret == UBSM_ERR_ALREADY_EXIST) { + if (ubsmem_shmem_deallocate(shm->name) != UBSM_OK) { + LOG(ERROR) << "Ubs create shm name=" << shm->name << " failed, shm exists, ret=" << ret; + return SHM_ERR_EXIST; + } + LOG(INFO) << "Ubs delete shm name=" << shm->name << " success, try to recreate."; + ret = ubsmem_shmem_allocate(g_regionName, shm->name, shm->len, SHM_RIGHT_MODE, + AquireFlagIfWrDelayComp(UBSM_FLAG_ONLY_IMPORT_NONCACHE | UBSM_FLAG_MEM_ANONYMOUS)); + if (ret != UBSM_OK) { + LOG(ERROR) << "Ubs recreate shm name=" << shm->name << " failed, ret=" << ret; + return SHM_ERR; + } + } else if (ret != UBSM_OK) { + LOG(ERROR) << "Ubs create shm name=" << shm->name << " failed, ret=" << ret; + return SHM_ERR; + } +} while (0); + + ret = ubsmem_shmem_map(NULL, shm->len, PROT_READ | PROT_WRITE, MAP_SHARED, shm->name, 0, (void**)&(shm->addr)); + if (ret != UBSM_OK) { + LOG(ERROR) << "Ubs map shm=" << shm->name << " failed, ret=" << ret; + if (ret == UBSM_ERR_NOT_FOUND) { + return SHM_ERR_NOT_FOUND; + } + ubsmem_shmem_deallocate(shm->name); + return SHM_ERR; + } + + // 通过MXE获取memid + shm->memid = 1; // 暂时打桩 + LOG(INFO) << "Ubs malloc local shm=" << shm->name << " length=" << shm->len << " memid=" << shm->memid << " success."; + return UBRING_OK; +} + +RETURN_CODE UbsShmMunmap(SHM *shm) +{ + // unmap + if (shm->addr == NULL) { + LOG(ERROR) << "Ubs input shm param is invalid, addr is NULL."; + return SHM_ERR_INPUT_INVALID; + } + + int ret = ubsmem_shmem_unmap(shm->addr, shm->len); + if (ret != UBSM_OK) { + if (ret == UBSM_ERR_NET) { + LOG(ERROR) << "Ubs unmap shm=" << shm->name << " failed, ubsm net err=" << ret; + AddShmToList(g_shmList, shm); + return SHM_ERR_UBSM_NET_ERR; + } + LOG(ERROR) << "Ubs unmap shm=" << shm->name << " length=" << shm->len << " failed, ret=" << ret; + return SHM_ERR; + } + + LOG(INFO) << "Ubs unmap shm=" << shm->name << " length=" << shm->len << " success."; + return UBRING_OK; +} + +RETURN_CODE UbsShmFree(SHM *shm) +{ + if (shm->addr == NULL) { + LOG(ERROR) << "Ubs input shm param is invalid, addr is NULL."; + return SHM_ERR_INPUT_INVALID; + } + + // free + int ret = ubsmem_shmem_deallocate(shm->name); + if (ret != UBSM_OK) { + if (ret == UBSM_ERR_IN_USING) { + LOG(INFO) << "Ubs free shm=" << shm->name << " failed, resource attached=" << ret; + return SHM_ERR_RESOURCE_ATTACHED; + } else if (ret == UBSM_ERR_NOT_FOUND) { + LOG(INFO) << "Ubs free shm=" << shm->name << " failed, resource not found=" << ret; + return SHM_ERR_NOT_FOUND; + } + LOG(ERROR) << "Ubs free shm="<< shm->name << " failed, ret=" << ret; + return SHM_ERR; + } + shm->addr = NULL; + LOG(INFO) << "Ubs free shm=" << shm->name << " length=" << shm->len << " success."; + return UBRING_OK; +} + +RETURN_CODE UbsShmLocalFree(SHM *shm) +{ + // unmap + if (shm->addr == NULL) { + LOG(ERROR) << "Ubs input shm param is invalid, addr is NULL."; + return SHM_ERR_INPUT_INVALID; + } + + int ret = ubsmem_shmem_unmap(shm->addr, shm->len); + if (ret != UBSM_OK) { + if (ret == UBSM_ERR_NET) { + LOG(ERROR) << "Ubs unmap shm=" << shm->name << " failed, ubsm net err=" << ret; + AddShmToList(g_shmList, shm); + return SHM_ERR_UBSM_NET_ERR; + } + LOG(WARNING) << "Ubs unmap shm=" << shm->name << " length=" << shm->len << " failed, ret=" << ret; + } + + // free + ret = ubsmem_shmem_deallocate(shm->name); + if (ret != UBSM_OK) { + if (ret == UBSM_ERR_IN_USING) { + LOG_EVERY_SECOND(INFO) << "Ubs delete shm=" << shm->name << " failed, resource attached=" << ret; + return SHM_ERR_RESOURCE_ATTACHED; + } + LOG(ERROR) << "Ubs delete shm=" << shm->name << " failed, ret=" << ret; + return SHM_ERR; + } + shm->addr = NULL; + LOG(INFO) << "Ubs free local shm=" << shm->name << " length=" << shm->len << " success."; + return UBRING_OK; +} + +RETURN_CODE UbsShmRemoteMalloc(SHM *shm) +{ + int ret = ubsmem_shmem_map(NULL, shm->len, PROT_READ | PROT_WRITE, MAP_SHARED, shm->name, 0, (void**)&(shm->addr)); + if (ret != UBSM_OK) { + LOG(ERROR) << "Ubs map Shm=" << shm->name << " failed, ret=" << ret; + return SHM_ERR; + } + + LOG(INFO) << "Ubs malloc remote shm=" << shm->name << " length=" << shm->len << " success."; + return UBRING_OK; +} + +RETURN_CODE UbsShmLocalMmap(SHM *shm, int prot) +{ + int ret = ubsmem_shmem_map(NULL, shm->len, prot, MAP_SHARED, shm->name, 0, (void**)&(shm->addr)); + if (ret != UBSM_OK) { + LOG(ERROR) << "Ubs map Shm=" << shm->name << " failed, ret=" << ret; + return SHM_ERR; + } + + LOG(INFO) << "Ubs mmap remote shm=" << shm->name << " length=" << shm->len << " success."; + return UBRING_OK; +} + +RETURN_CODE UbsShmRemoteFree(SHM *shm) +{ + // unmap + if (shm->addr == NULL) { + LOG(ERROR) << "Ubs input shm param is invalid, addr is NULL."; + return SHM_ERR_INPUT_INVALID; + } + + int ret = ubsmem_shmem_unmap(shm->addr, shm->len); + if (ret != UBSM_OK) { + if (ret == UBSM_ERR_NET) { + LOG(ERROR) << "Ubs unmap shm=" << shm->name << " failed, ubsm net err=" << ret; + AddShmToList(g_shmList, shm); + return SHM_ERR_UBSM_NET_ERR; + } + LOG(ERROR) << "Ubs unmap shm=" << shm->name << " length=" << shm->len << " failed, ret=" << ret; + return SHM_ERR; + } + + LOG(INFO) << "Ubs free Remote shm=" << shm->name << " length=" << shm->len << " success."; + return UBRING_OK; +} + +void UbsMemLoggerPrint(int level, const char *msg) +{ + if (level == UBSM_LOG_ERROR_LEVEL) { + LOG(ERROR) << msg; + } else if (level == UBSM_LOG_WARN_LEVEL) { + LOG(WARNING) << msg; + } else { + LOG(INFO) << msg; + } + return; +} + +RETURN_CODE UbsShmInit(void) +{ + // 加载libubsm_sdk.so函数指针 + RETURN_CODE retCode = UbsShmInterfacesLoad(); + if (retCode != UBRING_OK) { + LOG(ERROR) << "Load ubs shm functions failed, ret=" << retCode; + return UBRING_ERR; + } + + if (gethostname(hostname, MAX_HOST_NAME_DESC_LENGTH) != 0) { + LOG(ERROR) << "ubring config gethostname failed, errno=" << errno; + return UBRING_ERR; + } + + int ret = ubsmem_set_extern_logger(UbsMemLoggerPrint); + if (ret != UBSM_OK) { + LOG(ERROR) << "Ubs set logger failed, ret=" << ret; + return UBRING_ERR; + } + + ret = ubsmem_set_logger_level(UBSM_LOG_INFO_LEVEL); + if (ret != UBSM_OK) { + LOG(ERROR) << "Ubs set logger level failed, ret=" << ret; + return UBRING_ERR; + } + + ubsmem_options_t options = {}; + ret = ubsmem_init_attributes(&options); + if (ret != UBSM_OK) { + LOG(ERROR) << "Ubs shm init attributes failed, ret=" << ret; + return UBRING_ERR; + } + + ret = ubsmem_initialize(&options); + if (ret != UBSM_OK) { + LOG(ERROR) << "Ubs shm initialize failed, ret=" << ret; + return UBRING_ERR; + } + + if (UNLIKELY(ubsmem_local_nid_query(&FLAGS_node_location) != UBSM_OK)) { + LOG(ERROR) << "Get local nid failed."; + return UBRING_ERR; + } + + if (UNLIKELY(ubsmem_shmem_faults_register(brpc::ubring::UBRingManager::UbEventCallback) != UBSM_OK)) { + LOG(ERROR) << "Failed to register the ub event callback function."; + return UBRING_ERR; + } + + if (CreateUbsShmRegion(g_regionName) != UBRING_OK) { + LOG(ERROR) << "Create Ubs region failed."; + return UBRING_ERR; + } + + if (InitShmTimer(&g_shmList) != UBRING_OK) { + LOG(ERROR) << "Ubs shm list init failed."; + return UBRING_ERR; + } + + LOG(INFO) << "Ubs shm init success."; + return UBRING_OK; +} + +RETURN_CODE UbsShmFini(void) +{ + int ret = ubsmem_finalize(); + if (ret != UBSM_OK) { + LOG(ERROR) << "Ubs shm finalize fail, ret=" << ret; + return UBRING_ERR; + } + + if (UNLIKELY(DestroyShmTimer(g_shmList) != UBRING_OK)) { + LOG(ERROR) << "Ubs shm list finalize failed."; + return UBRING_ERR; + } + + LOG(INFO) << "Ubs shm finalize success."; + return UBRING_OK; +} + +static void DeleteShmToList(ShmList* shmList) +{ + if (shmList == NULL || shmList->head == NULL) { + return; + } + + ShmListNode *curNode = shmList->head; + shmList->head = curNode->next; + if (shmList->head != NULL) { + shmList->head->prev = NULL; + } else { + shmList->tail = NULL; + } + LOG(INFO) << "Delete shm to list, name=" << curNode->shm.name << " size=" << shmList->size; + FREE_PTR(curNode); + shmList->size--; +} + +void *UbsShmCallback(void* args) +{ + ShmList *shmList = (ShmList*)args; + if (UNLIKELY(shmList == NULL)) { + LOG(ERROR) << "Shm list is null."; + return NULL; + } + + LOCK_GUARD(shmList->shmLock); + while (shmList->head != NULL) { + SHM shm = shmList->head->shm; + if (shm.addr == NULL) { + LOG(ERROR) << "Ubs input shm param is invalid, addr is NULL."; + return NULL; + } + + int ret = ubsmem_shmem_unmap(shm.addr, shm.len); + if (ret != UBSM_OK) { + if (ret == UBSM_ERR_NET) { + return NULL; + } + LOG(ERROR) << "Ubs unmap shm=" << shm.name << " length=" << shm.len << " failed, ret=" << ret; + return NULL; + } + LOG(INFO) << "Ubs unmap shm=" << shm.name << " length=" << shm.len << " success."; + + ret = ubsmem_shmem_deallocate(shm.name); + if (ret != UBSM_OK) { + DeleteShmToList(shmList); + LOG(ERROR) << "Ubs delete shm=" << shm.name << " failed, ret=" << ret; + return NULL; + } + DeleteShmToList(shmList); + LOG(INFO) << "Ubs free local shm=" << shm.name << " length=" << shm.len << " success."; + } + + return NULL; +} + +RETURN_CODE UbsShmAddTimer(ShmList *shmList) +{ + uint32_t timerInterval = FLAGS_ub_flying_io_timeout; + itimerspec timeSpec = { + .it_interval = {.tv_sec = timerInterval, .tv_nsec = 0}, + .it_value = {.tv_sec = 0, .tv_nsec = 1} + }; + int timerFd = TimerStart(&timeSpec, UbsShmCallback, (void*)shmList); + if (UNLIKELY(timerFd == -1)) { + LOG(ERROR) << "Start shm timer failed."; + return UBRING_ERR; + } + g_shmTimerFd = timerFd; + + return UBRING_OK; +} + +RETURN_CODE InitShmTimer(ShmList **shmList) +{ + *shmList = (ShmList *)malloc(sizeof(ShmList)); + if (*shmList == NULL) { + LOG(ERROR) << "Malloc shm list failed."; + return UBRING_ERR; + } + (*shmList)->head = NULL; + (*shmList)->tail = NULL; + (*shmList)->size = 0; + + if (pthread_mutex_init(&(*shmList)->shmLock, NULL) != 0) { + LOG(ERROR) << "Init shm list mutex failed."; + FREE_PTR(*shmList); + return UBRING_ERR; + } + + if (UbsShmAddTimer(*shmList) == UBRING_ERR) { + LOG(ERROR) << "Ubs add timer failed."; + FREE_PTR(*shmList); + return UBRING_ERR; + } + return UBRING_OK; +} + +RETURN_CODE DestroyShmTimer(ShmList *shmList) +{ + DeleteTimerSafe((uint32_t)g_shmTimerFd); + if (shmList == NULL) { + LOG(WARNING) << "Shm list is null."; + return UBRING_ERR; + } + ShmListNode* current = shmList->head; + ShmListNode* next; + + while (current != NULL) { + next = current->next; + free(current); + current = next; + } + pthread_mutex_destroy(&shmList->shmLock); + FREE_PTR(shmList); + return UBRING_OK; +} + +RETURN_CODE IsExistInShmList(ShmList *shmList, const SHM *shm) +{ + if (UNLIKELY(shmList == NULL || shm == NULL)) { + LOG(ERROR) << "Shm list or shm is null."; + return UBRING_ERR; + } + LOCK_GUARD(shmList->shmLock); + + ShmListNode *curNode = shmList->head; + while (curNode != NULL) { + if (strcmp(curNode->shm.name, shm->name) == 0 && curNode->shm.len == shm->len) { + return UBRING_OK; + } + curNode = curNode->next; + } + return UBRING_ERR; +} + +RETURN_CODE AddShmToList(ShmList *shmList, SHM *shm) +{ + if (shmList == NULL || shm == NULL) { + LOG(ERROR) << "Shm list or shm is null."; + return UBRING_ERR; + } + + if (IsExistInShmList(shmList, shm) == UBRING_OK) { + LOG(ERROR) << "Shm name=" << shm->name << " is exist in shm list."; + return UBRING_ERR; + } + + ShmListNode *newShmNode = (ShmListNode *)malloc(sizeof(ShmListNode)); + if (newShmNode == NULL) { + LOG(ERROR) << "Malloc shm node failed."; + return UBRING_ERR; + } + + memcpy(&newShmNode->shm, shm, sizeof(SHM)); + LOCK_GUARD(shmList->shmLock); + newShmNode->next = NULL; + newShmNode->prev = shmList->tail; + if (shmList->tail) { + shmList->tail->next = newShmNode; + shmList->tail = newShmNode; + } else { + shmList->head = newShmNode; + shmList->tail = newShmNode; + } + shmList->size++; + LOG(INFO) << "Add shm to list success, shm name=" << shm->name << " size=" << shmList->size; + return UBRING_OK; +} +} +} \ No newline at end of file diff --git a/src/brpc/ubshm/shm/shm_ubs.h b/src/brpc/ubshm/shm/shm_ubs.h new file mode 100644 index 0000000000..14b5916503 --- /dev/null +++ b/src/brpc/ubshm/shm/shm_ubs.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_SHM_UBS_H +#define BRPC_SHM_UBS_H +namespace brpc { +namespace ubring { +DECLARE_int32(ub_flying_io_timeout); + +typedef enum TagUbsLogLevel { + UBSM_LOG_DEBUG_LEVEL = 0, + UBSM_LOG_INFO_LEVEL = 1, + UBSM_LOG_WARN_LEVEL = 2, + UBSM_LOG_ERROR_LEVEL = 3, + UBSM_LOG_CLOSED_LEVEL = 4 +} UbsLogLevel; + +RETURN_CODE UbsShmLocalMalloc(SHM *shm); +RETURN_CODE UbsShmMunmap(SHM *shm); +RETURN_CODE UbsShmFree(SHM *shm); +RETURN_CODE UbsShmLocalFree(SHM *shm); +RETURN_CODE UbsShmRemoteMalloc(SHM *shm); +RETURN_CODE UbsShmRemoteFree(SHM *shm); +RETURN_CODE UbsShmInit(void); +RETURN_CODE UbsShmFini(void); +RETURN_CODE UbsShmLocalMmap(SHM *shm, int prot); +void UbsMemLoggerPrint(int level, const char *msg); + +void *UbsShmCallback(void* args); +RETURN_CODE UbsShmAddTimer(ShmList *shmList); +RETURN_CODE InitShmTimer(ShmList **shmList); +RETURN_CODE DestroyShmTimer(ShmList *shmList); +RETURN_CODE AddShmToList(ShmList *shmList, SHM *shm); +RETURN_CODE IsExistInShmList(ShmList *shmList, const SHM *shm); +} +} +#endif //BRPC_SHM_UBS_H \ No newline at end of file diff --git a/src/brpc/ubshm/timer/timer_mgr.cpp b/src/brpc/ubshm/timer/timer_mgr.cpp new file mode 100644 index 0000000000..e53833f95e --- /dev/null +++ b/src/brpc/ubshm/timer/timer_mgr.cpp @@ -0,0 +1,468 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include "brpc/ubshm/timer/timer_mgr.h" + +namespace brpc { +namespace ubring { + +int32_t g_epollFd = -1; +std::atomic g_totalTimerNum; +TimerFdCtx *g_timerFdCtxMap = NULL; +uint32_t maxSystemFd; +static pthread_t g_epollExecuteThread; +static int32_t g_timerModuleInitialized; + +#if defined(OS_MACOSX) +static int timerfd_create_macosx(int clockid, int flags); +static int timerfd_settime_macosx(int fd, int flags, + const itimerspec *new_value, + itimerspec *old_value); +#endif + +static RETURN_CODE DeleteTimerInner(uint32_t fd) { + if (g_timerFdCtxMap == NULL) { + return UBRING_OK; + } + + if (pthread_spin_lock(&g_timerFdCtxMap[fd].spinLock) != 0) { + return UBRING_ERR; + } + + if (g_timerFdCtxMap[fd].status == TIMER_CONTEXT_NOT_USING) { + pthread_spin_unlock(&g_timerFdCtxMap[fd].spinLock); + return UBRING_OK; + } + + g_timerFdCtxMap[fd].status = TIMER_CONTEXT_NOT_USING; + g_timerFdCtxMap[fd].cb = NULL; + g_timerFdCtxMap[fd].args = NULL; + g_timerFdCtxMap[fd].periodical = 0; + g_timerFdCtxMap[fd].fd = 0; + + pthread_spin_unlock(&g_timerFdCtxMap[fd].spinLock); + +#if defined(OS_LINUX) + epoll_ctl(g_epollFd, EPOLL_CTL_DEL, (int)fd, NULL); +#elif defined(OS_MACOSX) + struct kevent evt; + EV_SET(&evt, fd, EVFILT_TIMER, EV_DELETE, 0, 0, NULL); + kevent(g_epollFd, &evt, 1, NULL, 0, NULL); +#endif + + uint64_t exp = 0; + read((int)fd, &exp, sizeof(exp)); + + close((int)fd); + atomic_fetch_sub(&g_totalTimerNum, 1); + return UBRING_OK; +} + +static RETURN_CODE StartTimeEpoll(void) { +#if defined(OS_LINUX) + g_epollFd = epoll_create1(0); +#elif defined(OS_MACOSX) + g_epollFd = kqueue(); +#endif + if (UNLIKELY(g_epollFd == -1)) { + LOG(ERROR) << "Failed to create epoll/kqueue. errno=" << errno; + return UBRING_ERR; + } + + int ret = pthread_create(&g_epollExecuteThread, NULL, TimerEpoll, NULL); + if (UNLIKELY(ret != 0)) { + LOG(ERROR) << "Failed to create thread err=" << ret; + return UBRING_ERR; + } + return UBRING_OK; +} + +static RETURN_CODE TimerSpinLocksInit(void) { + if (g_timerFdCtxMap == NULL) { + LOG(ERROR) << "Timer module is not fully initialized."; + return UBRING_ERR; + } + + for (uint32_t fd = 0; fd < maxSystemFd; fd++) { + int ret = pthread_spin_init(&g_timerFdCtxMap[fd].spinLock, + PTHREAD_PROCESS_PRIVATE); + if (ret != EOK) { + LOG(ERROR) << "Failed to initialize spin lock for fd=" << fd; + for (uint32_t cleanupFd = 0; cleanupFd < fd; cleanupFd++) { + pthread_spin_destroy(&g_timerFdCtxMap[cleanupFd].spinLock); + } + return UBRING_ERR; + } + } + return UBRING_OK; +} + +static RETURN_CODE ExecuteCallback(int32_t timerFd) { + UnifiedCallback((void *)(&g_timerFdCtxMap[timerFd])); + return UBRING_OK; +} + +static RETURN_CODE TimerCtxMapCompletion(void) { + memset(g_timerFdCtxMap, 0, sizeof(TimerFdCtx) * maxSystemFd); + + RETURN_CODE ret = TimerSpinLocksInit(); + if (ret != UBRING_OK) { + LOG(ERROR) << "Failed to init spin locks for timer module."; + return UBRING_ERR; + } + return UBRING_OK; +} + +RETURN_CODE TimerInit(void) { + if (g_timerModuleInitialized > 0) { + return UBRING_OK; + } + + g_totalTimerNum.store(0); + + struct rlimit rlim; + if (getrlimit(RLIMIT_NOFILE, &rlim) != UBRING_OK) { + LOG(ERROR) << "Failed to get fd"; + return UBRING_ERR; + } + maxSystemFd = (uint32_t)rlim.rlim_cur; + + if (g_timerFdCtxMap == NULL) { + g_timerFdCtxMap = (TimerFdCtx *)malloc(sizeof(TimerFdCtx) * maxSystemFd); + if (UNLIKELY(!g_timerFdCtxMap)) { + LOG(ERROR) << "Fail to malloc space for timer modules. errno=%d", errno; + return UBRING_ERR; + } + + RETURN_CODE ret = TimerCtxMapCompletion(); + if (ret != UBRING_OK) { + LOG(ERROR) << "Failed to init main data structure of Time Module. ret=" << ret; + free(g_timerFdCtxMap); + g_timerFdCtxMap = NULL; + return UBRING_ERR; + } + } + + RETURN_CODE ret = StartTimeEpoll(); + if (ret != UBRING_OK) { + LOG(ERROR) << "Failed to start Timer Epoll. ret=" << ret; + if (LIKELY(g_timerFdCtxMap != NULL)) { + FREE_PTR(g_timerFdCtxMap); + } + return UBRING_ERR; + } + g_timerModuleInitialized = 1; + return UBRING_OK; +} + +void *UnifiedCallback(void *args) { + TimerFdCtx *ctx = (TimerFdCtx *)args; + if (pthread_spin_lock(&ctx->spinLock) != 0) { + return NULL; + } + + if (ctx->status == TIMER_CONTEXT_NOT_USING) { + pthread_spin_unlock(&ctx->spinLock); + return NULL; + } + + void *(*cb)(void *) = ctx->cb; + void *cbArgs = ctx->args; + uint32_t fd = ctx->fd; + int isPeriodical = ctx->periodical; + ctx->status = TIMER_CONTEXT_CALLBACK_ONGOING; + + pthread_spin_unlock(&ctx->spinLock); + + cb(cbArgs); + + if (!isPeriodical) { + DeleteTimerInner(fd); + } + return NULL; +} + +void *TimerEpoll(void *args) { + UNREFERENCE_PARAM(args); +#if defined(OS_LINUX) + struct epoll_event readyEvents[MAX_TIMER]; +#elif defined(OS_MACOSX) + struct kevent readyEvents[MAX_TIMER]; +#endif + + while (1) { + if (g_timerModuleInitialized <= 0) { + LOG(ERROR) << "The Timer module is not initialized."; + break; + } + +#if defined(OS_LINUX) + int32_t readyNum = epoll_wait(g_epollFd, readyEvents, MAX_TIMER, + TIMER_EPOLL_WAIT_TIMEOUT); +#elif defined(OS_MACOSX) + struct timespec timeout = {0, TIMER_EPOLL_WAIT_TIMEOUT * 1000000}; + int32_t readyNum = kevent(g_epollFd, NULL, 0, readyEvents, MAX_TIMER, &timeout); +#endif + + if (UNLIKELY(readyNum == -1)) { + errno_t err = errno; + if (err == EINTR) { + LOG_EVERY_SECOND(WARNING) << "Epoll/Kqueue wait was interrupted. errno=" << err; + continue; + } else if (err == EBADF) { + LOG(WARNING) << "The Timer module is destroyed."; + break; + } + LOG(ERROR) << "Epoll/Kqueue wait internal error. errno=" << err; + break; + } + + for (int32_t i = 0; i < readyNum; i++) { +#if defined(OS_LINUX) + struct epoll_event *event = &readyEvents[i]; + int32_t timerFd = event->data.fd; +#elif defined(OS_MACOSX) + struct kevent *event = &readyEvents[i]; + int32_t timerFd = event->ident; +#endif + + uint64_t exp = 0; + if (read(timerFd, &exp, sizeof(exp)) < 0) { + if (errno != EBADF) { + LOG(ERROR) << "Failed to read timerfd=" << timerFd << " errno=" << errno; + } + continue; + } + if (TimerFdCtxValidate((uint32_t)timerFd) != UBRING_OK) { + continue; + } + + RETURN_CODE ret = ExecuteCallback(timerFd); + if (ret != UBRING_OK) { + LOG(ERROR) << "Failed execute callback ret=" << ret; + DeleteTimerInner((uint32_t)timerFd); + continue; + } + } + } + return NULL; +} + +void DeleteTimerSafe(uint32_t fd) { + if (g_timerFdCtxMap == NULL) { + return; + } + + if (pthread_spin_lock(&g_timerFdCtxMap[fd].spinLock) != 0) { + return; + } + + if (g_timerFdCtxMap[fd].status == TIMER_CONTEXT_NOT_USING) { + pthread_spin_unlock(&g_timerFdCtxMap[fd].spinLock); + return; + } + + g_timerFdCtxMap[fd].status = TIMER_CONTEXT_NOT_USING; + g_timerFdCtxMap[fd].cb = NULL; + g_timerFdCtxMap[fd].args = NULL; + g_timerFdCtxMap[fd].periodical = 0; + g_timerFdCtxMap[fd].fd = 0; + + pthread_spin_unlock(&g_timerFdCtxMap[fd].spinLock); + +#if defined(OS_LINUX) + epoll_ctl(g_epollFd, EPOLL_CTL_DEL, (int)fd, NULL); +#elif defined(OS_MACOSX) + struct kevent evt; + EV_SET(&evt, fd, EVFILT_TIMER, EV_DELETE, 0, 0, NULL); + kevent(g_epollFd, &evt, 1, NULL, 0, NULL); +#endif + + uint64_t exp = 0; + read((int)fd, &exp, sizeof(exp)); + + close((int)fd); + atomic_fetch_sub(&g_totalTimerNum, 1); +} + +void DeleteTimer(uint32_t fd) { + if (g_timerFdCtxMap == NULL) { + LOG(WARNING) << "The timer is not initialized."; + return; + } + + g_timerFdCtxMap[fd].periodical = 0; +} + +int32_t TimerStart(const itimerspec *time, void *(*cb)(void *), void *args) { + if (g_epollFd == -1) { + LOG(ERROR) << "Timer epoll/kqueue encountered internal error."; + return -1; + } + +#if defined(OS_LINUX) + int timerFd = timerfd_create(CLOCK_MONOTONIC, 0); +#elif defined(OS_MACOSX) + int timerFd = timerfd_create_macosx(CLOCK_MONOTONIC, 0); +#endif + + if (UNLIKELY(timerFd >= (int)maxSystemFd || timerFd == -1)) { + LOG(ERROR) << "Failed to create timerfd=" << timerFd << " errno=" << errno; + return -1; + } + + g_timerFdCtxMap[timerFd].status = TIMER_CONTEXT_EPOLL_WAITING; + g_timerFdCtxMap[timerFd].cb = cb; + g_timerFdCtxMap[timerFd].args = args; + g_timerFdCtxMap[timerFd].fd = (uint32_t)timerFd; + + if (LIKELY(time->it_interval.tv_sec > 0 || time->it_interval.tv_nsec > 0)) { + g_timerFdCtxMap[timerFd].periodical = 1; + } + +#if defined(OS_LINUX) + struct epoll_event event = { + .events = EPOLLIN, + .data = {.fd = timerFd} + }; + + int32_t ret = epoll_ctl(g_epollFd, EPOLL_CTL_ADD, timerFd, &event); +#elif defined(OS_MACOSX) + struct kevent event; + uint64_t timeout_nsec = time->it_value.tv_sec * 1000000000ULL + time->it_value.tv_nsec; + uint64_t interval_nsec = time->it_interval.tv_sec * 1000000000ULL + time->it_interval.tv_nsec; + EV_SET(&event, timerFd, EVFILT_TIMER, EV_ADD | EV_ENABLE, 0, + timeout_nsec / 1000000, NULL); + int32_t ret = kevent(g_epollFd, &event, 1, NULL, 0, NULL); +#endif + + if (UNLIKELY(ret != 0)) { + CloseTimerFd((uint32_t)timerFd); + LOG(ERROR) << "Failed to add event to epoll/kqueue. errno=" << errno; + return -1; + } + + atomic_fetch_add(&g_totalTimerNum, 1); + +#if defined(OS_LINUX) + ret = timerfd_settime(timerFd, 0, time, NULL); +#elif defined(OS_MACOSX) + ret = timerfd_settime_macosx(timerFd, 0, time, NULL); +#endif + + if (UNLIKELY(ret != 0)) { +#if defined(OS_LINUX) + if (epoll_ctl(g_epollFd, EPOLL_CTL_DEL, timerFd, NULL) != 0) { +#elif defined(OS_MACOSX) + struct kevent evt; + EV_SET(&evt, timerFd, EVFILT_TIMER, EV_DELETE, 0, 0, NULL); + if (kevent(g_epollFd, &evt, 1, NULL, 0, NULL) != 0) { +#endif + LOG(ERROR) << "Failed to delete the timer fd=" << timerFd << " with errno=" << errno; + } + CloseTimerFd((uint32_t)timerFd); + atomic_fetch_sub(&g_totalTimerNum, 1); + LOG(ERROR) << "Failed to set timer"; + return -1; + } + + return timerFd; +} + +uint32_t GetActiveTimerNum(void) { + return atomic_load(&g_totalTimerNum); +} + +void CloseTimerFd(uint32_t fd) { + g_timerFdCtxMap[fd].cb = NULL; + g_timerFdCtxMap[fd].args = NULL; + g_timerFdCtxMap[fd].status = TIMER_CONTEXT_NOT_USING; + g_timerFdCtxMap[fd].fd = 0; + g_timerFdCtxMap[fd].periodical = 0; + if (close((int)fd) != 0) { + LOG(ERROR) << "Failed to close timer fd=" << fd << " errno=" << errno; + return; + } +} + +void TimerModuleDestroy(void) { + uint32_t maxFd = maxSystemFd; + if (g_timerFdCtxMap) { + for (uint32_t fd = 0; fd < maxFd; fd++) { + if (g_timerFdCtxMap[fd].status != TIMER_CONTEXT_NOT_USING) { + DeleteTimerSafe(fd); + } + } + } + close(g_epollFd); + g_epollFd = -1; + g_totalTimerNum = 0; + g_timerModuleInitialized = 0; + int32_t ret = pthread_join(g_epollExecuteThread, NULL); + if (ret != EOK) { + LOG(ERROR) << "Failed to join pthread, during destroying timer module. ret=" << ret; + return; + } +} + +RETURN_CODE TimerFdCtxValidate(uint32_t fd) { + if (fd >= maxSystemFd) { + LOG(ERROR) << "TimerFd=" << fd << " is out of range=" << maxSystemFd; + return UBRING_ERR; + } + if (g_timerFdCtxMap[fd].status == TIMER_CONTEXT_NOT_USING) { + LOG(ERROR) << "TimerFd=" << fd << " has wrong status=" << g_timerFdCtxMap[fd].status; + return UBRING_ERR; + } + if (g_timerFdCtxMap[fd].cb == NULL) { + LOG(ERROR) << "The callback is not set."; + return UBRING_ERR; + } + + return UBRING_OK; +} + +#if defined(OS_MACOSX) +static int timerfd_create_macosx(int clockid, int flags) { + int pipefd[2]; + if (pipe(pipefd) == -1) { + return -1; + } + return pipefd[0]; +} + +static int timerfd_settime_macosx(int fd, int flags, + const itimerspec *new_value, + itimerspec *old_value) { + if (old_value != NULL) { + memset(old_value, 0, sizeof(itimerspec)); + } + return 0; +} +#endif + +} // namespace ubring +} // namespace brpc \ No newline at end of file diff --git a/src/brpc/ubshm/timer/timer_mgr.h b/src/brpc/ubshm/timer/timer_mgr.h new file mode 100644 index 0000000000..9630430a2c --- /dev/null +++ b/src/brpc/ubshm/timer/timer_mgr.h @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_TIMER_MGR_H +#define BRPC_TIMER_MGR_H +#include +#include +#include "brpc/ubshm/common/common.h" + +#if defined(OS_LINUX) +#include +#include +#elif defined(OS_MACOSX) +#include +#include +#include +#endif + +#define MAX_TIMER 1024 +#define TIMER_EPOLL_WAIT_TIMEOUT 1000 + +#if defined(OS_MACOSX) +struct itimerspec +{ + struct timespec it_interval; + struct timespec it_value; +}; +#endif +namespace brpc { +namespace ubring { +typedef enum { + TIMER_CONTEXT_NOT_USING, + TIMER_CONTEXT_EPOLL_WAITING, + TIMER_CONTEXT_CALLBACK_ONGOING +} TimerFdCtxStatus; + +typedef struct { + void *(*cb)(void*); + void *args; + uint32_t fd; + TimerFdCtxStatus status; + uint32_t periodical; + pthread_spinlock_t spinLock; +} TimerFdCtx; + +RETURN_CODE TimerInit(void); +void TimerModuleDestroy(void); +void *UnifiedCallback(void *args); +void *TimerEpoll(void *args); +int32_t TimerStart(const itimerspec *time, void *(*cb)(void *), void *args); +uint32_t GetActiveTimerNum(void); +void CloseTimerFd(uint32_t fd); + +void DeleteTimerSafe(uint32_t fd); +void DeleteTimer(uint32_t fd); +RETURN_CODE TimerFdCtxValidate(uint32_t fd); +} +} +#endif //BRPC_TIMER_MGR_H \ No newline at end of file diff --git a/src/brpc/ubshm/ub_endpoint.cpp b/src/brpc/ubshm/ub_endpoint.cpp new file mode 100644 index 0000000000..b4c728c057 --- /dev/null +++ b/src/brpc/ubshm/ub_endpoint.cpp @@ -0,0 +1,936 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#if BRPC_WITH_UBRING + +#include +#include +#include "butil/fd_utility.h" +#include "butil/logging.h" // CHECK, LOG +#include "butil/sys_byteorder.h" // HostToNet,NetToHost +#include "bthread/bthread.h" +#include "brpc/errno.pb.h" +#include "brpc/event_dispatcher.h" +#include "brpc/input_messenger.h" +#include "brpc/socket.h" +#include "brpc/reloadable_flags.h" +#include "brpc/ubshm/ub_helper.h" +#include "brpc/ubshm/ub_endpoint.h" +#include "brpc/ubshm/shm/shm_def.h" +#include "brpc/ubshm/common/common.h" +#include "brpc/ubshm_transport.h" +#include "brpc/ubshm/ubr_trx.h" + +DECLARE_int32(task_group_ntags); + +namespace brpc { +DECLARE_bool(log_connection_close); +namespace ubring { + +extern bool g_skip_ub_init; +DEFINE_int32(data_queue_size, 4, "data queue size for UB"); +DEFINE_bool(ub_trace_verbose, false, "Print log message verbosely"); +BRPC_VALIDATE_GFLAG(ub_trace_verbose, brpc::PassValidate); +DEFINE_int32(ub_poller_num, 1, "Poller number in ub polling mode."); +DEFINE_bool(ub_poller_yield, false, "Yield thread in RDMA polling mode."); +DEFINE_bool(ub_edisp_unsched, false, "Disable event dispatcher schedule"); +DEFINE_bool(ub_disable_bthread, false, "Disable bthread in RDMA"); + +static const size_t MIN_ONCE_READ = 4096; +static const size_t MAX_ONCE_READ = 524288; +static const size_t IOBUF_IOV_MAX = 256; + +static const char* MAGIC_STR = "UB"; +static const size_t MAGIC_STR_LEN = 2; +static const size_t HELLO_MSG_LEN_MIN = 64; +static const size_t ACK_MSG_LEN = 4; +static uint16_t g_ub_hello_msg_len = 64; +static uint16_t g_ub_hello_version = 2; +static uint16_t g_ub_impl_version = 1; + +static const uint32_t ACK_MSG_UB_OK = 0x1; + +static butil::Mutex* g_ubring_resource_mutex = NULL; + +struct HelloMessage { + void Serialize(void* data) const; + void Deserialize(void* data); + std::string toString() const; + + uint16_t msg_len; + uint16_t hello_ver; + uint16_t impl_ver; + uint64_t len; + char shm_name[SHM_MAX_NAME_BUFF_LEN]; +}; + +void HelloMessage::Serialize(void* data) const { + char* current_pos = static_cast(data); + const uint16_t net_msg_len = butil::HostToNet16(msg_len); + memcpy(current_pos, &net_msg_len, sizeof(net_msg_len)); + current_pos += sizeof(net_msg_len); + const uint16_t net_hello_ver = butil::HostToNet16(hello_ver); + memcpy(current_pos, &net_hello_ver, sizeof(net_hello_ver)); + current_pos += sizeof(net_hello_ver); + const uint16_t net_impl_ver = butil::HostToNet16(impl_ver); + memcpy(current_pos, &net_impl_ver, sizeof(net_impl_ver)); + current_pos += sizeof(net_impl_ver); + const uint64_t net_len = butil::HostToNet64(len); + memcpy(current_pos, &net_len, sizeof(net_len)); + current_pos += sizeof(net_len); + memcpy(current_pos, shm_name, SHM_MAX_NAME_BUFF_LEN); +} + +void HelloMessage::Deserialize(void* data) { + char* current_pos = static_cast(data); + uint16_t net_msg_len; + memcpy(&net_msg_len, current_pos, sizeof(net_msg_len)); + msg_len = butil::NetToHost16(net_msg_len); + current_pos += sizeof(net_msg_len); + uint16_t net_hello_ver; + memcpy(&net_hello_ver, current_pos, sizeof(net_hello_ver)); + hello_ver = butil::NetToHost16(net_hello_ver); + current_pos += sizeof(net_hello_ver); + uint16_t net_impl_ver; + memcpy(&net_impl_ver, current_pos, sizeof(net_impl_ver)); + impl_ver = butil::NetToHost16(net_impl_ver); + current_pos += sizeof(net_impl_ver); + uint64_t net_len; + memcpy(&net_len, current_pos, sizeof(net_len)); + len = butil::NetToHost64(net_len); + current_pos += sizeof(net_len); + memcpy(shm_name, current_pos, SHM_MAX_NAME_BUFF_LEN); +} + +std::string HelloMessage::toString() const { + constexpr size_t MAX_LEN = 16 + 6 + 16 + 6 + 16 + 6 + 20 + 6 + SHM_MAX_NAME_BUFF_LEN + 32; + std::array buf; + int n = snprintf(buf.data(), buf.size(), + "msg_len=%u, hello_ver=%u, impl_ver=%u, len=%lu, shm_name=%.*s", + msg_len, + hello_ver, + impl_ver, + static_cast(len), // 兼容32/64位 + static_cast(SHM_MAX_NAME_BUFF_LEN), // 限制最大输出长度 + shm_name + ); + return std::string(buf.data(), static_cast(n)); +} + +UBShmEndpoint::UBShmEndpoint(Socket* s) + : _socket(s) + , _state(UNINIT) + , _ub_ring(nullptr) + , _cq_sid(INVALID_SOCKET_ID) +{ + _read_butex = bthread::butex_create_checked>(); +} + +UBShmEndpoint::~UBShmEndpoint() { + Reset(); + bthread::butex_destroy(_read_butex); +} + +void UBShmEndpoint::Reset() { + DeallocateResources(); + + delete _ub_ring; + _ub_ring = nullptr; + _cq_sid = INVALID_SOCKET_ID; + _state = UNINIT; +} + +void UBConnect::StartConnect(const Socket* socket, + void (*done)(int err, void* data), + void* data) { + auto* ub_transport = static_cast(socket->_transport.get()); + CHECK(ub_transport->_ub_ep != NULL); + SocketUniquePtr s; + if (Socket::Address(socket->id(), &s) != 0) { + return; + } + if (!IsUBAvailable()) { + ub_transport->_ub_ep->_state = UBShmEndpoint::FALLBACK_TCP; + ub_transport->_ub_state = UBShmTransport::UB_OFF; + done(0, data); + return; + } + _done = done; + _data = data; + bthread_t tid; + bthread_attr_t attr = BTHREAD_ATTR_NORMAL; + bthread_attr_set_name(&attr, "UBProcessHandshakeAtClient"); + if (bthread_start_background(&tid, &attr, + UBShmEndpoint::ProcessHandshakeAtClient, ub_transport->_ub_ep) < 0) { + LOG(FATAL) << "Fail to start handshake bthread"; + Run(); + } else { + s.release(); + } +} + +void UBConnect::StopConnect(Socket* socket) { } + +void UBConnect::Run() { + _done(errno, _data); +} + +static void TryReadOnTcpDuringRdmaEst(Socket* s) { + int progress = Socket::PROGRESS_INIT; + while (true) { + uint8_t tmp; + ssize_t nr = read(s->fd(), &tmp, 1); + if (nr < 0) { + if (errno != EAGAIN) { + const int saved_errno = errno; + PLOG(WARNING) << "Fail to read from " << s; + s->SetFailed(saved_errno, "Fail to read from %s: %s", + s->description().c_str(), berror(saved_errno)); + return; + } + if (!s->MoreReadEvents(&progress)) { + break; + } + } else if (nr == 0) { + s->SetEOF(); + return; + } else { + LOG(WARNING) << "Read unexpected data from " << s; + s->SetFailed(EPROTO, "Read unexpected data from %s", + s->description().c_str()); + return; + } + } +} + +void UBShmEndpoint::OnNewDataFromTcp(Socket* m) { + auto* ub_transport = static_cast(m->_transport.get()); + UBShmEndpoint* ep = ub_transport->GetUBShmEp(); + CHECK(ep != NULL); + + int progress = Socket::PROGRESS_INIT; + while (true) { + if (ep->_state == UNINIT) { + if (!m->CreatedByConnect()) { + if (!IsUBAvailable()) { + ep->_state = FALLBACK_TCP; + ub_transport->_ub_state = UBShmTransport::UB_OFF; + continue; + } + bthread_t tid; + ep->_state = S_HELLO_WAIT; + SocketUniquePtr s; + m->ReAddress(&s); + bthread_attr_t attr = BTHREAD_ATTR_NORMAL; + bthread_attr_set_name(&attr, "UBProcessHandshakeAtServer"); + if (bthread_start_background(&tid, &attr, + ProcessHandshakeAtServer, ep) < 0) { + ep->_state = UNINIT; + LOG(FATAL) << "Fail to start handshake bthread"; + } else { + s.release(); + } + } else { + // The connection may be closed or reset before the client + // starts handshake. This will be handled by client handshake. + // Ignore the exception here. + } + } else if (ep->_state < ESTABLISHED) { // during handshake + ep->_read_butex->fetch_add(1, butil::memory_order_release); + bthread::butex_wake(ep->_read_butex); + } else if (ep->_state == FALLBACK_TCP){ // handshake finishes + InputMessenger::OnNewMessages(m); + return; + } else if (ep->_state == ESTABLISHED) { + TryReadOnTcpDuringRdmaEst(ep->_socket); + return; + } + if (!m->MoreReadEvents(&progress)) { + break; + } + } +} +bool HelloNegotiationValid(HelloMessage& msg) { + if (msg.hello_ver == g_ub_hello_version && + msg.impl_ver == g_ub_impl_version) { + // This can be modified for future compatibility + return true; + } + return false; +} + +static const int WAIT_TIMEOUT_MS = 50; + +int UBShmEndpoint::ReadFromFd(void* data, size_t len) { + CHECK(data != NULL); + int nr = 0; + size_t received = 0; + do { + const timespec duetime = butil::milliseconds_from_now(WAIT_TIMEOUT_MS); + nr = read(_socket->fd(), (uint8_t*)data + received, len - received); + if (nr < 0) { + if (errno == EAGAIN) { + const int expected_val = _read_butex->load(butil::memory_order_acquire); + if (bthread::butex_wait(_read_butex, expected_val, &duetime) < 0) { + if (errno != EWOULDBLOCK && errno != ETIMEDOUT) { + return -1; + } + } + } else { + return -1; + } + } else if (nr == 0) { + errno = EEOF; + return -1; + } else { + received += nr; + } + } while (received < len); + return 0; +} + +int UBShmEndpoint::WriteToFd(void* data, size_t len) { + CHECK(data != NULL); + int nw = 0; + size_t written = 0; + do { + const timespec duetime = butil::milliseconds_from_now(WAIT_TIMEOUT_MS); + nw = write(_socket->fd(), (uint8_t*)data + written, len - written); + if (nw < 0) { + if (errno == EAGAIN) { + if (_socket->WaitEpollOut(_socket->fd(), true, &duetime) < 0) { + if (errno != ETIMEDOUT) { + return -1; + } + } + } else { + return -1; + } + } else { + written += nw; + } + } while (written < len); + return 0; +} + +inline void UBShmEndpoint::TryReadOnTcp() { + if (_socket->_nevent.fetch_add(1, butil::memory_order_acq_rel) == 0) { + if (_state == FALLBACK_TCP) { + InputMessenger::OnNewMessages(_socket); + } else if (_state == ESTABLISHED) { + TryReadOnTcpDuringRdmaEst(_socket); + } + } +} + +void* UBShmEndpoint::ProcessHandshakeAtClient(void* arg) { + UBShmEndpoint* ep = static_cast(arg); + SocketUniquePtr s(ep->_socket); + UBConnect::RunGuard rg((UBConnect*)s->_app_connect.get()); + + LOG_IF(INFO, FLAGS_ub_trace_verbose) + << "Start handshake on " << s->_local_side; + + uint8_t data[g_ub_hello_msg_len]; + + ep->_state = C_ALLOC_SHM; + auto* ub_transport = static_cast(s->_transport.get()); + size_t local_shm_len = (size_t)(FLAGS_data_queue_size) * MB_TO_BYTE; + SHM local_trx_shm = {NULL, local_shm_len, 0, {0}, (uint32_t)s->fd()}; + auto shm_name_str = butil::endpoint2str(s->local_side()); + const char* shm_name = shm_name_str.c_str(); + if (ep->AllocateClientResources(&local_trx_shm, shm_name) < 0) { + LOG(WARNING) << "Fallback to tcp:" << s->description(); + ub_transport->_ub_state = UBShmTransport::UB_OFF; + ep->_state = FALLBACK_TCP; + return NULL; + } + + ep->_state = C_HELLO_SEND; + HelloMessage local_msg; + local_msg.msg_len = g_ub_hello_msg_len; + local_msg.hello_ver = g_ub_hello_version; + local_msg.impl_ver = g_ub_impl_version; + local_msg.len = local_shm_len; + memcpy(local_msg.shm_name, local_trx_shm.name, SHM_MAX_NAME_BUFF_LEN); + memcpy(data, MAGIC_STR, MAGIC_STR_LEN); + local_msg.Serialize((char*)data + MAGIC_STR_LEN); + if (ep->WriteToFd(data, g_ub_hello_msg_len) < 0) { + const int saved_errno = errno; + PLOG(WARNING) << "Fail to send hello message to server:" << s->description(); + s->SetFailed(saved_errno, "Fail to complete ubring handshake from %s: %s", + s->description().c_str(), berror(saved_errno)); + ep->_state = FAILED; + return NULL; + } + LOG_IF(INFO, FLAGS_ub_trace_verbose) << "client handshake message : " << local_msg.toString(); + + ep->_state = C_HELLO_WAIT; + if (ep->ReadFromFd(data, MAGIC_STR_LEN) < 0) { + const int saved_errno = errno; + PLOG(WARNING) << "Fail to get hello message from server:" << s->description(); + s->SetFailed(saved_errno, "Fail to complete ubring handshake from %s: %s", + s->description().c_str(), berror(saved_errno)); + ep->_state = FAILED; + return NULL; + } + if (memcmp(data, MAGIC_STR, MAGIC_STR_LEN) != 0) { + LOG(WARNING) << "Read unexpected data during handshake:" << s->description(); + s->SetFailed(EPROTO, "Fail to complete ubring handshake from %s: %s", + s->description().c_str(), berror(EPROTO)); + ep->_state = FAILED; + return NULL; + } + + if (ep->ReadFromFd(data, HELLO_MSG_LEN_MIN - MAGIC_STR_LEN) < 0) { + const int saved_errno = errno; + PLOG(WARNING) << "Fail to get Hello Message from server:" << s->description(); + s->SetFailed(saved_errno, "Fail to complete ubring handshake from %s: %s", + s->description().c_str(), berror(saved_errno)); + ep->_state = FAILED; + return NULL; + } + HelloMessage remote_msg; + remote_msg.Deserialize(data); + if (remote_msg.msg_len < HELLO_MSG_LEN_MIN) { + LOG(WARNING) << "Fail to parse Hello Message length from server:" + << s->description(); + s->SetFailed(EPROTO, "Fail to complete ubring handshake from %s: %s", + s->description().c_str(), berror(EPROTO)); + ep->_state = FAILED; + return NULL; + } + + if (remote_msg.msg_len > HELLO_MSG_LEN_MIN) { + // TODO: Read Hello Message customized data + // Just for future use, should not happen now + } + + if (!HelloNegotiationValid(remote_msg)) { + LOG(WARNING) << "Fail to negotiate with server, fallback to tcp:" + << s->description(); + ub_transport->_ub_state = UBShmTransport::UB_OFF; + } else { + ep->_state = C_MAP_REMOTE_SHM; + if (ep->_ub_ring->UbrMapRemoteShm(&local_trx_shm, shm_name) < 0) { + LOG(WARNING) << "Fail to map the remote shm, fallback to tcp:" << s->description(); + ub_transport->_ub_state = UBShmTransport::UB_OFF; + } else { + ub_transport->_ub_state = UBShmTransport::UB_ON; + } + } + + ep->_state = C_ACK_SEND; + uint32_t flags = 0; + if (ub_transport->_ub_state != UBShmTransport::UB_OFF) { + flags |= ACK_MSG_UB_OK; + } + uint32_t* tmp = (uint32_t*)data; + *tmp = butil::HostToNet32(flags); + if (ep->WriteToFd(data, ACK_MSG_LEN) < 0) { + const int saved_errno = errno; + PLOG(WARNING) << "Fail to send Ack Message to server:" << s->description(); + s->SetFailed(saved_errno, "Fail to complete ubring handshake from %s: %s", + s->description().c_str(), berror(saved_errno)); + ep->_state = FAILED; + return NULL; + } + + if (ub_transport->_ub_state == UBShmTransport::UB_ON) { + ep->_state = ESTABLISHED; + LOG_IF(INFO, FLAGS_ub_trace_verbose) + << "Client handshake ends (use ubring) on " << s->description(); + } else { + ep->_state = FALLBACK_TCP; + LOG_IF(INFO, FLAGS_ub_trace_verbose) + << "Client handshake ends (use tcp) on " << s->description(); + } + + errno = 0; + + return NULL; +} + +void* UBShmEndpoint::ProcessHandshakeAtServer(void* arg) { + UBShmEndpoint* ep = static_cast(arg); + SocketUniquePtr s(ep->_socket); + + LOG_IF(INFO, FLAGS_ub_trace_verbose) + << "Start handshake on " << s->description(); + + uint8_t data[g_ub_hello_msg_len]; + + ep->_state = S_HELLO_WAIT; + if (ep->ReadFromFd(data, MAGIC_STR_LEN) < 0) { + const int saved_errno = errno; + PLOG(WARNING) << "Fail to read Hello Message from client:" << s->description() << " " << s->_remote_side; + s->SetFailed(saved_errno, "Fail to complete ubring handshake from %s: %s", + s->description().c_str(), berror(saved_errno)); + ep->_state = FAILED; + return NULL; + } + auto* ub_transport = static_cast(s->_transport.get()); + if (memcmp(data, MAGIC_STR, MAGIC_STR_LEN) != 0) { + LOG_IF(INFO, FLAGS_ub_trace_verbose) << "It seems that the " + << "client does not use RDMA, fallback to TCP:" + << s->description(); + s->_read_buf.append(data, MAGIC_STR_LEN); + ep->_state = FALLBACK_TCP; + ub_transport->_ub_state = UBShmTransport::UB_OFF; + ep->TryReadOnTcp(); + return NULL; + } + + if (ep->ReadFromFd(data, g_ub_hello_msg_len - MAGIC_STR_LEN) < 0) { + const int saved_errno = errno; + PLOG(WARNING) << "Fail to read Hello Message from client:" << s->description(); + s->SetFailed(saved_errno, "Fail to complete ubring handshake from %s: %s", + s->description().c_str(), berror(saved_errno)); + ep->_state = FAILED; + return NULL; + } + + HelloMessage remote_msg; + remote_msg.Deserialize(data); + LOG_IF(INFO, FLAGS_ub_trace_verbose) << "server receive handshake message : " << remote_msg.toString(); + if (remote_msg.msg_len < HELLO_MSG_LEN_MIN) { + LOG(WARNING) << "Fail to parse Hello Message length from client:" + << s->description(); + s->SetFailed(EPROTO, "Fail to complete ubring handshake from %s: %s", + s->description().c_str(), berror(EPROTO)); + ep->_state = FAILED; + return NULL; + } + if (remote_msg.msg_len > HELLO_MSG_LEN_MIN) { + // TODO: Read Hello Message customized header + // Just for future use, should not happen now + } + + if (!HelloNegotiationValid(remote_msg)) { + LOG(WARNING) << "Fail to negotiate with client, fallback to tcp:" + << s->description(); + ub_transport->_ub_state = UBShmTransport::UB_OFF; + } else { + ep->_state = S_ALLOC_SHM; + ubring::SHM remote_trx_shm = {NULL, remote_msg.len, 0, {0}, (uint8_t)ep->_socket->fd()}; + strncpy(remote_trx_shm.name, remote_msg.shm_name, SHM_MAX_NAME_BUFF_LEN); + + size_t local_shm_len = (size_t)(FLAGS_data_queue_size) * MB_TO_BYTE; + // server端共享内存名称 + ubring::SHM local_trx_shm = {NULL, local_shm_len, 0, {0}, (uint8_t)ep->_socket->fd()}; + char clientName[SHM_MAX_NAME_BUFF_LEN]; + strncpy(clientName, remote_msg.shm_name, SHM_MAX_NAME_BUFF_LEN); + + char *clientIpPort = strrchr(clientName, '_'); + if (clientIpPort != NULL) { + *clientIpPort = '\0'; + } + int result = snprintf(local_trx_shm.name, SHM_MAX_NAME_BUFF_LEN, "%s_%s", + clientName, SERVER_SHM_NAME_SUFFIX); + if (UNLIKELY(result < 0)) { + LOG(WARNING) << "Copy client shared memory name failed, ret=" << result; + ub_transport->_ub_state = UBShmTransport::UB_OFF; + } + if (result >= 0 && ep->AllocateServerResources(&remote_trx_shm, &local_trx_shm) < 0) { + LOG(WARNING) << "Fail to allocate ub resources, fallback to tcp:" + << s->description(); + ub_transport->_ub_state = UBShmTransport::UB_OFF; + } + } + + ep->_state = S_HELLO_SEND; + HelloMessage local_msg; + local_msg.msg_len = g_ub_hello_msg_len; + if (ub_transport->_ub_state == UBShmTransport::UB_OFF) { + local_msg.impl_ver = 0; + local_msg.hello_ver = 0; + } else { + local_msg.hello_ver = g_ub_hello_version; + local_msg.impl_ver = g_ub_impl_version; + local_msg.len = (FLAGS_data_queue_size) * MB_TO_BYTE; + memcpy(local_msg.shm_name, remote_msg.shm_name, SHM_MAX_NAME_BUFF_LEN); + } + memcpy(data, MAGIC_STR, MAGIC_STR_LEN); + local_msg.Serialize((char*)data + MAGIC_STR_LEN); + if (ep->WriteToFd(data, g_ub_hello_msg_len) < 0) { + const int saved_errno = errno; + PLOG(WARNING) << "Fail to send Hello Message to client:" << s->description(); + s->SetFailed(saved_errno, "Fail to complete ub handshake from %s: %s", + s->description().c_str(), berror(saved_errno)); + ep->_state = FAILED; + return NULL; + } + + ep->_state = S_ACK_WAIT; + if (ep->ReadFromFd(data, ACK_MSG_LEN) < 0) { + const int saved_errno = errno; + PLOG(WARNING) << "Fail to read ack message from client:" << s->description(); + s->SetFailed(saved_errno, "Fail to complete ubring handshake from %s: %s", + s->description().c_str(), berror(saved_errno)); + ep->_state = FAILED; + return NULL; + } + + uint32_t* tmp = (uint32_t*)data; + uint32_t flags = butil::NetToHost32(*tmp); + if (flags & ACK_MSG_UB_OK) { + if (ub_transport->_ub_state == UBShmTransport::UB_OFF) { + LOG(WARNING) << "Fail to parse Hello Message length from client:" + << s->description(); + s->SetFailed(EPROTO, "Fail to complete ub handshake from %s: %s", + s->description().c_str(), berror(EPROTO)); + ep->_state = FAILED; + return NULL; + } else { + ub_transport->_ub_state = UBShmTransport::UB_ON; + ep->_state = ESTABLISHED; + LOG_IF(INFO, FLAGS_ub_trace_verbose) + << "Server handshake ends (use ubring) on " << s->description(); + } + } else { + ub_transport->_ub_state = UBShmTransport::UB_OFF; + ep->_state = FALLBACK_TCP; + LOG_IF(INFO, FLAGS_ub_trace_verbose) + << "Server handshake ends (use tcp) on " << s->description(); + } + ep->TryReadOnTcp(); + + return NULL; +} + +bool UBShmEndpoint::IsWritable() const { + if (BAIDU_UNLIKELY(g_skip_ub_init)) { + // Just for UT + return false; + } + auto ret = _ub_ring->IsUbrTrxWriteable(EPOLLET); + if (ret == 0) { + return true; + } + return false; +} + +ssize_t UBShmEndpoint::CutFromIOBufList(butil::IOBuf** from, size_t ndata) { + if (BAIDU_UNLIKELY(g_skip_ub_init)) { + // Just for UT + errno = EAGAIN; + return -1; + } + if (BAIDU_UNLIKELY(ndata == 0)) { + return 0; + } + struct iovec vec[IOBUF_IOV_MAX]; + size_t nvec = 0; + for (size_t i = 0; i < ndata; ++i) { + const butil::IOBuf* p = from[i]; + const size_t nref = p->backing_block_num(); + for (size_t j = 0; j < nref && nvec < IOBUF_IOV_MAX; ++j, ++nvec) { + butil::StringPiece sp = p->backing_block(j); + vec[nvec].iov_base = const_cast(sp.data()); + vec[nvec].iov_len = sp.size(); + } + } + + ssize_t nw = 0; + nw = _ub_ring->UbrTrxWritev(vec, nvec); + if (UNLIKELY(nw == -1)) { + LOG(ERROR) << "Non-blocking send msg in failed, connection has been closed."; + errno = EPIPE; + } else if (UNLIKELY(nw == UBRING_RETRY)) { + errno = EAGAIN; + nw = -1; + } + if (nw <= 0) { + return nw; + } + size_t npop_all = nw; + for (size_t i = 0; i < ndata; ++i) { + npop_all -= from[i]->pop_front(npop_all); + if (npop_all == 0) { + break; + } + } + return nw; +} + +int UBShmEndpoint::AllocateClientResources(ubring::SHM* local_trx_shm, const char* shm_name) { + if (BAIDU_UNLIKELY(g_skip_ub_init)) { + // For UT + return 0; + } + + CHECK(_ub_ring == NULL); + // TODO: Pooling management + _ub_ring = new UBRing(); + + SocketOptions options; + options.user = this; + options.keytable_pool = _socket->_keytable_pool; + if (Socket::Create(options, &_cq_sid) < 0) { + PLOG(WARNING) << "Fail to create socket for cq"; + return -1; + } + int ret = _ub_ring->UbrAllocateLocalShm(local_trx_shm, shm_name); + if (ret != 0) { + return ret; + } + PollerRegisterEvent(CqSidOp::ADD, EPOLLIN); + return 0; +} + +int UBShmEndpoint::AllocateServerResources(ubring::SHM* remote_trx_shm, ubring::SHM* local_trx_shm) { + if (BAIDU_UNLIKELY(g_skip_ub_init)) { + // For UT + return 0; + } + + CHECK(_ub_ring == NULL); + // TODO: Pooling management + _ub_ring = new UBRing(); + + SocketOptions options; + options.user = this; + options.keytable_pool = _socket->_keytable_pool; + if (Socket::Create(options, &_cq_sid) < 0) { + PLOG(WARNING) << "Fail to create socket for cq"; + return -1; + } + int ret = _ub_ring->UbrAllocateServerShm(remote_trx_shm, local_trx_shm); + if (ret != 0) { + return ret; + } + // TODO mwj 是否应该在连接之后再进行轮询? + PollerRegisterEvent(CqSidOp::ADD, EPOLLIN); + return ret; +} + +void UBShmEndpoint::DeallocateResources() { + if (!_ub_ring) { + return; + } + PollerRegisterEvent(CqSidOp::REMOVE); + _ub_ring->UbrTrxClose(); + if (INVALID_SOCKET_ID != _cq_sid) { + SocketUniquePtr s; + if (Socket::Address(_cq_sid, &s) == 0) { + s->_user = NULL; + s->_fd = -1; + s->SetFailed(); + } + } +} + +void UBShmEndpoint::PollIn(UBShmEndpoint* ep, uint32_t epEvent) { + SocketUniquePtr s; + if (Socket::Address(ep->_socket->id(), &s) < 0) { + return; + } + auto* ub_transport = static_cast(s->_transport.get()); + CHECK(ep == ub_transport->_ub_ep); + + InputMessageClosure last_msg; + while (true) { + int ret = ep->_ub_ring->IsUbrTrxReadable(epEvent); + if (ret < 0) { + return; + } + + bool read_eof = false; + while (!read_eof) { + const int64_t received_us = butil::cpuwide_time_us(); + const int64_t base_realtime = butil::gettimeofday_us() - received_us; + + size_t once_read = s->_avg_msg_size * 16; + if (once_read < MIN_ONCE_READ) { + once_read = MIN_ONCE_READ; + } else if (once_read > MAX_ONCE_READ) { + once_read = MAX_ONCE_READ; + } + + const ssize_t nr = s->_read_buf.append_from_reader(ep->_ub_ring, once_read); + if (nr <= 0) { + if (0 == nr) { + // Set `read_eof' flag and proceed to feed EOF into `Protocol' + // (implied by m->_read_buf.empty), which may produce a new + // `InputMessageBase' under some protocols such as HTTP + LOG_IF(WARNING, FLAGS_log_connection_close) << *s << " was closed by remote side"; + read_eof = true; + } else if (errno != EAGAIN) { + if (errno == EINTR) { + continue; + } + const int saved_errno = errno; + PLOG(WARNING) << "Fail to read from " << *s; + s->SetFailed(saved_errno, "Fail to read from %s: %s", + s->description().c_str(), berror(saved_errno)); + return; + } else { + return; + } + } + + InputMessenger* messenger = static_cast(s->user()); + if (messenger->ProcessNewMessage(s.get(), nr, read_eof, received_us, + base_realtime, last_msg) < 0) { + return; + } + } + + if (read_eof) { + s->SetEOF(); + } + } +} + +void UBShmEndpoint::PollOut(UBShmEndpoint* ep, uint32_t epEvent) { + SocketUniquePtr s; + if (Socket::Address(ep->_socket->id(), &s) < 0) { + return; + } + auto* ub_transport = static_cast(s->_transport.get()); + CHECK(ep == ub_transport->_ub_ep); + if (ep->IsWritable()) { + ep->_socket->WakeAsEpollOut(); + } + +} + +int UBShmEndpoint::GlobalInitialize() { + g_ubring_resource_mutex = new butil::Mutex; + _poller_groups = std::vector(FLAGS_task_group_ntags); + return 0; +} + +void UBShmEndpoint::GlobalRelease() { + for (int i = 0; i < FLAGS_task_group_ntags; ++i) { + PollingModeRelease(i); + } +} + +std::vector UBShmEndpoint::_poller_groups; + +int UBShmEndpoint::PollingModeInitialize(bthread_tag_t tag, + std::function callback, + std::function init_fn, + std::function release_fn) { + auto& group = _poller_groups[tag]; + auto& pollers = group.pollers; + auto& running = group.running; + bool expected = false; + if (!running.compare_exchange_strong(expected, true)) { + return 0; + } + struct FnArgs { + Poller* poller; + std::atomic* running; + }; + auto fn = [](void* p) -> void* { + std::unique_ptr args(static_cast(p)); + auto poller = args->poller; + auto running = args->running; + std::unordered_set cq_sids; + CqSidOp op; + + if (poller->init_fn) { + poller->init_fn(); + } + while (running->load(std::memory_order_relaxed)) { + while (poller->op_queue.Dequeue(op)) { + if (op.type == CqSidOp::ADD) { + cq_sids.emplace(op); + } else if (op.type == CqSidOp::REMOVE) { + cq_sids.erase(op); + + } else if (op.type == CqSidOp::MOD) { + cq_sids.erase(op); + cq_sids.emplace(op); + } + } + for (auto cq : cq_sids) { + SocketUniquePtr s; + if (Socket::Address(cq.sid, &s) < 0) { + continue; + } + UBShmEndpoint* ep = static_cast(s->user()); + if (!ep) { + continue; + } + + if (cq.event & EPOLLIN) { + PollIn(ep, cq.event); + } + + if (cq.event & EPOLLOUT) { + PollOut(ep, cq.event); + } + } + if (poller->callback) { + poller->callback(); + } + if (FLAGS_ub_poller_yield) { + bthread_yield(); + } + } + + if (poller->release_fn) { + poller->release_fn(); + } + + return nullptr; + }; + for (int i = 0; i < FLAGS_ub_poller_num; ++i) { + auto args = new FnArgs{&pollers[i], &running}; + auto attr = FLAGS_ub_disable_bthread ? BTHREAD_ATTR_PTHREAD + : BTHREAD_ATTR_NORMAL; + attr.tag = tag; + bthread_attr_set_name(&attr, "UBPolling"); + pollers[i].callback = callback; + pollers[i].init_fn = init_fn; + pollers[i].release_fn = release_fn; + auto rc = bthread_start_background(&pollers[i].tid, &attr, fn, args); + if (rc != 0) { + LOG(ERROR) << "Fail to start ubring polling bthread"; + return -1; + } + } + return 0; +} + +void UBShmEndpoint::PollingModeRelease(bthread_tag_t tag) { + auto& group = _poller_groups[tag]; + auto& pollers = group.pollers; + auto& running = group.running; + running.store(false, std::memory_order_relaxed); + for (int i = 0; i < FLAGS_ub_poller_num; ++i) { + bthread_join(pollers[i].tid, NULL); + } +} + +void UBShmEndpoint::PollerRegisterEvent(CqSidOp::OpType op, uint32_t events) { + auto index = butil::fmix32(_cq_sid) % FLAGS_ub_poller_num; + auto& group = _poller_groups[bthread_self_tag()]; + auto& pollers = group.pollers; + auto& poller = pollers[index]; + if (INVALID_SOCKET_ID != _cq_sid) { + poller.op_queue.Enqueue(CqSidOp{_cq_sid, events, op}); + } +} + +} // namespace ubring +} // namespace brpc + +#endif // if BRPC_WITH_UBRING diff --git a/src/brpc/ubshm/ub_endpoint.h b/src/brpc/ubshm/ub_endpoint.h new file mode 100644 index 0000000000..d199f5881a --- /dev/null +++ b/src/brpc/ubshm/ub_endpoint.h @@ -0,0 +1,234 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_UB_ENDPOINT_H +#define BRPC_UB_ENDPOINT_H + +#if BRPC_WITH_UBRING + +#include +#include +#include +#include +#include +#include "butil/atomicops.h" +#include "butil/iobuf.h" +#include "butil/macros.h" +#include "butil/containers/mpsc_queue.h" +#include "brpc/socket.h" +#include "brpc/ubshm/ub_helper.h" +#include "brpc/ubshm/ub_ring.h" +#include "brpc/ubshm/shm/shm_def.h" + + +namespace brpc { +class Socket; +namespace ubring { + +DECLARE_int32(ub_poller_num); +DECLARE_bool(ub_edisp_unsched); +DECLARE_bool(ub_disable_bthread); + +class UBConnect : public AppConnect { +public: + void StartConnect(const Socket* socket, + void (*done)(int err, void* data), void* data) override; + void StopConnect(Socket*) override; + struct RunGuard { + RunGuard(UBConnect* rc) { this_rc = rc; } + ~RunGuard() { if (this_rc) this_rc->Run(); } + UBConnect* this_rc; + }; + +private: + void Run(); + void (*_done)(int, void*){NULL}; + void* _data{NULL}; +}; + +class BAIDU_CACHELINE_ALIGNMENT UBShmEndpoint : public SocketUser { +friend class UBConnect; +friend class Socket; +public: + explicit UBShmEndpoint(Socket* s); + ~UBShmEndpoint() override; + + // Global initialization + // Return 0 if success, -1 if failed and errno set + static int GlobalInitialize(); + + static void GlobalRelease(); + + // Reset the endpoint (for next use) + void Reset(); + + // Cut data from the given IOBuf list and use UBRING to send + // Return bytes cut if success, -1 if failed and errno set + ssize_t CutFromIOBufList(butil::IOBuf** data, size_t ndata); + + // Whether the endpoint can send more data + bool IsWritable() const; + + void PollerRegisterEpollOut(bool pollin) { + uint32_t events = EPOLLOUT | EPOLLET; + if (pollin) { + PollerRegisterEvent(CqSidOp::MOD, events | EPOLLIN); + return; + } + PollerRegisterEvent(CqSidOp::ADD, events); + } + + void PollerUnRegisterEpollOut(bool pollin) { + uint32_t events = EPOLLIN | EPOLLET; + if (pollin) { + PollerRegisterEvent(CqSidOp::MOD, events); + return; + } + PollerRegisterEvent(CqSidOp::REMOVE); + } + + // Callback when there is new epollin event on TCP fd + static void OnNewDataFromTcp(Socket* m); + + // Initialize polling mode + static int PollingModeInitialize(bthread_tag_t tag, + std::function callback, + std::function init_fn, + std::function release_fn); + + static void PollingModeRelease(bthread_tag_t tag); + +private: + enum State { + UNINIT = 0x0, + C_ALLOC_SHM = 0x1, + C_HELLO_SEND = 0x2, + C_HELLO_WAIT = 0x3, + C_MAP_REMOTE_SHM = 0x4, + C_ACK_SEND = 0x5, + S_HELLO_WAIT = 0x11, + S_ALLOC_SHM = 0x12, + S_HELLO_SEND = 0x13, + S_ACK_WAIT = 0x14, + ESTABLISHED = 0x100, + FALLBACK_TCP = 0x200, + FAILED = 0x300 + }; + + // Process handshake at the client + static void* ProcessHandshakeAtClient(void* arg); + + // Process handshake at the server + static void* ProcessHandshakeAtServer(void* arg); + + // Allocate resources + // Return 0 if success, -1 if failed and errno set + int AllocateClientResources(SHM* local_trx_shm, const char* shm_name); + + int AllocateServerResources(SHM* remote_trx_shm, SHM* local_trx_shm); + + // Release resources + void DeallocateResources(); + + // Read at most len bytes from fd in _socket to data + // wait for _read_butex if encounter EAGAIN + // return -1 if encounter other errno (including EOF) + int ReadFromFd(void* data, size_t len); + + + // Write at most len bytes from data to fd in _socket + // wait for _epollout_butex if encounter EAGAIN + // return -1 if encounter other errno + int WriteToFd(void* data, size_t len); + + // Poll CQ and get the work completion + static void PollIn(UBShmEndpoint* ep, uint32_t epEvent); + + static void PollOut(UBShmEndpoint* ep, uint32_t epEvent); + + // Try to read data on TCP fd in _socket + inline void TryReadOnTcp(); + + // Not owner + Socket* _socket; + + State _state; + + // ub resource + ubring::UBRing* _ub_ring{nullptr}; + + SocketId _cq_sid; + + // butex for inform read events on TCP fd during handshake + butil::atomic *_read_butex; + + DISALLOW_COPY_AND_ASSIGN(UBShmEndpoint); + + struct CqSidOp { + enum OpType { + ADD, + REMOVE, + MOD + }; + SocketId sid; + uint32_t event; + OpType type; + }; + + struct CqSidOpHash { + std::size_t operator()(const CqSidOp& op) const { + return op.sid; + } + }; + + struct CqSidOpEqual { + bool operator()(const CqSidOp& lhs, const CqSidOp& rhs) const { + return lhs.sid == rhs.sid; + } + }; + + // Poller instance + struct BAIDU_CACHELINE_ALIGNMENT Poller { + bthread_t tid{INVALID_BTHREAD}; + butil::MPSCQueue> op_queue; + // Callback used for io_uring/spdk etc + std::function callback; + // Init and Destroy function + std::function init_fn; + std::function release_fn; + }; + // Poller group + struct BAIDU_CACHELINE_ALIGNMENT PollerGroup { + PollerGroup() : pollers(FLAGS_ub_poller_num), running(false) {} + std::vector pollers; + std::atomic running; + }; + static std::vector _poller_groups; + + void PollerRegisterEvent(CqSidOp::OpType op, uint32_t events = EPOLLET); +}; + +} // namespace ubring +} // namespace brpc + +#else // if BRPC_WITH_UBRING + +class UBShmEndpoint { }; + +#endif + +#endif //BRPC_UB_ENDPOINT_H diff --git a/src/brpc/ubshm/ub_helper.cpp b/src/brpc/ubshm/ub_helper.cpp new file mode 100644 index 0000000000..6c4c7a5fde --- /dev/null +++ b/src/brpc/ubshm/ub_helper.cpp @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#if BRPC_WITH_UBRING + +#include // dlopen +#include +#include +#include +#include +#include "butil/logging.h" +#include "brpc/socket.h" +#include "brpc/ubshm/ub_endpoint.h" +#include "brpc/ubshm/ub_helper.h" +#include "brpc/ubshm/ub_ring_manager.h" + +namespace brpc { +namespace ubring { + +void* g_handle_ub = NULL; +bool g_skip_ub_init = false; + +butil::atomic g_ub_available(false); + +void GlobalRelease() { + g_ub_available.store(false, butil::memory_order_release); + UBShmEndpoint::GlobalRelease(); + UBRingManager::UbrMgrFini(); + ShmMgrFini(); +} + +static inline void ExitWithError() { + GlobalRelease(); + exit(1); +} + +static void GlobalUBInitializeOrDieImpl() { + if (BAIDU_UNLIKELY(g_skip_ub_init)) { + // Just for UT + return; + } + + if (UBRingManager::UbrMgrInit()) { + PLOG(ERROR) << "Fail to UbrMgrInit"; + ExitWithError(); + } + + if (TimerInit()) { + PLOG(ERROR) << "Fail to TimerInit"; + ExitWithError(); + } + + if (ShmMgrInit()) { + PLOG(ERROR) << "Fail to ShmMgrInit"; + ExitWithError(); + } + + if (UBShmEndpoint::GlobalInitialize() < 0) { + LOG(ERROR) << "ubring_recv_block_type incorrect " + << "(valid value: default/large/huge)"; + ExitWithError(); + } + + g_ub_available.store(true, butil::memory_order_relaxed); +} + +static pthread_once_t initialize_UB_once = PTHREAD_ONCE_INIT; + +void GlobalUBInitializeOrDie() { + if (pthread_once(&initialize_UB_once, + GlobalUBInitializeOrDieImpl) != 0) { + LOG(FATAL) << "Fail to pthread_once GlobalUBInitializeOrDie"; + exit(1); + } +} + +bool IsUBAvailable() { + return g_ub_available.load(butil::memory_order_acquire); +} + +void GlobalDisableUb() { + if (g_ub_available.exchange(false, butil::memory_order_acquire)) { + LOG(FATAL) << "ub is disabled due to some unrecoverable problem"; + } +} + +bool SupportedByUB(std::string protocol) { + if (protocol.compare("baidu_std") == 0) { + return true; + } + return false; +} + +bool InitPollingModeWithTag(bthread_tag_t tag, + std::function callback, + std::function init_fn, + std::function release_fn) { + if (UBShmEndpoint::PollingModeInitialize(tag, callback, init_fn, + release_fn) == 0) { + return true; + } + return false; +} + +} // namespace ubring +} // namespace brpc + +#else + +#include +#include "butil/logging.h" + +namespace brpc { +namespace ubring { +void GlobalUBInitializeOrDie() { + LOG(ERROR) << "brpc is not compiled with ubring. To enable it, please refer to " + << "https://github.com/apache/brpc/blob/master/docs/en/ubring.md"; + exit(1); +} +} +} + +#endif // if BRPC_WITH_UBRING \ No newline at end of file diff --git a/src/brpc/ubshm/ub_helper.h b/src/brpc/ubshm/ub_helper.h new file mode 100644 index 0000000000..6ad9ebe3eb --- /dev/null +++ b/src/brpc/ubshm/ub_helper.h @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_UB_HELPER_H +#define BRPC_UB_HELPER_H + +#if BRPC_WITH_UBRING + +#include +#include +#include "bthread/types.h" + +namespace brpc { +namespace ubring { + +void GlobalRelease(); + +void GlobalUBInitializeOrDie(); + +bool InitPollingModeWithTag(bthread_tag_t tag, + std::function callback = nullptr, + std::function init_fn = nullptr, + std::function release_fn = nullptr); + +bool IsUBAvailable(); + +void GlobalDisableUb(); + +bool SupportedByUB(std::string protocol); + +} // namespace ubring +} // namespace brpc + +#else + +namespace brpc { +namespace ubring { + +void GlobalRelease(); + +void GlobalUBInitializeOrDie(); + +} // namespace ubring +} // namespace brpc + +#endif // if BRPC_WITH_UBRING + +#endif // BRPC_UB_HELPER_H \ No newline at end of file diff --git a/src/brpc/ubshm/ub_ring.cpp b/src/brpc/ubshm/ub_ring.cpp new file mode 100644 index 0000000000..0ea64f07c1 --- /dev/null +++ b/src/brpc/ubshm/ub_ring.cpp @@ -0,0 +1,1083 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include "bthread/bthread.h" +#include "butil/logging.h" +#include "brpc/ubshm/ub_ring.h" +#include "brpc/ubshm/ub_ring_manager.h" +#include "brpc/ubshm/shm/shm_ipc.h" + +namespace brpc { +namespace ubring { +uint32_t g_sleepTime[UBR_TASK_STEP_NUM] = {0}; +#define TIME_COVERSION 1000 +DEFINE_int32(ub_disconnect_timeout, 5, "Ubshm disconnection timeout."); +DEFINE_int32(ub_connect_timeout, 1, "Ubshm connection timeout."); +DEFINE_int32(ub_hb_timer_interval, 5, "Heartbeat timer interval."); +DEFINE_int32(ub_hb_retry_cnt, 10, "Heartbeat retry times."); +DEFINE_int32(ub_event_queue_timer_interval, 100, "Interval of the disconnection timer."); + +UBRing::UBRing() +{} +UBRing::~UBRing() +{} + +RETURN_CODE UBRing::UbrTrxMapShm(SHM *localShm, SHM *remoteShm) +{ + RETURN_CODE rc = UbrTrxMapLocalShm(localShm); + if (UNLIKELY(rc != UBRING_OK)) { + LOG(ERROR) << "Trx map local shared memory failed."; + return rc; + } + rc = UbrTrxMapRemoteShm(remoteShm); + if (UNLIKELY(rc != UBRING_OK)) { + LOG(ERROR) << "Trx map remote shared memory failed."; + return rc; + } + return UBRING_OK; +} + +RETURN_CODE UBRing::UbrTrxClose() { + RETURN_CODE closeCheckRc = UbrTrxCloseCheck(_trx); + if (UNLIKELY(closeCheckRc != UBRING_OK)) { + if (closeCheckRc == UBRING_REENTRY) { + LOG(INFO) << "Trx close skipped, already closing, local name=" << _trx->localShm.name; + return UBRING_OK; + } + return UBRING_ERR; + } + if (_trx->ubrRx.remoteTxEventQ.addr != nullptr) { + ((UbrEventQMsg *)_trx->ubrRx.remoteTxEventQ.addr)->flag = UBR_STATE_CLOSING; + } + + uint32_t disconnectTimeout = FLAGS_ub_disconnect_timeout; + uint64_t startTime = GetCurNanoSeconds(); + + if (_trx->ubrTx.localTxEventQ.addr != nullptr && ((UbrEventQMsg *)_trx->ubrTx.localTxEventQ.addr)->flag == UBR_STATE_CONNECTED) { + ((UbrEventQMsg *)_trx->ubrTx.localTxEventQ.addr)->flag = UBR_STATE_CLOSED; + _trx->ubrTx.trxState = UBR_STATE_CLOSED; + } + + if (_trx->ubrTx.remoteRxEventQ.addr != nullptr) { + ((UbrEventQMsg *)_trx->ubrTx.remoteRxEventQ.addr)->flag = UBR_STATE_CLOSED; + } + while (_trx->ubrRx.localRxEventQ.addr != nullptr && ((UbrEventQMsg *)_trx->ubrRx.localRxEventQ.addr)->flag != UBR_STATE_CLOSED) { + UbrSetSleepTask(UBR_TASK_CLOSE); + if (HasTimedOut(startTime, disconnectTimeout) != UBRING_OK) { + LOG(WARNING) << "Local shm " << _trx->localShm.name + << " wait for the peer to close timed out, force cleanup."; + _trx->ubrRx.trxState = UBR_STATE_CLOSED; + // Force synchronous cleanup instead of relying on async timer + DeleteTimerSafe((uint32_t)_trx->timerFd); + DeleteTimerSafe((uint32_t)_trx->hbTimerFd); + if (_trx->ubrTx.remoteRxEventQ.addr != nullptr) { + ((UbrEventQMsg *)_trx->ubrTx.remoteRxEventQ.addr)->flag = UBR_STATE_CLOSED; + } + if (UNLIKELY(ShmRemoteFree(&_trx->remoteShm) != UBRING_OK)) { + LOG(WARNING) << "Force close, remote shm " << _trx->remoteShm.name << " free failed."; + } + if (UNLIKELY(UbrTrxFreeShm(_trx) != UBRING_OK)) { + LOG(WARNING) << "Force close, local shm " << _trx->localShm.name << " free failed."; + } + if (UNLIKELY(UBRingManager::ReleaseUbrTrxFromMgr(_trx) != UBRING_OK)) { + LOG(WARNING) << "Force close, release trx " << _trx->localShm.name << " failed."; + } + return UBRING_ERR_TIMEOUT; + } + bthread_usleep(1000); // 1ms, yield to other bthreads + } + _trx->ubrRx.trxState = UBR_STATE_CLOSED; + RETURN_CODE rc; + if (UNLIKELY((rc = ClearTrxResource(_trx, startTime, UBR_SEND_CLOSE)) != UBRING_OK)) { + if (rc == UBRING_REENTRY) { + LOG(INFO) << "Trx close, peer is closing, trx local name=" << _trx->localShm.name; + return UBRING_OK; + } + LOG(ERROR) << "Trx close, clear trx resource failed, trx local name=" << _trx->localShm.name; + return UBRING_ERR; + } + // Unlink local shm name immediately so process exit does not leave visible leftovers. + RETURN_CODE unlinkRc = ShmFree(&_trx->localShm); + if (unlinkRc != UBRING_OK && unlinkRc != SHM_ERR_NOT_FOUND && unlinkRc != SHM_ERR_RESOURCE_ATTACHED) { + LOG(WARNING) << "Trx close, unlink local shm failed, trx local name=" << _trx->localShm.name + << ", rc=" << unlinkRc; + } + return UBRING_OK; +} + +RETURN_CODE UBRing::UbrAddCloseTimer() { + if (UNLIKELY(_trx == NULL)) { + LOG(ERROR) << "Trx add close timer failed, trx is null."; + return UBRING_ERR; + } + + uint32_t eventQTimerInterval = FLAGS_ub_event_queue_timer_interval * TIME_COVERSION; + itimerspec timeSpec = { + .it_interval = {.tv_sec = 0, .tv_nsec = eventQTimerInterval}, + .it_value = {.tv_sec = 0, .tv_nsec = 1} + }; + int timerFd = TimerStart(&timeSpec, UbrTrxCloseCallback, (void*)_trx); + if (UNLIKELY(timerFd == -1)) { + LOG(ERROR) << "Start ubr close timer failed, trx local name=" << _trx->localShm.name; + return UBRING_ERR; + } + _trx->timerFd = timerFd; + return UBRING_OK; +} + +RETURN_CODE UBRing::UbrAddTimer() { + if (UNLIKELY(UbrAddCloseTimer() != UBRING_OK)) { + LOG(ERROR) << "Ubr " << _trx->localShm.name << " add closed timer failed."; + return UBRING_ERR; + } + + if (UNLIKELY(UbrAddHBTimer() != UBRING_OK)) { + DeleteTimerSafe((uint32_t)_trx->timerFd); + LOG(ERROR) << "Ubr " << _trx->localShm.name << " add heartbeat timer failed."; + return UBRING_ERR; + } + return UBRING_OK; +} + +void* UBRing::UbrTrxCloseCallback(void* args) { + auto* trx = (UbrTrx*) args; + if (UNLIKELY(UBRing::UbrTrxCallbackCheck(trx) != UBRING_OK)) { + return nullptr; + } + + auto* localRxEventQ = (UbrEventQMsg *)trx->ubrRx.localRxEventQ.addr; + auto* localTxEventQ = (UbrEventQMsg *)trx->ubrTx.localTxEventQ.addr; + if (localRxEventQ->flag != UBR_STATE_CLOSED || localTxEventQ->flag == UBR_STATE_CLOSED) { + return nullptr; + } + trx->ubrRx.trxState = UBR_STATE_CLOSED; + int fd = (int)trx->localShm.fd; + do { + if (ATOMIC_LOAD(trx->closeCnt) == 0) { + break; + } + ATOMIC_SUB(trx->closeCnt, 1); + + uint64_t startTime = GetCurNanoSeconds(); + + if (localTxEventQ->flag == UBR_STATE_CONNECTED || ATOMIC_LOAD(trx->closeCnt) == 1) { + localTxEventQ->flag = UBR_STATE_CLOSED; + trx->ubrTx.trxState = UBR_STATE_CLOSED; + } + UbrEventQMsg* remoteRxEventQ = (UbrEventQMsg *)trx->ubrTx.remoteRxEventQ.addr; + if (remoteRxEventQ == nullptr) { + LOG(ERROR) << "Trx close callback failed, " << trx->localShm.name << " remoteRxEventQ is NULL."; + break; + } + remoteRxEventQ->flag = UBR_STATE_CLOSED; + RETURN_CODE clearRc = ClearTrxResource(trx, startTime, UBR_CALL_BACK_CLOSE, 1); + if (UNLIKELY(clearRc != UBRING_OK && clearRc != UBRING_REENTRY)) { + LOG(ERROR) << "Trx close callback failed, " << trx->localShm.name << " clear trx resource failed."; + break; + } + } while (0); + return nullptr; +} + +RETURN_CODE UBRing::UbrAddHBTimer() { + if (UNLIKELY(_trx == NULL)) { + LOG(ERROR) << "Trx add heartbeat timer failed, trx is null."; + return UBRING_ERR; + } + + itimerspec timeSpec = { + .it_interval = {.tv_sec = FLAGS_ub_hb_timer_interval, .tv_nsec = 0}, + .it_value = {.tv_sec = 0, .tv_nsec = 1} + }; + int timerFd = TimerStart(&timeSpec, UbrTrxHBCallback, (void*)_trx); + if (UNLIKELY(timerFd == -1)) { + LOG(ERROR) << "Start ubr heartbeat timer failed."; + return UBRING_ERR; + } + _trx->hbTimerFd = timerFd; + return UBRING_OK; +} + +RETURN_CODE UBRing::UbrPassiveClearTrx(UbrTrx *trx, int fd, PASSIVE_DISC_TYPE type) { + RETURN_CODE passiveCloseCheckRc = UbrTrxCloseCheck(trx); + if (UNLIKELY(passiveCloseCheckRc != UBRING_OK)) { + if (passiveCloseCheckRc == UBRING_REENTRY) { + LOG(INFO) << "Passive close skipped, active close in progress, name=" << trx->localShm.name; + uint64_t startTime = GetCurNanoSeconds(); + return ClearTrxResource(trx, startTime, UBR_CALL_BACK_CLOSE); + } + return UBRING_ERR; + } + trx->ubrTx.trxState = UBR_STATE_CLOSED; + trx->ubrRx.trxState = UBR_STATE_CLOSED; + DeleteTimerSafe((uint32_t)trx->timerFd); + const char *typeName = NULL; + if (type == UBR_HEARTBEAT) { + DeleteTimer((uint32_t)trx->hbTimerFd); + typeName = "Trx heartbeat"; + } else if (type == UBR_UB_EVENT) { + DeleteTimerSafe((uint32_t)trx->hbTimerFd); + typeName = "Ub event callback"; + } + bthread_usleep(FLAGS_ub_flying_io_timeout * 1000000LL); // yield-friendly sleep + + int rc = ShmLocalFree(&trx->remoteShm); + if (rc != UBRING_OK) { + LOG(ERROR) << typeName << ", delete remote shm failed. ret=" << rc; + } + rc = ShmLocalFree(&trx->localShm); + if (rc != UBRING_OK) { + LOG(ERROR) << typeName << ", delete local shm failed. ret=" << rc; + } + + UBRingManager::ReleaseUbrTrxFromMgr(trx); + return UBRING_OK; +} + +void* UBRing::UbrTrxHBCallback(void* args) { + auto* trx = (UbrTrx*) args; + if (UNLIKELY(UbrTrxCallbackCheck(trx) != UBRING_OK)) { + return NULL; + } + + auto* localDataStatus = (UbrDataStatusQMsg *)trx->ubrTx.localDataStatusQ.addr; + auto* remoteDataStatus = (UbrDataStatusQMsg *)trx->ubrRx.remoteDataStatusQ.addr; + if (UNLIKELY(localDataStatus == NULL || remoteDataStatus == NULL)) { + LOG(ERROR) << "Heartbeat error, datastatus is NULL."; + return NULL; + } + + if (trx->ubrTx.trxState != UBR_STATE_CONNECTED || trx->ubrRx.trxState != UBR_STATE_CONNECTED) { + LOG_EVERY_SECOND(INFO) << "Heartbeat cannot be started, wait connected state."; + return NULL; + } + + remoteDataStatus->heartBeat = 1; + if (localDataStatus->heartBeat == 1) { + localDataStatus->heartBeat = 0; + trx->ubrTx.hbRetryCnt = 0; + return NULL; + } + + ++trx->ubrTx.hbRetryCnt; + if (trx->ubrTx.hbRetryCnt <= FLAGS_ub_hb_retry_cnt) { + return NULL; + } + + int fd = (int)trx->localShm.fd; + LOG(INFO) << "Hlc heartbeat, start to clear trx resource. hbTimerFd=" << fd << ", shmName=" << trx->localShm.name; + UbrPassiveClearTrx(trx, fd, UBR_HEARTBEAT); + LOG(INFO) << "Hlc heartbeat clear trx resource finish."; + return NULL; +} + +RETURN_CODE UBRing::UbrAddAsynClearTimer(UbrTrx *trx) { + if (UNLIKELY(trx == NULL)) { + LOG(ERROR) << "Trx add close timer failed, trx is null."; + return UBRING_ERR; + } + + if (trx->clearTimerFd > 0) { + return UBRING_OK; + } + + itimerspec timeSpec = { + .it_interval = {.tv_sec = 0, .tv_nsec = 0}, + .it_value = {.tv_sec = FLAGS_ub_flying_io_timeout, .tv_nsec = 0} + }; + + int timerFd = TimerStart(&timeSpec, UbrAsynClearCallback, (void*)trx); + if (UNLIKELY(timerFd == -1)) { + LOG(ERROR) << "Start ubr close timer failed, trx name=%s.", trx->localShm.name; + return UBRING_ERR; + } + trx->clearTimerFd = timerFd; + return UBRING_OK; +} + +void *UBRing::UbrAsynClearCallback(void *args) +{ + auto* trx = (UbrTrx*) args; + if (UNLIKELY(trx == NULL)) { + LOG(ERROR) << "Trx close, trx is null."; + return NULL; + } + + if (UNLIKELY(ShmRemoteFree(&trx->remoteShm) != UBRING_OK)) { + LOG(ERROR) << "Trx close, remote shm " << trx->remoteShm.name << " free failed."; + } + + if (UNLIKELY(UbrTrxFreeShm(trx) != UBRING_OK)) { + LOG(ERROR) << "Trx close, wait for local shm " << trx->localShm.name << " free fail."; + } + + if (UNLIKELY(UBRingManager::ReleaseUbrTrxFromMgr(trx) != UBRING_OK)) { + LOG(ERROR) << "Trx close, release shm " << trx->localShm.name << " trx failed."; + } + return NULL; +} + +int UBRing::UbrTrxSend(const void *buf, uint32_t bufLen) +{ + if (UNLIKELY(CheckTrxSendPreCheck(_trx) != UBRING_OK)) { + return UBRING_ERR; + } + // 1.2 计算空间 + auto *dataStatusMsg = (UbrDataStatusQMsg *)_trx->ubrTx.localDataStatusQ.addr; + auto *dataMsg = (UbrMsgFormat *)_trx->ubrTx.remoteDataQ.addr; + uint32_t cap = _trx->ubrTx.capacity; + uint32_t tail = dataStatusMsg->tail; + uint32_t remainChunkNum = + (_trx->ubrTx.writePos > tail) ? (tail + cap - _trx->ubrTx.writePos) : (tail - _trx->ubrTx.writePos); + uint32_t needMsgChunkNum = CalcUbrMsgChunkCnt(bufLen); + if (remainChunkNum < needMsgChunkNum) { + return UBRING_RETRY; + } + UbrMsgFormat *msg = &(_trx->ubrTx.localMsgSpace); + uint32_t totalSendLen = 0; + uint32_t remainBufLen = bufLen; + uint8_t isLastPkt = 0; + _trx->ubrTx.outIoId++; + ((UbrEventQMsg *)_trx->ubrTx.remoteRxEventQ.addr)->ioId = _trx->ubrTx.outIoId; + while (remainBufLen > 0) { + isLastPkt = (uint8_t)(remainBufLen <= UBR_MSG_PAYLOAD_LEN); + msg->header[UBR_MSG_FLAG_INDEX] = isLastPkt ? UBR_MSG_CHUNK_EOF : UBR_MSG_CHUNK_EXIST; + msg->header[UBR_MSG_LEN_INDEX] = isLastPkt ? (uint8_t)remainBufLen : UBR_MSG_PAYLOAD_LEN; + msg->header[UBR_MSG_CUR_INDEX] = 0; + memcpy(msg->payload.inner, (const uint8_t *)buf + totalSendLen, msg->header[UBR_MSG_LEN_INDEX]); + Copy64Byte((int8_t *)&dataMsg[_trx->ubrTx.writePos], (int8_t *)msg); + _trx->ubrTx.writePos = (_trx->ubrTx.writePos + 1) % cap; + totalSendLen += msg->header[UBR_MSG_LEN_INDEX]; + remainBufLen -= msg->header[UBR_MSG_LEN_INDEX]; + } + return (int)totalSendLen; +} + +int UBRing::UbrTrxRecv(void *buf, uint32_t bufLen) +{ + RETURN_CODE rc = UBRING_OK; + if (UNLIKELY((rc = CheckTrxRecvParam(_trx, buf, bufLen)) != UBRING_OK)) { + return (rc == UBR_NOT_CONNECTED) ? 0 : rc; + } + UbrMsgFormat *dataMsg = (UbrMsgFormat *)_trx->ubrRx.localDataQ.addr; + uint32_t readPosEnd = _trx->ubrRx.readPos; + uint8_t flag = dataMsg[readPosEnd].header[UBR_MSG_FLAG_INDEX]; + if (flag == UBR_MSG_CHUNK_NONE) { + return UBRING_RETRY; + } + return UbrTrxRecvBlockMode(static_cast(buf), bufLen); +} + +int UBRing::UbrTrxRecvBlockMode(uint8_t *dest, uint32_t bufLen) +{ + RETURN_CODE rc = UBRING_OK; + if (UNLIKELY((rc = CheckTrxRecvParam(_trx, dest, bufLen)) != UBRING_OK)) { + return (rc == UBR_NOT_CONNECTED) ? 0 : rc; + } + + int32_t totalCopied = 0; + int32_t remainingLen = (int32_t)bufLen; + bool notEofEncountered = true; + + UbrRx *ubrRx = &_trx->ubrRx; + UbrMsgFormat *dataMsg = (UbrMsgFormat *)ubrRx->localDataQ.addr; + bool needUpdateEpollEofPos = ubrRx->readPos == ubrRx->epEofPos; + + while (notEofEncountered && remainingLen > 0) { + if (UNLIKELY(CheckTrxRecvPreCheck(_trx) != UBRING_OK)) { + return UBRING_ERR; + } + UbrMsgFormat *currentChunk = &dataMsg[ubrRx->readPos]; + uint8_t flag = currentChunk->header[UBR_MSG_FLAG_INDEX]; + if (flag == UBR_MSG_CHUNK_NONE) { + if (totalCopied > 0) { + break; + } + errno = EAGAIN; + return -1; + } + if (flag == UBR_MSG_CHUNK_EOF) { + notEofEncountered = false; + } + uint8_t chunkMsgLen = currentChunk->header[UBR_MSG_LEN_INDEX]; + uint8_t curIndex = currentChunk->header[UBR_MSG_CUR_INDEX]; + uint8_t availableData = chunkMsgLen - curIndex; + + int32_t copyLen = (remainingLen < availableData) ? remainingLen : availableData; + memcpy(dest + totalCopied, dataMsg[ubrRx->readPos].payload.inner + curIndex, (size_t)copyLen); + totalCopied += copyLen; + remainingLen -= copyLen; + currentChunk->header[UBR_MSG_CUR_INDEX] += (uint8_t)copyLen; + if (LIKELY(currentChunk->header[UBR_MSG_CUR_INDEX] == chunkMsgLen)) { + currentChunk->header[UBR_MSG_FLAG_INDEX] = UBR_MSG_CHUNK_NONE; + UpdateDataQTail(_trx); + ubrRx->readPos = (ubrRx->readPos + 1) % ubrRx->capacity; + } + } + if (needUpdateEpollEofPos) { + ubrRx->epEofPos = ubrRx->readPos; + } + return (int)totalCopied; +} + +ssize_t UBRing::UbrTrxWritev(const struct iovec *iov, int iovcnt) +{ + if (UNLIKELY(CheckTrxSendPreCheck(_trx) != UBRING_OK)) { + return UBRING_ERR; + } + + size_t bufLen = 0; + for (int i = 0; i < iovcnt; i++) { + bufLen += iov[i].iov_len; + } + RETURN_CODE rc = WritevHasEnoughSpace(bufLen); + if (rc != UBRING_OK) { + return rc; + } + + UbrMsgFormat *dataMsg = (UbrMsgFormat *)_trx->ubrTx.remoteDataQ.addr; + UbrMsgFormat *msg = &(_trx->ubrTx.localMsgSpace); + int curIov = 0; + size_t curIovPos = 0; + ssize_t totalSendLen = 0; + size_t pktRemainN = 0; + size_t iovRemain = 0; + size_t fulled = 0; + uint8_t isLastPkt = 0; + uint8_t curPktLen = 0; + _trx->ubrTx.outIoId++; + ((UbrEventQMsg *)_trx->ubrTx.remoteRxEventQ.addr)->ioId = _trx->ubrTx.outIoId; + while (bufLen > 0) { + isLastPkt = (uint8_t)(bufLen <= UBR_MSG_PAYLOAD_LEN); + curPktLen = isLastPkt ? (uint8_t)bufLen : UBR_MSG_PAYLOAD_LEN; + msg->header[UBR_MSG_FLAG_INDEX] = isLastPkt ? UBR_MSG_CHUNK_EOF : UBR_MSG_CHUNK_EXIST; + msg->header[UBR_MSG_LEN_INDEX] = curPktLen; + msg->header[UBR_MSG_CUR_INDEX] = 0; + pktRemainN = curPktLen; + while (curIov < iovcnt && pktRemainN > 0) { + iovRemain = (iov[curIov].iov_len - curIovPos); + fulled = iovRemain > pktRemainN ? pktRemainN : iovRemain; + memcpy((msg->payload.inner + (curPktLen - (uint8_t)pktRemainN)), + (uint8_t *)(iov[curIov].iov_base) + curIovPos, + fulled); + pktRemainN -= fulled; + curIovPos += fulled; + if (curIovPos == iov[curIov].iov_len) { + curIov++; + curIovPos = 0; + } + } + + Copy64Byte((int8_t *)&dataMsg[_trx->ubrTx.writePos], (int8_t *)msg); + _trx->ubrTx.writePos = (_trx->ubrTx.writePos + 1) % _trx->ubrTx.capacity; + totalSendLen += (ssize_t)curPktLen; + bufLen -= (int)curPktLen; + } + return totalSendLen; +} + +ssize_t UBRing::UbrTrxReadv(const struct iovec *iov, int iovcnt) +{ + RETURN_CODE rc = UBRING_OK; + if (UNLIKELY((rc = CheckTrxRecvParam(_trx, iov, (uint32_t)iovcnt)) != UBRING_OK)) { + return (rc == UBR_NOT_CONNECTED) ? 0 : rc; + } + UbrMsgFormat *dataMsg = (UbrMsgFormat *)_trx->ubrRx.localDataQ.addr; + uint32_t readPosEnd = _trx->ubrRx.readPos; + uint8_t flag = dataMsg[readPosEnd].header[UBR_MSG_FLAG_INDEX]; + if (flag == UBR_MSG_CHUNK_NONE) { + errno = EAGAIN; + return -1; + } + ssize_t nr = UbrTrxReadvBlockMode(iov, iovcnt); + if (UNLIKELY(nr == -1)) { + LOG(ERROR) << "Non-blocking readv msg in failed, connection has been closed."; + errno = EPIPE; + return -1; + } + return nr; +} + +ssize_t UBRing::UbrTrxReadvBlockMode(const struct iovec *iov, int iovcnt) +{ + RETURN_CODE rc = UBRING_OK; + if (UNLIKELY((rc = CheckTrxRecvParam(_trx, iov, (uint32_t)iovcnt)) != UBRING_OK)) { + return (rc == UBR_NOT_CONNECTED) ? 0 : rc; + } + + size_t remainBufLen = 0; + for (int i = 0; i < iovcnt; i++) { + remainBufLen += iov[i].iov_len; + } + + bool needUpdateEpollEofPos = _trx->ubrRx.readPos == _trx->ubrRx.epEofPos; + ssize_t totalRecvLen = StartReadv(_trx, iov, iovcnt, remainBufLen); + + if (needUpdateEpollEofPos) { + _trx->ubrRx.epEofPos = _trx->ubrRx.readPos; + } + return totalRecvLen; +} + +RETURN_CODE UBRing::IsUbrTrxReadable(uint32_t epEvent) +{ + if (UNLIKELY(_trx == NULL)) { + LOG(ERROR) << "The trx to be checked is NULL."; + return UBRING_ERR; + } + if (UNLIKELY(_trx->localShm.addr == NULL)) { + LOG(ERROR) << "The trx localShm to be checked is NULL."; + return UBRING_ERR; + } + if (UNLIKELY(_trx->ubrTx.trxState != UBR_STATE_CONNECTED)) { + // TODO mwj 这几块的日志是否需要删除 + // LOG(ERROR) << "The trx is not connected state."; + return UBRING_ERR; + } + + uint64_t ioId = ((UbrEventQMsg *)_trx->ubrRx.localRxEventQ.addr)->ioId; + if ((epEvent & EPOLLET) && ioId == _trx->ubrRx.inIoId) { + return MPA_MUXER_NOT_READY; + } + + uint32_t readPosEnd = _trx->ubrRx.readPos; + if (epEvent & EPOLLET) { + readPosEnd = _trx->ubrRx.epEofPos; + } + + UbrMsgFormat *dataMsg = (UbrMsgFormat *)_trx->ubrRx.localDataQ.addr; + uint8_t flag = dataMsg[readPosEnd].header[UBR_MSG_FLAG_INDEX]; + if (flag == UBR_MSG_CHUNK_NONE) { + return MPA_MUXER_NOT_READY; + } + if (epEvent & EPOLLET) { + _trx->ubrRx.inIoId = ioId; + } + return UBRING_OK; +} + +RETURN_CODE UBRing::IsUbrTrxWriteable(uint32_t epEvent) +{ + if (UNLIKELY(_trx == NULL)) { + LOG(ERROR) << "The trx to be checked is NULL."; + return UBRING_ERR; + } + if (UNLIKELY(_trx->localShm.addr == NULL)) { + LOG(ERROR) << "The trx localShm to be checked is NULL."; + return UBRING_ERR; + } + if (UNLIKELY((UbrEventQMsg *)_trx->ubrTx.localTxEventQ.addr == NULL)) { + LOG(ERROR) << "The trx localTxEventQ addr is NULL."; + return UBRING_ERR; + } + if (UNLIKELY((UbrEventQMsg *)_trx->ubrTx.localDataStatusQ.addr == NULL)) { + LOG(ERROR) << "The trx localDataStatusQ addr is NULL."; + return UBRING_ERR; + } + + if (UNLIKELY(_trx->ubrTx.trxState != UBR_STATE_CONNECTED)) { + LOG(ERROR) << "The trx is not connected state."; + return UBRING_ERR; + } + + UbrDataStatusQMsg *dataStatusMsg = (UbrDataStatusQMsg *)_trx->ubrTx.localDataStatusQ.addr; + uint32_t cap = _trx->ubrTx.capacity; + uint32_t tail = dataStatusMsg->tail; + uint32_t remainChunkNum = + (_trx->ubrTx.writePos > tail) ? (tail + cap - _trx->ubrTx.writePos) : (tail - _trx->ubrTx.writePos); + if (remainChunkNum == 0) { + _trx->ubrTx.epLastCap = remainChunkNum; + return MPA_MUXER_NOT_READY; + } + + if ((epEvent & EPOLLET) && (_trx->ubrTx.epLastCap >= remainChunkNum)) { + _trx->ubrTx.epLastCap = remainChunkNum; + return MPA_MUXER_NOT_READY; + } + _trx->ubrTx.epLastCap = remainChunkNum; + return UBRING_OK; +} + +RETURN_CODE UBRing::UbrSetTimeout(UbrTaskStep taskType, int timeout) +{ + if (taskType >= UBR_TASK_STEP_NUM || timeout < 0) { + LOG(ERROR) << "Set timeout failed, invalid task type."; + return UBRING_ERR; + } + + g_sleepTime[taskType] = (uint32_t)timeout; + LOG(INFO) << "Set timeout success, taskType=" << taskType << ", timeout=" << timeout; + return UBRING_OK; +} + +RETURN_CODE UBRing::UbrTrxFreeShm(UbrTrx *trx) +{ + if (trx == NULL) { + LOG(ERROR) << "Trx is NULL."; + return UBRING_ERR; + } + + RETURN_CODE rc = UBRING_OK; + rc = ShmMunmap(&trx->localShm); + if (UNLIKELY(rc != UBRING_OK)) { + LOG(ERROR) << "Trx close, local unmap " << trx->localShm.name << " shm fail."; + return UBRING_ERR; + } + + rc = ShmFree(&trx->localShm); + if (UNLIKELY(rc != UBRING_OK)) { + if (rc != SHM_ERR_RESOURCE_ATTACHED && rc != SHM_ERR_NOT_FOUND) { + LOG(ERROR) << "Wait for " << trx->localShm.name << " local shm free fail."; + return UBRING_ERR; + } + LOG(INFO) << "Local shm " << trx->localShm.name << " already freed, continue to free remote shm."; + } + + RETURN_CODE remoteRc = UBRING_OK; + if (trx->remoteShm.addr != NULL) { + remoteRc = IpcShmRemoteFree(&trx->remoteShm); + } + if (remoteRc != UBRING_OK) { + LOG(WARNING) << "Free remote shm " << trx->remoteShm.name << " failed, rc=" << remoteRc; + } + + return UBRING_OK; +} + +void UBRing::PreWriteAddr(uint8_t *addr, size_t len) +{ + if (addr == NULL) { + return; + } + + size_t i = 0; + while (i < len) { + if (i + sizeof(uint64_t) <= len) { + *(uint64_t *)(addr + i) = (uint64_t)0; + i += sizeof(uint64_t); + } else if (i + sizeof(uint32_t) < len) { + *(uint32_t *)(addr + i) = (uint32_t)0; + i += sizeof(uint32_t); + } else if (i + sizeof(uint16_t) < len) { + *(uint16_t *)(addr + i) = (uint16_t)0; + i += sizeof(uint16_t); + } else { + *(addr + i) = (uint8_t)0; + i += sizeof(uint8_t); + } + } +} + +void UBRing::PrewriteUbrTx(UbrTx *tx) +{ + if (tx == NULL) { + return; + } + PreWriteAddr(tx->remoteDataQ.addr, tx->capacity * sizeof(UbrMsgFormat)); +} + +void UBRing::PrewriteUbrRx(UbrRx *rx) +{ + if (rx == NULL) { + return; + } + PreWriteAddr(rx->localDataQ.addr, rx->capacity * sizeof(UbrMsgFormat)); +} + +RETURN_CODE UBRing::UbrTrxMapLocalShm(SHM *localShm) +{ + if (UNLIKELY(_trx == NULL)) { + LOG(ERROR) << "Trx map Shared memory failed, trx is null."; + return UBRING_ERR; + } + if (UNLIKELY(localShm == NULL || localShm->addr == NULL)) { + LOG(ERROR) << "Trx map Shared memory failed, localShm is null or addr is NULL."; + return UBRING_ERR; + } + _trx->localShm = *localShm; + _trx->ubrTx.localTxEventQ.addr = localShm->addr + TX_EVENTQ_ADDR_OFFSET; + _trx->ubrTx.localTxEventQ.len = UBR_EVENTQ_LEN; + _trx->ubrRx.localRxEventQ.addr = localShm->addr + RX_EVENTQ_ADDR_OFFSET; + _trx->ubrRx.localRxEventQ.len = UBR_EVENTQ_LEN; + _trx->ubrTx.localDataStatusQ.addr = localShm->addr + DATASTATUSQ_ADDR_OFFSET; + _trx->ubrTx.localDataStatusQ.len = UBR_DATASTATUSQ_LEN; + size_t addrAlignedOffset = Aligned64Offset(localShm->addr + DATAQ_ADDR_OFFSET); + _trx->ubrRx.localDataQ.addr = localShm->addr + DATAQ_ADDR_OFFSET + addrAlignedOffset; + _trx->ubrRx.localDataQ.len = localShm->len - DATAQ_ADDR_OFFSET - addrAlignedOffset; + return UBRING_OK; +} + +RETURN_CODE UBRing::UbrTrxMapRemoteShm(SHM *remoteShm) +{ + if (UNLIKELY(_trx == NULL)) { + LOG(ERROR) << "Trx map Shared memory failed, trx is null."; + return UBRING_ERR; + } + if (UNLIKELY(remoteShm == NULL || remoteShm->addr == NULL)) { + LOG(ERROR) << "Trx map Shared memory failed, remoteShm is null or addr is NULL."; + return UBRING_ERR; + } + _trx->remoteShm = *remoteShm; + _trx->ubrRx.remoteTxEventQ.addr = remoteShm->addr + TX_EVENTQ_ADDR_OFFSET; + _trx->ubrRx.remoteTxEventQ.len = UBR_EVENTQ_LEN; + _trx->ubrTx.remoteRxEventQ.addr = remoteShm->addr + RX_EVENTQ_ADDR_OFFSET; + _trx->ubrTx.remoteRxEventQ.len = UBR_EVENTQ_LEN; + _trx->ubrRx.remoteDataStatusQ.addr = remoteShm->addr + DATASTATUSQ_ADDR_OFFSET; + _trx->ubrRx.remoteDataStatusQ.len = UBR_DATASTATUSQ_LEN; + size_t addrAlignedOffset = Aligned64Offset(remoteShm->addr + DATAQ_ADDR_OFFSET); + _trx->ubrTx.remoteDataQ.addr = remoteShm->addr + DATAQ_ADDR_OFFSET + addrAlignedOffset; + _trx->ubrTx.remoteDataQ.len = remoteShm->len - DATAQ_ADDR_OFFSET - addrAlignedOffset; + return UBRING_OK; +} + +RETURN_CODE UBRing::UbrServerTrxInit(SHM *localShm, SHM *remoteShm) +{ + RETURN_CODE rc = UbrTrxMapShm(localShm, remoteShm); + if (UNLIKELY(rc != UBRING_OK)) { + LOG(ERROR) <<"Trx map shared memory failed."; + return rc; + } + + uint32_t localDataMsgCap = (uint32_t)(_trx->ubrRx.localDataQ.len / UBR_MSG_LEN); + uint32_t remoteDataMsgCap = (uint32_t)(_trx->ubrTx.remoteDataQ.len / UBR_MSG_LEN); + _trx->ubrRx.capacity = localDataMsgCap; + _trx->ubrTx.capacity = remoteDataMsgCap; + rc = UBRingManager::GetUbrDealMsgMaxCnt(_trx->ubrRx.capacity, &_trx->ubrRx.dealMsgMaxCnt); + if (UNLIKELY(rc != UBRING_OK)) { + LOG(ERROR) << "Get ubring deal msg max cnt."; + return rc; + } + PrewriteUbrRx(&_trx->ubrRx); + PrewriteUbrTx(&_trx->ubrTx); + + ((UbrDataStatusQMsg *)(_trx->ubrTx.localDataStatusQ.addr))->tail = remoteDataMsgCap - 1; + ((UbrDataStatusQMsg *)(_trx->ubrRx.remoteDataStatusQ.addr))->tail = localDataMsgCap - 1; + + if (UNLIKELY(UbrAddTimer() != UBRING_OK)) { + LOG(ERROR) << "Ubr add timer failed, localName=" << localShm->name; + return UBRING_ERR; + } + + ((UbrDataStatusQMsg *)(_trx->ubrTx.localDataStatusQ.addr))->timeout = FLAGS_ub_connect_timeout; + ((UbrDataStatusQMsg *)(_trx->ubrRx.remoteDataStatusQ.addr))->timeout = FLAGS_ub_connect_timeout; + + ((UbrEventQMsg *)_trx->ubrTx.remoteRxEventQ.addr)->flag = UBR_STATE_CONNECTED; + ((UbrEventQMsg *)_trx->ubrRx.localRxEventQ.addr)->flag = UBR_STATE_CONNECTED; + _trx->ubrTx.trxState = UBR_STATE_CONNECTED; + _trx->ubrRx.trxState = UBR_STATE_CONNECTED; + return UBRING_OK; +} + +int UBRing::UbrAllocateServerShm(SHM* remote_trx_shm, SHM* local_trx_shm) { + UbrSetSleepTask(UBR_TASK_ACCEPT_MAP_FRONT); + if (UNLIKELY((ShmRemoteMalloc(remote_trx_shm)) != UBRING_OK)) { + LOG(ERROR) << "Trx apply remote shared memory failed."; + return -1; + } + + if (UNLIKELY((ShmLocalCalloc(local_trx_shm)) != UBRING_OK)) { + LOG(ERROR) << "Trx apply local shared memory failed."; + return -1; + } + + UbrTrx **ubrTrxPtr = &_trx; + if (UNLIKELY((UBRingManager::AcquireUbrTrxFromMgr(ubrTrxPtr)) != UBRING_OK)) { + LOG(ERROR) << "Acquire ubrtrx failed."; + ShmRemoteFree(remote_trx_shm); + ShmLocalFree(local_trx_shm); + return -1; + } + _trx->type = TCP_TRX; + if (UNLIKELY((UbrServerTrxInit(local_trx_shm, remote_trx_shm)) != UBRING_OK)) { + LOG(ERROR) << "Server trx init failed."; + ShmRemoteFree(remote_trx_shm); + UbrTrxFreeShm(_trx); + UBRingManager::ReleaseUbrTrxFromMgr(_trx); + return -1; + } + return 0; +} + +int UBRing::UbrAllocateLocalShm(SHM *local_trx_shm, const char *shm_name) +{ + if (UNLIKELY((UBRingManager::AcquireUbrTrxFromMgr(&(_trx))) != UBRING_OK)) { + LOG(ERROR) << "Acquire ubrtrx failed, localName=" << shm_name; + return -1; + } + + _trx->type = TCP_TRX; + if (UNLIKELY((ApplyAndMapLocalShm(local_trx_shm, shm_name)) != UBRING_OK)) { + LOG(ERROR) << "Trx apply or map local shared memory failed, localName=" << shm_name; + return -1; + } + return 0; +} + +int UBRing::UbrMapRemoteShm(SHM *local_trx_shm, const char *local_name) +{ + RETURN_CODE rc = UbrMapRemoteShmAddTimer(local_trx_shm, local_name); + if (UNLIKELY(rc != UBRING_OK)) { + LOG(ERROR) << "Connect Trx failed, local shm name=" << local_trx_shm->name; + return -1; + } + PrewriteUbrRx(&_trx->ubrRx); + PrewriteUbrTx(&_trx->ubrTx); + ((UbrEventQMsg *)_trx->ubrRx.remoteTxEventQ.addr)->flag = UBR_STATE_CONNECTED; + ((UbrEventQMsg *)_trx->ubrRx.localRxEventQ.addr)->flag = UBR_STATE_CONNECTED; + _trx->ubrTx.trxState = UBR_STATE_CONNECTED; + _trx->ubrRx.trxState = UBR_STATE_CONNECTED; + return 0; +} + +RETURN_CODE UBRing::UbrMapRemoteShmAddTimer(SHM *localTrxShm, const char *localName) +{ + uint64_t startTime = GetCurNanoSeconds(); + + size_t remoteServerLen = UBR_MSG_LEN * (((UbrDataStatusQMsg *)(_trx->ubrTx.localDataStatusQ.addr))->tail + 1) + + UBR_MSG_LEN * ((DATAQ_ADDR_OFFSET / UBR_MSG_LEN) + 1); + SHM remoteTrxShm = {NULL, remoteServerLen, 0, {0}, localTrxShm->fd}; + int result = snprintf(remoteTrxShm.name, + SHM_MAX_NAME_BUFF_LEN, + "%s_%s_%s", + SHM_NAME_PREFIX, + localName, + SERVER_SHM_NAME_SUFFIX); + if (UNLIKELY(result < 0)) { + LOG(ERROR) << "Copy server shared memory name failed, localName=%s, ret=%d.", localName, result; + return UBRING_ERR; + } + UbrSetSleepTask(UBR_TASK_CONNECT_MAP_FRONT); + RETURN_CODE rc = ApplyAndMapRemoteShm(&remoteTrxShm); + if (UNLIKELY(rc != UBRING_OK)) { + LOG(ERROR) << "Connect Trx map shared memory failed, remote shm=" << remoteTrxShm.name; + return rc; + } + + if (UNLIKELY(UbrAddTimer() != UBRING_OK)) { + LOG(ERROR) << "Ubr add timer failed, localName=" << localName; + ShmRemoteFree(&remoteTrxShm); + return UBRING_ERR; + } + + UbrSetSleepTask(UBR_TASK_CONNECT_MAP_AFTER); + + uint32_t timeout = ((UbrDataStatusQMsg *)(_trx->ubrTx.localDataStatusQ.addr))->timeout; + if (HasTimedOut(startTime, timeout) != UBRING_OK) { + LOG(ERROR) << "Local shm " << localTrxShm->name << " wait for connect remote map timeout."; + DeleteTimerSafe((uint32_t)_trx->hbTimerFd); + DeleteTimerSafe((uint32_t)_trx->timerFd); + ShmRemoteFree(&remoteTrxShm); + return UBRING_ERR_TIMEOUT; + } + + return UBRING_OK; +} + +RETURN_CODE UBRing::ApplyAndMapLocalShm(SHM *localTrxShm, const char *localName) +{ + if (UNLIKELY(_trx == NULL || localTrxShm == NULL)) { + LOG(ERROR) << "Trx map Shared memory failed, trx is null, localName=" << localName; + return UBRING_ERR; + } + int result = snprintf(localTrxShm->name, + SHM_MAX_NAME_BUFF_LEN, + "%s_%s_%s", + SHM_NAME_PREFIX, + localName, + CLIENT_SHM_NAME_SUFFIX); + if (UNLIKELY(result < 0)) { + LOG(ERROR) << "Copy client localTrx shared memory name failed, localName=" << localName << ", ret=" << result; + return UBRING_ERR; + } + + RETURN_CODE rc = ShmLocalCalloc(localTrxShm); + if (UNLIKELY(rc != UBRING_OK)) { + LOG(ERROR) << "Trx apply local shared memory failed, local shm name=" << localTrxShm->name << ", rc=" << rc; + if (rc == SHM_ERR_EXIST || rc == SHM_ERR_NOT_FOUND) { + rc = UBR_ERR_ADDR_IN_USE; + } + UBRingManager::ReleaseUbrTrxFromMgr(_trx); + return rc; + } + rc = UbrTrxMapLocalShm(localTrxShm); + if (UNLIKELY(rc != UBRING_OK)) { + LOG(ERROR) << "Trx map local shared memory failed, local shm name=" << localTrxShm->name; + ShmLocalFree(localTrxShm); + UBRingManager::ReleaseUbrTrxFromMgr(_trx); + return rc; + } + ((UbrDataStatusQMsg *)_trx->ubrTx.localDataStatusQ.addr)->timeout = FLAGS_ub_connect_timeout; + _trx->ubrRx.capacity = (uint32_t)(_trx->ubrRx.localDataQ.len / UBR_MSG_LEN); + rc = UBRingManager::GetUbrDealMsgMaxCnt(_trx->ubrRx.capacity, &_trx->ubrRx.dealMsgMaxCnt); + if (rc != UBRING_OK) { + LOG(ERROR) << "Get ubring deal msg max cnt, local shm name=" << localTrxShm->name; + ShmLocalFree(localTrxShm); + UBRingManager::ReleaseUbrTrxFromMgr(_trx); + return rc; + } + return UBRING_OK; +} + +RETURN_CODE UBRing::ApplyAndMapRemoteShm(SHM *remoteTrxShm) +{ + RETURN_CODE rc = ShmRemoteMalloc(remoteTrxShm); + if (UNLIKELY(rc != UBRING_OK)) { + LOG(ERROR) << "Trx apply remote shared memory failed."; + return rc; + } + rc = UbrTrxMapRemoteShm(remoteTrxShm); + if (UNLIKELY(rc != UBRING_OK)) { + LOG(ERROR) << "Trx map shared memory failed."; + ShmRemoteFree(remoteTrxShm); + return rc; + } + _trx->ubrTx.capacity = (uint32_t)(_trx->ubrTx.remoteDataQ.len / UBR_MSG_LEN); + return UBRING_OK; +} + +RETURN_CODE UBRing::WritevHasEnoughSpace(size_t bufLen) +{ + UbrDataStatusQMsg *dataStatusMsg = (UbrDataStatusQMsg *)_trx->ubrTx.localDataStatusQ.addr; + uint32_t cap = _trx->ubrTx.capacity; + uint32_t tail = dataStatusMsg->tail; + uint32_t remainChunkNum = + (_trx->ubrTx.writePos > tail) ? (tail + cap - _trx->ubrTx.writePos) : (tail - _trx->ubrTx.writePos); + uint32_t needMsgChunkNum = CalcUbrMsgChunkCnt((uint32_t)bufLen); + if (remainChunkNum < needMsgChunkNum) { + return UBRING_RETRY; + } + return UBRING_OK; +} + +RETURN_CODE UBRing::UbrClearResourceCheck(UbrTrx *trx, uint64_t startTime, UbrCloseType closeType) +{ + if (UNLIKELY(trx == NULL)) { + LOG(ERROR) << "Trx close failed, trx is null."; + return UBRING_ERR; + } + + UbrEventQMsg* localTxEventQ = (UbrEventQMsg *)trx->ubrTx.localTxEventQ.addr; + if (localTxEventQ->flag == UBR_STATE_CONNECTED) { + localTxEventQ->flag = UBR_STATE_CLOSING; + } + + if (closeType == UBR_SEND_CLOSE) { + DeleteTimerSafe((uint32_t)trx->timerFd); + } else { + DeleteTimer((uint32_t)trx->timerFd); + } + DeleteTimerSafe((uint32_t)trx->hbTimerFd); + + if (localTxEventQ->flag == UBR_STATE_CLOSING) { + localTxEventQ->flag = UBR_STATE_CLOSED; + trx->ubrTx.trxState = UBR_STATE_CLOSED; + } + + return UBRING_OK; +} + +RETURN_CODE UBRing::ClearTrxResource(UbrTrx *trx, uint64_t startTime, UbrCloseType closeType, int op) +{ + RETURN_CODE rc = UbrClearResourceCheck(trx, startTime, closeType); + if (rc != UBRING_OK) { + return rc; + } + + rc = UbrAddAsynClearTimer(trx); + if (rc != UBRING_OK) { + LOG(ERROR) << "Trx close, add " << trx->localShm.name << " close clear timer failed."; + return UBRING_ERR; + } + + return UBRING_OK; +} + +RETURN_CODE UBRing::UbrTrxCloseCheck(UbrTrx *trx) +{ + if (UNLIKELY(trx == NULL)) { + LOG(ERROR) << "Trx close failed, client trx is null."; + return UBRING_ERR; + } + int expected = MAX_CLOSE_COUNT; + if (!ATOMIC_COMPARE_EXCHANGE_STRONG(trx->closeCnt, expected, MAX_CLOSE_COUNT - 1)) { + LOG(INFO) << "Trx close skipped, already closing, trx local name=" << trx->localShm.name; + return UBRING_REENTRY; + } + + if (UNLIKELY(trx->ubrTx.localTxEventQ.addr == nullptr)) { + LOG(ERROR) << "Trx close failed, localTxEventQ addr is NULL, trx local name=" << trx->localShm.name; + return UBRING_ERR; + } + return UBRING_OK; +} + +ssize_t UBRing::StartReadv(UbrTrx *trx, const struct iovec *iov, int iovcnt, size_t remainBufLen) +{ + ssize_t totalRecvLen = 0; + int iovIndex = 0; + size_t iovPos = 0; + UbrMsgFormat *dataMsg = (UbrMsgFormat *)trx->ubrRx.localDataQ.addr; + bool notEofEncountered = true; + while (notEofEncountered && remainBufLen > 0) { + if (UNLIKELY(CheckTrxRecvPreCheck(trx) != UBRING_OK)) { + return UBRING_ERR; + } + UbrMsgFormat *currentChunk = &dataMsg[trx->ubrRx.readPos]; + uint8_t flag = currentChunk->header[UBR_MSG_FLAG_INDEX]; + if (flag == UBR_MSG_CHUNK_NONE) { + if (totalRecvLen > 0) { + break; + } + errno = EAGAIN; + return -1; + } + if (flag == UBR_MSG_CHUNK_EOF) { + notEofEncountered = false; + } + uint8_t chunkMsgLen = currentChunk->header[UBR_MSG_LEN_INDEX]; + uint8_t curIndex = currentChunk->header[UBR_MSG_CUR_INDEX]; + uint8_t recvLen = + remainBufLen > (size_t)(chunkMsgLen - curIndex) ? (chunkMsgLen - curIndex) : (uint8_t)remainBufLen; + while (iovIndex < iovcnt && recvLen > 0) { + size_t copyLen = + recvLen > (iov[iovIndex].iov_len - iovPos) ? iov[iovIndex].iov_len - iovPos : (size_t)recvLen; + memcpy((uint8_t *)iov[iovIndex].iov_base + iovPos, currentChunk->payload.inner + curIndex, copyLen); + recvLen -= (uint8_t)copyLen; + iovPos += copyLen; + curIndex += (uint8_t)copyLen; + if (iovPos == iov[iovIndex].iov_len) { + iovIndex++; + iovPos = 0; + } + remainBufLen -= copyLen; + totalRecvLen += (ssize_t)copyLen; + } + currentChunk->header[UBR_MSG_CUR_INDEX] = curIndex; + if (currentChunk->header[UBR_MSG_CUR_INDEX] == chunkMsgLen) { + currentChunk->header[UBR_MSG_FLAG_INDEX] = UBR_MSG_CHUNK_NONE; + UpdateDataQTail(trx); + trx->ubrRx.readPos = (trx->ubrRx.readPos + 1) % trx->ubrRx.capacity; + } + } + return totalRecvLen; +} +} // namespace ubring +} // namespace brpc diff --git a/src/brpc/ubshm/ub_ring.h b/src/brpc/ubshm/ub_ring.h new file mode 100644 index 0000000000..09a97d1dcb --- /dev/null +++ b/src/brpc/ubshm/ub_ring.h @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_UB_RING_H +#define BRPC_UB_RING_H + +#include +#include +#include "butil/macros.h" +#include "butil/reader_writer.h" +#include "brpc/ubshm/ubr_trx.h" +#include "brpc/ubshm/shm/shm_mgr.h" +#include "brpc/ubshm/timer/timer_mgr.h" + +namespace brpc { +namespace ubring { +DECLARE_int32(ub_flying_io_timeout); +extern uint32_t g_sleepTime[UBR_TASK_STEP_NUM]; + +class UBRing : public butil::IReader { +public: + UBRing(); + ~UBRing(); + DISALLOW_COPY_AND_ASSIGN(UBRing); + + ssize_t ReadV(const iovec* iov, int iovcnt) override { + return UbrTrxReadv(iov, iovcnt); + } + + RETURN_CODE UbrTrxMapShm(SHM *localShm, SHM *remoteShm); + + RETURN_CODE UbrTrxClose(); + + RETURN_CODE UbrAddCloseTimer(); + + RETURN_CODE UbrAddTimer(); + + static void *UbrTrxCloseCallback(void *args); + + RETURN_CODE UbrAddHBTimer(); + + static void *UbrTrxHBCallback(void *args); + + static RETURN_CODE UbrPassiveClearTrx(UbrTrx *trx, int fd, PASSIVE_DISC_TYPE type); + + static RETURN_CODE UbrAddAsynClearTimer(UbrTrx *trx); + + static void *UbrAsynClearCallback(void *args); + + int UbrTrxSend(const void *buf, uint32_t bufLen); + + int UbrTrxRecv(void *buf, uint32_t bufLen); + + int UbrTrxRecvBlockMode(uint8_t *dest, uint32_t bufLen); + + ssize_t UbrTrxWritev(const struct iovec *iov, int iovcnt); + ssize_t UbrTrxReadv(const struct iovec *iov, int iovcnt); + ssize_t UbrTrxReadvBlockMode(const struct iovec *iov, int iovcnt); + + RETURN_CODE IsUbrTrxReadable(uint32_t epEvent); + + RETURN_CODE IsUbrTrxWriteable(uint32_t epEvent); + + RETURN_CODE UbrSetTimeout(UbrTaskStep taskType, int timeout); + + static RETURN_CODE UbrTrxFreeShm(UbrTrx *trx); + + void PrewriteUbrTx(UbrTx *tx); + void PrewriteUbrRx(UbrRx *rx); + + static inline void UbrSetSleepTask(UbrTaskStep taskType) + { + if (taskType >= UBR_TASK_STEP_NUM || taskType < 0) { + return; + } + uint32_t type = (uint32_t)taskType; + sleep(g_sleepTime[type]); + return; + } + + static inline RETURN_CODE CheckTrxConnectParam(const char *listenerName, const char *localName) + { + if (UNLIKELY(listenerName == NULL)) { + LOG(ERROR) << "The request listener name is null."; + return UBRING_ERR; + } + if (UNLIKELY(localName == NULL)) { + LOG(ERROR) << "The request trx shared memory name is null."; + return UBRING_ERR; + } + return UBRING_OK; + } + + int UbrAllocateServerShm(SHM* remote_trx_shm, SHM* local_trx_shm); + + int UbrMapRemoteShm(SHM *local_trx_shm, const char *local_name); + + int UbrAllocateLocalShm(SHM *local_trx_shm, const char *shm_name); + + RETURN_CODE UbrMapRemoteShmAddTimer(SHM *localTrxShm, const char *localName); + + static inline RETURN_CODE CheckTrxSendPreCheck(UbrTrx *trx) + { + if (UNLIKELY(trx->ubrTx.trxState != UBR_STATE_CONNECTED)) { + LOG(ERROR) << "Trx send failed, trx is not connected state."; + return UBRING_ERR; + } + + return UBRING_OK; + } + static RETURN_CODE CheckTrxRecvParam(UbrTrx *trx, const void *buf, uint32_t bufLen) + { + if (UNLIKELY(trx == NULL)) { + LOG(ERROR) << "Trx recv failed, trx is null."; + return UBRING_ERR; + } + + if (UNLIKELY((UbrEventQMsg *)trx->ubrRx.localRxEventQ.addr == NULL)) { + LOG(ERROR) << "Trx send failed, localTxEventQ addr is NULL."; + return UBRING_ERR; + } + + if (UNLIKELY(trx->ubrRx.trxState != UBR_STATE_CONNECTED)) { + LOG(ERROR) << "Trx recv failed, trx is not connected statep=" << trx->ubrRx.trxState; + return UBR_NOT_CONNECTED; + } + if (UNLIKELY(buf == NULL)) { + LOG(ERROR) << "Trx recv failed, buf is null."; + return UBRING_ERR; + } + if (UNLIKELY(bufLen == 0)) { + LOG(ERROR) << "Trx recv failed, bufLen is 0."; + return UBRING_ERR; + } + return UBRING_OK; + } + + static inline RETURN_CODE CheckTrxRecvPreCheck(UbrTrx *trx) + { + if (UNLIKELY(trx->ubrRx.trxState != UBR_STATE_CONNECTED)) { + LOG(ERROR) << "Trx recv failed, trx is not connected state."; + return UBRING_ERR; + } + return UBRING_OK; + } + + static inline void UpdateDataQTail(UbrTrx *trx) + { + ((UbrDataStatusQMsg *)trx->ubrRx.remoteDataStatusQ.addr)->tail = trx->ubrRx.readPos; + } + + static RETURN_CODE UbrTrxCallbackCheck(UbrTrx *trx) + { + if (trx == NULL) { + LOG(ERROR) << "Trx close callback failed, trx is null."; + return UBRING_ERR; + } + if (UNLIKELY(trx->localShm.addr == NULL)) { + LOG(ERROR) << "Trx close failed, localShm addr is NULL."; + return UBRING_ERR; + } + if (UNLIKELY(trx->ubrRx.localRxEventQ.addr == NULL)) { + LOG(ERROR) << "Trx close failed, localRxEventQ addr is NULL."; + return UBRING_ERR; + } + if (UNLIKELY(trx->ubrTx.localTxEventQ.addr == NULL)) { + LOG(ERROR) << "Trx close failed, localTxEventQ addr is NULL."; + return UBRING_ERR; + } + return UBRING_OK; + } + +private: + RETURN_CODE UbrTrxMapLocalShm(SHM *localShm); + RETURN_CODE UbrTrxMapRemoteShm(SHM *remoteShm); + RETURN_CODE ApplyAndMapLocalShm(SHM *localTrxShm, const char *localName); + RETURN_CODE ApplyAndMapRemoteShm(SHM *remoteTrxShm); + static RETURN_CODE UbrTrxCloseCheck(UbrTrx *trx); + void ReleaseFileLock(int lockFd); + ssize_t StartReadv(UbrTrx *trx, const struct iovec *iov, int iovcnt, size_t remainBufLen); + void PreWriteAddr(uint8_t *addr, size_t len); + RETURN_CODE WritevHasEnoughSpace(size_t bufLen); + RETURN_CODE UbrServerTrxInit(SHM *localShm, SHM *remoteShm); + static RETURN_CODE UbrClearResourceCheck(UbrTrx *trx, uint64_t startTime, UbrCloseType closeType); + static RETURN_CODE ClearTrxResource(UbrTrx *trx, uint64_t startTime, UbrCloseType closeType, int op=0); + + UbrTrx* _trx{nullptr}; +}; +} +} + +#endif //BRPC_UB_RING_H \ No newline at end of file diff --git a/src/brpc/ubshm/ub_ring_manager.cpp b/src/brpc/ubshm/ub_ring_manager.cpp new file mode 100644 index 0000000000..13df631f9e --- /dev/null +++ b/src/brpc/ubshm/ub_ring_manager.cpp @@ -0,0 +1,264 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "brpc/ubshm/ub_ring.h" +#include "brpc/ubshm/ub_ring_manager.h" +#include "butil/logging.h" + +namespace brpc { +namespace ubring { +DEFINE_int32(ubr_max_managed_num, 1024, "maximum number of managed ubring"); +DEFINE_int32(tail_update_after_read, 8, "Position of the tail update after the read"); + +UbrMgr UBRingManager::g_ubrMgr; +UbrLinkInfoMgr UBRingManager::g_linkInfoMgr; +pthread_mutex_t UBRingManager::g_ubrTrxMgrMtx = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t UBRingManager::g_ubrListenerMgrMtx = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t UBRingManager::g_linkInfoMgrMtx = PTHREAD_MUTEX_INITIALIZER; + +uint64_t g_ubrTrxNum = 0; +uint64_t g_ubEventCnt = 0; +uint64_t g_ubrListenerNum = 0; + +RETURN_CODE UBRingManager::GetUbrDealMsgMaxCnt(const uint32_t capacity, uint32_t *dealMsgMaxCnt) { + if (UNLIKELY(dealMsgMaxCnt == NULL)) { + LOG(ERROR) << "Get update factor failed, dealMsgMaxCnt is null."; + return UBRING_ERR; + } + if (UNLIKELY(FLAGS_tail_update_after_read == 0)) { + LOG(ERROR) << "Get update factor failed, factor is 0."; + return UBRING_ERR; + } + *dealMsgMaxCnt = capacity / FLAGS_tail_update_after_read; + return UBRING_OK; +} + +RETURN_CODE UBRingManager::UbrMgrDefault() +{ + g_ubrMgr.trxNum = 0; + g_ubrMgr.trxCap = FLAGS_ubr_max_managed_num; + g_ubrMgr.trxMgrUnitStatus = NULL; + g_ubrMgr.trxMgr = NULL; + return UBRING_OK; +} + +RETURN_CODE UBRingManager::UbrMgrInit() { + RETURN_CODE rc = UbrMgrDefault(); + if (UNLIKELY(rc != UBRING_OK)) { + LOG(ERROR) << "Ubr manager set default values failed."; + return rc; + } + + size_t trxMgrSize = g_ubrMgr.trxCap * sizeof(UbrTrx); + g_ubrMgr.trxMgr = (UbrTrx *)malloc(trxMgrSize); + size_t trxMgrStatusSize = g_ubrMgr.trxCap * sizeof(UbrMgrUnitStatus); + g_ubrMgr.trxMgrUnitStatus = (UbrMgrUnitStatus *)malloc(trxMgrStatusSize); + if (UNLIKELY(g_ubrMgr.trxMgr == NULL || + g_ubrMgr.trxMgrUnitStatus == NULL)) { + LOG(ERROR) << "Ubr manager memory allocation failed."; + UbrMgrFini(); + return UBRING_ERR; + } + + memset(g_ubrMgr.trxMgr, 0, trxMgrSize); + memset(g_ubrMgr.trxMgrUnitStatus, UBR_MGR_UNIT_FREE, trxMgrStatusSize); + LinkInfoInit(); + return UBRING_OK; +} + +void UBRingManager::UbrMgrFini() { + { + LOCK_GUARD(g_ubrTrxMgrMtx); + FREE_PTR(g_ubrMgr.trxMgr); + FREE_PTR(g_ubrMgr.trxMgrUnitStatus); + } + { + LOCK_GUARD(g_ubrListenerMgrMtx); + } + g_ubrMgr.trxNum = 0; + g_ubrMgr.trxCap = 0; + LinkInfoFini(); +} + +RETURN_CODE UBRingManager::AcquireUbrTrxFromMgr(UbrTrx **trx) { + if (UNLIKELY(trx == NULL)) { + LOG(ERROR) << "Acquire trx failed, trx is null."; + return UBRING_ERR; + } + + if (UNLIKELY(g_ubrMgr.trxMgr == NULL)) { + LOG(ERROR) << "Acquire trx failed, trxMgr is null."; + return UBRING_ERR; + } + + LOCK_GUARD(g_ubrTrxMgrMtx); + if (g_ubrMgr.trxNum >= g_ubrMgr.trxCap) { + LOG(ERROR) << "Acquire trx failed, trx number is full."; + return UBRING_ERR; + } + + for (uint32_t i = 0; i < g_ubrMgr.trxCap; ++i) { + if (g_ubrMgr.trxMgrUnitStatus[i] == UBR_MGR_UNIT_FREE) { + memset(&g_ubrMgr.trxMgr[i], 0, sizeof(UbrTrx)); + g_ubrMgr.trxMgrUnitStatus[i] = UBR_MGR_UNIT_USED; + *trx = &g_ubrMgr.trxMgr[i]; + (*trx)->trxMgrIndex = i; + (*trx)->ubrId = g_ubrTrxNum; + (*trx)->closeState = UBR_CLOSE_FIRST; + (*trx)->closeCnt = MAX_CLOSE_COUNT; + ++g_ubrMgr.trxNum; + ++g_ubrTrxNum; + return UBRING_OK; + } + } + LOG(ERROR) << "Acquire trx failed, no available space."; + return UBRING_ERR; +} + +RETURN_CODE UBRingManager::ReleaseUbrTrxFromMgr(UbrTrx *trx) { + if (UNLIKELY(trx == NULL)) { + LOG(ERROR) << "Release trx failed, trx is null."; + return UBRING_ERR; + } + + trx->localShm.addr = NULL; + trx->ubrTx.localTxEventQ.addr = NULL; + trx->ubrTx.localDataStatusQ.addr = NULL; + trx->ubrRx.localRxEventQ.addr = NULL; + trx->ubrRx.remoteDataStatusQ.addr = NULL; + if (UNLIKELY(g_ubrMgr.trxMgr == NULL)) { + LOG(ERROR) << "Release trx failed, trxMgr is null."; + return UBRING_ERR; + } + + LOCK_GUARD(g_ubrTrxMgrMtx); + uint32_t idx = trx->trxMgrIndex; + if (g_ubrMgr.trxMgrUnitStatus[idx] == UBR_MGR_UNIT_FREE) { + LOG(INFO) << "Release trx already freed, name=" << trx->localShm.name; + return UBRING_OK; + } + + if (g_ubrMgr.trxNum == 0) { + LOG(ERROR) << "Release trx failed, trx number is 0."; + return UBRING_ERR; + } + + g_ubrMgr.trxMgrUnitStatus[idx] = UBR_MGR_UNIT_FREE; + --g_ubrMgr.trxNum; + return UBRING_OK; +} + +void UBRingManager::LinkInfoInit(void) { + + size_t linkInfoMgrSize = FLAGS_ubr_max_managed_num * sizeof(UbrLinkInfo); + g_linkInfoMgr.allLinkInfo = (UbrLinkInfo*) malloc(linkInfoMgrSize); + if (g_linkInfoMgr.allLinkInfo == NULL) { + LOG(ERROR) << "allLinkInfo is NULL"; + LinkInfoFini(); + return; + } + + g_linkInfoMgr.linkMgrUnitStatus = (UbrMgrUnitStatus*) malloc(linkInfoMgrSize); + if (g_linkInfoMgr.linkMgrUnitStatus == NULL) { + LinkInfoFini(); + return; + } + + memset(g_linkInfoMgr.allLinkInfo, 0, linkInfoMgrSize); + memset(g_linkInfoMgr.linkMgrUnitStatus, 0, linkInfoMgrSize); +} + +void UBRingManager::LinkInfoFini(void) { + if (g_linkInfoMgr.linkMgrUnitStatus == NULL || g_linkInfoMgr.allLinkInfo == NULL) { + LOG(ERROR) << "LinkInfo is NULL"; + return; + } + { + LOCK_GUARD(g_linkInfoMgrMtx); + FREE_PTR(g_linkInfoMgr.allLinkInfo); + FREE_PTR(g_linkInfoMgr.linkMgrUnitStatus); + } + + g_linkInfoMgr.linkNum = 0; +} + +void UBRingManager::AcquireLinkInfoToMgr(const char *listenerName, UbrTrx *trx) { + if (listenerName == NULL || trx == NULL) { + LOG(ERROR) << "LinkInfo acquire fail."; + return; + } + + if (g_linkInfoMgr.linkMgrUnitStatus == NULL || g_linkInfoMgr.allLinkInfo == NULL) { + LOG(ERROR) << "LinkInfo is NULL."; + return; + } + uint32_t ubrIndex = trx->trxMgrIndex; + char* connectName = trx->localShm.name; + if (g_linkInfoMgr.linkMgrUnitStatus[ubrIndex] == UBR_MGR_UNIT_FREE) { + strncpy(g_linkInfoMgr.allLinkInfo[ubrIndex].connectName, + connectName, SHM_MAX_NAME_BUFF_LEN); + strncpy(g_linkInfoMgr.allLinkInfo[ubrIndex].listenerName, + listenerName, SHM_MAX_NAME_BUFF_LEN); + g_linkInfoMgr.linkMgrUnitStatus[ubrIndex] = UBR_MGR_UNIT_USED; + g_linkInfoMgr.linkNum++; + } +} + +void UBRingManager::ReleaseLinkInfoFromMgr(UbrTrx *trx) { + if (trx == NULL || g_linkInfoMgr.linkMgrUnitStatus == NULL) { + LOG(ERROR) << "LinkInfo release fail."; + return; + } + + if (g_linkInfoMgr.linkMgrUnitStatus[trx->trxMgrIndex] == UBR_MGR_UNIT_FREE) { + LOG(ERROR) << "Release linkInfo failed, trx is not in manager."; + return; + } + g_linkInfoMgr.linkMgrUnitStatus[trx->trxMgrIndex] = UBR_MGR_UNIT_FREE; + g_linkInfoMgr.linkNum--; +} + +int32_t UBRingManager::UbEventCallback(const char *shmName) +{ + if (UNLIKELY(shmName == NULL)) { + LOG(ERROR) << "Ub event callback failed, shm name is null."; + return UBRING_ERR; + } + if (UNLIKELY(g_ubrMgr.trxMgr == NULL)) { + LOG(ERROR) << "Ub event callback failed, trx mgr is null."; + return UBRING_ERR; + } + LOG(INFO) << "Ub event callback is processing. shm_name=" << shmName; + + for (uint32_t i = 0; i < g_ubrMgr.trxCap; ++i) { + if (g_ubrMgr.trxMgrUnitStatus[i] == UBR_MGR_UNIT_FREE) { + continue; + } + + if (strcmp(g_ubrMgr.trxMgr[i].localShm.name, shmName) == 0 || // 故障链路为该trx的本端shm + strcmp(g_ubrMgr.trxMgr[i].remoteShm.name, shmName) == 0) { // 故障链路为该trx的对端shm + ++g_ubEventCnt; + int fd = (int)g_ubrMgr.trxMgr[i].localShm.fd; + LOG(WARNING) << "Ub event callback, the fd of the faulty link is " << fd; + return UBRing::UbrPassiveClearTrx(&g_ubrMgr.trxMgr[i], fd, UBR_UB_EVENT); + } + } + return UBRING_ERR; +} +} +} diff --git a/src/brpc/ubshm/ub_ring_manager.h b/src/brpc/ubshm/ub_ring_manager.h new file mode 100644 index 0000000000..c901791565 --- /dev/null +++ b/src/brpc/ubshm/ub_ring_manager.h @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_UB_RING_MANAGER_H +#define BRPC_UB_RING_MANAGER_H + +#include "brpc/ubshm/ubr_trx.h" +#include "brpc/ubshm/shm/shm_def.h" +#include "brpc/ubshm/common/common.h" + +namespace brpc { +namespace ubring { +typedef enum { + UBR_MGR_UNIT_FREE = 0, + UBR_MGR_UNIT_USED = 1 +} UbrMgrUnitStatus; + +typedef struct TagUbrMgr { + uint32_t trxNum; + uint32_t trxCap; + UbrTrx *trxMgr; + UbrMgrUnitStatus *trxMgrUnitStatus; +} UbrMgr; + +typedef struct TagUbrLinkInfo { + char connectName[SHM_MAX_NAME_BUFF_LEN]; + char listenerName[SHM_MAX_NAME_BUFF_LEN]; +} UbrLinkInfo; + +typedef struct TagUbrLinkInfoMgr { + uint32_t linkNum; + UbrLinkInfo* allLinkInfo; + UbrMgrUnitStatus *linkMgrUnitStatus; +} UbrLinkInfoMgr; + +class UBRingManager { +public: + ~UBRingManager(){ + UbrMgrFini(); + } + + static RETURN_CODE GetUbrDealMsgMaxCnt(const uint32_t capacity, uint32_t *dealMsgMaxCnt); + + static RETURN_CODE UbrMgrDefault(); + + static RETURN_CODE UbrMgrInit(); + + static void UbrMgrFini(); + + static RETURN_CODE AcquireUbrTrxFromMgr(UbrTrx **trx); + + static RETURN_CODE ReleaseUbrTrxFromMgr(UbrTrx *trx); + + static void LinkInfoInit(void); + static void LinkInfoFini(void); + static void AcquireLinkInfoToMgr(const char* listenerName, UbrTrx *trx); + static void ReleaseLinkInfoFromMgr(UbrTrx* trx); + static int32_t UbEventCallback(const char *shmName); + +private: + UBRingManager() { + } + + static UbrMgr g_ubrMgr; + static UbrLinkInfoMgr g_linkInfoMgr; + static pthread_mutex_t g_ubrTrxMgrMtx; + static pthread_mutex_t g_ubrListenerMgrMtx; + static pthread_mutex_t g_linkInfoMgrMtx; +}; +} +} + +#endif //BRPC_UB_RING_MANAGER_H \ No newline at end of file diff --git a/src/brpc/ubshm/ubr_msg.h b/src/brpc/ubshm/ubr_msg.h new file mode 100644 index 0000000000..8a19b6f6bc --- /dev/null +++ b/src/brpc/ubshm/ubr_msg.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_UBR_MSG_H +#define BRPC_UBR_MSG_H +#define UBR_MSG_HEADER_LEN 4 +#define UBR_MSG_PAYLOAD_LEN 60 +#define UBR_MSG_LEN (UBR_MSG_HEADER_LEN + UBR_MSG_PAYLOAD_LEN) + +#define UBR_MSG_FLAG_INDEX 0 +#define UBR_MSG_LEN_INDEX 1 +#define UBR_MSG_CUR_INDEX 2 + +namespace brpc { +namespace ubring { +typedef enum { + UBR_MSG_CHUNK_NONE = 0, + UBR_MSG_CHUNK_EXIST = 1, + UBR_MSG_CHUNK_EOF = 2 +} UbrMsgHdrFlag; + +typedef struct TagUbrMsgPayload { + uint8_t inner[UBR_MSG_PAYLOAD_LEN]; +} UbrMsgPayload; + +typedef struct __attribute__((aligned(64))) TagUbrMsgFormat { + UbrMsgPayload payload; + + uint8_t header[UBR_MSG_HEADER_LEN]; +} UbrMsgFormat; + +static inline uint32_t CalcUbrMsgChunkCnt(uint32_t bufLen) +{ + uint32_t msgChunkNum = (bufLen + UBR_MSG_PAYLOAD_LEN - 1) / UBR_MSG_PAYLOAD_LEN; + return msgChunkNum; +} +} +} +#endif //BRPC_UBR_MSG_H \ No newline at end of file diff --git a/src/brpc/ubshm/ubr_trx.h b/src/brpc/ubshm/ubr_trx.h new file mode 100644 index 0000000000..af9c52ade7 --- /dev/null +++ b/src/brpc/ubshm/ubr_trx.h @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_UBR_TRX_H +#define BRPC_UBR_TRX_H +#include +#include +#include +#include "brpc/ubshm/shm/shm_def.h" +#include "brpc/ubshm/common/common.h" +#include "brpc/ubshm/common/thread_lock.h" +#include "brpc/ubshm/ubr_msg.h" + +/* +----------------------------------------------------------------------------+ + │ UbrTrx shm │ + +-------------+-------------+-------------+---------------+------------------+ + │ TxEventQ │ RxEventQ │ DataStatusQ │ zero(44Bytes) | DataQ │ + +-------------+-------------+-------------+---------------+------------------+ */ + +#define UBR_EVENTQ_LEN sizeof(UbrEventQMsg) +#define UBR_DATASTATUSQ_LEN sizeof(UbrDataStatusQMsg) + +#define TX_EVENTQ_ADDR_OFFSET 0 +#define RX_EVENTQ_ADDR_OFFSET UBR_EVENTQ_LEN +#define DATASTATUSQ_ADDR_OFFSET ((UBR_EVENTQ_LEN) << 1) +#define DATAQ_ADDR_OFFSET (DATASTATUSQ_ADDR_OFFSET + UBR_DATASTATUSQ_LEN) +#define MB_TO_BYTE (1024 * 1024) +#define MAX_CLOSE_COUNT 2 + +#define SHM_NAME_PREFIX "UBRING" +#define SERVER_SHM_NAME_SUFFIX "S" +#define CLIENT_SHM_NAME_SUFFIX "C" + +namespace brpc { +namespace ubring { +extern RETURN_CODE(*g_BeforeTcpClose)(int); +extern RETURN_CODE(*g_AfterTcpClose)(int); + +typedef enum { + UBR_STATE_NONE, + UBR_STATE_CONNECTED, + UBR_STATE_CLOSING, + UBR_STATE_CLOSED +} EventQState; + +typedef enum { + UBR_SEND_CLOSE, + UBR_CALL_BACK_CLOSE +} UbrCloseType; + +typedef enum { + UBR_CLOSE_FIRST, + UBR_CLOSE_SECOND, + UBR_CLOSE_END +} UbrCloseCount; + +typedef enum { + UDP_TRX, + TCP_TRX, + UBR_TRX +} UbrTrxType; + +typedef enum { + UBR_TASK_CONNECT_MAP_FRONT, + UBR_TASK_CONNECT_MAP_AFTER, + UBR_TASK_ACCEPT_MAP_FRONT, + UBR_TASK_ACCEPT_MAP_AFTER, + UBR_TASK_CLOSE, + UBR_TASK_STEP_NUM +} UbrTaskStep; + +typedef struct TagUbrDataStatusQMsg { + uint32_t tail; + uint32_t timeout; + uint8_t heartBeat; +} UbrDataStatusQMsg; + +typedef struct TagUbrEventQMsg { + uint64_t ioId; + EventQState flag; +} UbrEventQMsg; + +typedef struct TagUbrAddrInfo { + uint8_t *addr; + size_t len; +} UbrAddrInfo; + +typedef struct TagUbrTx { + UbrAddrInfo remoteDataQ; + UbrAddrInfo remoteRxEventQ; + UbrAddrInfo localDataStatusQ; + UbrAddrInfo localTxEventQ; + uint64_t outIoId; + uint32_t writePos; + uint32_t capacity; + UbrMsgFormat localMsgSpace; + uint32_t hbRetryCnt; + uint32_t epLastCap; + volatile EventQState trxState; +} UbrTx; + +typedef struct TagUbrRx { + UbrAddrInfo localDataQ; + UbrAddrInfo localRxEventQ; + UbrAddrInfo remoteDataStatusQ; + UbrAddrInfo remoteTxEventQ; + uint64_t inIoId; + uint32_t readPos; + uint32_t capacity; + uint32_t dealMsgNum; + uint32_t dealMsgMaxCnt; + uint32_t epEofPos; + volatile EventQState trxState; +} UbrRx; + +typedef struct TagUbrTrx { + UbrTx ubrTx; + UbrRx ubrRx; + uint64_t ubrId; + uint32_t trxMgrIndex; + UbrTrxType type; + SHM localShm; + SHM remoteShm; + int timerFd; + int hbTimerFd; + int clearTimerFd; + AtomicInt closeCnt; + AtomicInt closeState; +} UbrTrx; + +typedef struct TagFileLock { + int lockFd; + char* lockPath; +} FileLock; + +typedef struct TagUbrLinkLock { + int fileLockNum; + FileLock* fileLock; +} UbrLinkLock; + +typedef enum { + UBR_UB_EVENT, + UBR_HEARTBEAT, +}PASSIVE_DISC_TYPE; + +} +} +#endif //BRPC_UBR_TRX_H \ No newline at end of file diff --git a/src/brpc/ubshm/ubs_mem/declare_shm_ubs.h b/src/brpc/ubshm/ubs_mem/declare_shm_ubs.h new file mode 100644 index 0000000000..b09b2bf943 --- /dev/null +++ b/src/brpc/ubshm/ubs_mem/declare_shm_ubs.h @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef UBRING_MK_UBSM +#error Do not include this file unless you know what you are doing. +#endif + +#ifndef UBRING_MK_UBSM_OPTIONAL +#define UBRING_MK_UBSM_OPTIONAL UBRING_MK_UBSM +#endif + +UBRING_MK_UBSM(int, ubsmem_init_attributes, (ubsmem_options_t *ubsm_shmem_opts)); + +UBRING_MK_UBSM(int, ubsmem_initialize, (const ubsmem_options_t *ubsm_shmem_opts)); + +UBRING_MK_UBSM(int, ubsmem_finalize, (void)); + +UBRING_MK_UBSM(int, ubsmem_set_logger_level, (int level)); + +UBRING_MK_UBSM(int, ubsmem_set_extern_logger, (void (*func)(int level, const char *msg))); + +UBRING_MK_UBSM(int, ubsmem_lookup_regions, (ubsmem_regions_t* regions)); + +UBRING_MK_UBSM(int, ubsmem_create_region, (const char *region_name, size_t size, const ubsmem_region_attributes_t *reg_attr)); + +UBRING_MK_UBSM(int, ubsmem_destroy_region, (const char *region_name)); + +UBRING_MK_UBSM(int, ubsmem_shmem_allocate,(const char *region_name, const char *name, size_t size, mode_t mode, + uint64_t flags)); + +UBRING_MK_UBSM(int, ubsmem_shmem_deallocate, (const char *name)); + +UBRING_MK_UBSM(int, ubsmem_shmem_map, (void *addr, size_t length, int prot, int flags, const char *name, off_t offset, + void **local_ptr)); + +UBRING_MK_UBSM(int, ubsmem_shmem_unmap, (void *local_ptr, size_t length)); + +UBRING_MK_UBSM(int, ubsmem_shmem_faults_register, (shmem_faults_func registerFunc)); + +UBRING_MK_UBSM(int, ubsmem_local_nid_query, (uint32_t *nid)); + +#undef UBRING_MK_UBSM_OPTIONAL +#undef UBRING_MK_UBSM \ No newline at end of file diff --git a/src/brpc/ubshm/ubs_mem/ubs_mem.h b/src/brpc/ubshm/ubs_mem/ubs_mem.h new file mode 100644 index 0000000000..66069c6e9c --- /dev/null +++ b/src/brpc/ubshm/ubs_mem/ubs_mem.h @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_UBS_MEM_H +#define BRPC_UBS_MEM_H +#include "ubs_mem_def.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Initialize the UBSMSHMEM attributes + * + * @param ubsm_shmem_opts - [out] shmem attributes + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_init_attributes(ubsmem_options_t *ubsm_shmem_opts); + +/** + * Initialize the UBSMSHMEM library. + * Required to be the first called when a process uses the UBSMSHMEM library. + * @param ubsm_shmem_opts - options structure containing initialization choices + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_initialize(const ubsmem_options_t *ubsm_shmem_opts); + +/** + * Finalize the UBSMSHMEM library. + * Once finalized, the process can continue work,but it is disconnected from the UBSMSHMEM library functions. + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_finalize(void); + +/** + * @brief Set log level + * @return - 0 on success and other on failure + * @param level - level to be set, debug(0), info(1), warning(2), error(3), closed(4) + */ +SHMEM_API int ubsmem_set_logger_level(int level); + +/** + * @brief Set external log function, user can set customized logger function, + * in the customized logger function, user can use unified logger utility, + * then the log message can be written into the same log file as caller's, + * if it is not set, log message will be printed to stdout. + * @param func - [in] external logger function + * @return 0 on success and other on failure + */ +SHMEM_API int ubsmem_set_extern_logger(void (*func)(int level, const char *msg)); + +/** + * Look up regions in UBSMSHMEM associated with the local node. + * @param regions - [out] The descriptor to the regions. + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_lookup_regions(ubsmem_regions_t* regions); + +/** + * Create a large region of UBSMSHMEM. + * Regions are primarily used as large containers within which additional memory may be allocated and managed by + * the program. + * @param region_name - name of the region + * @param size - size (in bytes) requested for the region, 930 no use, default 0. + * Note that implementations may round up the size to implementation-dependent sizes, + * and may impose system-wide (or user-dependent) limits on individual and total size allocated to a given user. + * @param reg_attr - details of UBSMSHMEM region attributes + * @param region_desc - [out] Region_Descriptor for the created region + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_create_region(const char *region_name, size_t size, const ubsmem_region_attributes_t *reg_attr); + +/** + * Look up a region in UBSMSHMEM by name in the name service. + * @param region_name - name of the region. + * @param region_desc - [out] The descriptor to the region. + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_lookup_region(const char *region_name, ubsmem_region_desc_t *region_desc); + +/** + * Destroy a region, and all contents within the region. Note that this + * method call will trigger a delayed free operation to permit other + * instances currently using the region to finish. + * @param region_name - name of the region. + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_destroy_region(const char *region_name); + +/** + * Allocate some named space within a region. Allocates an area of UBSMSHMEM within a region + * @param region_name - name of the region. + * @param name - name of the share memory object + * @param size - size of the space to allocate in bytes. + * @param mode - mode associated with this space. + * @param flags - Special marking for this object, MXMEM_FLAG_WITH_LOCK etc. + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_shmem_allocate(const char *region_name, const char *name, size_t size, mode_t mode, + uint64_t flags); + +/** + * Deallocate allocated space in memory + * @param name - name of the share memory object + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_shmem_deallocate(const char *name); + +/** + * Map item in UBSMSHMEM to the local virtual address space, and return its pointer. + * @param addr - The starting address for the new mapping is specified in addr, If addr is NULL, then + * the kernel chooses the (page-aligned) address at which to create the mapping + * @param length - The length argument specifies the length of the mapping (which must be greater than 0) + * @param prot - same as mmap, describes the desired memory protection of the mapping (and must not conflict with + * the open mode of the file). + * @param flags - same as mmap + * @param name - name of the share memory object which to be mapped, same as mmap's fd + * @param offset - same as mmap, offset must be a multiple of the page size + * @param local_ptr - [out] within the process virtual address space that can be used to directly access the + * data item in UBSMSHMEM + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_shmem_map(void *addr, size_t length, int prot, int flags, const char *name, off_t offset, + void **local_ptr); + +/** + * Unmap a data item in UBSMSHMEM from the local virtual address space. + * @param local_ptr - pointer within the process virtual address space to be unmapped + * @param length - the size to be unmapped + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_shmem_unmap(void *local_ptr, size_t length); + +/** + * Change permissions associated with a data item descriptor. + * @param name - descriptor associated with some data item + * @param perm - new permissions for the data item + * @return - 0 on success and other on failure,other return described in UBSM_SHMEM_RETURN. + */ +SHMEM_API int ubsmem_shmem_set_ownership(const char *name, void *start, size_t length, int prot); + +/** + * shmem lock - Set the lock, status, and data consistency of the shmem item + * @param name - descriptor associated with share memory object + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_shmem_write_lock(const char *name); +SHMEM_API int ubsmem_shmem_read_lock(const char *name); +SHMEM_API int ubsmem_shmem_unlock(const char *name); + +SHMEM_API int ubsmem_shmem_list_lookup(const char *prefix, ubsmem_shmem_desc_t *shm_list, uint32_t *shm_cnt); +SHMEM_API int ubsmem_shmem_lookup(const char *name, ubsmem_shmem_info_t *shm_info); +SHMEM_API int ubsmem_shmem_attach(const char *name); +SHMEM_API int ubsmem_shmem_detach(const char *name); + +/** + * Alloc an area from the resource pool and use it only within the scope of the current process. + * @param region_name - name of the region. + * @param size - size of the space to allocate in bytes. + * Note that implementations may round up the size to implementation-dependent sizes. + * @param mem_distance - Describe the performance distance between memory resources and local nodes. + * Note that described in perf_desc_distance + * @param is_numa - is numa or fd malloc, true: numa, false: fd + * @param local_ptr - [out] pointer within the process virtual address space that can be used to directly access. + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_lease_malloc(const char *region_name, size_t size, ubsmem_distance_t mem_distance, bool is_numa, + void **local_ptr); + +/** + * Release the pointer. + * @param local_ptr - The pointer returned by the malloc function. + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_lease_free(void *local_ptr); + +SHMEM_API int ubsmem_lookup_cluster_statistic(ubsmem_cluster_info_t *info); + +/** + * Subscribes to shared memory UB Event. + * @param registerFunc - Shared Memory UB Event Response Handling Function. + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_shmem_faults_register(shmem_faults_func registerFunc); + +/** + * Query the supernode ID of this node within the supernode domain. + * @param nid - The supernode ID of this node within the supernode domain. + * @return - 0 on success and other on failure + */ +SHMEM_API int ubsmem_local_nid_query(uint32_t *nid); + +#ifdef __cplusplus +} // end of extern "C" +#endif +#endif //BRPC_UBS_MEM_H \ No newline at end of file diff --git a/src/brpc/ubshm/ubs_mem/ubs_mem_def.h b/src/brpc/ubshm/ubs_mem/ubs_mem_def.h new file mode 100644 index 0000000000..29646611f3 --- /dev/null +++ b/src/brpc/ubshm/ubs_mem/ubs_mem_def.h @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_UBS_MEM_DEF_H +#define BRPC_UBS_MEM_DEF_H +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef SHMEM_API +#define SHMEM_API __attribute__((visibility("default"))) +#endif + +// 先修改为48,与旧版本对齐 +#define MAX_HOST_NUM 16 +#define MAX_NUMA_NUM 32 +#define MAX_NUMA_RESV_LEN 16 + +#define MAX_HOST_NAME_DESC_LENGTH 64 +#define MAX_SHM_NAME_LENGTH 48 +#define MAX_REGION_NAME_DESC_LENGTH 48 +#define MAX_REGION_NODE_NUM 16 +#define MAX_REGIONS_NUM 6 +#define MAX_OBMM_SHMDEV_PATH_LEN 64 + +#define MAX_MEMID_NUM 2048 +#define MAX_SHM_CNT 300 + +#define UBSM_FLAG_CACHE 0x0UL +#define UBSM_FLAG_WITH_LOCK 0x1UL +#define UBSM_FLAG_NONCACHE 0x2UL // open O_SYNC +#define UBSM_FLAG_WR_DELAY_COMP 0x4UL // obmm import with wr_delay_comp +#define UBSM_FLAG_ONLY_IMPORT_NONCACHE 0x8UL // only import open O_SYNC +#define UBSM_FLAG_MEM_ANONYMOUS 0x10UL // auto cleanup when all references in domain drop to zero + +typedef enum { + UBSM_OK = 0, + // common error + UBSM_ERR_PARAM_INVALID = 6010, + UBSM_ERR_NOPERM = 6011, // no permision + UBSM_ERR_MEMORY = 6012, // memcpy or other mem func failed + UBSM_ERR_UNIMPL = 6013, // not implement + UBSM_CHECK_RESOURCE_ERROR = 6014, // resource check failed. + UBSM_ERR_MEMLIB = 6015, // mem lib failed + UBSM_ERR_NO_NEEDED = 6016, // default region no need to create + + // resource error + UBSM_ERR_NOT_FOUND = 6020, + UBSM_ERR_ALREADY_EXIST = 6021, + UBSM_ERR_MALLOC_FAIL = 6022, + UBSM_ERR_RECORD = 6023, + UBSM_ERR_IN_USING = 6024, // shm is in use (usrNum > 0) + + // net error + UBSM_ERR_NET = 6040, + + // under api + UBSM_ERR_UBSE = 6050, + UBSM_ERR_OBMM = 6051, + + // cc lock error + UBSM_ERR_LOCK_NOT_SUPPORTED = 6060, + UBSM_ERR_LOCK_ALREADY_LOCKED = 6061, + UBSM_ERR_DLOCK = 6062, + + UBSM_ERR_BUFF = 6099, +} ubsmshmem_ret_t; +/** + * Memory distance, describes the physical memory resource distance relative to the current PE. + */ +typedef enum { + /** direct connect node is provided, same as PerfLevel::L0 */ + DISTANCE_DIRECT_NODE = 0, + /** one hop connect node is provided, same as PerfLevel::L1, not support 930 */ + DISTANCE_HOP_NODE = 1, +} ubsmem_distance_t; + +typedef struct { + // todo +} ubsmem_options_t; + +typedef struct { + char host_name[MAX_HOST_NAME_DESC_LENGTH]; // include '\0' + bool affinity; +} ubsmem_region_node_desc_t; + +typedef struct { + int host_num; + ubsmem_region_node_desc_t hosts[MAX_REGION_NODE_NUM]; +} ubsmem_region_attributes_t; + +typedef struct { + int num; + ubsmem_region_attributes_t region[MAX_REGIONS_NUM]; +} ubsmem_regions_t; + +typedef struct { + char region_name[MAX_REGION_NAME_DESC_LENGTH]; + size_t size; + ubsmem_region_attributes_t region_attr; +} ubsmem_region_desc_t; + +typedef struct { + uint32_t slot_id; // 节点唯一标识, 采用slotid, 与lcne保持一致 + uint32_t socket_id; // socket id + uint32_t numa_id; // 节点中的numa id + uint32_t mem_lend_ratio; // 池化内存借出比例上限 + uint64_t mem_total; // 内存总量, 单位字节 + uint64_t mem_free; // 内存空闲量, 单位字节 + uint64_t mem_borrow; // 借用的内存,单位字节 + uint64_t mem_lend; // 借出的内存,单位字节 + uint8_t resv[MAX_NUMA_RESV_LEN]; +} ubsmem_numa_mem_t; + +typedef struct { + char host_name[MAX_HOST_NAME_DESC_LENGTH]; + int numa_num; + ubsmem_numa_mem_t numa[MAX_NUMA_NUM]; +} ubsmem_host_info_t; + +typedef struct { + int host_num; // 集群可用节点数量 + ubsmem_host_info_t host[MAX_HOST_NUM]; +} ubsmem_cluster_info_t; + +typedef struct { + char name[MAX_SHM_NAME_LENGTH + 1]; + size_t size; +} ubsmem_shmem_desc_t; + +typedef struct { + char name[MAX_SHM_NAME_LENGTH + 1]; + size_t size; + uint32_t mem_num; + uint64_t mem_unit_size; + uint64_t mem_id_list[MAX_MEMID_NUM]; +} ubsmem_shmem_info_t; + +typedef int32_t (*shmem_faults_func)(const char *shm_name); + +#ifdef __cplusplus +} +#endif +#endif //BRPC_UBS_MEM_DEF_H \ No newline at end of file diff --git a/src/brpc/ubshm/ubs_mem/ubshmem_stub.cpp b/src/brpc/ubshm/ubs_mem/ubshmem_stub.cpp new file mode 100644 index 0000000000..f0eaf29f8e --- /dev/null +++ b/src/brpc/ubshm/ubs_mem/ubshmem_stub.cpp @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ubs_mem.h" + +int ubsmem_init_attributes(ubsmem_options_t *ubsm_shmem_opts) +{ + return UBSM_OK; +} + +int ubsmem_initialize(const ubsmem_options_t *ubsm_shmem_opts) +{ + return UBSM_OK; +} + +int ubsmem_finalize(void) +{ + return UBSM_OK; +} + +int ubsmem_set_logger_level(int level) +{ + return UBSM_OK; +} + +int ubsmem_set_extern_logger(void (*func)(int level, const char *msg)) +{ + return UBSM_OK; +} + +int ubsmem_lookup_regions(ubsmem_regions_t* regions) +{ + regions->num = 1; + regions->region[0].host_num = 1; + regions->region[0].hosts[0].affinity = true; + regions->region[0].hosts[0].host_name[0] = 'h'; + regions->region[0].hosts[0].host_name[1] = '1'; + regions->region[0].hosts[0].host_name[2] = '\0'; // 2号位置使用\0 + return UBSM_OK; +} + +int ubsmem_create_region(const char *region_name, size_t size, const ubsmem_region_attributes_t *reg_attr) +{ + return UBSM_OK; +} + + +int ubsmem_destroy_region(const char *region_name) +{ + return UBSM_OK; +} + +int ubsmem_shmem_allocate(const char *region_name, const char *name, size_t size, mode_t mode, uint64_t flags) +{ + return UBSM_OK; +} + +int ubsmem_shmem_deallocate(const char *name) +{ + return UBSM_OK; +} + +int ubsmem_shmem_map(void *addr, size_t length, int prot, int flags, const char *name, off_t offset, + void **local_ptr) +{ + return UBSM_OK; +} + +int ubsmem_shmem_unmap(void *local_ptr, size_t length) +{ + return UBSM_OK; +} + +int ubsmem_shmem_faults_register(shmem_faults_func registerFunc) +{ + return UBSM_OK; +} + +int ubsmem_local_nid_query(uint32_t *nid) +{ + *nid = 1; // stub + return UBSM_OK; +} \ No newline at end of file diff --git a/src/brpc/ubshm_transport.cpp b/src/brpc/ubshm_transport.cpp new file mode 100644 index 0000000000..fec1a4b646 --- /dev/null +++ b/src/brpc/ubshm_transport.cpp @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#if BRPC_WITH_UBRING + +#include "brpc/ubshm_transport.h" +#include "brpc/tcp_transport.h" +#include "brpc/ubshm/ub_endpoint.h" +#include "brpc/ubshm/ub_helper.h" + +namespace brpc { +DECLARE_bool(usercode_in_coroutine); +DECLARE_bool(usercode_in_pthread); + +extern SocketVarsCollector *g_vars; + +void UBShmTransport::Init(Socket *socket, const SocketOptions &options) { + CHECK(_ub_ep == NULL); + if (options.socket_mode == SOCKET_MODE_UBRING) { + _ub_ep = new(std::nothrow)ubring::UBShmEndpoint(socket); + if (!_ub_ep) { + const int saved_errno = errno; + PLOG(ERROR) << "Fail to create UBShmEndpoint"; + socket->SetFailed( + saved_errno, "Fail to create UBShmEndpoint: %s", berror(saved_errno)); + } + _ub_state = UB_UNKNOWN; + } else { + _ub_state = UB_OFF; + socket->_socket_mode = SOCKET_MODE_TCP; + } + _socket = socket; + _default_connect = options.app_connect; + _on_edge_trigger = options.on_edge_triggered_events; + if (options.need_on_edge_trigger && _on_edge_trigger == NULL) { + _on_edge_trigger = ubring::UBShmEndpoint::OnNewDataFromTcp; + } + _tcp_transport = std::make_shared(); + _tcp_transport->Init(socket, options); +} + +void UBShmTransport::Release() { + if (_ub_ep) { + delete _ub_ep; + _ub_ep = NULL; + _ub_state = UB_UNKNOWN; + } +} + +int UBShmTransport::Reset(int32_t expected_nref) { + if (_ub_ep) { + _ub_ep->Reset(); + _ub_state = UB_UNKNOWN; + } + return 0; +} + +std::shared_ptr UBShmTransport::Connect() { + if (_default_connect == nullptr) { + return std::make_shared(); + } + return _default_connect; +} + +int UBShmTransport::CutFromIOBuf(butil::IOBuf *buf) { + if (_ub_ep && _ub_state != UB_OFF) { + butil::IOBuf *data_arr[1] = {buf}; + return _ub_ep->CutFromIOBufList(data_arr, 1); + } else { + return _tcp_transport->CutFromIOBuf(buf); + } +} + +ssize_t UBShmTransport::CutFromIOBufList(butil::IOBuf **buf, size_t ndata) { + if (_ub_ep && _ub_state != UB_OFF) { + return _ub_ep->CutFromIOBufList(buf, ndata); + } + return _tcp_transport->CutFromIOBufList(buf, ndata); +} + +int UBShmTransport::WaitEpollOut(butil::atomic *_epollout_butex, + bool pollin, const timespec duetime) { + // LOG(INFO) << "mwj pollin4=" << pollin << " duetime=" << butil::timespec_to_microseconds(duetime); + if (_ub_state == UB_ON) { + // LOG(INFO) << "mwj pollin1=" << pollin; + const int expected_val = _epollout_butex->load(butil::memory_order_acquire); + CHECK(_ub_ep != NULL); + if (!_ub_ep->IsWritable()) { + g_vars->nwaitepollout << 1; + _ub_ep->PollerRegisterEpollOut(pollin); + auto mwj_ret = bthread::butex_wait(_epollout_butex, expected_val, &duetime); + // LOG(INFO) << "mwj pollin2=" << pollin << " mwj_ret=" << mwj_ret; + if (mwj_ret < 0) { + if (errno != EAGAIN && errno != ETIMEDOUT) { + const int saved_errno = errno; + PLOG(WARNING) << "Fail to wait ub window of " << _socket; + _socket->SetFailed(saved_errno, + "Fail to wait ub window of %s: %s", + _socket->description().c_str(), + berror(saved_errno)); + } + if (_socket->Failed()) { + // NOTE: + // Different from TCP, we cannot find the UB channel + // failed by writing to it. Thus we must check if it + // is already failed here. + return 1; + } + } + _ub_ep->PollerUnRegisterEpollOut(pollin); + } + } else { + return _tcp_transport->WaitEpollOut(_epollout_butex, pollin, duetime); + } + // LOG(INFO) << "mwj return 0"; + return 0; +} + +void UBShmTransport::ProcessEvent(bthread_attr_t attr) { + bthread_t tid; + if (FLAGS_usercode_in_coroutine) { + OnEdge(_socket); + } else if (ubring::FLAGS_ub_edisp_unsched == false) { + auto rc = bthread_start_background(&tid, &attr, OnEdge, _socket); + if (rc != 0) { + LOG(FATAL) << "Fail to start ProcessEvent"; + OnEdge(_socket); + } + } else if (bthread_start_urgent(&tid, &attr, OnEdge, _socket) != 0) { + LOG(FATAL) << "Fail to start ProcessEvent"; + OnEdge(_socket); + } +} + +void UBShmTransport::QueueMessage(InputMessageClosure& input_msg, + int* num_bthread_created, bool last_msg) { + if (last_msg) { + return; + } + InputMessageBase* to_run_msg = input_msg.release(); + if (!to_run_msg) { + return; + } + + if (ubring::FLAGS_ub_disable_bthread) { + ProcessInputMessage(to_run_msg); + return; + } + // Create bthread for last_msg. The bthread is not scheduled + // until bthread_flush() is called (in the worse case). + + // TODO(gejun): Join threads. + bthread_t th; + bthread_attr_t tmp = (FLAGS_usercode_in_pthread ? + BTHREAD_ATTR_PTHREAD : + BTHREAD_ATTR_NORMAL) | BTHREAD_NOSIGNAL; + tmp.keytable_pool = _socket->keytable_pool(); + tmp.tag = bthread_self_tag(); + bthread_attr_set_name(&tmp, "ProcessInputMessage"); + + if (!FLAGS_usercode_in_coroutine && bthread_start_background( + &th, &tmp, ProcessInputMessage, to_run_msg) == 0) { + ++*num_bthread_created; + } else { + ProcessInputMessage(to_run_msg); + } +} + +void UBShmTransport::Debug(std::ostream &os) {} + +int UBShmTransport::ContextInitOrDie(bool serverOrNot, const void* _options) { + if (serverOrNot) { + if (!OptionsAvailableOverUB(static_cast(_options))) { + return -1; + } + ubring::GlobalUBInitializeOrDie(); + if (!ubring::InitPollingModeWithTag(static_cast(_options)->bthread_tag)) { + return -1; + } + } else { + if (!OptionsAvailableForUB(static_cast(_options))) { + return -1; + } + ubring::GlobalUBInitializeOrDie(); + if (!ubring::InitPollingModeWithTag(bthread_self_tag())) { + return -1; + } + return 0; + } + + return 0; +} + +bool UBShmTransport::OptionsAvailableForUB(const ChannelOptions* opt) { + if (opt->has_ssl_options()) { + LOG(WARNING) << "Cannot use SSL and UB at the same time"; + return false; + } + if (!ubring::SupportedByUB(opt->protocol.name())) { + LOG(WARNING) << "Cannot use " << opt->protocol.name() + << " over UB"; + return false; + } + return true; +} + +bool UBShmTransport::OptionsAvailableOverUB(const ServerOptions* opt) { + if (opt->rtmp_service) { + LOG(WARNING) << "RTMP is not supported by UB"; + return false; + } + if (opt->has_ssl_options()) { + LOG(WARNING) << "SSL is not supported by UB"; + return false; + } + if (opt->nshead_service) { + LOG(WARNING) << "NSHEAD is not supported by UB"; + return false; + } + if (opt->mongo_service_adaptor) { + LOG(WARNING) << "MONGO is not supported by UB"; + return false; + } + return true; +} +} // namespace brpc +#endif \ No newline at end of file diff --git a/src/brpc/ubshm_transport.h b/src/brpc/ubshm_transport.h new file mode 100644 index 0000000000..13943d763e --- /dev/null +++ b/src/brpc/ubshm_transport.h @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_UB_TRANSPORT_H +#define BRPC_UB_TRANSPORT_H +#if BRPC_WITH_UBRING +#include "brpc/socket.h" +#include "brpc/channel.h" +#include "brpc/transport.h" + +namespace brpc { +class UBShmTransport : public Transport { + friend class TransportFactory; + friend class ubring::UBShmEndpoint; +friend class ubring::UBConnect; +public: + void Init(Socket* socket, const SocketOptions& options) override; + void Release() override; + int Reset(int32_t expected_nref) override; + std::shared_ptr Connect() override; + int CutFromIOBuf(butil::IOBuf* buf) override; + ssize_t CutFromIOBufList(butil::IOBuf** buf, size_t ndata) override; + int WaitEpollOut(butil::atomic* _epollout_butex, bool pollin, const timespec duetime) override; + void ProcessEvent(bthread_attr_t attr) override; + void QueueMessage(InputMessageClosure& inputMsg, int* num_bthread_created, bool last_msg) override; + void Debug(std::ostream &os) override; + ubring::UBShmEndpoint* GetUBShmEp() { + CHECK(_ub_ep != NULL); + return _ub_ep; + } + static int ContextInitOrDie(bool serverOrNot, const void* _options); +private: + static bool OptionsAvailableForUB(const ChannelOptions* opt); + static bool OptionsAvailableOverUB(const ServerOptions* opt); +private: + // The on/off state of UB + enum UBState { + UB_ON, + UB_OFF, + UB_UNKNOWN + }; + // The UBShmEndpoint + ubring::UBShmEndpoint* _ub_ep = NULL; + // Should use UB or not + UBState _ub_state; + std::shared_ptr _tcp_transport; +}; +} // namespace brpc +#endif // BRPC_WITH_UBRING +#endif //BRPC_UB_TRANSPORT_H \ No newline at end of file diff --git a/src/bthread/bthread.cpp b/src/bthread/bthread.cpp index 27ded27acd..9b0f45991d 100644 --- a/src/bthread/bthread.cpp +++ b/src/bthread/bthread.cpp @@ -396,13 +396,6 @@ int bthread_equal(bthread_t t1, bthread_t t2) { return t1 == t2; } -#ifdef BUTIL_USE_ASAN -// Fixme!!! -// The noreturn `bthread_exit' may cause a warning of ASan, but does not abort the program. -// -// ==94463==WARNING: ASan is ignoring requested __asan_handle_no_return: stack type: default top: 0x00016dd7f000; bottom 0x00010b1a4000; size: 0x000062bdb000 (1656598528) -// False positive error reports may follow -#endif // BUTIL_USE_ASAN void bthread_exit(void* retval) { bthread::TaskGroup* g = bthread::tls_task_group; if (g != NULL && !g->is_current_main_task()) { diff --git a/src/bthread/rwlock.cpp b/src/bthread/rwlock.cpp index e635668373..e28f5ccb8b 100644 --- a/src/bthread/rwlock.cpp +++ b/src/bthread/rwlock.cpp @@ -15,350 +15,508 @@ // specific language governing permissions and limitations // under the License. +#include #include "bvar/collector.h" +#include "butil/memory/scope_guard.h" #include "bthread/rwlock.h" +#include "bthread/mutex.h" #include "bthread/butex.h" namespace bthread { -// A `bthread_rwlock_t' is a reader/writer mutual exclusion lock, -// which is a bthread implementation of golang RWMutex. -// The lock can be held by an arbitrary number of readers or a single writer. -// For details, see https://github.com/golang/go/blob/master/src/sync/rwmutex.go - -// Define in bthread/mutex.cpp +// Defined in bthread/mutex.cpp; reused here so that bthread_rwlock_t +// participates in the global ContentionProfiler just like bthread_mutex_t +// and bthread_sem_t. class ContentionProfiler; extern ContentionProfiler* g_cp; extern bvar::CollectorSpeedLimit g_cp_sl; -extern bool is_contention_site_valid(const bthread_contention_site_t& cs); -extern void make_contention_site_invalid(bthread_contention_site_t* cs); extern void submit_contention(const bthread_contention_site_t& csite, int64_t now_ns); -// It is enough for readers. If the reader exceeds this value, -// need to use `int64_t' instead of `int'. -const int RWLockMaxReaders = 1 << 30; - -// For reading. -static int rwlock_rdlock_impl(bthread_rwlock_t* __restrict rwlock, - const struct timespec* __restrict abstime) { - int reader_count = ((butil::atomic*)&rwlock->reader_count) - ->fetch_add(1, butil::memory_order_acquire) + 1; - // Fast path. - if (reader_count >= 0) { - CHECK_LT(reader_count, RWLockMaxReaders); - return 0; - } - - // Slow path. +// Lazily arm sampling on first contention. Caller must declare +// `size_t sampling_range' and `int64_t start_ns' in scope: +// start_ns == 0 -> not yet decided +// start_ns == -1 -> decided NOT to sample (profiler off / not selected) +// start_ns > 0 -> sampling armed; value is the wall-clock start time +#define BTHREAD_RWLOCK_MAYBE_START_SAMPLING \ + do { \ + if (start_ns == 0) { \ + if (BAIDU_UNLIKELY(g_cp != NULL)) { \ + sampling_range = bvar::is_collectable(&g_cp_sl); \ + start_ns = bvar::is_sampling_range_valid(sampling_range) ? \ + butil::cpuwide_time_ns() : -1; \ + } else { \ + start_ns = -1; \ + } \ + } \ + } while (0) - // Don't sample when contention profiler is off. - if (NULL == bthread::g_cp) { - return bthread_sem_timedwait(&rwlock->reader_sema, abstime); - } - // Ask Collector if this (contended) locking should be sampled. - const size_t sampling_range = bvar::is_collectable(&bthread::g_cp_sl); - if (!bvar::is_sampling_range_valid(sampling_range)) { // Don't sample. - return bthread_sem_timedwait(&rwlock->reader_sema, abstime); +// Submit one contention sample if sampling was armed for this attempt. +// `start_ns > 0' is the convention used everywhere in this file to indicate +// that BTHREAD_RWLOCK_MAYBE_START_SAMPLING actually decided to sample. +// No-op otherwise. Force-inlined so the uncontended fast path stays cheap. +static BUTIL_FORCE_INLINE void submit_contention_if_sampled( + int64_t start_ns, size_t sampling_range) { + if (BAIDU_UNLIKELY(start_ns > 0)) { + const int64_t end_ns = butil::cpuwide_time_ns(); + const bthread_contention_site_t csite{end_ns - start_ns, sampling_range}; + submit_contention(csite, end_ns); } - - // Sample. - const int64_t start_ns = butil::cpuwide_time_ns(); - int rc = bthread_sem_timedwait(&rwlock->reader_sema, abstime); - const int64_t end_ns = butil::cpuwide_time_ns(); - const bthread_contention_site_t csite{end_ns - start_ns, sampling_range}; - // Submit `csite' for each reader immediately after - // owning rdlock to avoid the contention of `csite'. - bthread::submit_contention(csite, end_ns); - - return rc; -} - -static inline int rwlock_rdlock(bthread_rwlock_t* rwlock) { - return rwlock_rdlock_impl(rwlock, NULL); } -static inline int rwlock_timedrdlock(bthread_rwlock_t* __restrict rwlock, - const struct timespec* __restrict abstime) { - return rwlock_rdlock_impl(rwlock, abstime); -} +// bthread RWLock +// writer-priority implementation overview +// Three synchronization fields are used: +// +// * `lock_word' (32-bit butex): +// bit 31 : 1 if the write lock is held, 0 otherwise. +// bit 0~30: number of readers currently holding the read lock. +// Mutually exclusive: when bit 31 is set, the lower 31 bits are 0. +// +// * `writer_wait_count' (32-bit butex): +// Number of writers that have entered wrlock() but not yet finished +// (i.e. currently waiting for the mutex / waiting for lock_word==0 / +// holding the write lock). Each writer accounts for itself: it is +// incremented at the very beginning of wrlock() and decremented at +// the very end of unwrlock()/cleanup(). +// Readers consult this field to implement writer-priority: if any +// writer is "in flight", new readers yield by waiting on it. +// +// * `writer_queue_mutex' (bthread_mutex_t): +// Serializes writers so that at most one writer races for `lock_word' +// at any time. Other writers queue up on this mutex. +// +// Wakeup channels: +// * Readers waiting on writers -> wait on writer_wait_count, woken by unwrlock/cleanup +// * Writers waiting on readers -> wait on lock_word, woken by unrdlock +// * Writers waiting on writers -> wait on writer_queue_mutex + +static int rwlock_rdlock(bthread_rwlock_t* rwlock, bool try_lock, + const struct timespec* abstime) { + auto lock_word = (butil::atomic*)rwlock->lock_word; + auto writer_wait_count = (butil::atomic*)rwlock->writer_wait_count; + + // Sampling state for the contention profiler (lazily armed on first + // contention so that the uncontended fast path stays cheap): + // start_ns == 0 -> not yet decided + // start_ns == -1 -> decided NOT to sample + // start_ns > 0 -> sampling armed; submit on exit + // Each reader samples independently and submits once on its own way out; + // we deliberately do NOT use rwlock->writer_csite here because that field + // is exclusively owned by the writer. + size_t sampling_range = bvar::INVALID_SAMPLING_RANGE; + int64_t start_ns = 0; + int rc = 0; -// Returns 0 if the lock was acquired, otherwise errno. -static inline int rwlock_tryrdlock(bthread_rwlock_t* rwlock) { while (true) { - int reader_count = ((butil::atomic*)&rwlock->reader_count) - ->load(butil::memory_order_relaxed); - if (reader_count < 0) { - // Failed to acquire the read lock because there is a writer. - return EBUSY; - } - if (((butil::atomic*)&rwlock->reader_count) - ->compare_exchange_weak(reader_count, reader_count + 1, - butil::memory_order_acquire, - butil::memory_order_relaxed)) { - return 0; + // Writer-priority: if any writer is in flight, yield to it. + // `relaxed' is sufficient here because: + // - There is no data published via writer_wait_count; + // data visibility is established via the acquire-CAS on + // `lock_word' below paired with the release-CAS in unwrlock(). + // - butex_wait() will re-check the expected value before sleeping, + // so we cannot lose a wakeup even if `w' is slightly stale. + unsigned w = writer_wait_count->load(butil::memory_order_relaxed); + if (w > 0) { + if (try_lock) { + // Don't sample tryrdlock failures: they are by design a + // non-blocking probe, not a contention event. + return EBUSY; + } + // We are about to block on writer_wait_count; arm sampling + // before parking so the wait time is included in the report. + BTHREAD_RWLOCK_MAYBE_START_SAMPLING; + if (butex_wait(writer_wait_count, w, abstime) < 0 && + errno != EWOULDBLOCK && errno != EINTR) { + rc = errno; + break; + } + continue; } - } -} -static inline int rwlock_unrdlock(bthread_rwlock_t* rwlock) { - int reader_count = ((butil::atomic*)&rwlock->reader_count) - ->fetch_add(-1, butil::memory_order_relaxed) - 1; - // Fast path. - if (reader_count >= 0) { - return 0; + // No writer in flight: try to add ourselves to the reader count. + // 2^31 - 1 readers should be enough for any realistic workload. + unsigned l = lock_word->load(butil::memory_order_relaxed); + if ((l >> 31) == 0) { + // Refuse to increment when the reader count has saturated + // the low 31 bits. Otherwise `l + 1' would flip bit 31 and + // we would corrupt lock_word into "writer held" state. + // POSIX-style: report EAGAIN ("max read locks exceeded"). + if (BAIDU_UNLIKELY(l == 0x7FFFFFFFu)) { + LOG(ERROR) << "Too many readers on bthread_rwlock_t=" << rwlock; + rc = EAGAIN; + break; + } + // Acquire on success synchronizes-with the release-CAS in + // unwrlock(), so any data written by the previous writer is + // visible to us before we start reading. + if (lock_word->compare_exchange_weak(l, l + 1, + butil::memory_order_acquire, + butil::memory_order_relaxed)) { + rc = 0; + break; + } + // CAS failed (likely another reader bumped r): retry. + } else if (try_lock) { + // Write lock is currently held. + return EBUSY; + } else { + // Write lock currently held but not yet self-accounted as a + // pending writer (very narrow window inside wrlock). Arm + // sampling now so the spin/wait until writer_wait_count >= 1 + // is also accounted for. + BTHREAD_RWLOCK_MAYBE_START_SAMPLING; + } + // Otherwise (write lock held but not try_lock): spin once more. + // The next iteration will observe writer_wait_count >= 1 (writers + // self-account in writer_wait_count for the entire wrlock lifetime), + // and we will block on it instead of busy spinning. } - // Slow path. - if (BAIDU_UNLIKELY(reader_count + 1 == 0 || reader_count + 1 == -RWLockMaxReaders)) { - CHECK(false) << "rwlock_unrdlock of unlocked rwlock"; - return EINVAL; - } + // Submit one contention sample for this reader (success or failure). + submit_contention_if_sampled(start_ns, sampling_range); + return rc; +} - // A writer is pending. - int reader_wait = ((butil::atomic*)&rwlock->reader_wait) - ->fetch_add(-1, butil::memory_order_relaxed) - 1; - if (reader_wait != 0) { +static int rwlock_unrdlock(bthread_rwlock_t* rwlock) { + auto lock_word = (butil::atomic*)rwlock->lock_word; + while (true) { + unsigned l = lock_word->load(butil::memory_order_relaxed); + // Misuse detection: the caller must currently hold a read lock. + // l == 0 -> no lock is held (double unlock?) + // (l >> 31) != 0 -> write lock is held, not read lock + if (l == 0 || (l >> 31) != 0) { + LOG(ERROR) << "Invalid unrdlock on bthread_rwlock_t=" << rwlock + << ", lock_word=" << l; + return EINVAL; + } + // Release on success publishes any reads/writes done while holding + // the read lock to the next acquirer (typically a writer's + // acquire-CAS in wrlock()). + if(!(lock_word->compare_exchange_weak(l, l - 1, + butil::memory_order_release, + butil::memory_order_relaxed))) { + continue; + } + // We were the last reader (lock_word transitioned 1 -> 0). Wake the + // single writer (if any) that may be sleeping on `lock_word' inside + // wrlock(). At most one writer can be there because writers are + // serialized by writer_queue_mutex. + // No-op if nobody is waiting; butex_wake() short-circuits cheaply. + if (l == 1) { + butex_wake(lock_word); + } return 0; } +} - // The last reader unblocks the writer. - - if (NULL == bthread::g_cp) { - bthread_sem_post(&rwlock->writer_sema); - return 0; +// Roll back the side effects of a failed wrlock attempt: +// - Release writer_queue_mutex if we managed to acquire it. +// - Decrement our share of writer_wait_count. +// - If we were the last in-flight writer, wake all readers that have +// been parked by writer-priority (w == 1 means writer_wait_count is now 0). +// Called on EBUSY (try_lock failed), ETIMEDOUT, EINTR-leading-to-fail. +static BUTIL_FORCE_INLINE void rwlock_wrlock_cleanup(bthread_rwlock_t* rwlock, bool write_queue_locked) { + if (write_queue_locked) { + bthread_mutex_unlock(&rwlock->writer_queue_mutex); } - // Ask Collector if this (contended) locking should be sampled. - const size_t sampling_range = bvar::is_collectable(&bthread::g_cp_sl); - if (!sampling_range) { // Don't sample - bthread_sem_post(&rwlock->writer_sema); - return 0; + auto writer_wait_count = (butil::atomic*)rwlock->writer_wait_count; + // Withdraw our writer-priority "vote" so readers can make progress. + auto w = writer_wait_count->fetch_sub(1, butil::memory_order_relaxed); + // w is the value BEFORE the subtraction, so w == 1 means we were the + // last writer in flight; wake every reader parked on writer_wait_count. + if (w == 1) { + butex_wake_all(writer_wait_count); } - - // Sampling. - const int64_t start_ns = butil::cpuwide_time_ns(); - bthread_sem_post(&rwlock->writer_sema); - const int64_t end_ns = butil::cpuwide_time_ns(); - const bthread_contention_site_t csite{end_ns - start_ns, sampling_range}; - // Submit `csite' for each reader immediately after - // releasing rdlock to avoid the contention of `csite'. - bthread::submit_contention(csite, end_ns); - return 0; } -#define DO_CSITE_IF_NEED \ - do { \ - /* Don't sample when contention profiler is off. */ \ - if (NULL != bthread::g_cp) { \ - /* Ask Collector if this (contended) locking should be sampled. */ \ - sampling_range = bvar::is_collectable(&bthread::g_cp_sl); \ - start_ns = bvar::is_sampling_range_valid(sampling_range) ? \ - butil::cpuwide_time_ns() : -1; \ - } else { \ - start_ns = -1; \ - } \ - } while (0) - -#define SUBMIT_CSITE_IF_NEED \ - do { \ - if (ETIMEDOUT == rc && start_ns > 0) { \ - /* Failed to lock due to ETIMEDOUT, submit the elapse directly. */ \ - const int64_t end_ns = butil::cpuwide_time_ns(); \ - const bthread_contention_site_t csite{end_ns - start_ns, sampling_range}; \ - bthread::submit_contention(csite, end_ns); \ - } \ - } while (0) - -// For writing. -static inline int rwlock_wrlock_impl(bthread_rwlock_t* __restrict rwlock, - const struct timespec* __restrict abstime) { - // First, resolve competition with other writers. - int rc = bthread_mutex_trylock(&rwlock->write_queue_mutex); +static int rwlock_wrlock(bthread_rwlock_t* rwlock, bool try_lock, + const struct timespec* abstime) { + auto writer_wait_count = (butil::atomic*)rwlock->writer_wait_count; + // Step 1: announce ourselves before doing anything else, so that + // concurrent readers immediately observe writer-priority and back off. + // This MUST happen before we try to acquire writer_queue_mutex, + // otherwise a flood of readers could starve us indefinitely. + // 2^31 in-flight writers should be enough for any realistic workload. + writer_wait_count->fetch_add(1, butil::memory_order_relaxed); + + // Sampling state for the contention profiler. Both wrlock() and + // unwrlock() sample independently: wrlock() submits its own wait time + // on the way out (success or failure); unwrlock() samples its own + // CAS-spin / mutex_unlock / butex_wake_all latency separately. We do + // NOT use rwlock->writer_csite here -- the two operations are not + // forced to share a single sample. size_t sampling_range = bvar::INVALID_SAMPLING_RANGE; - // -1: don't sample. - // 0: default value. - // > 0: Start time of sampling. int64_t start_ns = 0; - if (0 != rc) { - DO_CSITE_IF_NEED; - rc = bthread_mutex_timedlock(&rwlock->write_queue_mutex, abstime); + // Step 2: serialize with other writers. At most one writer holds + // `writer_queue_mutex' at a time and races for `lock_word'. + int rc = bthread_mutex_trylock(&rwlock->writer_queue_mutex); + if (0 != rc) { + if (try_lock) { + // Fail to acquire the wrlock. Don't sample trywrlock failures: + // they are by design a non-blocking probe, not a contention event. + rwlock_wrlock_cleanup(rwlock, false); + return rc; + } + // We are about to block on writer_queue_mutex; arm sampling. + // Note: the inner mutex itself has csite disabled (see init), so + // its blocking time is only counted once -- here, by the rwlock. + BTHREAD_RWLOCK_MAYBE_START_SAMPLING; + rc = bthread_mutex_timedlock(&rwlock->writer_queue_mutex, abstime); if (0 != rc) { - SUBMIT_CSITE_IF_NEED; + // Fail to acquire the wrlock. Submit the elapsed wait time + // directly (no unwrlock() will run for this writer). + submit_contention_if_sampled(start_ns, sampling_range); + rwlock_wrlock_cleanup(rwlock, false); return rc; } } - // Announce to readers there is a pending writer. - int reader_count = ((butil::atomic*)&rwlock->reader_count) - ->fetch_add(-RWLockMaxReaders, butil::memory_order_release); - // Wait for active readers. - if (reader_count != 0 && - ((butil::atomic*)&rwlock->reader_wait) - ->fetch_add(reader_count) + reader_count != 0) { - rc = bthread_sem_trywait(&rwlock->writer_sema); - if (0 != rc) { - if (0 == start_ns) { - DO_CSITE_IF_NEED; + // Step 3: with `writer_queue_mutex' held, wait for all readers to drain + // and then claim the write bit of `lock_word'. + auto lock_word = (butil::atomic*)rwlock->lock_word; + while (true) { + unsigned l = lock_word->load(butil::memory_order_relaxed); + if (l != 0) { + // Readers still hold the lock. Park on `lock_word' until the last + // reader releases (unrdlock will butex_wake on transition 1->0). + if (try_lock) { + errno = EBUSY; + break; } - - rc = bthread_sem_timedwait(&rwlock->writer_sema, abstime); - if (0 != rc) { - SUBMIT_CSITE_IF_NEED; - bthread_mutex_unlock(&rwlock->write_queue_mutex); - return rc; + // Arm sampling before parking so the wait-for-readers time is + // counted (in case the queue_mutex acquisition above was uncontended). + BTHREAD_RWLOCK_MAYBE_START_SAMPLING; + // Use the freshly read `r' as expected; if lock_word changes + // before we sleep, butex_wait returns EWOULDBLOCK and we retry. + if (butex_wait(lock_word, l, abstime) < 0 && + errno != EWOULDBLOCK && errno != EINTR) { + break; } + continue; } + // Acquire on success synchronizes-with release-CAS in + // unrdlock()/unwrlock(): we will see all data published by the + // previous reader/writer before we start writing. + if (lock_word->compare_exchange_weak(l, (unsigned)(1 << 31), + butil::memory_order_acquire, + butil::memory_order_relaxed)) { + // Submit the writer's wait sample immediately on success. + // unwrlock() will sample its own latency separately. + submit_contention_if_sampled(start_ns, sampling_range); + return 0; + } + // CAS may spuriously fail (weak); retry without sleeping. } - if (start_ns > 0) { - rwlock->writer_csite.duration_ns = butil::cpuwide_time_ns() - start_ns; - rwlock->writer_csite.sampling_range = sampling_range; - } - rwlock->wlock_flag = true; - return 0; -} -#undef DO_CSITE_IF_NEED -#undef SUBMIT_CSITE_IF_NEED -static inline int rwlock_wrlock(bthread_rwlock_t* rwlock) { - return rwlock_wrlock_impl(rwlock, NULL); + // Failure path: snapshot errno before cleanup, because + // bthread_mutex_unlock / butex_wake_all inside cleanup may invoke + // syscalls or yield and clobber errno on this thread. + int saved_errno = errno; + // Submit the elapsed wait directly; we never reached unwrlock(). + submit_contention_if_sampled(start_ns, sampling_range); + rwlock_wrlock_cleanup(rwlock, true); + return saved_errno; } -static inline int rwlock_timedwrlock(bthread_rwlock_t* __restrict rwlock, - const struct timespec* __restrict abstime) { - return rwlock_wrlock_impl(rwlock, abstime); -} - -static inline int rwlock_trywrlock(bthread_rwlock_t* rwlock) { - int rc = bthread_mutex_trylock(&rwlock->write_queue_mutex); - if (0 != rc) { - return rc; - } - - int expected = 0; - if (!((butil::atomic*)&rwlock->reader_count) - ->compare_exchange_strong(expected, -RWLockMaxReaders, - butil::memory_order_acquire, - butil::memory_order_relaxed)) { - // Failed to acquire the write lock because there are active readers. - bthread_mutex_unlock(&rwlock->write_queue_mutex); - return EBUSY; - } - rwlock->wlock_flag = true; - - return 0; -} +static int rwlock_unwrlock(bthread_rwlock_t* rwlock) { + auto lock_word = (butil::atomic*)rwlock->lock_word; + auto writer_wait_count = (butil::atomic*)rwlock->writer_wait_count; -static inline void rwlock_unwrlock_slow(bthread_rwlock_t* rwlock, int reader_count) { - bthread_sem_post_n(&rwlock->reader_sema, reader_count); - // Allow other writers to proceed. - bthread_mutex_unlock(&rwlock->write_queue_mutex); -} - -static inline int rwlock_unwrlock(bthread_rwlock_t* rwlock) { - rwlock->wlock_flag = false; + // Sampling state for the contention profiler. unwrlock() samples + // independently of wrlock(): although the release-CAS itself cannot + // fail due to writer-writer contention (writers are serialized by + // writer_queue_mutex), the body still does mutex_unlock(), + // butex_wake_all() and may spuriously spin on the weak CAS, all of + // which contribute to the critical-section tail latency. + size_t sampling_range = bvar::INVALID_SAMPLING_RANGE; + int64_t start_ns = 0; + BTHREAD_RWLOCK_MAYBE_START_SAMPLING; - // Announce to readers there is no active writer. - int reader_count = ((butil::atomic*)&rwlock->reader_count)->fetch_add( - RWLockMaxReaders, butil::memory_order_release) + RWLockMaxReaders; - if (BAIDU_UNLIKELY(reader_count >= RWLockMaxReaders)) { - CHECK(false) << "rwlock_unwlock of unlocked rwlock"; - return EINVAL; - } + while (true) { + unsigned l = lock_word->load(butil::memory_order_relaxed); + // Misuse detection: we must currently hold the write lock. + if (BAIDU_UNLIKELY(l != (unsigned)(1 << 31))) { + LOG(ERROR) << "Invalid unwrlock!"; + return EINVAL; + } + // Release-CAS publishes all writes performed under the write lock + // to the next acquirer (a reader's acquire-CAS or another writer's + // acquire-CAS). The CAS itself cannot fail due to contention since + // writers are serialized by writer_queue_mutex; weak failure here is + // only a spurious CAS failure -- just retry. + if (!lock_word->compare_exchange_weak(l, 0, + butil::memory_order_release, + butil::memory_order_relaxed)) { + continue; + } - bool is_valid = bthread::is_contention_site_valid(rwlock->writer_csite); - if (BAIDU_UNLIKELY(is_valid)) { - bthread_contention_site_t saved_csite = rwlock->writer_csite; - bthread::make_contention_site_invalid(&rwlock->writer_csite); + // ---- Order of the next two operations is INTENTIONAL ---- + // + // We deliberately: + // (1) unlock writer_queue_mutex FIRST, then + // (2) fetch_sub(writer_wait_count) and conditionally wake readers. + // + // Rationale (writer-priority semantics): + // * Any writer queued on writer_queue_mutex has already + // fetch_add'ed its share into writer_wait_count back in wrlock() + // (before it even tried to lock the mutex). So when it wakes + // up here and we later fetch_sub, the counter still reflects + // "there is at least one more writer in flight": w_old >= 2, + // which means w != 1, which means we will NOT wake readers. + // Readers must keep yielding to the next writer -- exactly the + // writer-priority invariant. + // * Only when we are truly the last writer in flight (w_old == 1 + // after our fetch_sub, i.e. writer_wait_count is now 0) do we + // wake_all readers parked on writer_wait_count. + // + // Subtle but harmless effect: + // Between (1) and (2) there is a small window in which our + // own "ghost share" is still counted in writer_wait_count even though + // we have effectively left. New readers entering rdlock() during + // this window will see writer_wait_count >= 1 and park on it; they + // will be woken either by step (2) below (if no successor writer + // appeared) or by the successor writer's eventual unwrlock. + // No wakeup is ever lost: butex_wait re-checks the expected + // value before truly sleeping, and any successor writer will + // itself execute this same wake logic on its way out. + // + // Reversing the order (fetch_sub before unlock mutex) would break + // strict writer-priority because woken readers could grab the + // read lock before a successor writer queued on the mutex even + // gets a chance to CAS lock_word. + bthread_mutex_unlock(&rwlock->writer_queue_mutex); + unsigned w = writer_wait_count->fetch_sub(1, butil::memory_order_relaxed); + if (w == 1) { + butex_wake_all(writer_wait_count); + } - const int64_t unlock_start_ns = butil::cpuwide_time_ns(); - rwlock_unwrlock_slow(rwlock, reader_count); - const int64_t unlock_end_ns = butil::cpuwide_time_ns(); - saved_csite.duration_ns += unlock_end_ns - unlock_start_ns; - bthread::submit_contention(saved_csite, unlock_end_ns); - } else { - rwlock_unwrlock_slow(rwlock, reader_count); + // Submit our own unwrlock-side sample (CAS spin + mutex_unlock + + // butex_wake_all). This is independent of the wrlock-side sample. + submit_contention_if_sampled(start_ns, sampling_range); + return 0; } - - return 0; } -static inline int rwlock_unlock(bthread_rwlock_t* rwlock) { - if (rwlock->wlock_flag) { +// Generic unlock entry that dispatches to unwrlock/unrdlock by inspecting +// `lock_word'. This is safe ONLY because the caller must already hold one of +// the two locks: while holding a read lock the high bit of `lock_word' cannot +// flip on, and while holding the write lock the low bits cannot be set. +// Therefore a relaxed load is sufficient to make the dispatch decision. +static int rwlock_unlock(bthread_rwlock_t* rwlock) { + auto lock_word = (butil::atomic*)rwlock->lock_word; + unsigned r = lock_word->load(butil::memory_order_relaxed); + if ((r >> 31) != 0) { return rwlock_unwrlock(rwlock); } else { return rwlock_unrdlock(rwlock); } } -} // namespace bthread - -__BEGIN_DECLS - -int bthread_rwlock_init(bthread_rwlock_t* __restrict rwlock, - const bthread_rwlockattr_t* __restrict) { - int rc = bthread_sem_init(&rwlock->reader_sema, 0); - if (BAIDU_UNLIKELY(0 != rc)) { - return rc; +// Deleter that turns butex_create_checked()'s raw pointer into something +// std::unique_ptr can clean up automatically. Using RAII here lets the +// init-error paths just `return rc' without manually unwinding partial +// allocations; ownership is `release()'d only on the all-success path. +struct ButexDeleter { + void operator()(void* butex) const { + if (butex != NULL) { + butex_destroy(butex); + } } - bthread_sem_disable_csite(&rwlock->reader_sema); - rc = bthread_sem_init(&rwlock->writer_sema, 0); - if (BAIDU_UNLIKELY(0 != rc)) { - bthread_sem_destroy(&rwlock->reader_sema); - return rc; +}; + +static int rwlock_init(bthread_rwlock_t* rwlock) { + std::unique_ptr writer_wait_count( + butex_create_checked()); + if (writer_wait_count == NULL) { + LOG(ERROR) << "Fail to create writer_wait_count butex: out of memory"; + return ENOMEM; } - bthread_sem_disable_csite(&rwlock->writer_sema); - - rwlock->reader_count = 0; - rwlock->reader_wait = 0; - rwlock->wlock_flag = false; + std::unique_ptr lock_word(butex_create_checked()); + if (lock_word == NULL) { + LOG(ERROR) << "Fail to create lock_word butex: out of memory"; + return ENOMEM; + } + *writer_wait_count = 0; + *lock_word = 0; bthread_mutexattr_t attr; bthread_mutexattr_init(&attr); + BRPC_SCOPE_EXIT { bthread_mutexattr_destroy(&attr); }; + // Disable csite on the inner queue mutex so the writer's wait time is + // accounted exactly once -- by the rwlock layer, not double-counted via + // the inner mutex. bthread_mutexattr_disable_csite(&attr); - rc = bthread_mutex_init(&rwlock->write_queue_mutex, &attr); - if (BAIDU_UNLIKELY(0 != rc)) { - bthread_sem_destroy(&rwlock->reader_sema); - bthread_sem_destroy(&rwlock->writer_sema); + const int rc = bthread_mutex_init(&rwlock->writer_queue_mutex, &attr); + if (rc != 0) { + LOG(ERROR) << "Fail to init writer_queue_mutex, rc=" << rc; return rc; } - bthread_mutexattr_destroy(&attr); - - bthread::make_contention_site_invalid(&rwlock->writer_csite); + // All resources successfully created; transfer butex ownership to + // rwlock. From here on, bthread_rwlock_destroy() is responsible for + // releasing them. + rwlock->writer_wait_count = writer_wait_count.release(); + rwlock->lock_word = lock_word.release(); return 0; } +static int rwlock_destroy(bthread_rwlock_t* rwlock) { + // Destroy the inner mutex first; bthread_mutex_init() allocates an + // internal butex which would otherwise leak. Pointers are nulled to + // surface accidental double-destroy / use-after-destroy bugs early. + int rc = bthread_mutex_destroy(&rwlock->writer_queue_mutex); + if (rc != 0) { + LOG(ERROR) << "Fail to destroy writer_queue_mutex, rc=" << rc; + } + if (rwlock->writer_wait_count != NULL) { + butex_destroy(rwlock->writer_wait_count); + rwlock->writer_wait_count = NULL; + } + if (rwlock->lock_word != NULL) { + butex_destroy(rwlock->lock_word); + rwlock->lock_word = NULL; + } + return rc; +} + +} // namespace bthread + +__BEGIN_DECLS + +int bthread_rwlock_init(bthread_rwlock_t* __restrict rwlock, + const bthread_rwlockattr_t* __restrict) { + return bthread::rwlock_init(rwlock); +} + int bthread_rwlock_destroy(bthread_rwlock_t* rwlock) { - bthread_sem_destroy(&rwlock->reader_sema); - bthread_sem_destroy(&rwlock->writer_sema); - bthread_mutex_destroy(&rwlock->write_queue_mutex); - return 0; + return bthread::rwlock_destroy(rwlock); } int bthread_rwlock_rdlock(bthread_rwlock_t* rwlock) { - return bthread::rwlock_rdlock(rwlock); + return bthread::rwlock_rdlock(rwlock, false, NULL); } int bthread_rwlock_tryrdlock(bthread_rwlock_t* rwlock) { - return bthread::rwlock_tryrdlock(rwlock); + return bthread::rwlock_rdlock(rwlock, true, NULL); } int bthread_rwlock_timedrdlock(bthread_rwlock_t* __restrict rwlock, const struct timespec* __restrict abstime) { - return bthread::rwlock_timedrdlock(rwlock, abstime); + return bthread::rwlock_rdlock(rwlock, false, abstime); } int bthread_rwlock_wrlock(bthread_rwlock_t* rwlock) { - return bthread::rwlock_wrlock(rwlock); + return bthread::rwlock_wrlock(rwlock, false, NULL); } int bthread_rwlock_trywrlock(bthread_rwlock_t* rwlock) { - return bthread::rwlock_trywrlock(rwlock); + return bthread::rwlock_wrlock(rwlock, true, NULL); } int bthread_rwlock_timedwrlock(bthread_rwlock_t* __restrict rwlock, const struct timespec* __restrict abstime) { - return bthread::rwlock_timedwrlock(rwlock, abstime); + return bthread::rwlock_wrlock(rwlock, false, abstime); } int bthread_rwlock_unlock(bthread_rwlock_t* rwlock) { diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index 579bb23120..4706b7f77e 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -247,6 +247,12 @@ TaskGroup::~TaskGroup() { } #ifdef BUTIL_USE_ASAN +// Returns the **highest** address of the calling pthread's stack and its +// total size, matching brpc's `StackStorage::bottom` convention (see comment +// in bthread/stack.h: "Assume stack grows upwards"). Note that on Linux +// `pthread_attr_getstack(3)` returns the lowest address of the region, so +// we have to translate it; on macOS `pthread_get_stackaddr_np(3)` already +// returns the stack base (highest address), so we use it as-is. int PthreadAttrGetStack(void*& stack_addr, size_t& stack_size) { #if defined(OS_MACOSX) stack_addr = pthread_get_stackaddr_np(pthread_self()); @@ -259,9 +265,13 @@ int PthreadAttrGetStack(void*& stack_addr, size_t& stack_size) { LOG(ERROR) << "Fail to get pthread attributes: " << berror(rc); return rc; } - rc = pthread_attr_getstack(&attr, &stack_addr, &stack_size); + void* stack_lowest = NULL; + rc = pthread_attr_getstack(&attr, &stack_lowest, &stack_size); if (0 != rc) { LOG(ERROR) << "Fail to get pthread stack: " << berror(rc); + } else { + // Translate lowest -> highest to match StackStorage::bottom. + stack_addr = (char*)stack_lowest + stack_size; } pthread_attr_destroy(&attr); return rc; @@ -635,6 +645,10 @@ int TaskGroup::join(bthread_t tid, void** return_value) { return errno; } } + // Ensure all memory writes made by the joined bthread are visible to + // the joining thread after join returns. This matches the semantic + // guarantee provided by pthread_join() across supported architectures. + butil::atomic_thread_fence(butil::memory_order_acquire); if (return_value) { *return_value = NULL; } diff --git a/src/bthread/types.h b/src/bthread/types.h index 86148c938b..d46de1e835 100644 --- a/src/bthread/types.h +++ b/src/bthread/types.h @@ -225,16 +225,26 @@ typedef struct bthread_sem_t { typedef struct bthread_rwlock_t { #if defined(__cplusplus) bthread_rwlock_t() - : reader_count(0), reader_wait(0), wlock_flag(false), writer_csite{} {} + : writer_wait_count(0), lock_word(NULL) {} DISALLOW_COPY_AND_ASSIGN(bthread_rwlock_t); #endif - bthread_sem_t reader_sema; // Semaphore for readers to wait for completing writers. - bthread_sem_t writer_sema; // Semaphore for writers to wait for completing readers. - int reader_count; // Number of pending readers. - int reader_wait; // Number of departing readers. - bool wlock_flag; // Flag used to indicate that a write lock has been held. - bthread_mutex_t write_queue_mutex; // Held if there are pending writers. - bthread_contention_site_t writer_csite; + // Number of writers currently in flight (used as a butex): + // writers waiting on writer_queue_mutex, writers waiting for + // lock_word == 0, and the writer currently holding the write lock + // are all counted here. Each writer accounts for itself: incremented + // at the very beginning of wrlock() and decremented at the very end + // of unwrlock()/cleanup(). Readers consult this field to honor + // writer-priority: any non-zero value parks new readers. + unsigned* writer_wait_count; + // Serializes writers so that at most one writer at a time races for + // lock_word. Other writers queue up on this mutex. + bthread_mutex_t writer_queue_mutex; + // Bit-packed atomic lock word (used as a butex): + // bit 31 : 1 if the write lock is held, 0 otherwise. + // bit 0~30: number of readers currently holding the read lock. + // 0 : unlocked. + // The high bit and the low 31 bits are mutually exclusive. + unsigned* lock_word; } bthread_rwlock_t; typedef struct { diff --git a/src/bvar/detail/combiner.h b/src/bvar/detail/combiner.h index cae1b8ea8f..3007f50da8 100644 --- a/src/bvar/detail/combiner.h +++ b/src/bvar/detail/combiner.h @@ -233,7 +233,38 @@ friend class GlobalValue; ~AgentCombiner() { if (_id >= 0) { - clear_all_agents(); + // NOTE: We intentionally do NOT walk `_agents` here (e.g. via the + // previously existed `clear_all_agents()`). + // + // `Agent` instances live inside per-thread `ThreadBlock`s owned by + // `AgentGroup` and are destroyed when their owning thread exits + // (via `_destroy_tls_blocks`). At that point `~Agent` calls + // `combiner.lock()`; if the combiner has already started its + // destruction the `weak_ptr` is expired and the agent will skip + // `commit_and_erase`, leaving its `LinkNode` linked to this + // combiner's `_agents`. If we tried to traverse `_agents` here we + // could touch agent nodes whose `ThreadBlock` was just freed by + // a concurrent thread-exit, causing heap-use-after-free + // (see issue #2937 follow-up). + // + // It is safe to leave the list "dirty" because: + // * `butil::LinkedList` / `butil::LinkNode` have trivial + // destructors and never traverse on destruction, so tearing + // down `_agents` here does not dereference any agent node. + // * After this combiner is gone, every still-alive `Agent` will + // observe `combiner.expired() == true` in `~Agent` and skip + // `commit_and_erase`, so the dangling `prev_/next_` pointers + // in those agents are never read. + // * If the freed `_id` is later reused by a new combiner and the + // same TLS slot is taken, `get_or_create_tls_agent` will call + // `Agent::reset` and `Append` the agent into the new + // combiner's `_agents`. `LinkNode::InsertBefore` only writes + // `prev_/next_` (never reads their stale values), so the + // dangling pointers are safely overwritten. + // * `Agent::element` is destroyed together with the `ThreadBlock`, + // so any non-POD resource it holds is still released; if the + // agent slot is reused, `Agent::reset` will overwrite the + // element value before it is observed again. AgentGroup::destroy_agent(_id); _id = -1; } @@ -319,18 +350,31 @@ friend class GlobalValue; return agent; } - void clear_all_agents() { - butil::AutoLock guard(_lock); - // Resting agents is must because the agent object may be reused. - // Set element to be default-constructed so that if it's non-pod, - // internal allocations should be released. - for (butil::LinkNode* node = _agents.head(); node != _agents.end();) { - node->value()->reset(ElementTp(), NULL); - butil::LinkNode* const saved_next = node->next(); - node->RemoveFromList(); - node = saved_next; - } - } + // NOTE: `clear_all_agents()` is intentionally kept but no longer called + // from `~AgentCombiner` (see the long comment in `~AgentCombiner`). + // + // Calling it from the destructor is unsafe: by the time the destructor + // runs, agent weak_ptrs have already expired and `~Agent` will skip + // `commit_and_erase`; a concurrent thread-exit can therefore free the + // `ThreadBlock` (and the agents inside it) while we are still walking + // `_agents` here, which is a heap-use-after-free. + // + // The body is left around (commented out) for reference / future use -- + // do NOT re-enable it from `~AgentCombiner`. + // + // void clear_all_agents() { + // butil::AutoLock guard(_lock); + // // Resetting agents is a must because the agent object may be + // // reused. Set element to be default-constructed so that if it's + // // non-pod, internal allocations should be released. + // for (butil::LinkNode* node = _agents.head(); + // node != _agents.end();) { + // node->value()->reset(ElementTp(), NULL); + // butil::LinkNode* const saved_next = node->next(); + // node->RemoveFromList(); + // node = saved_next; + // } + // } const BinaryOp& op() const { return _op; } diff --git a/test/brpc_channel_unittest.cpp b/test/brpc_channel_unittest.cpp index 2004767470..db6e2ac777 100644 --- a/test/brpc_channel_unittest.cpp +++ b/test/brpc_channel_unittest.cpp @@ -298,14 +298,10 @@ class ChannelTest : public ::testing::Test{ cntl->_current_call.sending_sock.reset(ptr.release()); cntl->_server = &ts->_dummy; - google::protobuf::Closure* done = - brpc::NewCallback< - int64_t, brpc::Controller*, - brpc::RpcPBMessages*, - const brpc::Server*, + google::protobuf::Closure* done = brpc::NewCallback< + int64_t, brpc::Controller*, brpc::RpcPBMessages*, const brpc::Server*, brpc::MethodStatus*, int64_t, std::shared_ptr>( - &brpc::policy::SendRpcResponse, - meta.correlation_id(), cntl, + &brpc::policy::SendRpcResponse, meta.correlation_id(), cntl, messages, &ts->_dummy, NULL, -1, nullptr); ts->_svc.CallMethod(method, cntl, req, res, done); } @@ -1491,6 +1487,57 @@ class ChannelTest : public ::testing::Test{ EXPECT_EQ(cntl.response_attachment().to_string(), "123"); StopAndJoin(); } + + void TestBackupRequestSelectiveResponseRace() { + ASSERT_EQ(0, StartAccept(_ep)); + + const size_t NCHANS = 8; + brpc::SelectiveChannel channel; + ASSERT_EQ(0, channel.Init("rr", NULL)); + for (size_t i = 0; i < NCHANS; ++i) { + brpc::Channel* subchan = new brpc::Channel; + SetUpChannel(subchan, false, false); + ASSERT_EQ(0, channel.AddChannel(subchan, NULL)) << "i=" << i; + } + + const int kRounds = 150; + const int kCodeListSize = 20000; + std::atomic call_cnt(0); + _svc.SetMockFunc([&call_cnt](google::protobuf::RpcController*, + const ::test::EchoRequest*, + ::test::EchoResponse* res, + google::protobuf::Closure*) { + const int seen = call_cnt.fetch_add(1, std::memory_order_relaxed); + const bool slow = ((seen & 1) == 0); + if (slow) { + bthread_usleep(1500); + } + res->clear_code_list(); + const int base = slow ? 1000000 : 2000000; + for (int i = 0; i < kCodeListSize; ++i) { + res->add_code_list(base + i); + } + res->set_message(slow ? "slow" : "fast"); + }); + + for (int round = 0; round < kRounds; ++round) { + brpc::Controller cntl; + test::EchoRequest req; + test::EchoResponse res; + req.set_message(__FUNCTION__); + cntl.set_backup_request_ms(1); + cntl.set_timeout_ms(3000); + CallMethod(&channel, &cntl, &req, &res, true); + ASSERT_FALSE(cntl.Failed()) << "round=" << round + << " err=" << cntl.ErrorText(); + ASSERT_EQ(kCodeListSize, res.code_list_size()) << "round=" << round; + ASSERT_TRUE(res.message() == "slow" || res.message() == "fast") + << "round=" << round; + } + + EXPECT_EQ(kRounds * 2, call_cnt.load(std::memory_order_relaxed)); + StopAndJoin(); + } void TestCloseFD(bool single_server, bool async, bool short_connection) { std::cout << " *** single=" << single_server @@ -2787,6 +2834,10 @@ TEST_F(ChannelTest, backuprequest_selective) { } } +TEST_F(ChannelTest, backuprequest_selective_response_race) { + TestBackupRequestSelectiveResponseRace(); +} + TEST_F(ChannelTest, close_fd) { for (int i = 0; i <= 1; ++i) { // Flag SingleServer for (int j = 0; j <= 1; ++j) { // Flag Asynchronous diff --git a/test/brpc_streaming_rpc_unittest.cpp b/test/brpc_streaming_rpc_unittest.cpp index ecb88c6150..0f8a3e56d5 100644 --- a/test/brpc_streaming_rpc_unittest.cpp +++ b/test/brpc_streaming_rpc_unittest.cpp @@ -91,6 +91,7 @@ struct BatchStreamFeedbackRaceState { std::atomic client_got_second_msg{false}; std::atomic server_write_done{false}; std::atomic rpc_done{false}; + std::atomic client_closed_count{0}; bthread_t server_send_tid{0}; std::atomic server_send_started{false}; @@ -123,7 +124,9 @@ class BatchStreamClientHandler : public brpc::StreamInputHandler { void on_idle_timeout(brpc::StreamId /*id*/) override {} - void on_closed(brpc::StreamId /*id*/) override {} + void on_closed(brpc::StreamId /*id*/) override { + _state->client_closed_count.fetch_add(1, std::memory_order_release); + } void on_failed(brpc::StreamId /*id*/, int /*error_code*/, const std::string& /*error_text*/) override {} @@ -224,12 +227,17 @@ static void SetAtomicTrue(std::atomic* f) { f->store(true, std::memory_order_release); } -static bool WaitForTrue(const std::atomic& f, int timeout_ms) { +template +static bool WaitForTrue(Pred pred, int timeout_ms) { const int64_t deadline_us = butil::gettimeofday_us() + (int64_t)timeout_ms * 1000L; - while (!f.load(std::memory_order_acquire) && butil::gettimeofday_us() < deadline_us) { + while (!pred() && butil::gettimeofday_us() < deadline_us) { usleep(1000); } - return f.load(std::memory_order_acquire); + return pred(); +} + +static bool WaitForTrue(const std::atomic& f, int timeout_ms) { + return WaitForTrue([&f]() { return f.load(std::memory_order_acquire); }, timeout_ms); } TEST_F(StreamingRpcTest, sanity) { @@ -307,6 +315,22 @@ TEST_F(StreamingRpcTest, batch_create_stream_feedback_race) { } server.Stop(0); server.Join(); + + // Release the SocketUniquePtr held above so the fake socket can be + // recycled. Otherwise BeforeRecycle / on_closed for the extra stream + // is deferred until `client_extra_ptr` destructs at scope exit, which + // happens *after* `client_handler` and `state` are destroyed -> UAF + // inside Stream::Consume on Linux. + client_extra_ptr.reset(); + + // on_closed() runs asynchronously on each client stream's consumer + // bthread. Wait for both before letting handler/state go out of + // scope, otherwise Stream::Consume will dereference freed memory. + int expected_closed = request_streams.size(); + WaitForTrue([&state, expected_closed]() { + return state.client_closed_count.load(std::memory_order_acquire) + >= expected_closed; + }, 2000); }; test::EchoService_Stub stub(&channel); diff --git a/test/bthread_rwlock_unittest.cpp b/test/bthread_rwlock_unittest.cpp index 2da226cb2f..9a88051c1a 100644 --- a/test/bthread_rwlock_unittest.cpp +++ b/test/bthread_rwlock_unittest.cpp @@ -17,6 +17,7 @@ #include #include "gperftools_helper.h" +#include "butil/atomicops.h" #include namespace { @@ -286,6 +287,253 @@ TEST(RWLockTest, mix_thread_types) { ASSERT_EQ(0, bthread_rwlock_destroy(&rw)); } +// Tests below verify the writer-priority semantics and the cleanup path +// guarded by the design notes in bthread/rwlock.cpp. +struct WriterPriorityArgs { + bthread_rwlock_t* rw; + butil::atomic* order; + int my_order; // sequence number captured inside the critical section + int hold_us; +}; + +void* wp_writer_fn(void* arg) { + auto* a = (WriterPriorityArgs*)arg; + EXPECT_EQ(0, bthread_rwlock_wrlock(a->rw)); + a->my_order = a->order->fetch_add(1, butil::memory_order_relaxed); + bthread_usleep(a->hold_us); + EXPECT_EQ(0, bthread_rwlock_unlock(a->rw)); + return NULL; +} + +void* wp_reader_fn(void* arg) { + auto* a = (WriterPriorityArgs*)arg; + EXPECT_EQ(0, bthread_rwlock_rdlock(a->rw)); + a->my_order = a->order->fetch_add(1, butil::memory_order_relaxed); + bthread_usleep(a->hold_us); + EXPECT_EQ(0, bthread_rwlock_unlock(a->rw)); + return NULL; +} + +// Verifies the writer-priority invariant guarded by the order +// "unlock writer_queue_mutex BEFORE fetch_sub(writer_wait_count)" in +// rwlock_unwrlock(): once a writer is queued, any new reader arriving +// later MUST yield to that writer. +TEST(RWLockTest, writer_priority) { + bthread_setconcurrency(8); + bthread_rwlock_t rw; + ASSERT_EQ(0, bthread_rwlock_init(&rw, NULL)); + + // (1) Main thread holds the read lock first. + ASSERT_EQ(0, bthread_rwlock_rdlock(&rw)); + + butil::atomic order(0); + WriterPriorityArgs warg {&rw, &order, -1, 5000}; + WriterPriorityArgs r2arg {&rw, &order, -1, 0}; + + // (2) Start a writer; it should park inside wrlock() because the read + // lock is held. Sleep long enough for it to fetch_add into + // writer_wait_count and reach the butex_wait on `lock_word'. + bthread_t wth; + ASSERT_EQ(0, bthread_start_urgent(&wth, NULL, wp_writer_fn, &warg)); + bthread_usleep(50 * 1000); + + // (3) Now spawn a fresh reader. By writer-priority it MUST observe + // writer_wait_count > 0 and park on it (NOT join the active read + // lock). + bthread_t r2th; + ASSERT_EQ(0, bthread_start_urgent(&r2th, NULL, wp_reader_fn, &r2arg)); + bthread_usleep(50 * 1000); + + // (4) Release the original read lock. The writer should win the race + // and complete BEFORE the queued reader. + ASSERT_EQ(0, bthread_rwlock_unlock(&rw)); + + bthread_join(wth, NULL); + bthread_join(r2th, NULL); + + EXPECT_GE(warg.my_order, 0); + EXPECT_GE(r2arg.my_order, 0); + EXPECT_LT(warg.my_order, r2arg.my_order) + << "Writer-priority violated: writer entered with order=" + << warg.my_order << " but late reader entered with order=" + << r2arg.my_order; + + ASSERT_EQ(0, bthread_rwlock_destroy(&rw)); +} + +void* wp_timed_wrlock_short(void* arg) { + auto* rw = (bthread_rwlock_t*)arg; + timespec ts = butil::milliseconds_from_now(50); + EXPECT_EQ(ETIMEDOUT, bthread_rwlock_timedwrlock(rw, &ts)); + return NULL; +} + +// Verifies the cleanup path of rwlock_wrlock_cleanup(): after multiple +// writers fail with ETIMEDOUT, writer_wait_count must be back to 0 so +// that subsequent readers are not blocked by leftover "ghost shares". +TEST(RWLockTest, wrlock_failure_does_not_leak_writer_count) { + bthread_setconcurrency(8); + bthread_rwlock_t rw; + ASSERT_EQ(0, bthread_rwlock_init(&rw, NULL)); + + // Hold the read lock so every wrlock attempt must block on `lock_word'. + ASSERT_EQ(0, bthread_rwlock_rdlock(&rw)); + + const int N = 8; + bthread_t wth[N]; + for (int i = 0; i < N; ++i) { + ASSERT_EQ(0, bthread_start_urgent(&wth[i], NULL, wp_timed_wrlock_short, &rw)); + } + // Wait for all timed wrlock attempts to time out and run cleanup. + for (int i = 0; i < N; ++i) { + bthread_join(wth[i], NULL); + } + + // Release the read lock; from this point on no writer is in flight, + // so a new reader MUST acquire the lock immediately. + ASSERT_EQ(0, bthread_rwlock_unlock(&rw)); + + timespec ts = butil::milliseconds_from_now(500); + butil::Timer t; + t.start(); + ASSERT_EQ(0, bthread_rwlock_timedrdlock(&rw, &ts)); + t.stop(); + EXPECT_LT(t.m_elapsed(), 100) + << "Reader was blocked for " << t.m_elapsed() << "ms; " + << "writer_wait_count was likely leaked by the cleanup path."; + + ASSERT_EQ(0, bthread_rwlock_unlock(&rw)); + ASSERT_EQ(0, bthread_rwlock_destroy(&rw)); +} + +struct DataConsistencyArgs { + bthread_rwlock_t* rw; + int64_t* shared; // protected by rw + int64_t local_inc; // writer: number of increments this thread did + int64_t observed_max; // reader: max value observed + bool is_writer; +}; + +void* dc_worker(void* arg) { + auto* a = (DataConsistencyArgs*)arg; + while (!g_stopped) { + if (a->is_writer) { + EXPECT_EQ(0, bthread_rwlock_wrlock(a->rw)); + ++(*a->shared); + ++a->local_inc; + EXPECT_EQ(0, bthread_rwlock_unlock(a->rw)); + } else { + EXPECT_EQ(0, bthread_rwlock_rdlock(a->rw)); + int64_t v = *a->shared; + if (v > a->observed_max) { + a->observed_max = v; + } + EXPECT_EQ(0, bthread_rwlock_unlock(a->rw)); + } + } + return NULL; +} + +// Verifies the release/acquire memory ordering pair on `lock_word'. +// If the CAS in unwrlock()/unrdlock() weren't release-ordered, or the +// CAS in rdlock()/wrlock() weren't acquire-ordered, writes done inside +// the critical section could appear lost or inconsistent to other +// threads, causing the final counter to disagree with total writer ops. +TEST(RWLockTest, data_consistency) { + bthread_rwlock_t rw; + ASSERT_EQ(0, bthread_rwlock_init(&rw, NULL)); + + g_stopped = false; + const int W = 4; + const int R = 8; + bthread_setconcurrency(W + R + 4); + + int64_t shared = 0; + std::vector args(W + R); + std::vector threads(W + R); + for (int i = 0; i < W + R; ++i) { + args[i].rw = &rw; + args[i].shared = &shared; + args[i].local_inc = 0; + args[i].observed_max = -1; + args[i].is_writer = (i < W); + ASSERT_EQ(0, bthread_start_urgent(&threads[i], NULL, dc_worker, &args[i])); + } + + bthread_usleep(500 * 1000); + g_stopped = true; + + int64_t total_inc = 0; + for (int i = 0; i < W + R; ++i) { + bthread_join(threads[i], NULL); + if (args[i].is_writer) { + total_inc += args[i].local_inc; + } + } + + // No lost updates: every writer's increment is reflected in `shared'. + EXPECT_EQ(total_inc, shared) + << "Lost updates: total writer ops=" << total_inc + << " but shared counter=" << shared; + // No reader saw a value greater than the final counter. + for (int i = W; i < W + R; ++i) { + EXPECT_LE(args[i].observed_max, shared) + << "Reader " << i << " observed_max=" << args[i].observed_max + << " > final shared=" << shared; + } + + ASSERT_EQ(0, bthread_rwlock_destroy(&rw)); +} + +void* ws_reader_loop(void* arg) { + auto* rw = (bthread_rwlock_t*)arg; + while (!g_stopped) { + EXPECT_EQ(0, bthread_rwlock_rdlock(rw)); + // Hold the read lock briefly to keep the lock continuously busy. + bthread_usleep(100); + EXPECT_EQ(0, bthread_rwlock_unlock(rw)); + } + return NULL; +} + +// Verifies that under a continuous read load, a writer can still acquire +// the lock in bounded time. This is the end-to-end guarantee of the +// writer-priority strategy: any reader arriving AFTER the writer entered +// wrlock() must yield, ensuring the writer never starves. +TEST(RWLockTest, no_writer_starvation) { + bthread_rwlock_t rw; + ASSERT_EQ(0, bthread_rwlock_init(&rw, NULL)); + + g_stopped = false; + const int R = 16; + bthread_setconcurrency(R + 4); + bthread_t rth[R]; + for (int i = 0; i < R; ++i) { + ASSERT_EQ(0, bthread_start_urgent(&rth[i], NULL, ws_reader_loop, &rw)); + } + + // Let the readers ramp up and saturate the lock. + bthread_usleep(50 * 1000); + + // A single writer must succeed within a generous budget. + butil::Timer t; + t.start(); + ASSERT_EQ(0, bthread_rwlock_wrlock(&rw)); + t.stop(); + + EXPECT_LT(t.m_elapsed(), 1000) + << "Writer starved for " << t.m_elapsed() << "ms under " + << R << " concurrent readers; writer-priority is broken."; + + ASSERT_EQ(0, bthread_rwlock_unlock(&rw)); + + g_stopped = true; + for (int i = 0; i < R; ++i) { + bthread_join(rth[i], NULL); + } + ASSERT_EQ(0, bthread_rwlock_destroy(&rw)); +} + struct BAIDU_CACHELINE_ALIGNMENT PerfArgs { bthread_rwlock_t* rw; int64_t counter; @@ -386,13 +634,14 @@ void PerfTest(uint32_t writer_ratio, ThreadId* /*dummy*/, int thread_num, << " writer_ratio=" << writer_ratio << " reader_num=" << reader_num << " read_count=" << read_count - << " read_average_time=" << (read_count == 0 ? 0 : read_wait_time / (double)read_count) + << " read_average_time=" << (read_count == 0 ? 0 : read_wait_time / (double)read_count) << "ns" << " writer_num=" << writer_num << " write_count=" << write_count - << " write_average_time=" << (write_count == 0 ? 0 : write_wait_time / (double)write_count); + << " write_average_time=" << (write_count == 0 ? 0 : write_wait_time / (double)write_count) << "ns"; } TEST(RWLockTest, performance) { + bthread_setconcurrency(16); const int thread_num = 12; PerfTest(0, (pthread_t*)NULL, thread_num, pthread_create, pthread_join); PerfTest(0, (bthread_t*)NULL, thread_num, bthread_start_background, bthread_join);