From 293118648602b548839f59877e558182710267aa Mon Sep 17 00:00:00 2001 From: usamoi Date: Mon, 27 Apr 2026 16:07:24 +0800 Subject: [PATCH] feat: adopt PostgreSQL's naming convention of temporary files and directories Signed-off-by: usamoi --- Cargo.lock | 4 +- Cargo.toml | 5 +- crates/bm25/Cargo.toml | 3 +- crates/bm25/src/maintain.rs | 26 ++++--- crates/xtask/Cargo.toml | 2 +- src/index/bm25/am/am_build.rs | 4 +- src/index/bm25/am/am_vacuumcleanup.rs | 5 +- src/index/mod.rs | 1 + src/index/temp.rs | 101 ++++++++++++++++++++++++++ src/lib.rs | 14 ++++ 10 files changed, 146 insertions(+), 19 deletions(-) create mode 100644 src/index/temp.rs diff --git a/Cargo.lock b/Cargo.lock index b90f482..0e32941 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -162,7 +162,6 @@ dependencies = [ "score", "serde", "simd", - "tempfile", "validator", "zerocopy", ] @@ -1546,13 +1545,14 @@ version = "0.0.0" dependencies = [ "always_equal", "bm25", + "getrandom", "index", "mimalloc", "pgrx", "pgrx-catalog", + "rand", "serde", "simd", - "tempfile", "toml 1.1.2+spec-1.1.0", "validator", ] diff --git a/Cargo.toml b/Cargo.toml index 86d000d..b29cc3f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,10 +27,11 @@ bm25 = { path = "./crates/bm25" } index = { path = "./crates/index" } simd = { path = "./crates/simd" } +getrandom.workspace = true pgrx = "=0.17.0" pgrx-catalog = "0.3.2" +rand.workspace = true serde.workspace = true -tempfile.workspace = true toml = "1.1.2" validator.workspace = true @@ -49,9 +50,9 @@ version = "0.0.0" edition = "2024" [workspace.dependencies] +getrandom = "0.4.2" rand = "0.10.1" serde = { version = "1.0.228", features = ["derive"] } -tempfile = "3.27.0" validator = { version = "0.20.0", features = ["derive"] } zerocopy = { version = "0.8.48", features = ["derive", "simd"] } diff --git a/crates/bm25/Cargo.toml b/crates/bm25/Cargo.toml index c553814..a9862c7 100644 --- a/crates/bm25/Cargo.toml +++ b/crates/bm25/Cargo.toml @@ -11,10 +11,9 @@ score = { path = "../score" } simd = { path = "../simd" } blake3 = "1.8.4" -getrandom = "0.4.2" +getrandom.workspace = true memmap2 = "0.9.10" serde.workspace = true -tempfile.workspace = true validator.workspace = true zerocopy.workspace = true diff --git a/crates/bm25/src/maintain.rs b/crates/bm25/src/maintain.rs index b2adde3..fc3aa8f 100644 --- a/crates/bm25/src/maintain.rs +++ b/crates/bm25/src/maintain.rs @@ -19,12 +19,17 @@ use crate::tuples::*; use crate::vector::Document; use crate::{Opaque, WIDTH, compression}; use index::relation::{Page, PageGuard, RelationRead, RelationWrite}; -use std::fs::File; +use std::fs::{File, OpenOptions}; use std::io::BufWriter; +use std::path::Path; use zerocopy::{FromBytes, IntoBytes}; -pub fn maintain(index: &R, _check: impl Fn()) -where +pub fn maintain( + index: &R, + _check: impl Fn(), + dir: &Path, + file: &Path, +) where R::Page: Page, { let meta_guard = index.read(0); @@ -38,10 +43,11 @@ where let _lock_guard = index.write(ptr_lock); - let tempdir = handle_io_error(tempfile::tempdir()); - - let mut relabel = BufWriter::with_capacity(16 * 1024, handle_io_error(tempfile::tempfile())); - let mut records_writer = crate::io::records_writer(tempdir.path(), 0); + let mut relabel = BufWriter::with_capacity( + 16 * 1024, + handle_io_error(OpenOptions::new().read(true).write(true).open(file)), + ); + let mut records_writer = crate::io::records_writer(dir, 0); let jump_guard = index.read(ptr_jump); let jump_bytes = jump_guard.get(1).expect("data corruption"); @@ -93,7 +99,7 @@ where } else { &mut [] }; - let mut mappings_writer = crate::io::mappings_writer(tempdir.path(), 0); + let mut mappings_writer = crate::io::mappings_writer(dir, 0); { let mut tape_tokens = TapeReader::new(jump_tuple.ptr_tokens(), |bytes| { @@ -254,9 +260,9 @@ where mappings_writer.flush(); drop(records_writer); drop(mappings_writer); - crate::io::locally_merge(tempdir.path(), 0); + crate::io::locally_merge(dir, 0); - let segment = crate::io::readers(tempdir.path(), 1); + let segment = crate::io::readers(dir, 1); let flushed = crate::flush::flush(k1, b, index, segment); let mut jump_guard = index.write(ptr_jump); diff --git a/crates/xtask/Cargo.toml b/crates/xtask/Cargo.toml index 23ae6d1..1645c81 100644 --- a/crates/xtask/Cargo.toml +++ b/crates/xtask/Cargo.toml @@ -11,7 +11,7 @@ serde.workspace = true serde_json = "1.0.149" shlex = "1.3.0" target-triple = "1.0.0" -tempfile.workspace = true +tempfile = "3.27.0" [lints] workspace = true diff --git a/src/index/bm25/am/am_build.rs b/src/index/bm25/am/am_build.rs index b9c93bb..670feb2 100644 --- a/src/index/bm25/am/am_build.rs +++ b/src/index/bm25/am/am_build.rs @@ -18,6 +18,7 @@ use crate::index::bm25::am::Reloption; use crate::index::bm25::types::*; use crate::index::fetcher::ctid_to_key; use crate::index::storage::PostgresRelation; +use crate::index::temp::tempdir; use crate::index::traverse::{HeapTraverser, Traverser}; use std::ffi::{CStr, OsStr}; use std::marker::PhantomData; @@ -140,7 +141,7 @@ pub unsafe extern "C-unwind" fn ambuild( reporter.tuples_total(unsafe { (*(*index_relation).rd_rel).reltuples as u64 }); reporter.phase(BuildPhase::from_code(BuildPhaseCode::Scanning)); let seed = bm25::seed::random(); - let tempdir = tempfile::tempdir().expect("failed to create temporary directory"); + let tempdir = tempdir(); let total = if let Some(leader) = unsafe { Bm25Leader::enter( c"bm25_parallel_build_main", @@ -534,6 +535,7 @@ pub unsafe extern "C-unwind" fn bm25_parallel_build_main( _seg: *mut pgrx::pg_sys::dsm_segment, toc: *mut pgrx::pg_sys::shm_toc, ) { + let _ = rand::rng().reseed(); let bm25shared = unsafe { pgrx::pg_sys::shm_toc_lookup(toc, 0xA000000000000001, false).cast::() }; diff --git a/src/index/bm25/am/am_vacuumcleanup.rs b/src/index/bm25/am/am_vacuumcleanup.rs index 565d6bd..78d9dbd 100644 --- a/src/index/bm25/am/am_vacuumcleanup.rs +++ b/src/index/bm25/am/am_vacuumcleanup.rs @@ -13,6 +13,7 @@ // Copyright (c) 2025-2026 TensorChord Inc. use crate::index::storage::PostgresRelation; +use crate::index::temp::{tempdir, tempfile}; #[pgrx::pg_guard] pub unsafe extern "C-unwind" fn amvacuumcleanup( @@ -33,6 +34,8 @@ pub unsafe extern "C-unwind" fn amvacuumcleanup( #[cfg(feature = "pg18")] pgrx::pg_sys::vacuum_delay_point(false); }; - bm25::maintain(&index, check); + let tempdir = tempdir(); + let tempfile = tempfile(); + bm25::maintain(&index, check, tempdir.path(), tempfile.path()); stats } diff --git a/src/index/mod.rs b/src/index/mod.rs index ed35c0a..21a8f6a 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -19,6 +19,7 @@ mod hook; mod operators; mod scanners; mod storage; +mod temp; mod traverse; pub fn init() { diff --git a/src/index/temp.rs b/src/index/temp.rs new file mode 100644 index 0000000..37e77b9 --- /dev/null +++ b/src/index/temp.rs @@ -0,0 +1,101 @@ +// This software is licensed under a dual license model: +// +// GNU Affero General Public License v3 (AGPLv3): You may use, modify, and +// distribute this software under the terms of the AGPLv3. +// +// Elastic License v2 (ELv2): You may also use, modify, and distribute this +// software under the Elastic License v2, which has specific restrictions. +// +// We welcome any commercial collaboration or support. For inquiries +// regarding the licenses, please contact us at: +// vectorchord-inquiry@tensorchord.ai +// +// Copyright (c) 2025-2026 TensorChord Inc. + +use std::path::{Path, PathBuf}; + +pub struct TempFile { + path: PathBuf, +} + +impl TempFile { + pub fn path(&self) -> &Path { + self.path.as_path() + } +} + +impl Drop for TempFile { + fn drop(&mut self) { + let _ = std::fs::remove_file(&self.path); + } +} + +pub fn tempfile() -> TempFile { + let path = temppath(); + std::fs::File::create_new(&path).expect("failed to create the temporary file"); + TempFile { path } +} + +pub struct TempDir { + path: PathBuf, +} + +impl TempDir { + pub fn path(&self) -> &Path { + self.path.as_path() + } +} + +impl Drop for TempDir { + fn drop(&mut self) { + let _ = std::fs::remove_dir_all(&self.path); + } +} + +pub fn tempdir() -> TempDir { + let path = temppath(); + std::fs::create_dir(&path).expect("failed to create the temporary directory"); + TempDir { path } +} + +fn temppath() -> PathBuf { + let tablespace_path = unsafe { + use rand::seq::IndexedRandom; + use std::mem::MaybeUninit; + pgrx::pg_sys::PrepareTempTablespaces(); + let mut tablespaces = [pgrx::pg_sys::Oid::INVALID; 8]; + let length = + pgrx::pg_sys::GetTempTablespaces(tablespaces.as_mut_ptr(), tablespaces.len() as _); + let tablespace = tablespaces[..length as usize] + .choose(&mut rand::rng()) + .copied() + .unwrap_or(pgrx::pg_sys::Oid::INVALID); + let tablespace = if tablespace != pgrx::pg_sys::Oid::INVALID { + tablespace + } else { + pgrx::pg_sys::MyDatabaseTableSpace + }; + let mut buf = [MaybeUninit::::uninit(); pgrx::pg_sys::MAXPGPATH as usize]; + pgrx::pg_sys::TempTablespacePath(buf.as_mut_ptr().cast::(), tablespace); + let s = std::ffi::CStr::from_ptr(buf.as_ptr().cast::()); + // It is reasonable to make this assumption because PostgreSQL + // uses symbolic links internally to access tablespaces. + let s = s.to_str().expect("found non-utf8 characters in the path"); + assert!(s.is_ascii(), "found non-ascii characters in the path"); + debug_assert!(s.starts_with("base/") || s.starts_with("pg_tblspc/")); + debug_assert!(s.ends_with("/pgsql_tmp")); + AsRef::::as_ref(s).to_path_buf() + }; + if let Err(e) = std::fs::create_dir(&tablespace_path) { + if e.kind() != std::io::ErrorKind::AlreadyExists { + panic!("failed to create the temporary directory in the tablespace"); + } + } + let path = tablespace_path.join(crate::tempname()); + { + // a leftover from a backend crash + let _ = std::fs::remove_file(&path); + let _ = std::fs::remove_dir_all(&path); + } + path +} diff --git a/src/lib.rs b/src/lib.rs index 2eb7587..470db3b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -70,6 +70,20 @@ fn is_main() -> bool { IS_MAIN.get() } +#[must_use] +fn tempname() -> String { + let pid = std::process::id(); + let number = { + static mut COUNTER: u32 = 0; + unsafe { + let number = COUNTER; + COUNTER = COUNTER.wrapping_add(1); + number + } + }; + format!("pgsql_tmp{pid}.{number}.vchord_bm25") +} + #[cfg(not(panic = "unwind"))] compile_error!("This crate must be compiled with `-Cpanic=unwind`.");