Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/basic_memory/cli/commands/cloud/rclone_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from rich.console import Console

from basic_memory.cli.commands.cloud.rclone_installer import is_rclone_installed
from basic_memory.config import resolve_data_dir
from basic_memory.utils import normalize_project_path

console = Console()
Expand Down Expand Up @@ -138,13 +139,16 @@ def get_bmignore_filter_path() -> Path:
def get_project_bisync_state(project_name: str) -> Path:
"""Get path to project's bisync state directory.

Honors ``BASIC_MEMORY_CONFIG_DIR`` so isolated instances each keep their
own bisync state alongside their config.

Args:
project_name: Name of the project

Returns:
Path to bisync state directory for this project
"""
return Path.home() / ".basic-memory" / "bisync-state" / project_name
return resolve_data_dir() / "bisync-state" / project_name


def bisync_initialized(project_name: str) -> bool:
Expand Down
63 changes: 47 additions & 16 deletions src/basic_memory/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,44 @@ def _default_semantic_search_enabled() -> bool:
)


def resolve_data_dir() -> Path:
"""Resolve the Basic Memory data directory.

Single source of truth for the per-user state directory. Honors
``BASIC_MEMORY_CONFIG_DIR`` so each process/worktree can isolate config
and database state; otherwise falls back to ``<user home>/.basic-memory``.

Cross-platform: ``Path.home()`` reads ``$HOME`` on POSIX and
``%USERPROFILE%`` on Windows, so there's no need to check ``$HOME``
explicitly here.
"""
if config_dir := os.getenv("BASIC_MEMORY_CONFIG_DIR"):
return Path(config_dir)
return Path.home() / DATA_DIR_NAME


def default_fastembed_cache_dir() -> str:
"""Return the default cache directory used for FastEmbed model artifacts.

Resolution order:
1. ``FASTEMBED_CACHE_PATH`` env var — honors FastEmbed's own convention
so users who already configure it through the environment keep working.
2. ``<basic-memory data dir>/fastembed_cache`` — the same stable,
user-writable directory Basic Memory already uses for config and
the default SQLite database. Honors ``BASIC_MEMORY_CONFIG_DIR``.

Why not ``tempfile.gettempdir()``?
FastEmbed's own default is ``<system tmp>/fastembed_cache``, which is
ephemeral in many sandboxed MCP runtimes (e.g. Codex CLI wipes /tmp
between invocations). The model then disappears and every subsequent
ONNX load raises ``NO_SUCHFILE``. Persisting the cache under the
per-user data directory works identically on macOS, Linux, and Windows.
"""
if env_override := os.getenv("FASTEMBED_CACHE_PATH"):
return env_override
return str(resolve_data_dir() / "fastembed_cache")


@dataclass
class ProjectConfig:
"""Configuration for a specific basic-memory project."""
Expand Down Expand Up @@ -222,7 +260,13 @@ def __init__(self, **data: Any) -> None: ...
)
semantic_embedding_cache_dir: str | None = Field(
default=None,
description="Optional cache directory for FastEmbed model artifacts.",
description=(
"Optional override for the FastEmbed model cache directory. "
"When unset, Basic Memory resolves this at runtime to "
"<basic-memory data dir>/fastembed_cache (or FASTEMBED_CACHE_PATH "
"when that env var is set) so the model persists across runs "
"without hardcoding a path into config.json."
),
)
semantic_embedding_threads: int | None = Field(
default=None,
Expand Down Expand Up @@ -709,11 +753,7 @@ def ensure_project_paths_exists(self) -> "BasicMemoryConfig": # pragma: no cove
@property
def data_dir_path(self) -> Path:
"""Get app state directory for config and default SQLite database."""
if config_dir := os.getenv("BASIC_MEMORY_CONFIG_DIR"):
return Path(config_dir)

home = os.getenv("HOME", Path.home())
return Path(home) / DATA_DIR_NAME
return resolve_data_dir()


# Module-level cache for configuration
Expand All @@ -731,16 +771,7 @@ class ConfigManager:

def __init__(self) -> None:
"""Initialize the configuration manager."""
home = os.getenv("HOME", Path.home())
if isinstance(home, str):
home = Path(home)

# Allow override via environment variable
if config_dir := os.getenv("BASIC_MEMORY_CONFIG_DIR"):
self.config_dir = Path(config_dir)
else:
self.config_dir = home / DATA_DIR_NAME

self.config_dir = resolve_data_dir()
self.config_file = self.config_dir / CONFIG_FILE_NAME

# Ensure config directory exists
Expand Down
11 changes: 8 additions & 3 deletions src/basic_memory/ignore_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from pathlib import Path
from typing import Set

from basic_memory.config import resolve_data_dir


# Common directories and patterns to ignore by default
# These are used as fallback if .bmignore doesn't exist
Expand Down Expand Up @@ -61,9 +63,11 @@ def get_bmignore_path() -> Path:
"""Get path to .bmignore file.

Returns:
Path to ~/.basic-memory/.bmignore
Path to <basic-memory data dir>/.bmignore, honoring
``BASIC_MEMORY_CONFIG_DIR`` so isolated instances each keep their
own ignore file.
"""
return Path.home() / ".basic-memory" / ".bmignore"
return resolve_data_dir() / ".bmignore"


def create_default_bmignore() -> None:
Expand Down Expand Up @@ -176,7 +180,8 @@ def load_gitignore_patterns(base_path: Path, use_gitignore: bool = True) -> Set[
"""Load gitignore patterns from .gitignore file and .bmignore.

Combines patterns from:
1. ~/.basic-memory/.bmignore (user's global ignore patterns)
1. <basic-memory data dir>/.bmignore (user's global ignore patterns, honors
BASIC_MEMORY_CONFIG_DIR)
2. {base_path}/.gitignore (project-specific patterns, if use_gitignore=True)

Args:
Expand Down
35 changes: 29 additions & 6 deletions src/basic_memory/repository/embedding_provider_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
from threading import Lock

from basic_memory.config import BasicMemoryConfig
from basic_memory.config import BasicMemoryConfig, default_fastembed_cache_dir
from basic_memory.repository.embedding_provider import EmbeddingProvider

type ProviderCacheKey = tuple[
Expand All @@ -12,7 +12,7 @@
int | None,
int,
int,
str | None,
str,
int | None,
int | None,
]
Expand All @@ -22,6 +22,20 @@
_FASTEMBED_MAX_THREADS = 8


def _resolve_cache_dir(app_config: BasicMemoryConfig) -> str:
"""Resolve the effective FastEmbed cache dir for this config.

Uses an explicit ``is not None`` check — an empty string override from
config or ``BASIC_MEMORY_SEMANTIC_EMBEDDING_CACHE_DIR`` is an invalid
path, not a request to fall back to the default, and FastEmbed's error
message is clearer than silently swapping in a different directory.
"""
configured = app_config.semantic_embedding_cache_dir
if configured is not None:
return configured
return default_fastembed_cache_dir()


def _available_cpu_count() -> int | None:
"""Return the CPU budget available to this process when the runtime exposes it."""
process_cpu_count = getattr(os, "process_cpu_count", None)
Expand Down Expand Up @@ -61,15 +75,20 @@ def _resolve_fastembed_runtime_knobs(


def _provider_cache_key(app_config: BasicMemoryConfig) -> ProviderCacheKey:
"""Build a stable cache key from provider-relevant semantic embedding config."""
"""Build a stable cache key from provider-relevant semantic embedding config.

Uses the *resolved* cache dir — not the raw config field — so different
FASTEMBED_CACHE_PATH values produce distinct cache keys even when the
config field itself is unset.
"""
resolved_threads, resolved_parallel = _resolve_fastembed_runtime_knobs(app_config)
return (
app_config.semantic_embedding_provider.strip().lower(),
app_config.semantic_embedding_model,
app_config.semantic_embedding_dimensions,
app_config.semantic_embedding_batch_size,
app_config.semantic_embedding_request_concurrency,
app_config.semantic_embedding_cache_dir,
_resolve_cache_dir(app_config),
resolved_threads,
resolved_parallel,
)
Expand Down Expand Up @@ -103,8 +122,12 @@ def create_embedding_provider(app_config: BasicMemoryConfig) -> EmbeddingProvide
from basic_memory.repository.fastembed_provider import FastEmbedEmbeddingProvider

resolved_threads, resolved_parallel = _resolve_fastembed_runtime_knobs(app_config)
if app_config.semantic_embedding_cache_dir is not None:
extra_kwargs["cache_dir"] = app_config.semantic_embedding_cache_dir
# Trigger: cache_dir is resolved rather than passed through directly.
# Why: FastEmbed's own default caches to <system tmp>/fastembed_cache,
# which disappears in sandboxed MCP runtimes (e.g. Codex CLI). See #741.
# Outcome: always pass an explicit, user-writable cache dir so the ONNX
# model persists across runs.
extra_kwargs["cache_dir"] = _resolve_cache_dir(app_config)
if resolved_threads is not None:
extra_kwargs["threads"] = resolved_threads
if resolved_parallel is not None:
Expand Down
8 changes: 3 additions & 5 deletions src/basic_memory/services/project_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1137,12 +1137,10 @@ def get_system_status(self) -> SystemStatus:

# Get watch service status if available
watch_status = None
watch_status_path = Path.home() / ".basic-memory" / WATCH_STATUS_JSON
watch_status_path = self.config_manager.config.data_dir_path / WATCH_STATUS_JSON
if watch_status_path.exists():
try: # pragma: no cover
watch_status = json.loads( # pragma: no cover
watch_status_path.read_text(encoding="utf-8")
)
try:
watch_status = json.loads(watch_status_path.read_text(encoding="utf-8"))
except Exception: # pragma: no cover
pass

Expand Down
2 changes: 1 addition & 1 deletion src/basic_memory/sync/watch_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def __init__(
self.app_config = app_config
self.project_repository = project_repository
self.state = WatchServiceState()
self.status_path = Path.home() / ".basic-memory" / WATCH_STATUS_JSON
self.status_path = app_config.data_dir_path / WATCH_STATUS_JSON
self.status_path.parent.mkdir(parents=True, exist_ok=True)
self._ignore_patterns_cache: dict[Path, Set[str]] = {}
self._sync_service_factory = sync_service_factory
Expand Down
9 changes: 7 additions & 2 deletions src/basic_memory/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,8 @@ def setup_logging(

Args:
log_level: DEBUG, INFO, WARNING, ERROR
log_to_file: Write to ~/.basic-memory/basic-memory.log with rotation
log_to_file: Write to <basic-memory data dir>/basic-memory.log with rotation
(honors BASIC_MEMORY_CONFIG_DIR)
log_to_stdout: Write to stderr (for Docker/cloud deployments)
structured_context: Bind tenant_id, fly_region, etc. for cloud observability
"""
Expand All @@ -281,7 +282,11 @@ def setup_logging(
# Why: multiple basic-memory processes can share the same log directory at once.
# Outcome: use per-process log files on Windows so log rotation stays local.
log_filename = f"basic-memory-{os.getpid()}.log" if os.name == "nt" else "basic-memory.log"
log_path = Path.home() / ".basic-memory" / log_filename
# Deferred import: basic_memory.config imports from this module at load time,
# so resolving the data dir via a top-level import would cycle.
from basic_memory.config import resolve_data_dir

log_path = resolve_data_dir() / log_filename
log_path.parent.mkdir(parents=True, exist_ok=True)
if os.name == "nt":
_cleanup_windows_log_files(log_path.parent, log_path.name)
Expand Down
6 changes: 6 additions & 0 deletions test-int/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,13 @@ async def test_project(config_home, engine_factory) -> Project:

@pytest.fixture
def config_home(tmp_path, monkeypatch) -> Path:
# Patch both HOME and USERPROFILE so Path.home() returns the test dir on
# every platform — Path.home() reads HOME on POSIX and USERPROFILE on
# Windows, and ConfigManager.data_dir_path now goes through Path.home()
# via resolve_data_dir(). Must mirror tests/conftest.py:config_home.
monkeypatch.setenv("HOME", str(tmp_path))
if os.name == "nt":
monkeypatch.setenv("USERPROFILE", str(tmp_path))
# Set BASIC_MEMORY_HOME to the test directory
monkeypatch.setenv("BASIC_MEMORY_HOME", str(tmp_path / "basic-memory"))
return tmp_path
Expand Down
17 changes: 17 additions & 0 deletions tests/cli/test_ignore_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,29 @@

from basic_memory.ignore_utils import (
DEFAULT_IGNORE_PATTERNS,
get_bmignore_path,
load_gitignore_patterns,
should_ignore_path,
filter_files,
)


def test_get_bmignore_path_honors_basic_memory_config_dir(tmp_path, monkeypatch):
"""Regression guard for #742: .bmignore must follow BASIC_MEMORY_CONFIG_DIR."""
custom_dir = tmp_path / "instance-y" / "state"
monkeypatch.setenv("BASIC_MEMORY_CONFIG_DIR", str(custom_dir))

assert get_bmignore_path() == custom_dir / ".bmignore"


def test_get_bmignore_path_defaults_under_home(tmp_path, monkeypatch):
"""Without BASIC_MEMORY_CONFIG_DIR, .bmignore lives under ~/.basic-memory."""
monkeypatch.delenv("BASIC_MEMORY_CONFIG_DIR", raising=False)
monkeypatch.setattr(Path, "home", classmethod(lambda cls: tmp_path))

assert get_bmignore_path() == tmp_path / ".basic-memory" / ".bmignore"


def test_load_default_patterns_only():
"""Test loading default patterns when no .gitignore exists."""
with tempfile.TemporaryDirectory() as temp_dir:
Expand Down
61 changes: 61 additions & 0 deletions tests/repository/test_openai_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ def test_embedding_provider_factory_uses_provider_defaults_when_dimensions_not_s

def test_embedding_provider_factory_forwards_fastembed_runtime_knobs():
"""Factory should forward FastEmbed runtime tuning config fields."""
reset_embedding_provider_cache()
config = BasicMemoryConfig(
env="test",
projects={"test-project": "/tmp/basic-memory-test"},
Expand All @@ -265,6 +266,66 @@ def test_embedding_provider_factory_forwards_fastembed_runtime_knobs():
assert provider.parallel == 2


def test_embedding_provider_factory_uses_default_cache_dir_when_unset(config_home, monkeypatch):
"""Factory should pass the data-dir-relative default when cache_dir is None.

Legacy configs that carry an explicit ``semantic_embedding_cache_dir: null``
must still get a user-writable cache path rather than letting FastEmbed fall
back to ``<tmp>/fastembed_cache``. See #741.
"""
monkeypatch.delenv("BASIC_MEMORY_CONFIG_DIR", raising=False)
monkeypatch.delenv("FASTEMBED_CACHE_PATH", raising=False)
reset_embedding_provider_cache()

config = BasicMemoryConfig(
env="test",
projects={"test-project": str(config_home / "project")},
default_project="test-project",
semantic_search_enabled=True,
semantic_embedding_provider="fastembed",
semantic_embedding_cache_dir=None,
)

provider = create_embedding_provider(config)
assert isinstance(provider, FastEmbedEmbeddingProvider)
expected = str(config_home / ".basic-memory" / "fastembed_cache")
assert provider.cache_dir == expected


def test_embedding_provider_factory_cache_key_reflects_resolved_cache_dir(
config_home, tmp_path, monkeypatch
):
"""Changing FASTEMBED_CACHE_PATH must yield a distinct cached provider.

The provider cache key uses the *resolved* cache dir rather than the raw
(nullable) config field, so env-driven path changes invalidate the cache
instead of silently returning a stale provider pointing at the old path.
"""
monkeypatch.delenv("BASIC_MEMORY_CONFIG_DIR", raising=False)
monkeypatch.delenv("FASTEMBED_CACHE_PATH", raising=False)
reset_embedding_provider_cache()

base_kwargs = dict(
env="test",
projects={"test-project": str(config_home / "project")},
default_project="test-project",
semantic_search_enabled=True,
semantic_embedding_provider="fastembed",
semantic_embedding_cache_dir=None,
)

provider_a = create_embedding_provider(BasicMemoryConfig(**base_kwargs))
assert isinstance(provider_a, FastEmbedEmbeddingProvider)

monkeypatch.setenv("FASTEMBED_CACHE_PATH", str(tmp_path / "alt-cache"))
provider_b = create_embedding_provider(BasicMemoryConfig(**base_kwargs))

assert isinstance(provider_b, FastEmbedEmbeddingProvider)
assert provider_b is not provider_a
assert provider_a.cache_dir == str(config_home / ".basic-memory" / "fastembed_cache")
assert provider_b.cache_dir == str(tmp_path / "alt-cache")


def test_fastembed_provider_reports_runtime_log_attrs():
"""FastEmbed should expose the resolved runtime knobs for batch startup logs."""
provider = FastEmbedEmbeddingProvider(batch_size=128, threads=4, parallel=2)
Expand Down
Loading
Loading