Update throughput arguments and core assignment

avolkov-intel · avolkov-intel · commit b42d09de34b7 · 2026-05-20T06:03:06.000-07:00
diff --git a/sklbench/benchmarks/throughput_worker.py b/sklbench/benchmarks/throughput_worker.py
@@ -1,5 +1,5 @@
 # ===============================================================================
-# Copyright 2024 Intel Corporation
+# Copyright 2026 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,14 +18,13 @@
 import inspect
 import json
 import socket
-import sys
 import time
 from typing import Dict, List, Tuple
 
 from ..datasets import load_data
 from ..datasets.transformer import split_and_transform_data
+from ..utils.barrier import recv_until
 from ..utils.bench_case import get_bench_case_value
-from ..utils.common import convert_to_numpy
 from ..utils.config import bench_case_filter
 from ..utils.custom_types import BenchCase
 from ..utils.logger import logger
@@ -39,16 +38,6 @@
 )
 
 
-def barrier_wait(sock: socket.socket, msg_send: bytes, msg_expect_prefix: bytes):
-    """Send a message and block until response from parent."""
-    sock.sendall(msg_send)
-    data = b""
-    while not data.startswith(msg_expect_prefix):
-        chunk = sock.recv(1024)
-        if not chunk:
-            raise ConnectionError("Barrier socket closed unexpectedly")
-        data += chunk
-
 
 def run_measurement_loop(
     func, args: tuple, measurement_duration: float
@@ -184,12 +173,7 @@ def main():
             continue
 
         # Wait for "go" signal from parent before each stage
-        data = b""
-        while b"go" not in data:
-            chunk = sock.recv(1024)
-            if not chunk:
-                raise ConnectionError("Barrier socket closed unexpectedly")
-            data += chunk
+        recv_until(sock, b"go")
 
         method_name = available_methods[0]
         method_instance, data_args = get_method_and_args(
diff --git a/sklbench/runner/arguments.py b/sklbench/runner/arguments.py
@@ -137,44 +137,14 @@ def add_runner_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentPa
         action="store_true",
         help="Interrupt runner and exit if last benchmark failed with error.",
     )
-    # throughput mode arguments
+    # throughput mode
     parser.add_argument(
         "--throughput-mode",
         default=False,
         action="store_true",
         help="Run in throughput mode: multiple synchronized parallel instances "
-        "with CPU pinning via numactl.",
-    )
-    parser.add_argument(
-        "--num-instances",
-        type=int,
-        default=None,
-        help="Number of parallel instances in throughput mode.",
-    )
-    parser.add_argument(
-        "--cores-per-instance",
-        type=int,
-        default=None,
-        help="CPU cores per instance in throughput mode.",
-    )
-    parser.add_argument(
-        "--measurement-duration",
-        type=float,
-        default=60.0,
-        help="Duration (seconds) for each measurement stage in throughput mode.",
-    )
-    parser.add_argument(
-        "--emergency-timeout",
-        type=float,
-        default=3600.0,
-        help="Emergency subprocess timeout (seconds). Safety net only.",
-    )
-    parser.add_argument(
-        "--throughput-full-logs",
-        default=False,
-        action="store_true",
-        help="Store per-iteration start_ts and duration_ms arrays in throughput results. "
-        "Disabled by default to reduce output size.",
+        "with CPU pinning via numactl. Configure via bench:num_instances, "
+        "bench:cores_per_instance, bench:measurement_duration in config.",
     )
     # option to get parser description in Markdown table format for READMEs
     parser.add_argument(
diff --git a/sklbench/runner/throughput.py b/sklbench/runner/throughput.py
@@ -1,5 +1,5 @@
 # ===============================================================================
-# Copyright 2024 Intel Corporation
+# Copyright 2026 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,14 +16,14 @@
 
 import argparse
 import json
-import socket
 import subprocess
 import time
 from typing import Dict, List, Tuple, Union
 
 import numpy as np
 from tqdm import tqdm
 
+from ..utils.barrier import accept_and_wait, create_server, send_all, wait_all
 from ..utils.bench_case import get_bench_case_name, get_bench_case_value
 from ..utils.common import custom_format, hash_from_json_repr
 from ..utils.core_assignment import compute_core_assignments
@@ -37,60 +37,14 @@ def validate_throughput_args(
 ):
     if num_instances is None or num_instances < 1:
         raise ValueError(
-            "--num-instances is required and must be >= 1 in throughput mode"
+            "bench:num_instances is required and must be >= 1 in throughput mode"
         )
     if cores_per_instance is None or cores_per_instance < 1:
         raise ValueError(
-            "--cores-per-instance is required and must be >= 1 in throughput mode"
+            "bench:cores_per_instance is required and must be >= 1 in throughput mode"
         )
     if measurement_duration <= 0:
-        raise ValueError("--measurement-duration must be > 0")
-
-
-def create_barrier_server() -> Tuple[socket.socket, int]:
-    """Create a TCP server socket on localhost with OS-assigned port."""
-    server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-    server.bind(("localhost", 0))
-    server.listen(128)
-    port = server.getsockname()[1]
-    return server, port
-
-
-def wait_for_workers_ready(
-    server: socket.socket, num_instances: int, timeout: float
-) -> List[socket.socket]:
-    """Accept connections from all workers and wait for 'ready' message."""
-    server.settimeout(timeout)
-    connections = []
-    for _ in range(num_instances):
-        conn, _ = server.accept()
-        data = b""
-        while b"ready" not in data:
-            chunk = conn.recv(1024)
-            if not chunk:
-                raise ConnectionError("Worker disconnected before sending 'ready'")
-            data += chunk
-        connections.append(conn)
-    return connections
-
-
-def send_go_to_all(connections: List[socket.socket]):
-    """Send 'go' signal to all workers."""
-    for conn in connections:
-        conn.sendall(b"go")
-
-
-def wait_for_workers_done(connections: List[socket.socket], timeout: float):
-    """Wait for 'done' message from all workers."""
-    for conn in connections:
-        conn.settimeout(timeout)
-        data = b""
-        while b"done" not in data:
-            chunk = conn.recv(1024)
-            if not chunk:
-                raise ConnectionError("Worker disconnected before sending 'done'")
-            data += chunk
+        raise ValueError("bench:measurement_duration must be > 0")
 
 
 def validate_sync_quality(instance_outputs: List[Dict], stage: str):
@@ -137,7 +91,6 @@ def aggregate_stage_results(
     stage: str,
     measurement_duration: float,
     core_assignments: List[str],
-    full_logs: bool = False,
 ) -> Dict:
     """Aggregate per-instance results into a single stage result entry."""
     instances = []
@@ -163,10 +116,6 @@ def aggregate_stage_results(
         }
         instance_entry.update(compute_instance_stats(durations, start_timestamps))
 
-        if full_logs:
-            instance_entry["start_ts"] = start_timestamps
-            instance_entry["duration_ms"] = durations
-
         instances.append(instance_entry)
         all_iterations.append(iters)
 
@@ -196,19 +145,25 @@ def run_single_throughput_case(
     measurement_duration: float,
     emergency_timeout: float,
     log_level: str,
-    full_logs: bool = False,
 ) -> Tuple[int, List[Dict]]:
     """Run a single benchmark case in throughput mode."""
+    # Preload dataset in parent process to avoid cache race condition
+    # when multiple workers try to download/generate and save simultaneously
+    from ..datasets import load_data
+
+    logger.info("Preloading dataset in parent process to populate cache")
+    load_data(bench_case)
+
     numa_conf = get_numa_cpus_conf()
     core_assignments = compute_core_assignments(
-        num_instances, cores_per_instance, numa_conf if numa_conf else None
+        num_instances, cores_per_instance, numa_conf or None
     )
 
     logger.info(
         f"Core assignments for {num_instances} instances: {core_assignments}"
     )
 
-    server, port = create_barrier_server()
+    server, port = create_server()
     logger.debug(f"Barrier server listening on localhost:{port}")
 
     bench_case_str = json.dumps(bench_case).replace(" ", "")
@@ -236,33 +191,18 @@ def run_single_throughput_case(
         processes.append(proc)
 
     try:
-        # Wait for all workers to be ready (prep phase - unlimited, but bounded by emergency timeout)
-        connections = wait_for_workers_ready(server, num_instances, emergency_timeout)
+        connections = accept_and_wait(server, num_instances, b"ready", emergency_timeout)
         logger.info("All workers ready, starting measurement stages")
 
-        # Determine which stages exist
-        estimator_methods_training = get_bench_case_value(
-            bench_case, "algorithm:estimator_methods:training", None
-        )
-        estimator_methods_inference = get_bench_case_value(
-            bench_case, "algorithm:estimator_methods:inference", None
-        )
-        stages = []
-        if estimator_methods_training is not None:
-            stages = ["training", "inference"]
-        else:
-            # default stages
-            stages = ["training", "inference"]
-
-        stage_timeout = measurement_duration + 60  # extra time for one stage
+        stages = ["training", "inference"]
+        stage_timeout = measurement_duration + 60
 
         for stage in stages:
             logger.info(f"Sending 'go' for {stage} stage")
-            send_go_to_all(connections)
-            wait_for_workers_done(connections, stage_timeout)
+            send_all(connections, b"go")
+            wait_all(connections, b"done", stage_timeout)
             logger.info(f"All workers done with {stage} stage")
 
-        # Close barrier connections
         for conn in connections:
             conn.close()
 
@@ -326,7 +266,7 @@ def run_single_throughput_case(
 
     for stage in stages:
         stage_result = aggregate_stage_results(
-            instance_outputs, stage, measurement_duration, core_assignments, full_logs
+            instance_outputs, stage, measurement_duration, core_assignments
         )
         if not stage_result:
             continue
@@ -368,13 +308,6 @@ def run_throughput_benchmarks(
     env_info = get_environment_info()
     environment_name = args.environment_name or hash_from_json_repr(env_info)
 
-    # Resolve global defaults from CLI
-    default_num_instances = args.num_instances
-    default_cores_per_instance = args.cores_per_instance
-    default_measurement_duration = args.measurement_duration
-    default_emergency_timeout = args.emergency_timeout
-    full_logs = args.throughput_full_logs
-
     results = []
     return_code = 0
 
@@ -386,18 +319,14 @@ def run_throughput_benchmarks(
             )
         )
 
-        # Per-case config overrides CLI defaults
-        num_instances = get_bench_case_value(
-            bench_case, "bench:num_instances", default_num_instances
-        )
-        cores_per_instance = get_bench_case_value(
-            bench_case, "bench:cores_per_instance", default_cores_per_instance
-        )
+        # All throughput parameters come from bench_case config
+        num_instances = get_bench_case_value(bench_case, "bench:num_instances")
+        cores_per_instance = get_bench_case_value(bench_case, "bench:cores_per_instance")
         measurement_duration = get_bench_case_value(
-            bench_case, "bench:measurement_duration", default_measurement_duration
+            bench_case, "bench:measurement_duration", 60.0
         )
         emergency_timeout = get_bench_case_value(
-            bench_case, "bench:emergency_timeout", default_emergency_timeout
+            bench_case, "bench:emergency_timeout", 3600.0
         )
 
         try:
@@ -420,7 +349,6 @@ def run_throughput_benchmarks(
                 measurement_duration,
                 emergency_timeout,
                 args.bench_log_level,
-                full_logs,
             )
             if case_return_code != 0:
                 return_code = case_return_code
diff --git a/sklbench/utils/barrier.py b/sklbench/utils/barrier.py
@@ -0,0 +1,68 @@
+# ===============================================================================
+# Copyright 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===============================================================================
+
+"""TCP socket barrier for synchronizing throughput mode worker processes."""
+
+import socket
+from typing import List, Tuple
+
+
+def create_server() -> Tuple[socket.socket, int]:
+    """Create a TCP server socket on localhost with OS-assigned port."""
+    server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    server.bind(("localhost", 0))
+    server.listen(128)
+    port = server.getsockname()[1]
+    return server, port
+
+
+def recv_until(sock: socket.socket, expected: bytes):
+    """Block until expected message is received on socket."""
+    data = b""
+    while expected not in data:
+        chunk = sock.recv(1024)
+        if not chunk:
+            raise ConnectionError(
+                f"Socket closed before receiving {expected!r}"
+            )
+        data += chunk
+
+
+def send_all(connections: List[socket.socket], message: bytes):
+    """Send message to all connections."""
+    for conn in connections:
+        conn.sendall(message)
+
+
+def accept_and_wait(
+    server: socket.socket, num_connections: int, expected: bytes, timeout: float
+) -> List[socket.socket]:
+    """Accept num_connections and wait for expected message from each."""
+    server.settimeout(timeout)
+    connections = []
+    for _ in range(num_connections):
+        conn, _ = server.accept()
+        recv_until(conn, expected)
+        connections.append(conn)
+    return connections
+
+
+def wait_all(connections: List[socket.socket], expected: bytes, timeout: float):
+    """Wait for expected message from all existing connections."""
+    for conn in connections:
+        conn.settimeout(timeout)
+        recv_until(conn, expected)
diff --git a/sklbench/utils/core_assignment.py b/sklbench/utils/core_assignment.py