Skip to content

Commit a068f4a

Browse files
client: rank devices and endpoints by min latency instead of avg (#3484)
## Summary of Changes - Device and tunnel endpoint selection during `doublezero connect` now ranks candidates by `min_latency_ns` rather than `avg_latency_ns`, preferring the path with the best achievable round-trip time - `doublezero latency` output sorted by min latency for consistency - Three new tests added that use deliberately divergent min/avg values to pin the ranking metric ## Diff Breakdown | Category | Files | Lines (+/-) | Net | |------------|-------|-------------|------| | Core logic | 1 | +12 / -12 | 0 | | Tests | 1 | +127 / -0 | +127 | Single-file change; all net additions are tests. <details> <summary>Key files (click to expand)</summary> - `client/doublezero/src/dzd_latency.rs` — four `avg_latency_ns` → `min_latency_ns` substitutions in `select_tunnel_endpoint`, `retrieve_latencies`, and `best_latency`; three new regression tests </details> ## Testing Verification - 89 unit tests pass (`cargo test -p doublezero`) - Three new tests specifically verify that a device/endpoint with lower `min` but higher `avg` wins over one with higher `min` but lower `avg`, covering all three ranking sites
1 parent 9cef755 commit a068f4a

2 files changed

Lines changed: 195 additions & 12 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ All notable changes to this project will be documented in this file.
88

99
### Changes
1010

11+
- Client
12+
- Rank devices and tunnel endpoints by minimum observed latency (`min_latency_ns`) instead of average when selecting a connection target, preferring paths with the best achievable round-trip time
1113
- Tools
1214
- Add `IsRetryableFunc` field to `RetryOptions` for configurable retry criteria in the Solana JSON-RPC client; add `"rate limited"` string match and RPC code `-32429` to the default implementation
1315
- Telemetry

client/doublezero/src/dzd_latency.rs

Lines changed: 193 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,14 @@ pub fn select_tunnel_endpoint(
3535
device_public_ip: Ipv4Addr,
3636
exclude_ips: &[Ipv4Addr],
3737
) -> Ipv4Addr {
38-
// Filter latencies to records matching this device_pk, sorted by avg latency (ascending)
38+
// Filter latencies to records matching this device_pk, sorted by min latency (ascending)
3939
let mut device_latencies: Vec<&LatencyRecord> = latencies
4040
.iter()
4141
.filter(|l| l.device_pk == device_pk)
4242
.collect();
4343
device_latencies.sort_by(|a, b| {
44-
a.avg_latency_ns
45-
.partial_cmp(&b.avg_latency_ns)
44+
a.min_latency_ns
45+
.partial_cmp(&b.min_latency_ns)
4646
.unwrap_or(std::cmp::Ordering::Equal)
4747
});
4848

@@ -129,8 +129,8 @@ pub async fn retrieve_latencies<T: ServiceController>(
129129
if reachable_cmp != std::cmp::Ordering::Equal {
130130
return reachable_cmp;
131131
}
132-
a.avg_latency_ns
133-
.partial_cmp(&b.avg_latency_ns)
132+
a.min_latency_ns
133+
.partial_cmp(&b.min_latency_ns)
134134
.unwrap_or(std::cmp::Ordering::Equal)
135135
});
136136

@@ -151,7 +151,7 @@ pub async fn retrieve_latencies<T: ServiceController>(
151151
// typical internet connections while giving the selector enough freedom to
152152
// choose alternate devices when needed.
153153
//
154-
// The value is expressed in nanoseconds to match avg_latency_ns.
154+
// The value is expressed in nanoseconds to match min_latency_ns.
155155
const LATENCY_TOLERANCE_NS: i64 = 5_000_000; // 5 ms
156156

157157
/// Find the best device based on latency.
@@ -203,7 +203,7 @@ pub async fn best_latency<T: ServiceController>(
203203
.find(|latency| latency.device_pk == current_device.to_string())
204204
{
205205
best = Some(current);
206-
best_latency = current.avg_latency_ns;
206+
best_latency = current.min_latency_ns;
207207
}
208208
}
209209

@@ -220,7 +220,7 @@ pub async fn best_latency<T: ServiceController>(
220220
.ok_or_else(|| eyre::eyre!("Device with pubkey {} not found", &latency.device_pk))?;
221221

222222
if (!ignore_unprovisionable || device.is_device_eligible_for_provisioning())
223-
&& (latency.avg_latency_ns - best_latency).abs() > LATENCY_TOLERANCE_NS
223+
&& (latency.min_latency_ns - best_latency).abs() > LATENCY_TOLERANCE_NS
224224
{
225225
best = Some(latency);
226226
break;
@@ -313,14 +313,14 @@ mod tests {
313313
)
314314
}
315315

316-
fn make_latency(pk: &str, avg_latency_ns: i64, reachable: bool) -> LatencyRecord {
316+
fn make_latency(pk: &str, latency_ns: i64, reachable: bool) -> LatencyRecord {
317317
LatencyRecord {
318318
device_pk: pk.to_string(),
319319
device_code: "device".to_string(),
320320
device_ip: "0.0.0.0".to_string(),
321-
min_latency_ns: avg_latency_ns,
322-
max_latency_ns: avg_latency_ns,
323-
avg_latency_ns,
321+
min_latency_ns: latency_ns,
322+
max_latency_ns: latency_ns,
323+
avg_latency_ns: latency_ns,
324324
reachable,
325325
}
326326
}
@@ -918,6 +918,187 @@ mod tests {
918918
assert_eq!(result, Ipv4Addr::UNSPECIFIED);
919919
}
920920

921+
// --- min-latency ranking tests ---
922+
// These tests use distinct min/avg values to verify that ranking is by min_latency_ns.
923+
924+
#[test]
925+
fn test_select_tunnel_endpoint_ranks_by_min_not_avg() {
926+
let pk = Pubkey::new_unique();
927+
let pk_str = pk.to_string();
928+
let latencies = vec![
929+
LatencyRecord {
930+
device_pk: pk_str.clone(),
931+
device_code: "device1".to_string(),
932+
device_ip: "10.0.0.1".to_string(),
933+
min_latency_ns: 20_000_000, // higher min
934+
max_latency_ns: 20_000_000,
935+
avg_latency_ns: 5_000_000, // lower avg (would win if ranked by avg)
936+
reachable: true,
937+
},
938+
LatencyRecord {
939+
device_pk: pk_str.clone(),
940+
device_code: "device1".to_string(),
941+
device_ip: "10.0.0.2".to_string(),
942+
min_latency_ns: 5_000_000, // lower min (should win)
943+
max_latency_ns: 5_000_000,
944+
avg_latency_ns: 20_000_000, // higher avg
945+
reachable: true,
946+
},
947+
];
948+
let result = select_tunnel_endpoint(&latencies, &pk_str, Ipv4Addr::new(10, 0, 0, 1), &[]);
949+
assert_eq!(result, Ipv4Addr::new(10, 0, 0, 2)); // lower min wins
950+
}
951+
952+
#[tokio::test]
953+
async fn test_retrieve_latencies_sorts_by_min_not_avg() {
954+
let (pk1, dev1) = make_device(DeviceStatus::Activated, 0);
955+
let (pk2, dev2) = make_device(DeviceStatus::Activated, 0);
956+
957+
let mut devices = HashMap::new();
958+
devices.insert(pk1, dev1);
959+
devices.insert(pk2, dev2);
960+
961+
// pk1: low min, high avg. pk2: high min, low avg.
962+
// Sorted by min: pk1 first. Sorted by avg: pk2 first.
963+
let latencies = vec![
964+
LatencyRecord {
965+
device_pk: pk1.to_string(),
966+
device_code: "device".to_string(),
967+
device_ip: "0.0.0.0".to_string(),
968+
min_latency_ns: 5_000_000, // lower min → should be first
969+
max_latency_ns: 30_000_000,
970+
avg_latency_ns: 25_000_000, // higher avg
971+
reachable: true,
972+
},
973+
LatencyRecord {
974+
device_pk: pk2.to_string(),
975+
device_code: "device".to_string(),
976+
device_ip: "0.0.0.0".to_string(),
977+
min_latency_ns: 15_000_000, // higher min
978+
max_latency_ns: 20_000_000,
979+
avg_latency_ns: 8_000_000, // lower avg → would be first if sorted by avg
980+
reachable: true,
981+
},
982+
];
983+
984+
let mut controller = MockServiceController::new();
985+
controller.expect_latency().returning(move || {
986+
Ok(LatencyResponse {
987+
ready: true,
988+
results: latencies.clone(),
989+
})
990+
});
991+
992+
let result = retrieve_latencies(&controller, &devices, false, None)
993+
.await
994+
.unwrap();
995+
996+
assert_eq!(result.len(), 2);
997+
assert_eq!(result[0].device_pk, pk1.to_string()); // lower min first
998+
assert_eq!(result[1].device_pk, pk2.to_string());
999+
}
1000+
1001+
#[tokio::test]
1002+
async fn test_best_latency_selects_by_min_not_avg() {
1003+
let (pk1, dev1) = make_device(DeviceStatus::Activated, 0);
1004+
let (pk2, dev2) = make_device(DeviceStatus::Activated, 0);
1005+
1006+
let mut devices = HashMap::new();
1007+
devices.insert(pk1, dev1);
1008+
devices.insert(pk2, dev2);
1009+
1010+
// pk1: low min, high avg. pk2: high min, low avg.
1011+
// Should select pk1 (lower min).
1012+
let latencies = vec![
1013+
LatencyRecord {
1014+
device_pk: pk1.to_string(),
1015+
device_code: "device".to_string(),
1016+
device_ip: "0.0.0.0".to_string(),
1017+
min_latency_ns: 5_000_000, // lower min → should win
1018+
max_latency_ns: 30_000_000,
1019+
avg_latency_ns: 25_000_000, // higher avg
1020+
reachable: true,
1021+
},
1022+
LatencyRecord {
1023+
device_pk: pk2.to_string(),
1024+
device_code: "device".to_string(),
1025+
device_ip: "0.0.0.0".to_string(),
1026+
min_latency_ns: 15_000_000, // higher min → should lose
1027+
max_latency_ns: 20_000_000,
1028+
avg_latency_ns: 8_000_000, // lower avg → would win if sorted by avg
1029+
reachable: true,
1030+
},
1031+
];
1032+
1033+
let mut controller = MockServiceController::new();
1034+
controller.expect_latency().returning(move || {
1035+
Ok(LatencyResponse {
1036+
ready: true,
1037+
results: latencies.clone(),
1038+
})
1039+
});
1040+
1041+
let result = best_latency(&controller, &devices, true, None, None, &[])
1042+
.await
1043+
.unwrap();
1044+
1045+
assert_eq!(result.device_pk, pk1.to_string()); // lower min wins
1046+
}
1047+
1048+
#[tokio::test]
1049+
async fn test_best_latency_current_device_seeded_by_min_not_avg() {
1050+
// Verify that when a current_device is set, the tolerance window is seeded from
1051+
// min_latency_ns (line 205), not avg_latency_ns.
1052+
//
1053+
// pk2 (current): min=13ms, avg=6ms → min-seeded best_latency=13ms
1054+
// pk1 (candidate): min=5ms, avg=25ms
1055+
//
1056+
// With min seeding: |5 - 13| = 8ms > 5ms tolerance → switches to pk1 (correct)
1057+
// With avg seeding: |5 - 6| = 1ms < 5ms tolerance → wrongly stays with pk2
1058+
let (pk1, dev1) = make_device(DeviceStatus::Activated, 0);
1059+
let (pk2, dev2) = make_device(DeviceStatus::Activated, 0);
1060+
1061+
let mut devices = HashMap::new();
1062+
devices.insert(pk1, dev1);
1063+
devices.insert(pk2, dev2);
1064+
1065+
let latencies = vec![
1066+
LatencyRecord {
1067+
device_pk: pk1.to_string(),
1068+
device_code: "device".to_string(),
1069+
device_ip: "0.0.0.0".to_string(),
1070+
min_latency_ns: 5_000_000, // lower min → should win
1071+
max_latency_ns: 30_000_000,
1072+
avg_latency_ns: 25_000_000, // higher avg
1073+
reachable: true,
1074+
},
1075+
LatencyRecord {
1076+
device_pk: pk2.to_string(),
1077+
device_code: "device".to_string(),
1078+
device_ip: "0.0.0.0".to_string(),
1079+
min_latency_ns: 13_000_000, // higher min → current device
1080+
max_latency_ns: 20_000_000,
1081+
avg_latency_ns: 6_000_000, // lower avg (would anchor tolerance if avg-seeded)
1082+
reachable: true,
1083+
},
1084+
];
1085+
1086+
let mut controller = MockServiceController::new();
1087+
controller.expect_latency().returning(move || {
1088+
Ok(LatencyResponse {
1089+
ready: true,
1090+
results: latencies.clone(),
1091+
})
1092+
});
1093+
1094+
// pk2 is the current device but pk1 has significantly lower min (8ms gap > 5ms tolerance)
1095+
let result = best_latency(&controller, &devices, true, None, Some(&pk2), &[])
1096+
.await
1097+
.unwrap();
1098+
1099+
assert_eq!(result.device_pk, pk1.to_string()); // switches to lower-min device
1100+
}
1101+
9211102
#[test]
9221103
fn test_select_tunnel_endpoint_no_matching_device() {
9231104
let pk = Pubkey::new_unique();

0 commit comments

Comments
 (0)