Skip to content

Commit f27ae9a

Browse files
[BED-5972] Log timeout-wrapped execution time stats, add adaptive timeout logic (#222)
* feat: Log timeout-wrapped execution time stats, add adaptive timeout logic * feat: Polish out AdaptiveTimeout, replace instances of ExecuteWithTimeout * feat: Finish replacing Timeout.ExecuteWithTimeout calls; polish up AdaptiveTimeout * chore: Add AdaptiveTimeout unit tests * chore: Removing now-unused timeout params * chore: Remove obsolete timeout tests - should now be covered by AdaptiveTimeouts * fix: More aggressive time spiking for TimeSpikeSafetyValve test * chore: Address some coderabbit comments * fix: TimeSpikeSafetyValveTest - I played myself * fix: This time spike safety valve is too flexible * feat: Wrapping ldap queries in AdaptiveTimeout for analytics, experiment with adaptive timeouts * chore: Simplify AdaptiveTimeout constructor with default params; log current timeout on time spike safety valve event * chore: Polishing up * fix: DCLdapProcessorTest - specify positive-value timeout args for LdapProcessor constructor * fix: It.Is can't be used outside of Moq setups - these were always returning the default int value 0 in DCLdapProcessor tests * feat: Log time spike events when max timeout is in effect * chore: Just adding a little more transparency to Timeout fail results, especially with AdaptiveTimeouts in place now * fix: Timeout tests check response message StartsWith 'Timeout' instead of Equals * chore: Let's add an UseAdaptiveTimeout() to simplify things a bit, clear up some assumptions about our lower bound in TimeSpikeSafetyValve comments * feat: Log full fail message for CheckPort timeouts
1 parent 8ad304b commit f27ae9a

26 files changed

Lines changed: 746 additions & 283 deletions

src/CommonLib/AdaptiveTimeout.cs

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
using System;
2+
using System.Threading;
3+
using System.Threading.Tasks;
4+
using Microsoft.Extensions.Logging;
5+
using SharpHoundRPC.NetAPINative;
6+
7+
namespace SharpHoundCommonLib;
8+
9+
public sealed class AdaptiveTimeout : IDisposable {
10+
private readonly ExecutionTimeSampler _sampler;
11+
private readonly ILogger _log;
12+
private readonly TimeSpan _maxTimeout;
13+
private readonly bool _useAdaptiveTimeout;
14+
private readonly int _minSamplesForAdaptiveTimeout;
15+
private int _clearSamplesDecay;
16+
private const int TimeSpikePenalty = 2;
17+
private const int TimeSpikeForgiveness = 1;
18+
private const int ClearSamplesThreshold = 5;
19+
private const int StdDevMultiplier = 5;
20+
21+
public AdaptiveTimeout(TimeSpan maxTimeout, ILogger log, int sampleCount = 100, int logFrequency = 1000, int minSamplesForAdaptiveTimeout = 30, bool useAdaptiveTimeout = true) {
22+
if (maxTimeout <= TimeSpan.Zero)
23+
throw new ArgumentException("maxTimeout must be positive", nameof(maxTimeout));
24+
if (sampleCount <= 0)
25+
throw new ArgumentException("sampleCount must be positive", nameof(sampleCount));
26+
if (logFrequency <= 0)
27+
throw new ArgumentException("logFrequency must be positive", nameof(logFrequency));
28+
if (minSamplesForAdaptiveTimeout <= 0)
29+
throw new ArgumentException("minSamplesForAdaptiveTimeout must be positive", nameof(minSamplesForAdaptiveTimeout));
30+
if (log == null)
31+
throw new ArgumentNullException(nameof(log));
32+
33+
_sampler = new ExecutionTimeSampler(log, sampleCount, logFrequency);
34+
_log = log;
35+
_maxTimeout = maxTimeout;
36+
_useAdaptiveTimeout = useAdaptiveTimeout;
37+
_minSamplesForAdaptiveTimeout = minSamplesForAdaptiveTimeout;
38+
}
39+
40+
public void ClearSamples() {
41+
_clearSamplesDecay = 0;
42+
_sampler.ClearSamples();
43+
}
44+
45+
/// <summary>
46+
/// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller.
47+
/// Logs aggregate execution time data.
48+
/// Manages its own timeout.
49+
/// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached.
50+
/// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better.
51+
/// DO NOT use a single AdaptiveTimeout for multiple functions.
52+
/// </summary>
53+
/// <typeparam name="T"></typeparam>
54+
/// <param name="func"></param>
55+
/// <param name="parentToken"></param>
56+
/// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns>
57+
public async Task<Result<T>> ExecuteWithTimeout<T>(Func<CancellationToken, T> func, CancellationToken parentToken = default) {
58+
var result = await Timeout.ExecuteWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken);
59+
TimeSpikeSafetyValve(result.IsSuccess);
60+
return result;
61+
}
62+
63+
/// <summary>
64+
/// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller.
65+
/// Logs aggregate execution time data.
66+
/// Manages its own timeout.
67+
/// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached.
68+
/// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better.
69+
/// DO NOT use a single AdaptiveTimeout for multiple functions.
70+
/// </summary>
71+
/// <param name="func"></param>
72+
/// <param name="parentToken"></param>
73+
/// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns>
74+
public async Task<Result> ExecuteWithTimeout(Action<CancellationToken> func, CancellationToken parentToken = default) {
75+
var result = await Timeout.ExecuteWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken);
76+
TimeSpikeSafetyValve(result.IsSuccess);
77+
return result;
78+
}
79+
80+
/// <summary>
81+
/// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller.
82+
/// Logs aggregate execution time data.
83+
/// Manages its own timeout.
84+
/// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached.
85+
/// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better.
86+
/// DO NOT use a single AdaptiveTimeout for multiple functions.
87+
/// </summary>
88+
/// <typeparam name="T"></typeparam>
89+
/// <param name="func"></param>
90+
/// <param name="parentToken"></param>
91+
/// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns>
92+
public async Task<Result<T>> ExecuteWithTimeout<T>(Func<CancellationToken, Task<T>> func, CancellationToken parentToken = default) {
93+
var result = await Timeout.ExecuteWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken);
94+
TimeSpikeSafetyValve(result.IsSuccess);
95+
return result;
96+
}
97+
98+
/// <summary>
99+
/// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller.
100+
/// Logs aggregate execution time data.
101+
/// Manages its own timeout.
102+
/// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached.
103+
/// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better.
104+
/// DO NOT use a single AdaptiveTimeout for multiple functions.
105+
/// </summary>
106+
/// <param name="func"></param>
107+
/// <param name="parentToken"></param>
108+
/// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns>
109+
public async Task<Result> ExecuteWithTimeout(Func<CancellationToken, Task> func, CancellationToken parentToken = default) {
110+
var result = await Timeout.ExecuteWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken);
111+
TimeSpikeSafetyValve(result.IsSuccess);
112+
return result;
113+
}
114+
115+
/// <summary>
116+
/// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller.
117+
/// Logs aggregate execution time data.
118+
/// Manages its own timeout.
119+
/// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached.
120+
/// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better.
121+
/// DO NOT use a single AdaptiveTimeout for multiple functions.
122+
/// </summary>
123+
/// <typeparam name="T"></typeparam>
124+
/// <param name="func"></param>
125+
/// <param name="parentToken"></param>
126+
/// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns>
127+
public async Task<NetAPIResult<T>> ExecuteNetAPIWithTimeout<T>(Func<CancellationToken, NetAPIResult<T>> func, CancellationToken parentToken = default) {
128+
var result = await Timeout.ExecuteNetAPIWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken);
129+
TimeSpikeSafetyValve(result.IsSuccess);
130+
return result;
131+
}
132+
133+
/// <summary>
134+
/// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller.
135+
/// Logs aggregate execution time data.
136+
/// Manages its own timeout.
137+
/// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached.
138+
/// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better.
139+
/// DO NOT use a single AdaptiveTimeout for multiple functions.
140+
/// </summary>
141+
/// <typeparam name="T"></typeparam>
142+
/// <param name="func"></param>
143+
/// <param name="parentToken"></param>
144+
/// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns>
145+
public async Task<SharpHoundRPC.Result<T>> ExecuteRPCWithTimeout<T>(Func<CancellationToken, SharpHoundRPC.Result<T>> func, CancellationToken parentToken = default) {
146+
var result = await Timeout.ExecuteRPCWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken);
147+
TimeSpikeSafetyValve(result.IsSuccess);
148+
return result;
149+
}
150+
151+
/// <summary>
152+
/// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller.
153+
/// Logs aggregate execution time data.
154+
/// Manages its own timeout.
155+
/// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached.
156+
/// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better.
157+
/// DO NOT use a single AdaptiveTimeout for multiple functions.
158+
/// </summary>
159+
/// <typeparam name="T"></typeparam>
160+
/// <param name="func"></param>
161+
/// <param name="parentToken"></param>
162+
/// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns>
163+
public async Task<SharpHoundRPC.Result<T>> ExecuteRPCWithTimeout<T>(Func<CancellationToken, Task<SharpHoundRPC.Result<T>>> func, CancellationToken parentToken = default) {
164+
var result = await Timeout.ExecuteRPCWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken);
165+
TimeSpikeSafetyValve(result.IsSuccess);
166+
return result;
167+
}
168+
169+
public void Dispose() {
170+
_sampler.Dispose();
171+
}
172+
173+
// Within 5 standard deviations will have a conservative lower bound of catching 96% of executions (1 - 1/5^2),
174+
// regardless of sample shape
175+
// so long as those samples are independent and identically distributed
176+
// (and if they're not, our TimeSpikeSafetyValve should provide us with some adaptability)
177+
// But the effective collection rate is probably closer to 98+%
178+
// (in part because we don't need to filter out "too fast" outliers)
179+
// But we'll cap at configured maximum timeout
180+
// https://modelassist.epixanalytics.com/space/EA/26574957/Tchebysheffs+Rule
181+
// https://en.wikipedia.org/wiki/Independent_and_identically_distributed_random_variables
182+
public TimeSpan GetAdaptiveTimeout() {
183+
if (!UseAdaptiveTimeout())
184+
return _maxTimeout;
185+
186+
try {
187+
var stdDev = _sampler.StandardDeviation();
188+
var adaptiveTimeoutMs = _sampler.Average() + (stdDev * StdDevMultiplier);
189+
var cappedTimeoutMS = Math.Min(adaptiveTimeoutMs, _maxTimeout.TotalMilliseconds);
190+
return TimeSpan.FromMilliseconds(cappedTimeoutMS);
191+
}
192+
catch (Exception ex) {
193+
_log.LogError(ex, "Error calculating adaptive timeout, defaulting to max timeout.");
194+
return _maxTimeout;
195+
}
196+
}
197+
198+
// AdaptiveTimeout will not respond well to rapid spikes in execution time
199+
// imagine the wrapped function very regularly executes in 10ms
200+
// then suddenly starts taking a regular 100ms
201+
// this is fine (if it fits in our max timeout budget), and we shouldn't block
202+
// so we should create a safety valve in case this happens to reset our data samples
203+
private void TimeSpikeSafetyValve(bool isSuccess) {
204+
if (isSuccess) {
205+
_clearSamplesDecay -= TimeSpikeForgiveness;
206+
_clearSamplesDecay = Math.Max(0, _clearSamplesDecay);
207+
}
208+
else
209+
_clearSamplesDecay += TimeSpikePenalty;
210+
211+
212+
if (_clearSamplesDecay >= ClearSamplesThreshold) {
213+
if (UseAdaptiveTimeout()) {
214+
ClearSamples();
215+
_log.LogTrace("Time spike safety valve event at timeout {CurrentTimeout}.", GetAdaptiveTimeout());
216+
}
217+
else {
218+
_log.LogWarning("This call is frequently running over the maximum allowed timeout of {MaxTimeout}.", _maxTimeout);
219+
}
220+
}
221+
}
222+
223+
private bool UseAdaptiveTimeout() {
224+
return _useAdaptiveTimeout && _sampler.Count >= _minSamplesForAdaptiveTimeout;
225+
}
226+
}
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
using System;
2+
using System.Collections.Concurrent;
3+
using System.Diagnostics;
4+
using System.Linq;
5+
using System.Threading.Tasks;
6+
using Microsoft.Extensions.Logging;
7+
8+
namespace SharpHoundCommonLib;
9+
10+
/// <summary>
11+
/// Holds a rolling sample of execution times on a function, providing and logging data aggregates.
12+
/// </summary>
13+
public class ExecutionTimeSampler : IDisposable {
14+
private readonly ILogger _log;
15+
private readonly int _sampleCount;
16+
private readonly int _logFrequency;
17+
private int _samplesSinceLastLog;
18+
private ConcurrentQueue<double> _samples;
19+
20+
public int Count => _samples.Count;
21+
22+
public ExecutionTimeSampler(ILogger log, int sampleCount, int logFrequency) {
23+
_log = log;
24+
_sampleCount = sampleCount;
25+
_logFrequency = logFrequency;
26+
_samplesSinceLastLog = 0;
27+
_samples = new ConcurrentQueue<double>();
28+
}
29+
30+
public void ClearSamples() {
31+
Log(flush: true);
32+
_samples = new ConcurrentQueue<double>();
33+
}
34+
35+
public double StandardDeviation() {
36+
double average = _samples.Average();
37+
double sumOfSquaresOfDifferences = _samples.Select(val => (val - average) * (val - average)).Sum();
38+
double stddiv = Math.Sqrt(sumOfSquaresOfDifferences / _samples.Count);
39+
40+
return stddiv;
41+
}
42+
43+
public double Average() => _samples.Average();
44+
45+
public async Task<T> SampleExecutionTime<T>(Func<Task<T>> func) {
46+
var stopwatch = Stopwatch.StartNew();
47+
var result = await func.Invoke();
48+
stopwatch.Stop();
49+
AddTimeSample(stopwatch.Elapsed);
50+
51+
return result;
52+
}
53+
54+
public async Task SampleExecutionTime(Func<Task> func) {
55+
var stopwatch = Stopwatch.StartNew();
56+
await func.Invoke();
57+
stopwatch.Stop();
58+
AddTimeSample(stopwatch.Elapsed);
59+
}
60+
61+
public T SampleExecutionTime<T>(Func<T> func) {
62+
var stopwatch = Stopwatch.StartNew();
63+
var result = func.Invoke();
64+
stopwatch.Stop();
65+
AddTimeSample(stopwatch.Elapsed);
66+
67+
return result;
68+
}
69+
70+
public void SampleExecutionTime(Action func) {
71+
var stopwatch = Stopwatch.StartNew();
72+
func.Invoke();
73+
stopwatch.Stop();
74+
AddTimeSample(stopwatch.Elapsed);
75+
}
76+
77+
public void Dispose() {
78+
Log(flush: true);
79+
}
80+
81+
private void AddTimeSample(TimeSpan timeSpan) {
82+
while (_samples.Count >= _sampleCount) {
83+
_samples.TryDequeue(out _);
84+
}
85+
86+
_samples.Enqueue(timeSpan.TotalMilliseconds);
87+
_samplesSinceLastLog++;
88+
89+
Log();
90+
}
91+
92+
private void Log(bool flush = false) {
93+
if ((flush || _samplesSinceLastLog >= _logFrequency) && _samples.Count > 0) {
94+
try {
95+
_log.LogInformation("Execution time Average: {Average}ms, StdDiv: {StandardDeviation}ms", _samples.Average(), StandardDeviation());
96+
}
97+
catch (Exception ex) {
98+
_log.LogWarning("Failed to calculate execution time statistics: {Error}", ex.Message);
99+
}
100+
101+
_samplesSinceLastLog = 0;
102+
}
103+
}
104+
}

src/CommonLib/IRegistryKey.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ public interface IRegistryKey {
1010

1111
public class SHRegistryKey : IRegistryKey, IDisposable {
1212
private readonly RegistryKey _currentKey;
13+
private static readonly AdaptiveTimeout _adaptiveTimeout = new AdaptiveTimeout(maxTimeout: TimeSpan.FromSeconds(10), Logging.LogProvider.CreateLogger(nameof(SHRegistryKey)));
1314

1415
private SHRegistryKey(RegistryKey registryKey) {
1516
_currentKey = registryKey;
@@ -35,7 +36,7 @@ public object GetValue(string subkey, string name) {
3536
/// <exception cref="System.Security.SecurityException"></exception>
3637
/// <exception cref="UnauthorizedAccessException"></exception>
3738
public static async Task<SHRegistryKey> Connect(RegistryHive hive, string machineName) {
38-
var remoteKey = await Timeout.ExecuteWithTimeout(TimeSpan.FromSeconds(10), (_) => RegistryKey.OpenRemoteBaseKey(hive, machineName));
39+
var remoteKey = await _adaptiveTimeout.ExecuteWithTimeout((_) => RegistryKey.OpenRemoteBaseKey(hive, machineName));
3940
if (remoteKey.IsSuccess)
4041
return new SHRegistryKey(remoteKey.Value);
4142
else

0 commit comments

Comments
 (0)