Tweaks to TPU balancer

seth-planet · web-flow · commit fc935117976a · 2024-04-11T23:49:52.000-07:00
diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py
@@ -243,7 +243,7 @@ def __init__(self, tpu_list: list, fname_list: list):
             self._init_interpreters()
 
     def _init_interpreters(self):
-
+        # Set a Time To Live for balancing so we don't swap for inf in corner cases
         self.balance_ttl  = len(self.tpu_list) * 2
         start_boot_time = time.perf_counter_ns()
 
@@ -269,73 +269,87 @@ def enqueue(self, in_tensor, out_q: queue.Queue):
         self.queues[0].put(({self.first_name: in_tensor}, out_q))
 
 
-    def balance_queues(self):
-        # Don't bother if someone else is working on balancing
-        if len(self.queues) <= 1 or len(self.tpu_list) < 2 or self.balance_ttl <= 0 or \
-           not self.balance_lock.acquire(blocking=False):
-            return
+    def _eval_timings(interpreter_counts):
+        # How much time are we allocating for each segment
+        time_alloc = []
 
-        def eval_timings(interpreter_counts):
-            # How much time are we allocating for each segment
-            time_alloc = []
+        for seg_i in range(len(self.interpreters)):
+            # Find average runtime for this segment
+            avg_times = []
+            for interpreters in self.interpreters:
+                avg_times += [i.timings[seg_i] / i.exec_count[seg_i] for i in interpreters if i.exec_count[seg_i] != 0]
 
-            for seg_i in range(len(self.interpreters)):
-                # Find average runtime for this segment
-                avg_times = []
-                for interpreters in self.interpreters:
-                    avg_times += [i.timings[seg_i] / i.exec_count[seg_i] for i in interpreters if i.exec_count[seg_i] != 0]
+            if avg_times:
+                avg_time = sum(avg_times) / len(avg_times)
+            else:
+                return 0, 0, 0.0, None
 
-                if avg_times:
-                    avg_time = sum(avg_times) / len(avg_times)
-                else:
-                    return 0, 0, 0.0, None
+            # Adjust for number of TPUs allocated to it
+            if interpreter_counts[seg_i] > 0:
+                time_alloc.append(avg_time / interpreter_counts[seg_i])
+            else:
+                # No interpreters result inf time
+                time_alloc.append(float('inf'))
+
+        min_gt1_t = float('inf')
+        min_gt1_i = -1
+        max_t = 0
+        max_i = -1
+
+        # Find segments that maybe should swap
+        for i, t in enumerate(time_alloc):
+            # Max time needs to be shortened so add an interpreter.
+            if t > max_t:
+                max_t = t
+                max_i = i
+
+            # Min time needs to be lengthened so rem an interpreter,
+            # but only if it has more than one interpreter
+            if t < min_gt1_t and len(self.interpreters[i]) > 1:
+                min_gt1_t = t
+                min_gt1_i = i
+
+        # See if we can do better than the current max timing with swapping
+        swap_i = None
+        for interp_i, interpreters in enumerate(self.interpreters):
+            # Doesn't make sense to pull a TPU from a queue just to re-add it.
+            if interp_i == max_i:
+                continue
 
-                # Adjust for number of TPUs allocated to it
-                if interpreter_counts[seg_i] > 0:
-                    time_alloc.append(avg_time / interpreter_counts[seg_i])
-                else:
-                    # No interpreters result inf time
-                    time_alloc.append(float('inf'))
-
-            min_gt1_t = float('inf')
-            min_gt1_i = -1
-            max_t = 0
-            max_i = -1
-
-            # Find segments that maybe should swap
-            for i, t in enumerate(time_alloc):
-                # Max time needs to be shortened so add an interpreter.
-                if t > max_t:
-                    max_t = t
-                    max_i = i
-
-                # Min time needs to be lengthened so rem an interpreter,
-                # but only if it has more than one interpreter
-                if t < min_gt1_t and len(self.interpreters[i]) > 1:
-                    min_gt1_t = t
-                    min_gt1_i = i
-
-            # See if we can do better than the current max timing
-            untried_candidates = []
-            for interp_i, interpreters in enumerate(self.interpreters):
-                # Doesn't make sense to pull a TPU from a queue just to re-add it.
-                if interp_i == max_i:
-                    continue
-                # If it hasn't yet been tried for this segment
-                # (or if it has already found to be faster on this segment)
-                if any([True for i in interpreters if i.exec_count[max_i] < 10 or max_t-0.1 > i.timings[max_i] / i.exec_count[max_i]]):
-                    untried_candidates.append(interp_i)
+            # Test all TPUs in this segment
+            for i in interpreters:
+                # Only calc valid time after a few runs
+                new_max_t = 0
+                if i.exec_count[max_i] > 10:
+                    new_max_t = i.timings[max_i] / i.exec_count[max_i] 
+                new_swap_t = 0
+                if i.exec_count[interp_i] > 10:
+                    new_swap_t = i.timings[interp_i] / i.exec_count[interp_i] 
+                    
+                # If it hasn't yet been tried for this segment or
+                # If it has already found to be faster on this segment
+                # and we aren't making the other segment the new worst.
+                if i.exec_count[max_i] < 10 or (max_t > new_max_t and max_t > new_swap_t):
+                    swap_i = interp_i
+                    break
 
-            return min_gt1_i, max_i, max(time_alloc), untried_candidates[0] if len(untried_candidates) > 0 else None
+        return min_gt1_i, max_i, max(time_alloc), swap_i    
+
+    
+    def balance_queues(self):
+        # Don't bother if someone else is working on balancing
+        if len(self.queues) <= 1 or len(self.tpu_list) < 2 or self.balance_ttl <= 0 or \
+           not self.balance_lock.acquire(blocking=False):
+            return
 
         interpreter_counts = [len(i) for i in self.interpreters]
-        min_i, max_i, current_max, min_untried_i = eval_timings(interpreter_counts)
+        min_i, max_i, current_max, swap_i = self._eval_timings(interpreter_counts)
         interpreter_counts[min_i] -= 1
         interpreter_counts[max_i] += 1
-        _, _, new_max, _ = eval_timings(interpreter_counts)
+        _, _, new_max, _ = self._eval_timings(interpreter_counts)
 
         if new_max+1.0 < current_max:
-            # Allocate more TPUs to slow segments
+            # 1st Priority: Allocate more TPUs to slow segments
             logging.info(f"Re-balancing from queue {min_i} to {max_i} (max from {current_max:.2f} to {new_max:.2f})")
 
             realloc_interp = self._rem_interpreter_from(min_i)
@@ -344,24 +358,22 @@ def eval_timings(interpreter_counts):
             realloc_interp.start(max_i, self.fbytes_list[max_i])
             self.interpreters[max_i].append(realloc_interp)
 
-        elif min_untried_i is not None:
-            # Swap slow segments with faster ones to see if we can run them faster.
-            # It might be a good way to optimize for heterogenous hardware.
-            logging.info(f"Re-balancing between queues {min_untried_i} and {max_i}")
+        elif swap_i is not None:
+            # 2nd Priority: Swap slow segments with faster ones to see if we can
+            # run them faster. Hopefully still a good way to optimize for
+            # heterogenous hardware.
+            logging.info(f"Re-balancing between queues {swap_i} and {max_i}")
 
             # Stop them
-            new_max_i         = self._rem_interpreter_from(min_untried_i)
-            new_min_untried_i = self._rem_interpreter_from(max_i)
+            new_max  = self._rem_interpreter_from(swap_i)
+            new_swap = self._rem_interpreter_from(max_i)
 
             # Swap them
-            new_max_i.start(max_i, self.fbytes_list[max_i])
-            self.interpreters[max_i].append(new_max_i)
-
-            new_min_untried_i.start(min_untried_i, self.fbytes_list[min_untried_i])
-            self.interpreters[min_untried_i].append(new_min_untried_i)
+            new_max.start(max_i, self.fbytes_list[max_i])
+            self.interpreters[max_i].append(new_max)
 
-            # FIXME: After we have TPUs evaluated and otherwise balanced, we could
-            # further optimize by ensuring the slowest segment doesn't contain any slow TPUs.
+            new_swap.start(swap_i, self.fbytes_list[swap_i])
+            self.interpreters[swap_i].append(new_swap)
         else:
             # Return if we don't want to swap
             self.balance_lock.release()