Add mass and findBestNOccurrences functions (#52)

jrecuerda · web-flow · commit bb3cb0684b2f · 2019-06-06T09:53:09.000+02:00
diff --git a/khiva/array.py b/khiva/array.py
@@ -300,6 +300,11 @@ def to_list(self):
     def to_numpy(self):
         """ Converts the KHIVA array to a numpy array.
 
+        The returned numpy array shape matches the Array dimensions as follows:
+          - For an Array with dims equal to [4, 2, 1, 1] the numpy shape will be (2, 4).
+          - For an Array with dims equal to [4, 3, 2, 1] the numpy shape will be (2, 3, 4).
+          - For an Array with dims equal to [4, 1, 2, 3] the numpy shape will be (3, 2, 1, 4).
+
         :return: KHIVA array converted to numpy.array.
         """
         return self._get_data()
diff --git a/khiva/matrix.py b/khiva/matrix.py
@@ -74,6 +74,63 @@ def find_best_n_motifs(profile, index, m, n, self_join=False):
         array_reference=d)
 
 
+def find_best_n_occurrences(query_time_series, time_series, number_of_occurrences):
+    """ Calculates the N best matches of several queries in several time series.
+
+     The result has the following structure:
+        - 1st dimension corresponds to the nth best match.
+        - 2nd dimension corresponds to the number of queries.
+        - 3rd dimension corresponds to the number of time series.
+
+    For example, the distance in the position (1, 2, 3) corresponds to the second best distance of the third query in the
+    fourth time series. The index in the position (1, 2, 3) is the index of the subsequence which leads to the
+    second best distance of the third query in the fourth time series.
+
+    :param query_time_series: Array whose first dimension is the length of the query time series and the second
+    dimension is the number of queries.
+    :param time_series: Array whose first dimension is the length of the time series and the second dimension is the
+    number of time series.
+    :param number_of_occurrences: Number of matches to return.
+    :return: KHIVA arrays with the distances and indexes.
+    """
+
+    distances = ctypes.c_void_p(0)
+    indexes = ctypes.c_void_p(0)
+    KhivaLibrary().c_khiva_library.find_best_n_occurrences(ctypes.pointer(query_time_series.arr_reference),
+                                                           ctypes.pointer(time_series.arr_reference),
+                                                           ctypes.pointer(ctypes.c_long(number_of_occurrences)),
+                                                           ctypes.pointer(distances),
+                                                           ctypes.pointer(indexes))
+
+    return Array(array_reference=distances), Array(array_reference=indexes)
+
+
+def mass(query_time_series, time_series):
+    """ Mueen's Algorithm for Similarity Search.
+
+     The result has the following structure:
+        - 1st dimension corresponds to the index of the subsequence in the time series.
+        - 2nd dimension corresponds to the number of queries.
+        - 3rd dimension corresponds to the number of time series.
+
+    For example, the distance in the position (1, 2, 3) correspond to the distance of the third query to the fourth time
+    series for the second subsequence in the time series.
+
+    :param query_time_series: Array whose first dimension is the length of the query time series and the second
+    dimension is the number of queries.
+    :param time_series: Array whose first dimension is the length of the time series and the second dimension is the
+    number of time series.
+    :return: KHIVA array with the distances.
+    """
+
+    distances = ctypes.c_void_p(0)
+    KhivaLibrary().c_khiva_library.mass(ctypes.pointer(query_time_series.arr_reference),
+                                                           ctypes.pointer(time_series.arr_reference),
+                                                           ctypes.pointer(distances))
+
+    return Array(array_reference=distances)
+
+
 def stomp(first_time_series, second_time_series, subsequence_length):
     """ Stomp algorithm to calculate the matrix profile between `ta` and `tb` using a subsequence length of `m`.
 
diff --git a/tests/unit_tests/matrix_unit_tests.py b/tests/unit_tests/matrix_unit_tests.py
@@ -48,6 +48,19 @@ def test_stomp(self):
             self.assertAlmostEqual(a[i], 0, delta=1e-2)
             self.assertAlmostEqual(b[i], expected_index[i])
 
+    def test_stomp_different_queries(self):
+        stomp_result = stomp(Array([[10, 11, 10, 8, 14], [10, 14, 10, 10, 3]]),
+                             Array([[10, 11, 10, 11, 10, 11, 10, 7], [10, 13, 10, 10, 10, 14, 8, 7]]), 3)
+
+        a = stomp_result[0].to_numpy()
+        b = stomp_result[1].to_numpy()
+
+        self.assertAlmostEqual(a[1, 0, 2], 1.73205077, delta=1e-3)
+        self.assertAlmostEqual(a[0, 0, 0], 0.00954336, delta=1e-3)
+
+        self.assertEqual(b[0, 1, 5], 2)
+        self.assertEqual(b[1, 1, 1], 1)
+
     def test_find_best_n_motifs(self):
         stomp_result = stomp(Array([10, 10, 10, 10, 10, 10, 9, 10, 10, 10, 10, 10, 11, 10, 9], dtype.f32),
                              Array([10, 11, 10, 9], dtype.f32),
@@ -149,6 +162,60 @@ def test_find_best_n_discords_consecutive(self):
         else:
             self.assertNotEqual(a[1], 11)
 
+    def test_mass(self):
+        mass_result = mass(Array(np.array([4, 3, 8]), dtype.f32),
+                           Array(np.array([10, 10, 10, 11, 12, 11, 10, 10, 11, 12, 11, 14, 10, 10]), dtype.f32))
+
+        distances = mass_result.to_numpy()
+        distances_expected = np.array([1.732051, 0.328954, 1.210135, 3.150851, 3.245858, 2.822044,
+                                       0.328954, 1.210135, 3.150851, 0.248097, 3.30187, 2.82205])
+
+        np.testing.assert_array_almost_equal(distances, distances_expected, decimal=self.DECIMAL)
+
+    def test_mass_multiple(self):
+        mass_result = mass(Array(np.array([[10, 10, 11, 11], [10, 11, 10, 10]]), dtype.f32),
+                           Array(np.array([[10, 10, 10, 11, 12, 11, 10], [10, 11, 12, 11, 14, 10, 10]]), dtype.f32))
+
+        distances = mass_result.to_numpy()
+
+        distances_expected = np.array([[[1.83880341, 0.87391543, 1.5307337,  3.69551826],
+                                        [3.26598597, 3.48967957, 2.82842779, 1.21162188]],
+                                       [[1.5307337, 2.17577887, 2.57832384, 3.75498915],
+                                        [2.82842779, 2.82842731, 3.21592307, 0.50202721]]])
+
+        np.testing.assert_array_almost_equal(distances, distances_expected, decimal=self.DECIMAL)
+
+    def test_find_best_n_occurrences(self):
+        find_result = find_best_n_occurrences(
+                            Array(np.array([10, 11, 12]), dtype.f32),
+                            Array(np.array([[10, 10, 11, 11, 12, 11, 10, 10, 11, 12, 11, 10, 10, 11],
+                                            [10, 10, 11, 11, 12, 11, 10, 10, 11, 12, 11, 10, 10, 11]]), dtype.f32),
+                            1)
+
+        distances = find_result[0].to_numpy()
+        indexes = find_result[1].to_numpy()
+
+        self.assertAlmostEqual(distances[0], 0.00069053, delta=self.DELTA)
+        self.assertEqual(indexes[0], 7)
+
+    def test_find_best_n_occurrences_multiple_queries(self):
+        find_result = find_best_n_occurrences(
+            Array(np.array([[11, 11, 10, 11], [10, 11, 11, 12]]), dtype.f32),
+            Array(np.array([[10, 10, 11, 11, 10, 11, 10, 10, 11, 11, 10, 11, 10, 10],
+                            [11, 10, 10, 11, 10, 11, 11, 10, 11, 11, 14, 10, 11, 10]]), dtype.f32),
+            4)
+
+        distances = find_result[0].to_numpy()
+        indexes = find_result[1].to_numpy()
+
+        np.testing.assert_array_equal(find_result[0].get_dims(), np.array([4, 2, 2, 1]))
+
+        # Subsequence index of the third best distance for the second query over the first time series
+        self.assertEqual(indexes[0, 1, 3], 2)
+
+        # Second best distance for the first query over the second time series
+        self.assertAlmostEqual(distances[1, 0, 2], 1.83880329, delta=self.DELTA)
+
 
 if __name__ == '__main__':
     suite = unittest.TestLoader().loadTestsFromTestCase(MatrixTest)