Skip to content

Commit bb3cb06

Browse files
authored
Add mass and findBestNOccurrences functions (#52)
1 parent ffd0204 commit bb3cb06

3 files changed

Lines changed: 129 additions & 0 deletions

File tree

khiva/array.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,11 @@ def to_list(self):
300300
def to_numpy(self):
301301
""" Converts the KHIVA array to a numpy array.
302302
303+
The returned numpy array shape matches the Array dimensions as follows:
304+
- For an Array with dims equal to [4, 2, 1, 1] the numpy shape will be (2, 4).
305+
- For an Array with dims equal to [4, 3, 2, 1] the numpy shape will be (2, 3, 4).
306+
- For an Array with dims equal to [4, 1, 2, 3] the numpy shape will be (3, 2, 1, 4).
307+
303308
:return: KHIVA array converted to numpy.array.
304309
"""
305310
return self._get_data()

khiva/matrix.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,63 @@ def find_best_n_motifs(profile, index, m, n, self_join=False):
7474
array_reference=d)
7575

7676

77+
def find_best_n_occurrences(query_time_series, time_series, number_of_occurrences):
78+
""" Calculates the N best matches of several queries in several time series.
79+
80+
The result has the following structure:
81+
- 1st dimension corresponds to the nth best match.
82+
- 2nd dimension corresponds to the number of queries.
83+
- 3rd dimension corresponds to the number of time series.
84+
85+
For example, the distance in the position (1, 2, 3) corresponds to the second best distance of the third query in the
86+
fourth time series. The index in the position (1, 2, 3) is the index of the subsequence which leads to the
87+
second best distance of the third query in the fourth time series.
88+
89+
:param query_time_series: Array whose first dimension is the length of the query time series and the second
90+
dimension is the number of queries.
91+
:param time_series: Array whose first dimension is the length of the time series and the second dimension is the
92+
number of time series.
93+
:param number_of_occurrences: Number of matches to return.
94+
:return: KHIVA arrays with the distances and indexes.
95+
"""
96+
97+
distances = ctypes.c_void_p(0)
98+
indexes = ctypes.c_void_p(0)
99+
KhivaLibrary().c_khiva_library.find_best_n_occurrences(ctypes.pointer(query_time_series.arr_reference),
100+
ctypes.pointer(time_series.arr_reference),
101+
ctypes.pointer(ctypes.c_long(number_of_occurrences)),
102+
ctypes.pointer(distances),
103+
ctypes.pointer(indexes))
104+
105+
return Array(array_reference=distances), Array(array_reference=indexes)
106+
107+
108+
def mass(query_time_series, time_series):
109+
""" Mueen's Algorithm for Similarity Search.
110+
111+
The result has the following structure:
112+
- 1st dimension corresponds to the index of the subsequence in the time series.
113+
- 2nd dimension corresponds to the number of queries.
114+
- 3rd dimension corresponds to the number of time series.
115+
116+
For example, the distance in the position (1, 2, 3) correspond to the distance of the third query to the fourth time
117+
series for the second subsequence in the time series.
118+
119+
:param query_time_series: Array whose first dimension is the length of the query time series and the second
120+
dimension is the number of queries.
121+
:param time_series: Array whose first dimension is the length of the time series and the second dimension is the
122+
number of time series.
123+
:return: KHIVA array with the distances.
124+
"""
125+
126+
distances = ctypes.c_void_p(0)
127+
KhivaLibrary().c_khiva_library.mass(ctypes.pointer(query_time_series.arr_reference),
128+
ctypes.pointer(time_series.arr_reference),
129+
ctypes.pointer(distances))
130+
131+
return Array(array_reference=distances)
132+
133+
77134
def stomp(first_time_series, second_time_series, subsequence_length):
78135
""" Stomp algorithm to calculate the matrix profile between `ta` and `tb` using a subsequence length of `m`.
79136

tests/unit_tests/matrix_unit_tests.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,19 @@ def test_stomp(self):
4848
self.assertAlmostEqual(a[i], 0, delta=1e-2)
4949
self.assertAlmostEqual(b[i], expected_index[i])
5050

51+
def test_stomp_different_queries(self):
52+
stomp_result = stomp(Array([[10, 11, 10, 8, 14], [10, 14, 10, 10, 3]]),
53+
Array([[10, 11, 10, 11, 10, 11, 10, 7], [10, 13, 10, 10, 10, 14, 8, 7]]), 3)
54+
55+
a = stomp_result[0].to_numpy()
56+
b = stomp_result[1].to_numpy()
57+
58+
self.assertAlmostEqual(a[1, 0, 2], 1.73205077, delta=1e-3)
59+
self.assertAlmostEqual(a[0, 0, 0], 0.00954336, delta=1e-3)
60+
61+
self.assertEqual(b[0, 1, 5], 2)
62+
self.assertEqual(b[1, 1, 1], 1)
63+
5164
def test_find_best_n_motifs(self):
5265
stomp_result = stomp(Array([10, 10, 10, 10, 10, 10, 9, 10, 10, 10, 10, 10, 11, 10, 9], dtype.f32),
5366
Array([10, 11, 10, 9], dtype.f32),
@@ -149,6 +162,60 @@ def test_find_best_n_discords_consecutive(self):
149162
else:
150163
self.assertNotEqual(a[1], 11)
151164

165+
def test_mass(self):
166+
mass_result = mass(Array(np.array([4, 3, 8]), dtype.f32),
167+
Array(np.array([10, 10, 10, 11, 12, 11, 10, 10, 11, 12, 11, 14, 10, 10]), dtype.f32))
168+
169+
distances = mass_result.to_numpy()
170+
distances_expected = np.array([1.732051, 0.328954, 1.210135, 3.150851, 3.245858, 2.822044,
171+
0.328954, 1.210135, 3.150851, 0.248097, 3.30187, 2.82205])
172+
173+
np.testing.assert_array_almost_equal(distances, distances_expected, decimal=self.DECIMAL)
174+
175+
def test_mass_multiple(self):
176+
mass_result = mass(Array(np.array([[10, 10, 11, 11], [10, 11, 10, 10]]), dtype.f32),
177+
Array(np.array([[10, 10, 10, 11, 12, 11, 10], [10, 11, 12, 11, 14, 10, 10]]), dtype.f32))
178+
179+
distances = mass_result.to_numpy()
180+
181+
distances_expected = np.array([[[1.83880341, 0.87391543, 1.5307337, 3.69551826],
182+
[3.26598597, 3.48967957, 2.82842779, 1.21162188]],
183+
[[1.5307337, 2.17577887, 2.57832384, 3.75498915],
184+
[2.82842779, 2.82842731, 3.21592307, 0.50202721]]])
185+
186+
np.testing.assert_array_almost_equal(distances, distances_expected, decimal=self.DECIMAL)
187+
188+
def test_find_best_n_occurrences(self):
189+
find_result = find_best_n_occurrences(
190+
Array(np.array([10, 11, 12]), dtype.f32),
191+
Array(np.array([[10, 10, 11, 11, 12, 11, 10, 10, 11, 12, 11, 10, 10, 11],
192+
[10, 10, 11, 11, 12, 11, 10, 10, 11, 12, 11, 10, 10, 11]]), dtype.f32),
193+
1)
194+
195+
distances = find_result[0].to_numpy()
196+
indexes = find_result[1].to_numpy()
197+
198+
self.assertAlmostEqual(distances[0], 0.00069053, delta=self.DELTA)
199+
self.assertEqual(indexes[0], 7)
200+
201+
def test_find_best_n_occurrences_multiple_queries(self):
202+
find_result = find_best_n_occurrences(
203+
Array(np.array([[11, 11, 10, 11], [10, 11, 11, 12]]), dtype.f32),
204+
Array(np.array([[10, 10, 11, 11, 10, 11, 10, 10, 11, 11, 10, 11, 10, 10],
205+
[11, 10, 10, 11, 10, 11, 11, 10, 11, 11, 14, 10, 11, 10]]), dtype.f32),
206+
4)
207+
208+
distances = find_result[0].to_numpy()
209+
indexes = find_result[1].to_numpy()
210+
211+
np.testing.assert_array_equal(find_result[0].get_dims(), np.array([4, 2, 2, 1]))
212+
213+
# Subsequence index of the third best distance for the second query over the first time series
214+
self.assertEqual(indexes[0, 1, 3], 2)
215+
216+
# Second best distance for the first query over the second time series
217+
self.assertAlmostEqual(distances[1, 0, 2], 1.83880329, delta=self.DELTA)
218+
152219

153220
if __name__ == '__main__':
154221
suite = unittest.TestLoader().loadTestsFromTestCase(MatrixTest)

0 commit comments

Comments
 (0)