@@ -188,46 +188,6 @@ static PyObject *LTTB_for_buckets(PyObject *buckets_list) {
188188}
189189
190190
191- static double calculate_sse_for_bucket (PyArrayObject * bucket ) {
192- npy_intp num_points = PyArray_DIM (bucket , 0 );
193- double * data = (double * )PyArray_DATA (bucket );
194-
195- double sum_x = 0.0 , sum_y = 0.0 ;
196- double a_numerator = 0.0 ;
197- double a_denominator = 0.0 ;
198-
199- for (npy_intp i = 0 ; i < num_points ; i ++ ) {
200- sum_x += data [i * 2 ];
201- sum_y += data [i * 2 + 1 ];
202- }
203- double avg_x = sum_x / (double )num_points ;
204- double avg_y = sum_y / (double )num_points ;
205-
206- for (npy_intp i = 0 ; i < num_points ; i ++ ) {
207- double deviation_x = data [i * 2 ] - avg_x ;
208- double deviation_y = data [i * 2 + 1 ] - avg_y ;
209- a_numerator += deviation_x * deviation_y ;
210- a_denominator += deviation_x * deviation_x ;
211- }
212- double a , b ;
213- if (a_denominator == 0.0 ) {
214- a = (a_numerator > 0 ) ? INFINITY : - INFINITY ;
215- b = avg_y ;
216- } else {
217- a = a_numerator / a_denominator ;
218- b = avg_y - a * avg_x ;
219- }
220- double sse = 0.0 ;
221- for (npy_intp i = 0 ; i < num_points ; i ++ ) {
222- double x = data [i * 2 ];
223- double y = data [i * 2 + 1 ];
224- double standardError = y - (a * x + b );
225- sse += standardError * standardError ;
226- }
227- return sse ;
228- }
229-
230-
231191static npy_intp find_highest_sse_bucket_index (PyObject * buckets_list ,
232192 PyArrayObject * sse_array ) {
233193 npy_intp sse_len = PyArray_DIM (sse_array , 0 );
@@ -282,19 +242,91 @@ static npy_intp find_lowest_sse_adjacent_buckets_index(PyArrayObject *sse_array,
282242}
283243
284244
245+ static double calculate_sse_bucket (double * prev_point , double * bucket_data ,
246+ npy_intp bucket_len , double * next_point ) {
247+
248+ // Total points = 1 (prev) + bucket_len + 1 (next)
249+ npy_intp total_points = bucket_len + 2 ;
250+ double sum_x = 0.0 , sum_y = 0.0 ;
251+ npy_intp i ;
252+
253+ sum_x += prev_point [0 ];
254+ sum_y += prev_point [1 ];
255+
256+ for (i = 0 ; i < bucket_len ; i ++ ) {
257+ sum_x += bucket_data [i * 2 ];
258+ sum_y += bucket_data [i * 2 + 1 ];
259+ }
260+
261+ sum_x += next_point [0 ];
262+ sum_y += next_point [1 ];
263+ double avg_x = sum_x / total_points ;
264+ double avg_y = sum_y / total_points ;
265+
266+ // Calculate linear regression
267+ double numerator = 0.0 ;
268+ double denominator = 0.0 ;
269+ double deviation_x , deviation_y ;
270+
271+ deviation_x = prev_point [0 ] - avg_x ;
272+ deviation_y = prev_point [1 ] - avg_y ;
273+ numerator += deviation_x * deviation_y ;
274+ denominator += deviation_x * deviation_x ;
275+
276+ for (i = 0 ; i < bucket_len ; i ++ ) {
277+ deviation_x = bucket_data [i * 2 ] - avg_x ;
278+ deviation_y = bucket_data [i * 2 + 1 ] - avg_y ;
279+ numerator += deviation_x * deviation_y ;
280+ denominator += deviation_x * deviation_x ;
281+ }
282+
283+ deviation_x = next_point [0 ] - avg_x ;
284+ deviation_y = next_point [1 ] - avg_y ;
285+ numerator += deviation_x * deviation_y ;
286+ denominator += deviation_x * deviation_x ;
287+
288+ double a , b ;
289+ if (denominator == 0.0 ) {
290+ a = (numerator > 0 ) ? INFINITY : - INFINITY ;
291+ b = avg_y ;
292+ } else {
293+ a = numerator / denominator ;
294+ b = avg_y - a * avg_x ;
295+ }
296+
297+ // Calculate error
298+ double sse = 0.0 ;
299+ double standardError ;
300+
301+ standardError = prev_point [1 ] - (a * prev_point [0 ] + b );
302+ sse += standardError * standardError ;
303+
304+ for (i = 0 ; i < bucket_len ; i ++ ) {
305+ double bx = bucket_data [i * 2 ];
306+ double by = bucket_data [i * 2 + 1 ];
307+ standardError = by - (a * bx + b );
308+ sse += standardError * standardError ;
309+ }
310+
311+ standardError = next_point [1 ] - (a * next_point [0 ] + b );
312+ sse += standardError * standardError ;
313+
314+ return sse ;
315+ }
316+
285317static PyObject * calculate_sse_for_buckets (PyObject * buckets_list ) {
286318 Py_ssize_t num_buckets = PyList_Size (buckets_list );
287319 if (num_buckets < 3 ) {
288320 PyErr_SetString (PyExc_ValueError ,
289321 "Not enough buckets to calculate SSE" );
290322 return NULL ;
291323 }
324+
292325 npy_intp sse_len = num_buckets ;
293326 PyObject * sse_array =
294327 PyArray_Zeros (1 , & sse_len , PyArray_DescrFromType (NPY_DOUBLE ), 0 );
295328
296329 double * sse_data = (double * )PyArray_DATA ((PyArrayObject * )sse_array );
297-
298330 for (Py_ssize_t i = 1 ; i < num_buckets - 1 ; i ++ ) {
299331 PyArrayObject * prev_bucket =
300332 (PyArrayObject * )PyList_GetItem (buckets_list , i - 1 );
@@ -303,28 +335,16 @@ static PyObject *calculate_sse_for_buckets(PyObject *buckets_list) {
303335 PyArrayObject * next_bucket =
304336 (PyArrayObject * )PyList_GetItem (buckets_list , i + 1 );
305337
306- npy_intp curr_rows = PyArray_DIM (curr_bucket , 0 );
307- npy_intp cols = PyArray_DIM (curr_bucket , 1 );
308-
309- npy_intp combined_dims [2 ] = {curr_rows + 2 , cols };
310- PyArrayObject * bucket_with_adj =
311- (PyArrayObject * )PyArray_SimpleNew (2 , combined_dims , NPY_DOUBLE );
312-
313- double * bucket_adj_data = (double * )PyArray_DATA (bucket_with_adj );
314338 double * prev_last_row = (double * )PyArray_GETPTR2 (
315339 prev_bucket , PyArray_DIM (prev_bucket , 0 ) - 1 , 0 );
316- memcpy (bucket_adj_data , prev_last_row , cols * __DOUBLE_SIZE__ );
317340
318341 double * curr_data = (double * )PyArray_DATA (curr_bucket );
319- memcpy (bucket_adj_data + cols , curr_data ,
320- curr_rows * cols * __DOUBLE_SIZE__ );
342+ npy_intp curr_len = PyArray_DIM (curr_bucket , 0 );
321343
322344 double * next_first_row = (double * )PyArray_GETPTR2 (next_bucket , 0 , 0 );
323- memcpy (bucket_adj_data + (curr_rows + 1 ) * cols , next_first_row ,
324- cols * __DOUBLE_SIZE__ );
325345
326- sse_data [i ] = calculate_sse_for_bucket ( bucket_with_adj );
327- Py_DECREF ( bucket_with_adj );
346+ sse_data [i ] = calculate_sse_bucket ( prev_last_row , curr_data , curr_len ,
347+ next_first_row );
328348 }
329349
330350 return sse_array ;
0 commit comments