@@ -85,7 +85,7 @@ def handle_correlation_matrix(
8585 # Filter columns with correlation coefficient greater than the threshold
8686 try :
8787 correlated_cols = correlations [col ][
88- ( correlations [col ] > threshold ) & ( correlations [ col ] != 1 )
88+ correlations [col ]. abs () > threshold
8989 ].index .tolist ()
9090 except KeyError :
9191 print (
@@ -95,201 +95,17 @@ def handle_correlation_matrix(
9595 print ("Continuing with an empty list of correlated columns" )
9696 correlated_cols = []
9797
98+ # Exclude the column itself from the list of correlated columns
99+ if col in correlated_cols :
100+ correlated_cols .remove (col )
101+
98102 # Add the correlated columns to the list
99- drop_list .extend (
100- [(col , corr_col ) for corr_col in correlated_cols ]
101- )
103+ for corr_col in correlated_cols :
104+ # Add only the second column of the pair to avoid dropping both
105+ if col != corr_col :
106+ drop_list .append (corr_col )
102107
103108 # Remove duplicates from the list
104109 drop_list = list (set (drop_list ))
105110
106111 return drop_list
107-
108-
109- # def handle_correlation_matrix(local_param_dict, drop_list, df, chunk_size=50):
110- # """
111- # Calculate correlated columns in chunks.
112-
113- # Calculates the correlation coefficient between each column in the input DataFrame
114- # using chunks to avoid memory issues. The correlation threshold is defined by
115- # the 'corr' key in the local_param_dict dictionary.
116-
117- # Args:
118- # local_param_dict (dict): Dictionary containing local parameters, including the correlation threshold.
119- # drop_list (list): List to which correlated columns will be appended.
120- # df (pandas.DataFrame): Input DataFrame.
121- # chunk_size (int, optional): Size of each chunk for correlation calculation. Default is 50.
122-
123- # Returns:
124- # list: List of correlated columns.
125- # """
126-
127- # if chunk_size >= len(df):
128- # chunk_size = len(df) - 1
129- # # Define the correlation threshold
130- # threshold = local_param_dict.get("corr", 0.25)
131-
132- # # Remove non-numeric columns
133- # numeric_columns = df.select_dtypes(include=["number"]).columns
134- # df_numeric = df[numeric_columns]
135-
136- # # Split columns into chunks
137- # column_chunks = [
138- # df_numeric.columns[i : i + chunk_size]
139- # for i in range(0, len(df_numeric.columns), chunk_size)
140- # ]
141-
142- # # Iterate through each column chunk
143- # for chunk in tqdm(column_chunks, desc="Calculating Correlations"):
144- # # Calculate the correlation coefficients for the current chunk
145- # try:
146- # correlations = df_numeric[chunk].corr()
147- # except:
148- # print(
149- # "Encountered exception while calculating correlations for chunk", chunk
150- # )
151- # print(traceback.format_exc())
152- # continue
153-
154- # # Iterate through each column in the chunk
155- # for col in chunk:
156- # # Filter columns with correlation coefficient greater than the threshold
157- # try:
158- # correlated_cols = correlations[col][
159- # correlations[col].abs() > threshold
160- # ].index.tolist()
161- # except KeyError:
162- # print(
163- # "Encountered KeyError while calculating correlations for column",
164- # col,
165- # )
166- # print("Continuing with an empty list of correlated columns")
167- # correlated_cols = []
168- # except AttributeError:
169- # print(
170- # "Encountered AttributeError while calculating correlations for column",
171- # col,
172- # )
173- # print("Continuing with an empty list of correlated columns")
174- # correlated_cols = []
175-
176- # # Exclude the current column from the correlated columns list if it's in the list
177- # if col in correlated_cols:
178- # correlated_cols.remove(col)
179-
180- # # Add the correlated columns to the list
181- # drop_list.extend([(col, corr_col) for corr_col in correlated_cols])
182-
183- # # Remove duplicates from the list
184- # drop_list = list(set(drop_list))
185-
186- # return drop_list
187-
188-
189- # Example usage:
190- # input_csv_path = '../concatenated_data_concatenated_output_imputed_f_b_m_collapsed_mean.csv'
191- # df = pd.read_csv(input_csv_path)
192- # local_param_dict = {'corr': 0.25} # Example threshold value
193-
194- # correlated_columns = handle_correlation_matrix(df, local_param_dict)
195- # print("Columns with correlation greater than", local_param_dict['corr'])
196- # print(correlated_columns)
197-
198-
199- # def handle_correlation_matrix(local_param_dict, drop_list, df):
200- # print("Handling correlation matrix")
201- # temp_col_list = list(df.select_dtypes(include=[float, int]).columns)
202-
203- # # Calculate absolute correlation matrix
204- # corr_matrix = df.select_dtypes(include=[float, int]).corr().abs()
205-
206- # # Create a True/False mask and apply it
207- # mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
208- # tri_df = corr_matrix.mask(mask)
209-
210- # # List column names of highly correlated features (r > local_param_dict['corr'])
211- # corr_to_drop = [
212- # c for c in tri_df.columns if any(tri_df[c] > local_param_dict.get("corr"))
213- # ]
214-
215- # print(
216- # f"Identified {len(corr_to_drop)} correlated features to drop at >{local_param_dict.get('corr')}"
217- # )
218- # drop_list.extend(corr_to_drop)
219-
220- # return drop_list
221- # import pandas as pd
222- # import numpy as np
223-
224-
225- # def correlation_coefficient(x, y):
226- # """
227- # Calculate the correlation coefficient between two lists of values.
228-
229- # Parameters:
230- # x (list): First list of values.
231- # y (list): Second list of values.
232-
233- # Returns:
234- # float: Correlation coefficient between x and y.
235- # """
236- # n = len(x)
237- # sum_x = sum(x)
238- # sum_y = sum(y)
239- # sum_x_sq = sum(xi**2 for xi in x)
240- # sum_y_sq = sum(yi**2 for yi in y)
241- # sum_xy = sum(xi * yi for xi, yi in zip(x, y))
242-
243- # numerator = n * sum_xy - sum_x * sum_y
244- # denominator = ((n * sum_x_sq - sum_x**2) * (n * sum_y_sq - sum_y**2)) ** 0.5
245-
246- # if denominator == 0:
247- # return 0
248- # else:
249- # return numerator / denominator
250-
251-
252- # def handle_correlation_matrix(local_param_dict, drop_list, df):
253- # print("Handling correlation matrix")
254- # temp_col_list = list(df.select_dtypes(include=[float, int]).columns)
255-
256- # # Initialize an empty DataFrame to store correlation coefficients
257- # corr_matrix = pd.DataFrame(index=temp_col_list, columns=temp_col_list)
258-
259- # # Calculate correlation coefficients for each pair of columns
260- # for i, col1 in enumerate(temp_col_list):
261- # for j, col2 in enumerate(temp_col_list):
262- # if i != j:
263- # corr_matrix.loc[col1, col2] = correlation_coefficient(
264- # df[col1], df[col2]
265- # )
266-
267- # # Convert the DataFrame to absolute values
268- # corr_matrix = corr_matrix.abs()
269-
270- # # Create a True/False mask and apply it
271- # mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
272- # tri_df = corr_matrix.mask(mask)
273-
274- # # List column names of highly correlated features (r > local_param_dict['corr'])
275- # corr_to_drop = [
276- # c for c in tri_df.columns if any(tri_df[c] > local_param_dict.get("corr"))
277- # ]
278-
279- # print(
280- # f"Identified {len(corr_to_drop)} correlated features to drop at >{local_param_dict.get('corr')}"
281- # )
282- # drop_list.extend(corr_to_drop)
283-
284- # return drop_list
285-
286-
287- # Example usage:
288- # local_param_dict = {'corr': 0.25} # Example threshold value
289- # drop_list = []
290-
291- # # Assuming df is your DataFrame
292- # # Replace df with your actual DataFrame
293- # # Call the function to update the drop_list
294- # updated_drop_list = handle_correlation_matrix(local_param_dict, drop_list, df)
295- # print("Updated drop list:", updated_drop_list)
0 commit comments