|
36 | 36 | 'Count_Person_25To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_25To64Years_Female', |
37 | 37 | 'Count_Person_25To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_25To64Years_Male', |
38 | 38 | 'Count_Person_25To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_25To64Years', |
| 39 | + 'Count_Person_18To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_18To64Years_Female', |
| 40 | + 'Count_Person_18To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_18To64Years_Male', |
| 41 | + 'Count_Person_18To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_18To64Years', |
39 | 42 | ] |
40 | 43 |
|
41 | 44 |
|
@@ -116,37 +119,59 @@ def translate_wide_to_long(file_path): |
116 | 119 |
|
117 | 120 |
|
118 | 121 | def preprocess(df, cleaned_csv): |
119 | | - """ Preprocesses a DataFrame and saves the cleaned data to a CSV file. |
| 122 | + """Preprocesses and reshapes data, then saves it to a CSV file. |
| 123 | +
|
| 124 | + The function pivots the DataFrame to have separate columns for different |
| 125 | + age groups, combines them into a single row per geo-location and year, |
| 126 | + and then writes the result to a CSV file. |
| 127 | +
|
120 | 128 | Args: |
121 | | - df: The raw, unprocessed DataFrame. |
| 129 | + df: The raw, unprocessed DataFrame with columns for different age groups. |
122 | 130 | cleaned_csv: The path to the CSV file where the cleaned data will be saved. |
123 | | -
|
124 | | - Returns: |
125 | | - None |
126 | 131 | """ |
| 132 | + |
127 | 133 | try: |
128 | 134 | logging.info(f'Processing file: {cleaned_csv}') |
129 | 135 | df = df.replace(np.nan, '', regex=True) |
| 136 | + |
130 | 137 | with open(cleaned_csv, 'w', newline='') as f_out: |
131 | 138 | writer = csv.DictWriter(f_out, |
132 | 139 | fieldnames=_OUTPUT_COLUMNS, |
133 | 140 | lineterminator='\n') |
134 | 141 | writer.writeheader() |
135 | | - for _, row in df.iterrows(): |
136 | | - writer.writerow({ |
137 | | - 'Date': |
138 | | - '%s' % (row['time'][:4]), |
139 | | - 'GeoId': |
140 | | - '%s' % row['geo'], |
141 | | - 'Count_Person_25To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_25To64Years_Female': |
142 | | - (row['F']), |
143 | | - 'Count_Person_25To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_25To64Years_Male': |
144 | | - (row['M']), |
145 | | - 'Count_Person_25To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_25To64Years': |
146 | | - (row['T']), |
147 | | - }) |
148 | | - logging.info('File processing completed') |
149 | 142 |
|
| 143 | + # Grouping ensures that Y25-64 and Y18-64 data points for the same |
| 144 | + # place and year end up on the SAME row in your CSV. |
| 145 | + for (geo, time), group in df.groupby(['geo', 'time']): |
| 146 | + row_to_write = {'Date': time[:4], 'GeoId': geo} |
| 147 | + |
| 148 | + for _, row in group.iterrows(): |
| 149 | + # Handle the 25-64 Age Group |
| 150 | + if row['age'] == 'Y25-64': |
| 151 | + row_to_write[ |
| 152 | + 'Count_Person_25To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_25To64Years_Female'] = row.get( |
| 153 | + 'F', '') |
| 154 | + row_to_write[ |
| 155 | + 'Count_Person_25To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_25To64Years_Male'] = row.get( |
| 156 | + 'M', '') |
| 157 | + row_to_write[ |
| 158 | + 'Count_Person_25To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_25To64Years'] = row.get( |
| 159 | + 'T', '') |
| 160 | + |
| 161 | + # Handle the 18-64 Age Group (Now with Male and Female) |
| 162 | + elif row['age'] == 'Y18-64': |
| 163 | + row_to_write[ |
| 164 | + 'Count_Person_18To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_18To64Years_Female'] = row.get( |
| 165 | + 'F', '') |
| 166 | + row_to_write[ |
| 167 | + 'Count_Person_18To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_18To64Years_Male'] = row.get( |
| 168 | + 'M', '') |
| 169 | + row_to_write[ |
| 170 | + 'Count_Person_18To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_18To64Years'] = row.get( |
| 171 | + 'T', '') |
| 172 | + |
| 173 | + writer.writerow(row_to_write) |
| 174 | + logging.info('File processing completed') |
150 | 175 | except Exception as e: |
151 | 176 | logging.fatal(f'Processing error {e}') |
152 | 177 |
|
@@ -199,8 +224,14 @@ def main(_): |
199 | 224 |
|
200 | 225 | if mode == "" or mode == "download": |
201 | 226 | download_data(_DATA_URL, input_file) |
| 227 | + |
202 | 228 | if mode == "" or mode == "process": |
203 | 229 | translate_df = translate_wide_to_long(input_file) |
| 230 | + |
| 231 | + # Keep both age groups but ensure we only use the Percentage unit |
| 232 | + translate_df = translate_df[(translate_df['unit'] == 'PC') & ( |
| 233 | + translate_df['age'].isin(['Y18-64', 'Y25-64']))] |
| 234 | + |
204 | 235 | preprocess(translate_df, _CLEANED_CSV) |
205 | 236 | get_template_mcf(_OUTPUT_COLUMNS, _TMCF) |
206 | 237 |
|
|
0 commit comments