Skip to content

Commit dc2aa66

Browse files
authored
3 New SV addition to eurostatdata education enrollment (#1904)
* year range & statvar added * year range & statvar added * year range & statvar added * Updated test code & test data * Updated test code & test data * Updated test code * Updated test code & test data * Updated test code & test data * upadated scripts * upadated scripts * removed the malformed leading spaces from the tmcf file
1 parent a1ce299 commit dc2aa66

5 files changed

Lines changed: 6207 additions & 5433 deletions

File tree

scripts/eurostat/regional_statistics_by_nuts/education_enrollment/Eurostats_NUTS2_Enrollment.tmcf

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,33 @@ value: C:EurostatsNUTS2_Enrollment->Count_Person_25To64Years_EnrolledInEducation
2727
scalingFactor: 100
2828
unit:Percent
2929
measurementMethod: dcs:EurostatRegionalStatistics
30+
31+
Node: E:EurostatsNUTS2_Enrollment->E3
32+
typeOf: dcs:StatVarObservation
33+
variableMeasured: dcs:Count_Person_18To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_18To64Years_Female
34+
observationAbout: C:EurostatsNUTS2_Enrollment->GeoId
35+
observationDate: C:EurostatsNUTS2_Enrollment->Date
36+
value: C:EurostatsNUTS2_Enrollment->Count_Person_18To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_18To64Years_Female
37+
scalingFactor: 100
38+
unit:Percent
39+
measurementMethod: dcs:EurostatRegionalStatistics
40+
41+
Node: E:EurostatsNUTS2_Enrollment->E4
42+
typeOf: dcs:StatVarObservation
43+
variableMeasured: dcs:Count_Person_18To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_18To64Years_Male
44+
observationAbout: C:EurostatsNUTS2_Enrollment->GeoId
45+
observationDate: C:EurostatsNUTS2_Enrollment->Date
46+
value: C:EurostatsNUTS2_Enrollment->Count_Person_18To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_18To64Years_Male
47+
scalingFactor: 100
48+
unit:Percent
49+
measurementMethod: dcs:EurostatRegionalStatistics
50+
51+
Node: E:EurostatsNUTS2_Enrollment->E5
52+
typeOf: dcs:StatVarObservation
53+
variableMeasured: dcs:Count_Person_18To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_18To64Years
54+
observationAbout: C:EurostatsNUTS2_Enrollment->GeoId
55+
observationDate: C:EurostatsNUTS2_Enrollment->Date
56+
value: C:EurostatsNUTS2_Enrollment->Count_Person_18To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_18To64Years
57+
scalingFactor: 100
58+
unit:Percent
59+
measurementMethod: dcs:EurostatRegionalStatistics

scripts/eurostat/regional_statistics_by_nuts/education_enrollment/education_enrollment_preprocess_gen_tmcf.py

Lines changed: 50 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@
3636
'Count_Person_25To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_25To64Years_Female',
3737
'Count_Person_25To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_25To64Years_Male',
3838
'Count_Person_25To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_25To64Years',
39+
'Count_Person_18To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_18To64Years_Female',
40+
'Count_Person_18To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_18To64Years_Male',
41+
'Count_Person_18To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_18To64Years',
3942
]
4043

4144

@@ -116,37 +119,59 @@ def translate_wide_to_long(file_path):
116119

117120

118121
def preprocess(df, cleaned_csv):
119-
""" Preprocesses a DataFrame and saves the cleaned data to a CSV file.
122+
"""Preprocesses and reshapes data, then saves it to a CSV file.
123+
124+
The function pivots the DataFrame to have separate columns for different
125+
age groups, combines them into a single row per geo-location and year,
126+
and then writes the result to a CSV file.
127+
120128
Args:
121-
df: The raw, unprocessed DataFrame.
129+
df: The raw, unprocessed DataFrame with columns for different age groups.
122130
cleaned_csv: The path to the CSV file where the cleaned data will be saved.
123-
124-
Returns:
125-
None
126131
"""
132+
127133
try:
128134
logging.info(f'Processing file: {cleaned_csv}')
129135
df = df.replace(np.nan, '', regex=True)
136+
130137
with open(cleaned_csv, 'w', newline='') as f_out:
131138
writer = csv.DictWriter(f_out,
132139
fieldnames=_OUTPUT_COLUMNS,
133140
lineterminator='\n')
134141
writer.writeheader()
135-
for _, row in df.iterrows():
136-
writer.writerow({
137-
'Date':
138-
'%s' % (row['time'][:4]),
139-
'GeoId':
140-
'%s' % row['geo'],
141-
'Count_Person_25To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_25To64Years_Female':
142-
(row['F']),
143-
'Count_Person_25To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_25To64Years_Male':
144-
(row['M']),
145-
'Count_Person_25To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_25To64Years':
146-
(row['T']),
147-
})
148-
logging.info('File processing completed')
149142

143+
# Grouping ensures that Y25-64 and Y18-64 data points for the same
144+
# place and year end up on the SAME row in your CSV.
145+
for (geo, time), group in df.groupby(['geo', 'time']):
146+
row_to_write = {'Date': time[:4], 'GeoId': geo}
147+
148+
for _, row in group.iterrows():
149+
# Handle the 25-64 Age Group
150+
if row['age'] == 'Y25-64':
151+
row_to_write[
152+
'Count_Person_25To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_25To64Years_Female'] = row.get(
153+
'F', '')
154+
row_to_write[
155+
'Count_Person_25To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_25To64Years_Male'] = row.get(
156+
'M', '')
157+
row_to_write[
158+
'Count_Person_25To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_25To64Years'] = row.get(
159+
'T', '')
160+
161+
# Handle the 18-64 Age Group (Now with Male and Female)
162+
elif row['age'] == 'Y18-64':
163+
row_to_write[
164+
'Count_Person_18To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_18To64Years_Female'] = row.get(
165+
'F', '')
166+
row_to_write[
167+
'Count_Person_18To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_18To64Years_Male'] = row.get(
168+
'M', '')
169+
row_to_write[
170+
'Count_Person_18To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_18To64Years'] = row.get(
171+
'T', '')
172+
173+
writer.writerow(row_to_write)
174+
logging.info('File processing completed')
150175
except Exception as e:
151176
logging.fatal(f'Processing error {e}')
152177

@@ -199,8 +224,14 @@ def main(_):
199224

200225
if mode == "" or mode == "download":
201226
download_data(_DATA_URL, input_file)
227+
202228
if mode == "" or mode == "process":
203229
translate_df = translate_wide_to_long(input_file)
230+
231+
# Keep both age groups but ensure we only use the Percentage unit
232+
translate_df = translate_df[(translate_df['unit'] == 'PC') & (
233+
translate_df['age'].isin(['Y18-64', 'Y25-64']))]
234+
204235
preprocess(translate_df, _CLEANED_CSV)
205236
get_template_mcf(_OUTPUT_COLUMNS, _TMCF)
206237

scripts/eurostat/regional_statistics_by_nuts/education_enrollment/education_enrollment_preprocess_gen_tmcf_test.py

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
1514
import unittest
1615
import os
1716
from education_enrollment_preprocess_gen_tmcf import *
@@ -23,41 +22,32 @@
2322
sys.path.insert(0, MODULE_DIR)
2423

2524
TEST_DATASET_DIR = os.path.join(MODULE_DIR, "test_data", "sample_input")
26-
2725
EXPECTED_FILES_DIR = os.path.join(MODULE_DIR, "test_data", "sample_output")
2826

2927

3028
class TestProcess(unittest.TestCase):
31-
"""
32-
TestPreprocess is inherting unittest class
33-
properties which further requried for unit testing.
34-
The test will be conducted for EuroStat BMI Sample Datasets,
35-
It will be generating CSV, MCF and TMCF files based on the sample input.
36-
Comparing the data with the expected files.
37-
"""
3829

39-
def __init__(self, methodName: str = ...) -> None:
30+
def __init__(self, methodName: str = "runTest") -> None:
4031
super().__init__(methodName)
4132
input_path = os.path.join(TEST_DATASET_DIR, 'sample_data.tsv')
4233
self.CLEANED_CSV_FILE_PATH = os.path.join(EXPECTED_FILES_DIR,
4334
"test_output.csv")
44-
preprocess(translate_wide_to_long(input_path),
45-
self.CLEANED_CSV_FILE_PATH)
35+
36+
# Transform and filter data to match original script logic
37+
df = translate_wide_to_long(input_path)
38+
df = df[(df['unit'] == 'PC') & (df['age'].isin(['Y18-64', 'Y25-64']))]
39+
40+
preprocess(df, self.CLEANED_CSV_FILE_PATH)
4641

4742
with open(self.CLEANED_CSV_FILE_PATH, encoding="utf-8-sig") as csv_file:
4843
self.actual_csv_data = csv_file.read()
4944

5045
def test_create_csv(self):
51-
"""
52-
This method is required to test between output generated
53-
preprocess script and excepted output files like CSV
54-
"""
5546
expected_csv_file_path = os.path.join(EXPECTED_FILES_DIR,
5647
"Eurostats_NUTS2_Enrollment.csv")
5748

58-
expected_csv_data = ""
5949
with open(expected_csv_file_path,
60-
encoding="utf-8") as expected_csv_file:
50+
encoding="utf-8-sig") as expected_csv_file:
6151
expected_csv_data = expected_csv_file.read()
6252

6353
self.assertEqual(expected_csv_data.strip(),

0 commit comments

Comments
 (0)