datacommonsorg
diff --git a/‎scripts/eurostat/regional_statistics_by_nuts/education_enrollment/Eurostats_NUTS2_Enrollment.tmcf‎
Lines changed: 30 additions & 0 deletions b/‎scripts/eurostat/regional_statistics_by_nuts/education_enrollment/Eurostats_NUTS2_Enrollment.tmcf‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎scripts/eurostat/regional_statistics_by_nuts/education_enrollment/education_enrollment_preprocess_gen_tmcf.py‎
Lines changed: 50 additions & 19 deletions b/‎scripts/eurostat/regional_statistics_by_nuts/education_enrollment/education_enrollment_preprocess_gen_tmcf.py‎
Lines changed: 50 additions & 19 deletions
diff --git a/‎scripts/eurostat/regional_statistics_by_nuts/education_enrollment/education_enrollment_preprocess_gen_tmcf_test.py‎
Lines changed: 8 additions & 18 deletions b/‎scripts/eurostat/regional_statistics_by_nuts/education_enrollment/education_enrollment_preprocess_gen_tmcf_test.py‎
Lines changed: 8 additions & 18 deletions
@@ -27,3 +27,33 @@ value: C:EurostatsNUTS2_Enrollment->Count_Person_25To64Years_EnrolledInEducation
 scalingFactor: 100
 unit:Percent
 measurementMethod: dcs:EurostatRegionalStatistics
+
+Node: E:EurostatsNUTS2_Enrollment->E3
+typeOf: dcs:StatVarObservation
+variableMeasured: dcs:Count_Person_18To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_18To64Years_Female
+observationAbout: C:EurostatsNUTS2_Enrollment->GeoId
+observationDate: C:EurostatsNUTS2_Enrollment->Date
+value: C:EurostatsNUTS2_Enrollment->Count_Person_18To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_18To64Years_Female
+scalingFactor: 100
+unit:Percent
+measurementMethod: dcs:EurostatRegionalStatistics
+
+Node: E:EurostatsNUTS2_Enrollment->E4
+typeOf: dcs:StatVarObservation
+variableMeasured: dcs:Count_Person_18To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_18To64Years_Male
+observationAbout: C:EurostatsNUTS2_Enrollment->GeoId
+observationDate: C:EurostatsNUTS2_Enrollment->Date
+value: C:EurostatsNUTS2_Enrollment->Count_Person_18To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_18To64Years_Male
+scalingFactor: 100
+unit:Percent
+measurementMethod: dcs:EurostatRegionalStatistics
+
+Node: E:EurostatsNUTS2_Enrollment->E5
+typeOf: dcs:StatVarObservation
+variableMeasured: dcs:Count_Person_18To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_18To64Years
+observationAbout: C:EurostatsNUTS2_Enrollment->GeoId
+observationDate: C:EurostatsNUTS2_Enrollment->Date
+value: C:EurostatsNUTS2_Enrollment->Count_Person_18To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_18To64Years
+scalingFactor: 100
+unit:Percent
+measurementMethod: dcs:EurostatRegionalStatistics
@@ -36,6 +36,9 @@
     'Count_Person_25To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_25To64Years_Female',
     'Count_Person_25To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_25To64Years_Male',
     'Count_Person_25To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_25To64Years',
+    'Count_Person_18To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_18To64Years_Female',
+    'Count_Person_18To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_18To64Years_Male',
+    'Count_Person_18To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_18To64Years',
 ]
 
 
@@ -116,37 +119,59 @@ def translate_wide_to_long(file_path):
 
 
 def preprocess(df, cleaned_csv):
-    """ Preprocesses a DataFrame and saves the cleaned data to a CSV file.
+    """Preprocesses and reshapes data, then saves it to a CSV file.
+
+    The function pivots the DataFrame to have separate columns for different
+    age groups, combines them into a single row per geo-location and year,
+    and then writes the result to a CSV file.
+
     Args:
-        df: The raw, unprocessed DataFrame.
+        df: The raw, unprocessed DataFrame with columns for different age groups.
         cleaned_csv: The path to the CSV file where the cleaned data will be saved.
-
-    Returns:
-        None
     """
+
     try:
         logging.info(f'Processing file: {cleaned_csv}')
         df = df.replace(np.nan, '', regex=True)
+
         with open(cleaned_csv, 'w', newline='') as f_out:
             writer = csv.DictWriter(f_out,
                                     fieldnames=_OUTPUT_COLUMNS,
                                     lineterminator='\n')
             writer.writeheader()
-            for _, row in df.iterrows():
-                writer.writerow({
-                    'Date':
-                        '%s' % (row['time'][:4]),
-                    'GeoId':
-                        '%s' % row['geo'],
-                    'Count_Person_25To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_25To64Years_Female':
-                        (row['F']),
-                    'Count_Person_25To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_25To64Years_Male':
-                        (row['M']),
-                    'Count_Person_25To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_25To64Years':
-                        (row['T']),
-                })
-        logging.info('File processing completed')
 
+            # Grouping ensures that Y25-64 and Y18-64 data points for the same
+            # place and year end up on the SAME row in your CSV.
+            for (geo, time), group in df.groupby(['geo', 'time']):
+                row_to_write = {'Date': time[:4], 'GeoId': geo}
+
+                for _, row in group.iterrows():
+                    # Handle the 25-64 Age Group
+                    if row['age'] == 'Y25-64':
+                        row_to_write[
+                            'Count_Person_25To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_25To64Years_Female'] = row.get(
+                                'F', '')
+                        row_to_write[
+                            'Count_Person_25To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_25To64Years_Male'] = row.get(
+                                'M', '')
+                        row_to_write[
+                            'Count_Person_25To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_25To64Years'] = row.get(
+                                'T', '')
+
+                    # Handle the 18-64 Age Group (Now with Male and Female)
+                    elif row['age'] == 'Y18-64':
+                        row_to_write[
+                            'Count_Person_18To64Years_EnrolledInEducationOrTraining_Female_AsAFractionOfCount_Person_18To64Years_Female'] = row.get(
+                                'F', '')
+                        row_to_write[
+                            'Count_Person_18To64Years_EnrolledInEducationOrTraining_Male_AsAFractionOfCount_Person_18To64Years_Male'] = row.get(
+                                'M', '')
+                        row_to_write[
+                            'Count_Person_18To64Years_EnrolledInEducationOrTraining_AsAFractionOfCount_Person_18To64Years'] = row.get(
+                                'T', '')
+
+                writer.writerow(row_to_write)
+        logging.info('File processing completed')
     except Exception as e:
         logging.fatal(f'Processing error {e}')
 
@@ -199,8 +224,14 @@ def main(_):
 
     if mode == "" or mode == "download":
         download_data(_DATA_URL, input_file)
+
     if mode == "" or mode == "process":
         translate_df = translate_wide_to_long(input_file)
+
+        # Keep both age groups but ensure we only use the Percentage unit
+        translate_df = translate_df[(translate_df['unit'] == 'PC') & (
+            translate_df['age'].isin(['Y18-64', 'Y25-64']))]
+
         preprocess(translate_df, _CLEANED_CSV)
         get_template_mcf(_OUTPUT_COLUMNS, _TMCF)
 
 
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import unittest
 import os
 from education_enrollment_preprocess_gen_tmcf import *
@@ -23,41 +22,32 @@
 sys.path.insert(0, MODULE_DIR)
 
 TEST_DATASET_DIR = os.path.join(MODULE_DIR, "test_data", "sample_input")
-
 EXPECTED_FILES_DIR = os.path.join(MODULE_DIR, "test_data", "sample_output")
 
 
 class TestProcess(unittest.TestCase):
-    """
-    TestPreprocess is inherting unittest class
-    properties which further requried for unit testing.
-    The test will be conducted for EuroStat BMI Sample Datasets,
-    It will be generating CSV, MCF and TMCF files based on the sample input.
-    Comparing the data with the expected files.
-    """
 
-    def __init__(self, methodName: str = ...) -> None:
+    def __init__(self, methodName: str = "runTest") -> None:
         super().__init__(methodName)
         input_path = os.path.join(TEST_DATASET_DIR, 'sample_data.tsv')
         self.CLEANED_CSV_FILE_PATH = os.path.join(EXPECTED_FILES_DIR,
                                                   "test_output.csv")
-        preprocess(translate_wide_to_long(input_path),
-                   self.CLEANED_CSV_FILE_PATH)
+
+        # Transform and filter data to match original script logic
+        df = translate_wide_to_long(input_path)
+        df = df[(df['unit'] == 'PC') & (df['age'].isin(['Y18-64', 'Y25-64']))]
+
+        preprocess(df, self.CLEANED_CSV_FILE_PATH)
 
         with open(self.CLEANED_CSV_FILE_PATH, encoding="utf-8-sig") as csv_file:
             self.actual_csv_data = csv_file.read()
 
     def test_create_csv(self):
-        """
-        This method is required to test between output generated
-        preprocess script and excepted output files like CSV
-        """
         expected_csv_file_path = os.path.join(EXPECTED_FILES_DIR,
                                               "Eurostats_NUTS2_Enrollment.csv")
 
-        expected_csv_data = ""
         with open(expected_csv_file_path,
-                  encoding="utf-8") as expected_csv_file:
+                  encoding="utf-8-sig") as expected_csv_file:
             expected_csv_data = expected_csv_file.read()
 
         self.assertEqual(expected_csv_data.strip(),