Add support for nested options

thenav56 · thenav56 · commit 82ce2d3340fe · 2023-06-28T09:45:42.000+05:45
diff --git a/mapswipe_workers/mapswipe_workers/generate_stats/overall_stats.py b/mapswipe_workers/mapswipe_workers/generate_stats/overall_stats.py
@@ -65,19 +65,15 @@ def get_project_static_info(filename: str) -> pd.DataFrame:
                 ,regexp_replace(look_for, E'[\\n\\r]+', ' ', 'g' ) as look_for
                 ,project_type
                 ,image
+                ,created
                 -- Custom options values
                 ,CASE
                   WHEN project_type_specifics->'customOptions' IS NOT NULL
                   THEN -- thus if we have answer labels use them
-                    ARRAY(
-                      SELECT json_array_elements(
-                          project_type_specifics->'customOptions'
-                      )->>'value'
-                    )
+                    (project_type_specifics->'customOptions')::TEXT
                   ELSE -- otherwise use below label range as the mapswipe app default
-                    '{0,1,2,3}'
-                END as custom_options_values
-                -- custom_options_values -> parent - child relation
+                    '[{"value": 0}, {"value": 1}, {"value": 2}, {"value": 3}]'::TEXT
+                END as custom_options
                 -- add an array of the tile server names
                 ,CASE
                   WHEN project_type_specifics->'tileServer'->'name' IS NOT NULL THEN
diff --git a/mapswipe_workers/mapswipe_workers/generate_stats/project_stats.py b/mapswipe_workers/mapswipe_workers/generate_stats/project_stats.py
@@ -4,7 +4,7 @@
 import json
 import os
 import tempfile
-from typing import List
+import typing
 
 import pandas as pd
 from pandas.api.types import is_numeric_dtype
@@ -268,6 +268,18 @@ def calc_share(df: pd.DataFrame) -> pd.DataFrame:
     return df.join(share_df)
 
 
+def calc_parent_option_count(
+    df: pd.DataFrame,
+    custom_options: typing.Dict[int, typing.Set[int]],
+) -> pd.DataFrame:
+    df_new = df.copy()
+    # Update option count using sub options count
+    for option, sub_options in custom_options.items():
+        for sub_option in sub_options:
+            df_new[f"{option}_count"] += df_new[f"{sub_option}_count"]
+    return df_new
+
+
 def calc_count(df: pd.DataFrame) -> pd.DataFrame:
     df_new = df.filter(like="count")
     df_new_sum = df_new.sum(axis=1)
@@ -290,26 +302,42 @@ def calc_quadkey(row: pd.DataFrame):
     return quadkey
 
 
+def get_custom_options(custom_options: pd.Series) -> typing.Dict[int, typing.Set[int]]:
+    eval_value = ast.literal_eval(custom_options.item())
+    return {
+        option["value"]: {
+            sub_option["value"] for sub_option in option.get("subOptions", [])
+        }
+        for option in eval_value
+    }
+
+
 def add_missing_result_columns(
-    df: pd.DataFrame,
-    custom_options_values: pd.Series
+    df: typing.Union[pd.DataFrame, pd.Series],
+    custom_options: typing.Dict[int, typing.Set[int]],
 ) -> pd.DataFrame:
     """
     Check if all possible answers columns are included in the grouped results
     data frame and add columns if missing.
     """
 
-    all_answer_label_values_list = list(
-        ast.literal_eval(custom_options_values.item())
+    all_answer_label_values_set = set(
+        [
+            _option
+            for option, sub_options in custom_options.items()
+            for _option in [option, *sub_options]
+        ]
+    )
+    return df.reindex(
+        columns=sorted(all_answer_label_values_set),
+        fill_value=0,
     )
-    df = df.reindex(columns=all_answer_label_values_list, fill_value=0)
-    return df
 
 
 def get_agg_results_by_task_id(
     results_df: pd.DataFrame,
     tasks_df: pd.DataFrame,
-    custom_options_values: pd.Series,
+    custom_options_raw: pd.Series,
 ) -> pd.DataFrame:
     """
     For each task several users contribute results.
@@ -327,7 +355,7 @@ def get_agg_results_by_task_id(
     ----------
     results_df: pd.DataFrame
     tasks_df: pd.DataFrame
-    custom_options_values: pd.Series
+    custom_options_raw: pd.Series
     """
 
     results_by_task_id_df = (
@@ -336,21 +364,25 @@ def get_agg_results_by_task_id(
         .unstack(fill_value=0)
     )
 
+    custom_options = get_custom_options(custom_options_raw)
+
     # add columns for answer options that were not chosen for any task
     results_by_task_id_df = add_missing_result_columns(
         results_by_task_id_df,
-        custom_options_values,
+        custom_options,
     )
 
-    # TODO: Add logic for parent values using sub values
-    # [<parent_value> = <parent_value> + <child_1_value> + .. <child_N_value>]
-
     # needed for ogr2ogr todo: might be legacy?
     results_by_task_id_df = results_by_task_id_df.add_suffix("_count")
 
     # calculate total count of votes per task
     results_by_task_id_df["total_count"] = calc_count(results_by_task_id_df)
 
+    results_by_task_id_df = calc_parent_option_count(
+        results_by_task_id_df,
+        custom_options,
+    )
+
     # calculate share based on counts
     results_by_task_id_df = calc_share(results_by_task_id_df)
 
@@ -421,7 +453,7 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
         agg_results_df = get_agg_results_by_task_id(
             results_df,
             tasks_df,
-            project_info["custom_options_values"],
+            project_info["custom_options"],
         )
         agg_results_df.to_csv(agg_results_filename, index_label="idx")
 
diff --git a/mapswipe_workers/mapswipe_workers/generate_stats/user_stats.py b/mapswipe_workers/mapswipe_workers/generate_stats/user_stats.py
@@ -33,8 +33,7 @@ def get_disagreeing_contributions_per_user_and_task(row):
 
 
 def get_agg_results_by_user_id(
-    results_df: pd.DataFrame,
-    agg_results_df: pd.DataFrame
+    results_df: pd.DataFrame, agg_results_df: pd.DataFrame
 ) -> pd.DataFrame:
     """
     For each users we calcuate the number of total contributions (tasks)
diff --git a/mapswipe_workers/mapswipe_workers/utils/geojson_functions.py b/mapswipe_workers/mapswipe_workers/utils/geojson_functions.py
@@ -124,7 +124,7 @@ def cast_datatypes_for_geojson(filename: str):
                         geojson_data["features"][i]["properties"][property] = float(
                             geojson_data["features"][i]["properties"][property]
                         )
-                    except ValueError:
+                    except (ValueError, TypeError):
                         pass
 
         with open(filename, "w") as f:
diff --git a/mapswipe_workers/tests/integration/test_user_stats.py b/mapswipe_workers/tests/integration/test_user_stats.py
@@ -1,6 +1,7 @@
 import os
 import tempfile
 import unittest
+import pandas as pd
 
 import set_up
 import tear_down
@@ -56,7 +57,11 @@ def test_get_agg_results_by_user_id(self):
         tasks_df = get_tasks(self.tasks_filename, self.project_id)
         self.assertEqual(len(tasks_df), 67436)
 
-        agg_results_df = get_agg_results_by_task_id(results_df, tasks_df)
+        agg_results_df = get_agg_results_by_task_id(
+            results_df,
+            tasks_df,
+            pd.Series(data="[{\"value\": 0}, {\"value\": 1}, {\"value\": 2}, {\"value\": 3}]"),
+        )
         self.assertEqual(len(agg_results_df), 67436)
 
         agg_results_by_user_id_df = get_agg_results_by_user_id(
diff --git a/mapswipe_workers/tests/unittests/test_project_stats.py b/mapswipe_workers/tests/unittests/test_project_stats.py
@@ -0,0 +1,193 @@
+import unittest
+
+import pandas as pd
+
+from mapswipe_workers.generate_stats.project_stats import (
+    add_missing_result_columns,
+    calc_agreement,
+    calc_count,
+    calc_parent_option_count,
+    calc_share,
+    get_custom_options,
+)
+from tests.integration.base import BaseTestCase
+
+
+class TestProjectStats(BaseTestCase):
+    def test_calc_agreement(self):
+        ds = pd.Series(
+            data=[40, 15, 5, 17, 3],
+            index=["total_count", "1_count", "2_count", "3_count", "4_count"],
+        )
+        agg2 = calc_agreement(ds)
+        self.assertEqual(agg2, 0.32564102564102565)
+
+    def test_calc_count(self):
+        df = pd.DataFrame(
+            data=[[1, 15, 5, 20], [1, 234, 45, 6]],
+            columns=["taskId", "1_count", "2_count", "3_count"],
+        )
+        result = calc_count(df)
+        self.assertEqual(result[0], 40)
+
+    def test_calc_share(self):
+        df = pd.DataFrame(
+            data=[[1, 40, 15, 5, 20], [1, 285, 234, 45, 6]],
+            columns=["taskId", "total_count", "1_count", "2_count", "3_count"],
+        )
+        share = calc_share(df)
+        self.assertEqual(
+            share.filter(like="share").iloc[0].tolist(), [0.375, 0.125, 0.5]
+        )
+
+    def test_get_custom_options(self):
+        for raw_custom_options, excepted_values in [
+            (
+                [
+                    {"value": 0},
+                    {"value": 1},
+                    {"value": 2},
+                    {"value": 3}
+                ],
+                {0: set(), 1: set(), 2: set(), 3: set()},
+            ),
+            (
+                [
+                    {
+                        "value": 0,
+                        "subOptions": [
+                            {"value": 4}, {"value": 5}
+                        ],
+                    },
+                    {"value": 1},
+                    {"value": 2},
+                    {"value": 3}
+                ],
+                {0: {4, 5}, 1: set(), 2: set(), 3: set()},
+            ),
+            (
+                [
+                    {
+                        "value": 0,
+                        "subOptions": [
+                            {"value": 4}, {"value": 5}
+                        ],
+                    },
+                    {"value": 1},
+                    {"value": 2},
+                    {
+                        "value": 3,
+                        "subOptions": [
+                            {"value": 10},
+                            {"value": 12}
+                        ],
+                    },
+                ],
+                {0: {4, 5}, 1: set(), 2: set(), 3: {10, 12}},
+            ),
+        ]:
+            pd_series = pd.Series(data=[str(raw_custom_options)])
+            parsed_custom_options = get_custom_options(pd_series)
+            assert parsed_custom_options == excepted_values
+
+    def test_add_missing_result_columns(self):
+        df = pd.DataFrame(
+            data=[
+                ["project-1-group-1-task-1", 1],
+                ["project-1-group-1-task-1", 5],
+                ["project-1-group-2-task-1", 1],
+                ["project-1-group-2-task-1", 1],
+                ["project-1-group-2-task-1", 1],
+                ["project-2-group-3-task-1", 2],
+                ["project-2-group-1-task-1", 3],
+            ],
+            columns=[
+                "task_id",
+                "result",
+            ],
+        )
+        df = (
+            df.groupby(["task_id", "result"])
+            .size()
+            .unstack(fill_value=0)
+        )
+        updated_df = add_missing_result_columns(
+            df,
+            {
+                1: {4, 5},
+                2: {6},
+                3: set(),
+            },
+        )
+        # Existing columns
+        assert list(df.columns) == [1, 2, 3, 5]
+        # New columns
+        assert list(updated_df.columns) == [1, 2, 3, 4, 5, 6]
+        # Existing data
+        assert df.to_csv() == (
+            'task_id,1,2,3,5\n'
+            'project-1-group-1-task-1,1,0,0,1\n'
+            'project-1-group-2-task-1,3,0,0,0\n'
+            'project-2-group-1-task-1,0,0,1,0\n'
+            'project-2-group-3-task-1,0,1,0,0\n'
+        )
+        # New data
+        assert updated_df.to_csv() == (
+            'task_id,1,2,3,4,5,6\n'
+            'project-1-group-1-task-1,1,0,0,0,1,0\n'
+            'project-1-group-2-task-1,3,0,0,0,0,0\n'
+            'project-2-group-1-task-1,0,0,1,0,0,0\n'
+            'project-2-group-3-task-1,0,1,0,0,0,0\n'
+        )
+
+    def test_calc_parent_option_count(self):
+        df = pd.DataFrame(
+            data=[
+                [1, 40, 1, 0, 20, 0, 1, 0],
+                [2, 41, 0, 5, 20, 0, 0, 0],
+                [3, 42, 10, 10, 20, 0, 0, 1],
+                [4, 281, 0, 1, 0, 1, 1, 4],
+                [5, 282, 15, 10, 0, 1, 2, 4],
+                [1, 283, 2, 20, 0, 1, 0, 0],
+            ],
+            columns=[
+                "taskId",
+                "total_count",
+                "1_count",
+                "2_count",
+                "3_count",
+                "4_count",  # Child of 1
+                "5_count",  # Child of 1
+                "6_count",  # Child of 2
+            ],
+        )
+        updated_df = calc_parent_option_count(
+            df,
+            {
+                1: {4, 5},
+                2: {6},
+                3: set(),
+            },
+        )
+        # Columns without child shouldn't change
+        for column in [
+            "taskId",
+            "total_count",
+            "3_count",
+            "4_count",
+            "5_count",
+            "6_count",
+        ]:
+            assert df[column].compare(updated_df[column]).size == 0
+        # Columns with child should change
+        for column, updated_index, updated_value in [
+            ("1_count", [0, 3, 4, 5], [2, 2, 18, 3]),
+            ("2_count", [2, 3, 4], [11, 5, 14]),
+        ]:
+            compared = df[column].compare(updated_df[column])
+            assert list(compared['other'].index) == updated_index
+            assert list(compared['other']) == updated_value
+
+
+if __name__ == "__main__":
+    unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -124,7 +124,7 @@ def cast_datatypes_for_geojson(filename: str):`
`124`	`124`	`geojson_data["features"][i]["properties"][property] = float(`
`125`	`125`	`geojson_data["features"][i]["properties"][property]`
`126`	`126`	`)`
`127`		`- except ValueError:`
	`127`	`+ except (ValueError, TypeError):`
`128`	`128`	`pass`
`129`	`129`
`130`	`130`	`with open(filename, "w") as f:`