Skip to content
This repository was archived by the owner on Apr 3, 2026. It is now read-only.

Commit 82ce2d3

Browse files
committed
Add support for nested options
1 parent 42d7143 commit 82ce2d3

6 files changed

Lines changed: 251 additions & 26 deletions

File tree

mapswipe_workers/mapswipe_workers/generate_stats/overall_stats.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -65,19 +65,15 @@ def get_project_static_info(filename: str) -> pd.DataFrame:
6565
,regexp_replace(look_for, E'[\\n\\r]+', ' ', 'g' ) as look_for
6666
,project_type
6767
,image
68+
,created
6869
-- Custom options values
6970
,CASE
7071
WHEN project_type_specifics->'customOptions' IS NOT NULL
7172
THEN -- thus if we have answer labels use them
72-
ARRAY(
73-
SELECT json_array_elements(
74-
project_type_specifics->'customOptions'
75-
)->>'value'
76-
)
73+
(project_type_specifics->'customOptions')::TEXT
7774
ELSE -- otherwise use below label range as the mapswipe app default
78-
'{0,1,2,3}'
79-
END as custom_options_values
80-
-- custom_options_values -> parent - child relation
75+
'[{"value": 0}, {"value": 1}, {"value": 2}, {"value": 3}]'::TEXT
76+
END as custom_options
8177
-- add an array of the tile server names
8278
,CASE
8379
WHEN project_type_specifics->'tileServer'->'name' IS NOT NULL THEN

mapswipe_workers/mapswipe_workers/generate_stats/project_stats.py

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import json
55
import os
66
import tempfile
7-
from typing import List
7+
import typing
88

99
import pandas as pd
1010
from pandas.api.types import is_numeric_dtype
@@ -268,6 +268,18 @@ def calc_share(df: pd.DataFrame) -> pd.DataFrame:
268268
return df.join(share_df)
269269

270270

271+
def calc_parent_option_count(
272+
df: pd.DataFrame,
273+
custom_options: typing.Dict[int, typing.Set[int]],
274+
) -> pd.DataFrame:
275+
df_new = df.copy()
276+
# Update option count using sub options count
277+
for option, sub_options in custom_options.items():
278+
for sub_option in sub_options:
279+
df_new[f"{option}_count"] += df_new[f"{sub_option}_count"]
280+
return df_new
281+
282+
271283
def calc_count(df: pd.DataFrame) -> pd.DataFrame:
272284
df_new = df.filter(like="count")
273285
df_new_sum = df_new.sum(axis=1)
@@ -290,26 +302,42 @@ def calc_quadkey(row: pd.DataFrame):
290302
return quadkey
291303

292304

305+
def get_custom_options(custom_options: pd.Series) -> typing.Dict[int, typing.Set[int]]:
306+
eval_value = ast.literal_eval(custom_options.item())
307+
return {
308+
option["value"]: {
309+
sub_option["value"] for sub_option in option.get("subOptions", [])
310+
}
311+
for option in eval_value
312+
}
313+
314+
293315
def add_missing_result_columns(
294-
df: pd.DataFrame,
295-
custom_options_values: pd.Series
316+
df: typing.Union[pd.DataFrame, pd.Series],
317+
custom_options: typing.Dict[int, typing.Set[int]],
296318
) -> pd.DataFrame:
297319
"""
298320
Check if all possible answers columns are included in the grouped results
299321
data frame and add columns if missing.
300322
"""
301323

302-
all_answer_label_values_list = list(
303-
ast.literal_eval(custom_options_values.item())
324+
all_answer_label_values_set = set(
325+
[
326+
_option
327+
for option, sub_options in custom_options.items()
328+
for _option in [option, *sub_options]
329+
]
330+
)
331+
return df.reindex(
332+
columns=sorted(all_answer_label_values_set),
333+
fill_value=0,
304334
)
305-
df = df.reindex(columns=all_answer_label_values_list, fill_value=0)
306-
return df
307335

308336

309337
def get_agg_results_by_task_id(
310338
results_df: pd.DataFrame,
311339
tasks_df: pd.DataFrame,
312-
custom_options_values: pd.Series,
340+
custom_options_raw: pd.Series,
313341
) -> pd.DataFrame:
314342
"""
315343
For each task several users contribute results.
@@ -327,7 +355,7 @@ def get_agg_results_by_task_id(
327355
----------
328356
results_df: pd.DataFrame
329357
tasks_df: pd.DataFrame
330-
custom_options_values: pd.Series
358+
custom_options_raw: pd.Series
331359
"""
332360

333361
results_by_task_id_df = (
@@ -336,21 +364,25 @@ def get_agg_results_by_task_id(
336364
.unstack(fill_value=0)
337365
)
338366

367+
custom_options = get_custom_options(custom_options_raw)
368+
339369
# add columns for answer options that were not chosen for any task
340370
results_by_task_id_df = add_missing_result_columns(
341371
results_by_task_id_df,
342-
custom_options_values,
372+
custom_options,
343373
)
344374

345-
# TODO: Add logic for parent values using sub values
346-
# [<parent_value> = <parent_value> + <child_1_value> + .. <child_N_value>]
347-
348375
# needed for ogr2ogr todo: might be legacy?
349376
results_by_task_id_df = results_by_task_id_df.add_suffix("_count")
350377

351378
# calculate total count of votes per task
352379
results_by_task_id_df["total_count"] = calc_count(results_by_task_id_df)
353380

381+
results_by_task_id_df = calc_parent_option_count(
382+
results_by_task_id_df,
383+
custom_options,
384+
)
385+
354386
# calculate share based on counts
355387
results_by_task_id_df = calc_share(results_by_task_id_df)
356388

@@ -421,7 +453,7 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
421453
agg_results_df = get_agg_results_by_task_id(
422454
results_df,
423455
tasks_df,
424-
project_info["custom_options_values"],
456+
project_info["custom_options"],
425457
)
426458
agg_results_df.to_csv(agg_results_filename, index_label="idx")
427459

mapswipe_workers/mapswipe_workers/generate_stats/user_stats.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,7 @@ def get_disagreeing_contributions_per_user_and_task(row):
3333

3434

3535
def get_agg_results_by_user_id(
36-
results_df: pd.DataFrame,
37-
agg_results_df: pd.DataFrame
36+
results_df: pd.DataFrame, agg_results_df: pd.DataFrame
3837
) -> pd.DataFrame:
3938
"""
4039
For each users we calcuate the number of total contributions (tasks)

mapswipe_workers/mapswipe_workers/utils/geojson_functions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def cast_datatypes_for_geojson(filename: str):
124124
geojson_data["features"][i]["properties"][property] = float(
125125
geojson_data["features"][i]["properties"][property]
126126
)
127-
except ValueError:
127+
except (ValueError, TypeError):
128128
pass
129129

130130
with open(filename, "w") as f:

mapswipe_workers/tests/integration/test_user_stats.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import tempfile
33
import unittest
4+
import pandas as pd
45

56
import set_up
67
import tear_down
@@ -56,7 +57,11 @@ def test_get_agg_results_by_user_id(self):
5657
tasks_df = get_tasks(self.tasks_filename, self.project_id)
5758
self.assertEqual(len(tasks_df), 67436)
5859

59-
agg_results_df = get_agg_results_by_task_id(results_df, tasks_df)
60+
agg_results_df = get_agg_results_by_task_id(
61+
results_df,
62+
tasks_df,
63+
pd.Series(data="[{\"value\": 0}, {\"value\": 1}, {\"value\": 2}, {\"value\": 3}]"),
64+
)
6065
self.assertEqual(len(agg_results_df), 67436)
6166

6267
agg_results_by_user_id_df = get_agg_results_by_user_id(
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
import unittest
2+
3+
import pandas as pd
4+
5+
from mapswipe_workers.generate_stats.project_stats import (
6+
add_missing_result_columns,
7+
calc_agreement,
8+
calc_count,
9+
calc_parent_option_count,
10+
calc_share,
11+
get_custom_options,
12+
)
13+
from tests.integration.base import BaseTestCase
14+
15+
16+
class TestProjectStats(BaseTestCase):
17+
def test_calc_agreement(self):
18+
ds = pd.Series(
19+
data=[40, 15, 5, 17, 3],
20+
index=["total_count", "1_count", "2_count", "3_count", "4_count"],
21+
)
22+
agg2 = calc_agreement(ds)
23+
self.assertEqual(agg2, 0.32564102564102565)
24+
25+
def test_calc_count(self):
26+
df = pd.DataFrame(
27+
data=[[1, 15, 5, 20], [1, 234, 45, 6]],
28+
columns=["taskId", "1_count", "2_count", "3_count"],
29+
)
30+
result = calc_count(df)
31+
self.assertEqual(result[0], 40)
32+
33+
def test_calc_share(self):
34+
df = pd.DataFrame(
35+
data=[[1, 40, 15, 5, 20], [1, 285, 234, 45, 6]],
36+
columns=["taskId", "total_count", "1_count", "2_count", "3_count"],
37+
)
38+
share = calc_share(df)
39+
self.assertEqual(
40+
share.filter(like="share").iloc[0].tolist(), [0.375, 0.125, 0.5]
41+
)
42+
43+
def test_get_custom_options(self):
44+
for raw_custom_options, excepted_values in [
45+
(
46+
[
47+
{"value": 0},
48+
{"value": 1},
49+
{"value": 2},
50+
{"value": 3}
51+
],
52+
{0: set(), 1: set(), 2: set(), 3: set()},
53+
),
54+
(
55+
[
56+
{
57+
"value": 0,
58+
"subOptions": [
59+
{"value": 4}, {"value": 5}
60+
],
61+
},
62+
{"value": 1},
63+
{"value": 2},
64+
{"value": 3}
65+
],
66+
{0: {4, 5}, 1: set(), 2: set(), 3: set()},
67+
),
68+
(
69+
[
70+
{
71+
"value": 0,
72+
"subOptions": [
73+
{"value": 4}, {"value": 5}
74+
],
75+
},
76+
{"value": 1},
77+
{"value": 2},
78+
{
79+
"value": 3,
80+
"subOptions": [
81+
{"value": 10},
82+
{"value": 12}
83+
],
84+
},
85+
],
86+
{0: {4, 5}, 1: set(), 2: set(), 3: {10, 12}},
87+
),
88+
]:
89+
pd_series = pd.Series(data=[str(raw_custom_options)])
90+
parsed_custom_options = get_custom_options(pd_series)
91+
assert parsed_custom_options == excepted_values
92+
93+
def test_add_missing_result_columns(self):
94+
df = pd.DataFrame(
95+
data=[
96+
["project-1-group-1-task-1", 1],
97+
["project-1-group-1-task-1", 5],
98+
["project-1-group-2-task-1", 1],
99+
["project-1-group-2-task-1", 1],
100+
["project-1-group-2-task-1", 1],
101+
["project-2-group-3-task-1", 2],
102+
["project-2-group-1-task-1", 3],
103+
],
104+
columns=[
105+
"task_id",
106+
"result",
107+
],
108+
)
109+
df = (
110+
df.groupby(["task_id", "result"])
111+
.size()
112+
.unstack(fill_value=0)
113+
)
114+
updated_df = add_missing_result_columns(
115+
df,
116+
{
117+
1: {4, 5},
118+
2: {6},
119+
3: set(),
120+
},
121+
)
122+
# Existing columns
123+
assert list(df.columns) == [1, 2, 3, 5]
124+
# New columns
125+
assert list(updated_df.columns) == [1, 2, 3, 4, 5, 6]
126+
# Existing data
127+
assert df.to_csv() == (
128+
'task_id,1,2,3,5\n'
129+
'project-1-group-1-task-1,1,0,0,1\n'
130+
'project-1-group-2-task-1,3,0,0,0\n'
131+
'project-2-group-1-task-1,0,0,1,0\n'
132+
'project-2-group-3-task-1,0,1,0,0\n'
133+
)
134+
# New data
135+
assert updated_df.to_csv() == (
136+
'task_id,1,2,3,4,5,6\n'
137+
'project-1-group-1-task-1,1,0,0,0,1,0\n'
138+
'project-1-group-2-task-1,3,0,0,0,0,0\n'
139+
'project-2-group-1-task-1,0,0,1,0,0,0\n'
140+
'project-2-group-3-task-1,0,1,0,0,0,0\n'
141+
)
142+
143+
def test_calc_parent_option_count(self):
144+
df = pd.DataFrame(
145+
data=[
146+
[1, 40, 1, 0, 20, 0, 1, 0],
147+
[2, 41, 0, 5, 20, 0, 0, 0],
148+
[3, 42, 10, 10, 20, 0, 0, 1],
149+
[4, 281, 0, 1, 0, 1, 1, 4],
150+
[5, 282, 15, 10, 0, 1, 2, 4],
151+
[1, 283, 2, 20, 0, 1, 0, 0],
152+
],
153+
columns=[
154+
"taskId",
155+
"total_count",
156+
"1_count",
157+
"2_count",
158+
"3_count",
159+
"4_count", # Child of 1
160+
"5_count", # Child of 1
161+
"6_count", # Child of 2
162+
],
163+
)
164+
updated_df = calc_parent_option_count(
165+
df,
166+
{
167+
1: {4, 5},
168+
2: {6},
169+
3: set(),
170+
},
171+
)
172+
# Columns without child shouldn't change
173+
for column in [
174+
"taskId",
175+
"total_count",
176+
"3_count",
177+
"4_count",
178+
"5_count",
179+
"6_count",
180+
]:
181+
assert df[column].compare(updated_df[column]).size == 0
182+
# Columns with child should change
183+
for column, updated_index, updated_value in [
184+
("1_count", [0, 3, 4, 5], [2, 2, 18, 3]),
185+
("2_count", [2, 3, 4], [11, 5, 14]),
186+
]:
187+
compared = df[column].compare(updated_df[column])
188+
assert list(compared['other'].index) == updated_index
189+
assert list(compared['other']) == updated_value
190+
191+
192+
if __name__ == "__main__":
193+
unittest.main()

0 commit comments

Comments
 (0)