From 5c7a47c371756a0099b681701fe1dfbb68adaa17 Mon Sep 17 00:00:00 2001 From: Ryan Shaw Date: Thu, 12 Mar 2015 14:59:21 -0400 Subject: [PATCH 1/3] =?UTF-8?q?Fix=20calculation=20of=20agreement=20for=20?= =?UTF-8?q?=CF=80*,=20maybe.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- segeval/agreement/pi.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/segeval/agreement/pi.py b/segeval/agreement/pi.py index 791b31b..a72062a 100644 --- a/segeval/agreement/pi.py +++ b/segeval/agreement/pi.py @@ -6,6 +6,7 @@ from __future__ import absolute_import, division from decimal import Decimal from segeval.agreement import __fnc_metric__, __actual_agreement_linear__ +from itertools import chain def __fleiss_pi_linear__(dataset, **kwargs): @@ -29,13 +30,10 @@ def __fleiss_pi_linear__(dataset, **kwargs): A_a = Decimal(sum(all_numerators)) / sum(all_denominators) # Calculate Ae p_e_segs = list() - for boundaries_info in coders_boundaries.values(): - for item in boundaries_info: - boundaries, total_boundaries = item - p_e_seg = Decimal(boundaries) / total_boundaries - p_e_segs.append(p_e_seg) + boundary_ratios = chain.from_iterable(coders_boundaries.values()) + b_placed, b_possible = map(sum, zip(*boundary_ratios)) # Calculate P_e_seg - P_e_seg = Decimal(sum(p_e_segs)) / len(p_e_segs) + P_e_seg = Decimal(b_placed) / b_possible A_e = (P_e_seg ** 2) # Calculate pi pi = (A_a - A_e) / (Decimal('1') - A_e) From c46e12d82c3854b77f6db6477c23932a85eb58c3 Mon Sep 17 00:00:00 2001 From: Ryan Shaw Date: Wed, 25 Mar 2015 15:33:14 -0400 Subject: [PATCH 2/3] We are counting (near) matches, not edits, so we want 1-weight. --- segeval/similarity/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segeval/similarity/__init__.py b/segeval/similarity/__init__.py index 220811b..d1ff32a 100644 --- a/segeval/similarity/__init__.py +++ b/segeval/similarity/__init__.py @@ -103,7 +103,7 @@ def __boundary_confusion_matrix__(*args, **kwargs): # Add weighted near misses for transposition in statistics['transpositions']: match = transposition[2] - matrix[match][match] += fnc_weight_t([transposition], n_t) + matrix[match][match] += (1 - fnc_weight_t([transposition], n_t)) # Add confusion errors for substitution in statistics['substitutions']: hyp, ref = substitution From 7cd095cfd950c53eca24a582456cdffe8401c0dc Mon Sep 17 00:00:00 2001 From: Ryan Shaw Date: Wed, 25 Mar 2015 17:17:52 -0400 Subject: [PATCH 3/3] Hypothesis is on the left (a), reference is on the right (b). --- segeval/similarity/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/segeval/similarity/__init__.py b/segeval/similarity/__init__.py index d1ff32a..185accc 100644 --- a/segeval/similarity/__init__.py +++ b/segeval/similarity/__init__.py @@ -113,11 +113,11 @@ def __boundary_confusion_matrix__(*args, **kwargs): hyp, ref = None, None boundary_type, side = addition if side == 'a': - hyp = None - ref = boundary_type - else: # side == 'b' hyp = boundary_type ref = None + else: # side == 'b' + hyp = None + ref = boundary_type assert side == 'a' or side == 'b' matrix[hyp][ref] += 1 return matrix