evalF1_no_penalty.pl

#!/usr/bin/perl
#
# based on: 
#   Melamed, I. Dan. 1998. 
#   Manual annotation of translational equivalence: The blinker
#   project. Technical Report 98-07, Institute for
#   Research in Cognitive Science, Philadelphia,
#
#   http://repository.upenn.edu/cgi/viewcontent.cgi?article=1055&context=ircs_reports
#
#
#  given: 
#
#    g = gold standard token:token alignments (produced aligning all
#        tokens in chunk:chunk alignments)
#
#    s = system token:token alignments (produced aligning all tokens
#        in chunk:chunk alignments)
#
#  we discard punctuation .,:'`?;"-
#
#  we define
#    precis(g,s) = | overlap(g,s) | / |s|                   (eq. 1 in Melamed 98)
#    recall(g,s) = | overlap(g,s) | / |g|                   (eq. 2 in Melamed 98)
#                                            (F1 is equal to eq. 3 in Melamed 98)
#
#  where overlap returns the number of token:token alignments in
#  common between both sets
#
#  g and s can be fuzzy sets, where each token:token alignment is
#  weighted as follows:
#
#    weight(t1:t2) = 1 / max(fanout(t1),fanout(t2))        (eq. 4 in Melamed 98)
#
#    given directed alignments, fanout(t) is the number of token:token
#    alignments which have their origin in t #
#
#  As we have a different fanout factor in the gold standard pair and
#  in the system pair we use the fanout of the sys to compute overlap
#  and |s| for precision, and the fanout of the gold standard to
#  compute overlap and |g| for recall.  #
#
#  Precision and recall are computed for all alignments of all pairs
#  in one go (i.e. as opposed to aeraging F1 of each sentence pair)
#
# The script provides four evaluation measures:
#
# - F1 where alignment type and score are ignored (F1A)
# - F1 where alignment types need to match, but scores are
#   ignored. Match is quantified using Jaccard, as there can be multiple
#   tags (FACT,POL).  (F1AT)
# - F1 where alignment type is ignored, but each alignment is penalized
#   when scores do not match (F1AS)
# - F1 where alignment types need to match, and each alignment is
#   penalized when scores do not match. Match is quantified using
#   Jaccard, as there can be multiple tags (FACT,POL). In addition the
#   following special cases are catered for:
#   . there is no type penalty between tags {SPE1, SPE2, REL, SIMI} when
#     scores are (0-2] 
#   . there is no type penalty between EQUI and SIMI/SPE with score 4
#     (F1AST) 

#
# When type needs to match, a token:token alignment is in the overlap
# iff the types of the alignment in system and gold standard files is
# the same.
#
# When the scores are taken into account, the weight of the
# token:token alignment is penalized for differenes in score between
# the system and gold-standard alignment, as follows: #
#
#    weight(t1:t2) = 1 / max(fanout(t1),fanout(t2))     (eq. 4 in Melamed 98)
#                  * ( 1 - abs(score(t1:t2,sys) - score(t1:t2)) / 5)
# 
#  Changes:
#  v1 Oct. 16 2014
#  - first release

#  v2 Nov.  7 2014
#  - bug fixed: Bug affecting alignments which had multiple types, as
#    equalset received tags concatenated by _
#  - changes to address the following special case for F1AST: 
#     . there is no type penalty between tags {SPE1, SPE2, REL, SIMI} 
#       when both scores are (0-2]
#     . there is no type penalty between EQUI and SIMI/SPE with score 4.

#  v2 Sep. 7 2015 (minor changes ~ inigo lopez-gazpio)
#  - Do not raise warnings when tokens are used in several distinct alignments (allow M:N alignments)


=head1 $0

=head1 SYNOPSIS

 evalF1.pl gs system --debug=[01]

 Outputs the F1

 Example:

   $ ./evalF1.pl gs sys --debug=1

 Author: Eneko Agirre
          
 Nov. 7, 2014


=cut


use Getopt::Long qw(:config auto_help); 
use Pod::Usage; 
use warnings;
use strict;
use List::Util qw(max) ;
use File::Basename;

my $DBG = 0 ;

GetOptions("debug=i" => \$DBG)
    or
    pod2usage() ;

pod2usage if $#ARGV != 1 ;

my $dir = dirname(__FILE__);
my $incorrect = system "perl $dir/wellformed.pl $ARGV[1] > /dev/null" ;
die "\nSys file $ARGV[1] is not well-formed" if $incorrect ;

print "GS:  $ARGV[0]\n" if $DBG ;
my $gs = loadalignments($ARGV[0],'gold') ;

print "SYS: $ARGV[1]\n" if $DBG ;
my $sys = loadalignments($ARGV[1],'sys') ;

printf " F1 Ali     %6.4f\n", F1($gs,$sys,'') ;
printf " F1 Type    %6.4f\n", F1($gs,$sys,'type') ;
printf " F1 Score   %6.4f\n", F1($gs,$sys,'score') ;
printf " F1 Typ+Sco %6.4f\n", F1($gs,$sys,'typescore') ;


# the type of alignment only influences the following
#  - gs: store string of each token, remove if punctuation
#  - sys: use string of each token, remove if punctuation
sub loadalignments {
    my ($f,$type) = @_ ;
    my $alis = {} ;
    my ($id) ;
    my ($sent1,$sent2) ;
    open(I,$f) or die $! ;
    while (<I>) {
	chomp ;
	# extract pair id, and insert token strings in $alis if gold standard 
	if (/sentence id="([^\"]*)" /) {
	    $id = $1 ;
	    if ($type eq 'gold') {
		$sent1 = <I> ; chomp($sent1); $sent1 =~ s/^\/\/ //; $alis->{$id}{"string1"} = [ split(/ /,$sent1) ] ;
		$sent2 = <I> ; chomp($sent2); $sent2 =~ s/^\/\/ //; $alis->{$id}{"string2"} = [ split(/ /,$sent2) ] ;
	    }
	}

	# parse alignments
	if (/<==>/) {
	    die "contact developer" if not defined $id ;
	    next if not defined $id ;
	    # parse alignment 
	    my ($alignment,$types,$score,$comment) = split(/\/\//,$_) ;
	    my ($tokens1,$tokens2) = split(/<==>/,$alignment) ;
	    $tokens1 =~ s/^\s+// ; $tokens1 =~ s/\s+$// ; 
	    $tokens2 =~ s/^\s+// ; $tokens2 =~ s/\s+$// ; 
	    $score =~ s/^\s+// ; $score =~ s/\s+$// ; 
	    $types =~ s/^\s+// ; $types =~ s/\s+$// ; 
	    my @tokens1 =  split(/\s+/,$tokens1) ;
	    my @tokens2 =  split(/\s+/,$tokens2) ;
	    my @types =    split(/_/,$types) ;

	    # store chunk alignments, including NOALI
	    $alis->{$id}{"segments12"}{$tokens1}{$tokens2} = [ @types ] ;
	    $alis->{$id}{"segments21"}{$tokens2}{$tokens1} = [ @types ] ;

	    # produce token:token alignments, unless NOALI or ALIC (where one chunk is null, represented by 0)
	    next if $tokens1[0] == 0 ;  
	    next if $tokens2[0] == 0 ;   

	    # remove punctuation from evaluation
	    my $tmp ;
	    if ($type eq 'gold'){ $tmp = $alis } else { $tmp = $gs } ;
	    @tokens1 = grep { $tmp->{$id}{"string1"}[$_-1] !~ /^[.,:\'\`?;\"-]$/} @tokens1 ;
	    @tokens2 = grep { $tmp->{$id}{"string2"}[$_-1] !~ /^[.,:\'\`?;\"-]$/} @tokens2 ;
	    next if ! @tokens1 ;
	    next if ! @tokens2 ;

	    # produce token:token alignments and index them by token and by alignmet in both directions 
	    foreach my $t1 (@tokens1) {
		foreach my $t2 (@tokens2) {
		    # store @type separately for all kinds of alignments
		    $alis->{$id}{"tokens12"}{$t1}{$t2} = [ @types ] ;
		    $alis->{$id}{"tokens21"}{$t2}{$t1} = [ @types ] ;
		    $alis->{$id}{"links12"}{"$t1 $t2"} = [ @types ] ;
		    $alis->{$id}{"links21"}{"$t2 $t1"} = [ @types ] ;
		    # store $score separately for all kinds of alignments
		    $alis->{$id}{"tokens12score"}{$t1}{$t2} = $score ;
		    $alis->{$id}{"tokens21score"}{$t2}{$t1} = $score ;
		    $alis->{$id}{"links12score"}{"$t1 $t2"} = $score ;
		    $alis->{$id}{"links21score"}{"$t2 $t1"} = $score ;
		}
	    }
	}
    }
    die "No alignments found in $f, terminating" if (scalar keys %$alis) == 0 ;
    print "   Number of pairs $f: " . (scalar keys %$alis) . "\n" if $DBG ;
    return $alis ;
}


# check set equality for types
sub equalset {
    my ($set1,$set2) = @_ ;
    my $hash1 = { map { ($_, 1) } @$set1 } ;  
    my $hash2 = { map { ($_, 1) } @$set2 } ;  
    my $equal = 1 ;
    foreach my $el1 (keys %$hash1) {
	if (! $hash2->{$el1} ) { $equal = 0 ; last ; } ;
    }
    foreach my $el2 (keys %$hash2) {
	if (! $hash1->{$el2} ) { $equal = 0 ; last ; } ;
    }
    print ("   type mismatch: " . join("_",@$set1) . " and " . join("_",@$set2). "\n") if $DBG and ! $equal ;
    return $equal ;
}

# check jaccard between sets of types, normalizing to lowercase
sub jaccardset {
    my ($set1,$set2) = @_ ;
    my $hash1 = { map { (lc $_, 1) } @$set1 } ;  
    my $hash2 = { map { (lc $_, 1) } @$set2 } ;  
    my $intersect = {} ;
    my $union = {} ;
    foreach my $el1 (keys %$hash1) {
	$union->{$el1} = 1 ;
	$intersect->{$el1} = 1 if $hash2->{$el1} ;
    }
    foreach my $el2 (keys %$hash2) {
	$union->{$el2} = 1 ;
	$intersect->{$el2} = 1 if $hash1->{$el2} ;
    }
    my $jaccard = scalar(keys %$intersect) / scalar(keys %$union) ;
    return $jaccard ;
}


# check jaccard between sets of types, normalizing to lowercase
# no penalty between tags (SPE1, SPE2, REL, SIMI) when score is (0-2] (F1AST)
# no penalty between EQUI and SIMI/SPE with score 4 (F1AST)
# add 1 to intersection, substract 1 to union
sub jaccardsetNOP {
    my ($set1,$set2) = @_  ;		    
    my $hash1 = { map { (lc $_, 1) } @$set1 } ;  
    my $hash2 = { map { (lc $_, 1) } @$set2 } ;  
    my $intersect = {} ;
    my $union = {} ;
    foreach my $el1 (keys %$hash1) {
	$union->{$el1} = 1 ;
	$intersect->{$el1} = 1 if $hash2->{$el1} ;
    }
    foreach my $el2 (keys %$hash2) {
	$union->{$el2} = 1 ;
	$intersect->{$el2} = 1 if $hash1->{$el2} ;
    }
    my $jaccard = (1 + scalar(keys %$intersect)) / (-1 + scalar(keys %$union)) ;
    return $jaccard ;
}


# intersection between two sets
sub intersect {
    my ($set1,$set2) = @_ ;
    my $hash1 = { map { (lc $_, 1) } @$set1 } ;  
    my $hash2 = { map { (lc $_, 1) } @$set2 } ;  
    my $intersect = {} ;
    foreach my $el1 (keys %$hash1) {
	$intersect->{$el1} = 1 if $hash2->{$el1} ;
    }
    foreach my $el2 (keys %$hash2) {
	$intersect->{$el2} = 1 if $hash1->{$el2} ;
    }
    return [ keys %$intersect ] ;
}

sub EQUI {
    my ($typeset)=@_ ;
    foreach my $type (@$typeset) {
	return 1 if $type =~ /^EQUI/i ;
    }
    return 0 ;
}

sub SIMISPE {
    my ($typeset)=@_ ;
    foreach my $type (@$typeset) {
	return 1 if $type =~ /^(SIMI|SPE)/i ;
    }
    return 0 ;
}

sub SIMISPEREL {
    my ($typeset)=@_ ;
    foreach my $type (@$typeset) {
	return 1 if $type =~ /^(SIMI|SPE|REL)/i ;
    }
    return 0 ;
}

# equation 4 needs fanout, the number of token:token alignments per token 
#      [guard]
#      [death camp guard]
#
# e.g. fanout($alis->{"1"}{"tokens12"}{"guard"} = { death => [ equi ], 
#                                                   camp => [ equi ], 
#                                                   guard => [ equi ]} )
#         = 3 ;
# e.g. fanout($alis->{"1"}{"tokens21"}{"guard"} = { death => [ equi ]} )
#         = 1 ;

# number of token-token alignments for a given token in a given alignment direction
sub fanout {
    my ($ali) = @_ ;
    return scalar(keys %$ali) ;
}

# summatory of fan-out factors for all token-token alignments ( eq. 4)
sub countFanOut {
    my ($ali) = @_ ;
    my $count = 0;
    foreach my $token1 (keys %{ $ali->{"tokens12"}}) {
	foreach my $token2 (keys %{ $ali->{"tokens12"}{$token1}}) {
	    $count+= 1/max(fanout($ali->{"tokens12"}{$token1}),fanout($ali->{"tokens21"}{$token2})) ;
	}
    }
    return $count ; 
}

# Main function
sub F1 {
    my ($alisgs,$alissys,$mode) = @_ ;
    my $overlapGS ;
    my $overlapSYS ;
    my $linkssys ;
    print "\n F1 $mode ========\n" if $DBG ;
    print "    recall per pair (gs, sys)\n" if $DBG ;
    foreach my $id (sort {$a <=> $b} keys %$alissys) {
	$linkssys += countFanOut($alissys->{$id}) ; 
	next if not $alisgs->{$id} ;
	$overlapSYS += overlap($alissys->{$id},$alisgs->{$id},$id,$mode) ;
    }
    my $linksgs ;
    print "    precision per pair (sys, gs)\n" if $DBG ;
    foreach my $id (sort {$a <=> $b} keys %$alisgs) {
	$linksgs += countFanOut($alisgs->{$id}) ; 
	next if not $alissys->{$id} ;
	$overlapGS += overlap($alisgs->{$id},$alissys->{$id},$id,$mode) ;
    }
    my $precision = ($linkssys == 0) ? 0 : $overlapSYS / $linkssys ;
    my $recall =    ($linksgs == 0) ? 0 : $overlapGS / $linksgs ;
    my $f1 =        ($precision + $recall == 0) ? 0 : 2*$precision*$recall/($precision + $recall) ;
    print  " F1 overlapSYS:   $overlapSYS\n" if $DBG ;
    print  " F1 system links: $linkssys\n" if $DBG ;
    printf " F1 precision:    %4.2f\n",$precision if $DBG ;
    print  " F1 overlapGS:    $overlapGS\n" if $DBG ;
    print  " F1 gs links:     $linksgs\n" if $DBG ;
    printf " F1 recall:       %4.2f\n",$recall if $DBG ;
    return $f1 ;
}

# Fuzzy overlap using fanout from first alignment set
sub overlap {
    my ($ali1,$ali2,$id,$mode) = @_ ;
    my $overlap = 0;
    foreach my $token1 (keys %{ $ali1->{"tokens12"}}) {
	foreach my $token2 (keys %{ $ali1->{"tokens12"}{$token1}}) {
	    if ($mode eq "") {
		$overlap+= 1/max(fanout($ali1->{"tokens12"}{$token1}),fanout($ali1->{"tokens21"}{$token2}))
		    if $ali2->{"tokens12"}{$token1}{$token2} ;
	    } elsif ($mode eq "type") {
		if ($ali2->{"tokens12"}{$token1}{$token2}) { 		    # and equalset($ali1->{"tokens12"}{$token1}{$token2},$ali2->{"tokens12"}{$token1}{$token2}) ;
		    my $jaccard = jaccardset($ali1->{"tokens12"}{$token1}{$token2},$ali2->{"tokens12"}{$token1}{$token2}) ;
		    print ("        type mismatch: $jaccard " . join("_",@{$ali1->{"tokens12"}{$token1}{$token2}}) . " and " . join("_",@{$ali2->{"tokens12"}{$token1}{$token2}}). "\n") if $DBG and $jaccard != 1;
		    $overlap+= 1/max(fanout($ali1->{"tokens12"}{$token1}),fanout($ali1->{"tokens21"}{$token2}))
			* $jaccard ;
		}
	    } elsif ($mode eq "score") {
		$overlap+= 1/max(fanout($ali1->{"tokens12"}{$token1}),fanout($ali1->{"tokens21"}{$token2}))
		    * (1 - abs($ali1->{"tokens12score"}{$token1}{$token2} - $ali2->{"tokens12score"}{$token1}{$token2}) / 5)  
		    if $ali2->{"tokens12"}{$token1}{$token2} ;
	    } elsif ($mode eq "typescore") {
		next if not $ali2->{"tokens12"}{$token1}{$token2} ;
		my $typeset1 = $ali1->{"tokens12"}{$token1}{$token2} ;
		my $typeset2 = $ali2->{"tokens12"}{$token1}{$token2} ;
		my $score1 = $ali1->{"tokens12score"}{$token1}{$token2};
		my $score2 = $ali2->{"tokens12score"}{$token1}{$token2};
		my $overlapincrease = 1/max(fanout($ali1->{"tokens12"}{$token1}),fanout($ali1->{"tokens21"}{$token2})) 
		    * (1 - abs($score1 - $score2) / 5)  ;
		my $jaccard ;
		if ((EQUI($typeset1) and SIMISPE($typeset2) and ($score2>=4)) 
		    or (EQUI($typeset2) and SIMISPE($typeset1) and ($score1>=4))) {
                    # no type penalty between EQUI and SIMI/SPE with score 4 (F1AST)
		    $jaccard = jaccardset($typeset1,$typeset2) ;		    # jaccardsetNOP($typeset1,$typeset2) ;		    
		}
		elsif (SIMISPEREL($typeset1) and SIMISPEREL($typeset2) and not(SIMISPEREL(intersect($typeset1,$typeset2)))
		       and ($score1 < 3) and  ($score2 < 3)) {
		    # no type penalty between tags (SPE1, SPE2, REL, SIMI) when score is (0-2] (F1AST)
                    # and ali1 and ali2 have different type.
		    $jaccard = jaccardset($typeset1,$typeset2) ; # jaccardsetNOP($typeset1,$typeset2) ;
		}
		else { # standard match
		    $jaccard = jaccardset($typeset1,$typeset2) ;
		    # and equalset($ali1->{"tokens12"}{$token1}{$token2},$ali2->{"tokens12"}{$token1}{$token2}) ;
		}
		print ("        type mismatch: $jaccard " . join("_",@{$typeset1}) . " ($score1) and " . join("_",@{$typeset2}). " ($score2)\n") if $DBG and $jaccard != 1;
		$overlap+= $overlapincrease*$jaccard ;
	    } else { die }
	}
    }
    if ($DBG and $id) {
	my ($total,$precORrecall) ;
	printf "   " ;
	printf "%2d: ",$id ;
	$total = countFanOut($ali1) ; 
        if ($total) {
	    $precORrecall = $overlap / $total ;
	} else {
	    $precORrecall = 0 ;
	}
	printf "ov:%5.2f tot:%5.2f p/r:%4.2f\n",$overlap,$total,$precORrecall ;
    }
    return $overlap ; 
}