-
Notifications
You must be signed in to change notification settings - Fork 0
/
dupplot.pl
executable file
·131 lines (107 loc) · 3.81 KB
/
dupplot.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/perl
# Usage ./dupplot.pl file1 file2 [output_image_file];
# Author Raimon Grau <raimonster@gmail.com>. Artistic License v2.0
# The mechanism to detect similar lines in 2 files is to process both
# files line per line and see if we can consider them equal.
# We should keep in mind that a file might have repeated lines in
# itself, so instead of comparing line numbers which would be
# inefficient, we transform the lines to their md5 and use that as an
# index. That way similar lines will fall into the same bucket by
# design.
# When parsing the first file we create a hash with md5 as keys and a
# list of line numbers as values.
# When parsing the second file we reach out to the previous hash and
# create tuples of [line_in_previous_file, current_line] foreach
# line_in_previous_file with the same md5 as current_line.
# After that, we just launch gnuplot. If there's a 3rd argument, store
# the resulting graph as a png file.
use strict;
use warnings;
use Data::Dumper;
use Digest::MD5;
use File::Basename;
use File::Temp;
sub say {print @_,"\n";}
# This hash represents the mapping between the file formats and the
# normalization operations that we can do safely in each one of them.
my %sanitizer = (".pl" => sub {
$_ = shift;
s/^\s*[{}]\s*$//; # remove lines with single { or }
s/^\s*[}]\s*else\s*[{]\s*$//; # remove }?else{? lines
$_;
},
".rb" => sub {
$_ = shift;
s/^\s*end\s*$//; # remove lines with 'end'
$_;
},
".lisp" => sub {
$_ = shift;
s/;.*//; # comments
s/\)+$/\)/; # collapse multiple closing parens to one
$_;
}
);
sub extension_for {
my $fn = shift;
my ($name, $dir, $ext) = fileparse($fn, qr/\.[^.]*/);
return $ext;
}
sub process_file {
my ($fn, $sub) = (shift, shift);
open(my $fh, "<", $fn)
or die "Can't open < input.txt: $!";
my $ext = extension_for($fn);
my $md5;
while(<$fh>){
chomp;
$_ = $sanitizer{$ext}->($_) if exists $sanitizer{$ext};
next if /^\s*$/;
$md5 = Digest::MD5::md5_hex($_);
$sub->($md5, $.); # pass the md5 of the current line and the line
# number ($.)
}
close $fn;
return;
}
sub output_file {
my $output_file = shift;
my $file_cmd = "";
if ($output_file) {
$file_cmd = "set terminal png size 400,300; set output '$output_file.png';"
}
$file_cmd;
}
sub main {
# %h is the hash that accumulates the lines of the first file its keys
# are an md5 of the line, and the values are lists of line numbers
# where that line appears.
my %h = ();
# The array @tuples stores all the points that have to be written in
# the final plot
my @tuples = ();
process_file(shift, sub {
# Adds the line number to the value of %h indexed by
# the md5 of the line itself
my ($line, $lnum) = @_;
push @{$h{$line}}, $lnum;
});
process_file(shift,
sub {
# For every line, if it exists in the previous file,
# make a correspondence with each appearance
# (cartesian product)
my ($line, $lnum) = @_;
for my $other_file_line (@{$h{$line}}) {
push @tuples, [$lnum , $other_file_line];
}
});
my ($f_handler, $name) = mkstemp("/tmp/tempXXXXXXXX");
for my $tuple (@tuples) {
#say $tuple->[0], " " , $tuple->[1] ;
print $f_handler $tuple->[0], " " , $tuple->[1] , "\n";
}
my $file_cmd = output_file(shift);
system(qq|gnuplot -p -e "$file_cmd plot '$name' notitle"|);
}
main(@ARGV);