-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathFilterDate.pl
executable file
·123 lines (117 loc) · 3.58 KB
/
FilterDate.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Perl script to filter a column based on a date
# Modified 2021-01-08 to allow for filtering before and after a certain date
use strict;
use Getopt::Long;
use warnings;
use POSIX qw(strftime);
#use DateTime qw();
#my $date =DateTime->now->strftime('%Y%m%d');
use Time::ParseDate;
use Time::Piece;
my ($help,$in,$out,$before,$after,$colname);
&GetOptions(
'in:s' => \$in, # text-tab file formatted with anonymisation
'out:s' => \$out, # filtered output file
'before:s' => \$before,
'after:s' => \$after,
'colname:s' => \$colname, #the column name that contains the date to fiter
"help:s" => \$help,
);
if ($help||!$in||!$out||!$colname){
print "Usage: perl \n";
print " -in <txt> - text-tab delimited (.tsv) or comma-separate (.csv) file anonymised\n";
print " -out <txt> - tsv or csv as input\n";
print " -after <txt> - keep everything after this date cut-off\n";
print " -before <txt> - keep everything before this date cut-off\n";
print " -colname <txt> - the column name that contains the dates\n";
print " -help - Get this help\n";
exit();
}
open(OUT,">$out")||die "Can't open $out\n";
my ($after_cutoff,$before_cutoff);
if ($after){
$after_cutoff=parsedate($after);
print "Keeping everything after the following date: $after\n";
}
if ($before){
$before_cutoff=parsedate($before);
print "Keeping everything before the following date: $before\n";
}
my $colnumber;
if ($in=~/\.tsv$/){
open(IN,"<$in")||die "Can't open $in\n";
my $header=<IN>;
chomp($header);
print OUT "$header\n";
my @colnames=split(/\t/,$header);
for (my $i=0; $i<scalar(@colnames);$i++){
if ($colname eq $colnames[$i]){
$colnumber=$i;
}
}
print "Using column $colnumber => $colname\n";
}elsif ($in=~/\.csv$/){
open(IN,"<$in")||die "Can't open $in\n";
my $header=<IN>;
chomp($header);
print OUT "$header\n";
my @colnames=split(/,/,$header);
for (my $i=0; $i<scalar(@colnames);$i++){
if ($colname eq $colnames[$i]){
$colnumber=$i;
}
}
print "Using column $colnumber => $colname\n";
}else{
print "The input does not have the appropriate file extension to be processed\n";
exit;
}
my $exc=0;
my $inc=0;
while(<IN>){
chomp($_);
my @col;
if ($in=~/\.tsv$/){
@col=split(/\t/,$_);
}elsif($in=~/\.csv$/){
@col=split(/,/,$_);
}
my $tp;
if ($col[$colnumber]=~/\d{4}-\d{2}-\d{2}/){# best format
#print "Best format >$col[$colnumber]<\n";
#2020-09-24
#2020-09-22
$tp = parsedate($col[$colnumber]);
}elsif ($col[$colnumber]=~/^\d{4}\-\d{2}$/){
$tp = parsedate("$col[$colnumber]"."-01");
#print "Assuming $col[$colnumber] is "."$col[$colnumber]"."-01"."\n";
}elsif ($col[$colnumber]=~/^(\d{4}\-\d{2})-XX$/){
$tp = parsedate("$1"."-01");
#print "Assuming $col[$colnumber] is "."$1"."-01"."\n";
}else{# try your best to parse
#print "Trying my best to parse >$col[$colnumber]<\n";
$tp = parsedate($col[$colnumber]);
}
#print "$col[0]\t$col[$colnumber]\t$tp\t$before\t$before_cutoff\n";
if (!$tp){
print "$_\n";
}elsif ($after && $before){
if ($tp>=$after_cutoff && $tp<=$before_cutoff){
my $date = localtime($tp)->strftime("%m/%d/%Y");
#print $col[$colnumber],"\t",$tp,"\t",$date,"\n";
$inc++;
print OUT "$_\n";
}
}elsif ($after && $tp>=$after_cutoff && !$before){
$inc++;
print OUT "$_\n";
}elsif (!$after && $before && $tp<=$before_cutoff){
$inc++;
print OUT "$_\n";
}else{
#print "$col[$colnumber] is excluded\n";
$exc++;
}
}
print "$exc records are excluded\n";
print "$inc records are included\n";