-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathftdmp-reduce-table-redundancy-by-bucketing
executable file
·144 lines (115 loc) · 3.14 KB
/
ftdmp-reduce-table-redundancy-by-bucketing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/bin/bash
function print_help_and_exit
{
cat >&2 << 'EOF'
'ftdmp-reduce-table-redundancy-by-bucketing' groups column values and leaves only first-occuring unique rows
Options:
--bucketing-columns string * space-separated list of column names to analyze
--bucket-counts string * space-separated list of bucket counts, one per bucketing column
--max-presence number max number of occurrences for a bucketed vector, default is 1
--help | -h flag to display help message and exit
Standard input:
space-separated table
Standard output:
space-separated table
Examples:
cat scoring_results_table.txt | ftdmp-reduce-table-redundancy-by-bucketing --bucketing-columns "x y z a1 a2 a3" --bucket-counts "20 20 20 20 20 20"
EOF
exit 1
}
if [ -z "$1" ]
then
print_help_and_exit
fi
if [ -z "$FTDMPDIR" ]
then
export FTDMPDIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
export PATH="${FTDMPDIR}/core/voronota/expansion_js:${FTDMPDIR}:${PATH}"
fi
command -v voronota-js &> /dev/null || { echo >&2 "Error: 'voronota-js' executable not in binaries path"; exit 1; }
COLUMNS=""
BUCKETCOUNTS=""
MAX_PRESENCE="1"
HELP_MODE="false"
while [[ $# > 0 ]]
do
OPTION="$1"
OPTARG="$2"
shift
case $OPTION in
--bucketing-columns)
COLUMNS="$OPTARG"
shift
;;
--bucket-counts)
BUCKETCOUNTS="$OPTARG"
shift
;;
--max-presence)
MAX_PRESENCE="$OPTARG"
shift
;;
-h|--help)
HELP_MODE="true"
;;
*)
echo >&2 "Error: invalid command line option '$OPTION'"
exit 1
;;
esac
done
if [ "$HELP_MODE" == "true" ]
then
print_help_and_exit
fi
if [ -z "$COLUMNS" ]
then
echo >&2 "Error: no columns specified"
exit 1
fi
if [ -z "$BUCKETCOUNTS" ]
then
echo >&2 "Error: no bucket counts specified"
exit 1
fi
readonly TMPLDIR=$(mktemp -d)
trap "rm -r $TMPLDIR" EXIT
echo "$COLUMNS" | sed 's/\s/\n/g' | egrep '.' \
> "${TMPLDIR}/columns_info"
N_COLUMNS="$(cat "${TMPLDIR}/columns_info" | wc -l)"
echo "$BUCKETCOUNTS" | sed 's/\s/\n/g' | egrep '.' \
> "${TMPLDIR}/bucketcounts_info"
N_BUCKETCOUNTS="$(cat "${TMPLDIR}/bucketcounts_info" | wc -l)"
if [ "$N_COLUMNS" -ne "$N_BUCKETCOUNTS" ]
then
echo >&2 "Error: numbers of columns and bucket counts do not match"
exit 1
fi
paste "${TMPLDIR}/columns_info" "${TMPLDIR}/bucketcounts_info" > "${TMPLDIR}/full_columns_info"
{
echo "voronota_reduce_table_redundancy_by_bucketing('-input-file _stdin -output-file _stdout"
echo "-bucket-columns"
cat "${TMPLDIR}/full_columns_info" | awk '{print $1}'
echo "-bucket-counts"
cat "${TMPLDIR}/full_columns_info" | awk '{print $2}'
echo "-max-presence"
echo "$MAX_PRESENCE"
echo "');"
echo "voronota_assert_full_success('Failed to reduce table redundancy by bucketing');"
} \
| tr '\n' ' ' \
| sed 's/$/\n/' \
> "${TMPLDIR}/processing_script"
cat > "${TMPLDIR}/input"
if [ ! -s "${TMPLDIR}/input" ]
then
echo >&2 "Error: no input data in stdin"
exit 1
fi
cat "${TMPLDIR}/input" | voronota-js --no-setup-defaults "${TMPLDIR}/processing_script" > "${TMPLDIR}/output"
if [ ! -s "${TMPLDIR}/output" ]
then
echo >&2 "Error: no output produced"
exit 1
fi
cat "${TMPLDIR}/output"