-
Notifications
You must be signed in to change notification settings - Fork 3
/
config_restaurant.txt
executable file
·46 lines (29 loc) · 1.67 KB
/
config_restaurant.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
##############################################################
# AnyLine containing `#' with be treated as comment.
# The Typical Format is Variablename = Value
###############################################################
# Choose K and L Wisely, try with higher values of K and go down. Typically in range 1-5 for small pruning,
#for rigorous pruning use larger values of K
K=1
# More L increases the recall but also reports more pair. Less sensitive than K
L=4
# ngrams length
shingles=2
# Thresholds, only reports if found in at least this many buckets (cancels random noise). If you are missing pairs #decrease this
Thresh=3
#Give the input CSV file. First line will be ignored (assumed to be header). Every line will be treated as a #record.
#The line number of record will be its ID. That is the fist line after header is treated as record with ID 1 etc.
Input=data/restaurant.csv
#Output File: this will contain a pair of record IDs in each line indicating a possible match.
Output=restaurant_pair.csv
##############################################################################
#These are advanced parameters depending on memory
##############################################################################
# No of Cells in each bucket. Decrease if goes out of memory.
BucketSize=32
# No of buckets in each tables is 2^{this number}. Too small will never finish. Decrease if goes out of memoryy. #Larger is better. Must be < 27
RangePow=20
# Increase if MinHashing Takes a lot of Time. Must be power of 2.
MinHashChunkSize=32
# Processes these many records in parallel, larger is faster. Decrease if goes out of memory
Chunk=500000