This repository has been archived by the owner on Dec 11, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_matching_str.R
61 lines (46 loc) · 1.83 KB
/
find_matching_str.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
find_matching_str <- function(str_to_check, database, method = "osa", no_cores = 2, db = "nmnh"){
#remove these chars
this_str <- gsub("[?!*]", "", as.character(str_to_check))
if (this_str == "" || this_str == "-" || this_str == "NA" || is.na(this_str)){
cat("Empty string, returning NAs.")
results <- as.data.frame(cbind(NA, NA))
names(results) <- c("match", "score")
return(results)
}else{
if (method == "jw"){
#Jaro-Winkler distance
str_matches <- stringdist::stringdist(this_str, database[,1], nthread = no_cores, method = method, p = 0.1)
}else{
str_matches <- stringdist::stringdist(this_str, database[,1], nthread = no_cores, method = method)
}
#Add string to scores
results <- cbind(database, str_matches)
if (db == "nmnh"){
names(results) <- c("match", "ID", "score")
}else if (db == "gbif"){
names(results) <- c("match", "no_records", "ROWID", "score")
}
return(results)
}
}
find_matching_str2 <- function(str_to_check, database, method = "osa", no_cores = 2){
#remove these chars
this_str <- gsub("[?!*]", "", as.character(str_to_check))
if (this_str == "" || this_str == "-" || this_str == "NA" || is.na(this_str)){
cat("Empty string, returning NAs.")
results <- as.data.frame(cbind(NA, NA))
names(results) <- c("match", "score")
return(results)
}else{
if (method == "jw"){
#Jaro-Winkler distance
str_matches <- stringdist::stringdist(this_str, database[,1], nthread = no_cores, method = method, p = 0.1)
}else{
str_matches <- stringdist::stringdist(this_str, database[,1], nthread = no_cores, method = method)
}
#Add string to scores
results <- data.frame(cbind(database, str_matches))
names(results) <- c("match", "score")
return(results)
}
}