-
Notifications
You must be signed in to change notification settings - Fork 0
/
Lookup.R
81 lines (64 loc) · 2.16 KB
/
Lookup.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
library(dplyr)
library(tidyr)
library(stringr)
library(jsonlite)
generate_report = function(input_file,out_folder){
#if (!dir.exists(sub_folder)){
# dir.create(sub_folder, recursive = T)
#}
df <- readRDS(input_file)
nrows <- nrow(df)
if (nrow(df)>10000){
df <- df %>% sample_n(10000)
}
data = list()
cols = list()
for(col in colnames(df)){
type <- pillar::type_sum(df[[col]])
#print(type)
if (type == 'dbl' || type =='int' ){
max <- max(df[[col]])
min <- min(df[[col]])
}
else {
min <- ''
max <- ''
}
summary <- df %>% group_by(!!sym(col)) %>% summarise(n=n()) %>%
filter(n>10) %>%
mutate(frac=n/sum(n)) %>%
arrange(desc(frac)) %>%
head(5) %>%
mutate(value=!!sym(col),frac=signif(frac,2)) %>%
select(value,frac)
if (nrow(summary) < 1){
summary[1,] <- NA
}else if(col == "EAVE_LINKNO"){
summary <- summary %>% head(1)
summary[1,] <- NA
}
summary <- tryCatch(summary %>% mutate(across(everything(), as.character)), error = function(e) NULL)
if(is.null(summary)){
next
}
res <- summary#toJSON(summary,pretty=TRUE,na="null")
cols[[col]] = list(name=col,label='',description='',type=type,min=min,max=max,values=res)
}
finfo=file.info(input_file) %>% as_tibble() %>% select(size,mtime,ctime,atime,uname,grname)
data[['meta']] = list(file=input_file,finfo=finfo,nrows=nrows)
data[['columns']] <- cols
#col <- gsub("/", "_", col)
fname <- paste0(out_folder,tools::file_path_sans_ext(basename(input_file)),'.json')
data <- toJSON(data, pretty = TRUE, auto_unbox = TRUE)
write(data,fname)
}
out_folder <- "/home/calumm09/DataDictionary/scan_reports/"
in_folder <- "/conf/EAVE/GPanalysis/data/lookups/"
files <- list.files(path=in_folder, pattern="*.rds", full.names=T)
files
#generate_report("/conf/EAVE/GPanalysis/data/combined_qcovid_demographics.rds",out_folder)
generate_report("/conf/EAVE/GPanalysis/data/serology_snbts_july22_v3.rds",out_folder)
#data = readRDS('/home/calumm09/DataDictionary/scan_reports/serology_snbts_july22_v3.RData')
for (fname in files){
generate_report(fname,out_folder)
}