forked from LangilleLab/microbiome_helper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_logfiles.R
executable file
·126 lines (92 loc) · 3.75 KB
/
merge_logfiles.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env Rscript
# Read in package to read in command-line options.
library("optparse")
version <- "1.0"
option_list <- list(
make_option(c("-i", "--input"), type="character", default=NULL,
help="Comma-delimited list of logfiles to combine (required)." ,
metavar="path"),
make_option(c("-d", "--delim"), type="character", default="\t",
help="Character specifying how logfiles are delimited (default: \"\t\").",
metavar="path"),
make_option(c("-n", "--names"), type="character", default=NULL,
help=paste("Optional comma-delimited strings that should be added to distinguish",
"the columns by file (default=NULL)."),
metavar="path"),
make_option(c("-o", "--output"), type="character", default="combined_log.txt",
help="Path to output file (default: \"combined_log.txt\").",
metavar="path"),
make_option(c("--version"), action = "store_true", type="logical", default=FALSE,
help="Print out version number and exit.", metavar = "boolean")
)
opt_parser <- OptionParser(
option_list=option_list,
usage = "%prog [options] -i log1.txt,log2.txt",
description = paste(
"Basic script to read in multiple logfiles and to merge them by rows.\n",
"The first column of each file is assumed to contain the sample names", sep="")
)
opt <- parse_args(opt_parser)
# Print out version if --version flag set.
if (opt$version) {
cat("Wrapper version:", version, "\n")
options_tmp <- options(show.error.messages=FALSE)
on.exit(options(options_tmp))
stop()
}
if(is.null(opt$input)) {
stop("paths to input logfiles need to be set.")
}
in_files <- strsplit(opt$input, ",")
# Check if only one file given.
if(length(in_files[[1]]) == 1) {
stop("only one input file given.")
}
# Check if names option given and make sure same length as files if so.
if(! is.null(opt$names)){
in_names <- strsplit(opt$names, ",")
if(length(in_files[[1]]) != length(in_names[[1]])) {
stop("different numbers of infiles and names given.")
}
}
# Define function to add new empty rows and give them specified rownames in df.
add_NA_rows <- function(rownames2add, df_in) {
rows2add <- data.frame(matrix(NA, ncol=ncol(df_in), nrow=length(rownames2add)))
colnames(rows2add) <- colnames(df_in)
rownames(rows2add) <- rownames2add
return(rbind(rows2add, df_in))
}
# Define function to merge 2 dataframes on rownames.
# Will set any rows found in only 1 df to be all NAs in the df where it is missing.
cbind_w_missing <- function(df1, df2) {
df1_only <- rownames(df1)[which(! rownames(df1) %in% rownames(df2))]
df2_only <- rownames(df2)[which(! rownames(df2) %in% rownames(df1))]
if(length(df1_only >= 1)){
df2 <- add_NA_rows(df1_only, df2)
}
if(length(df2_only >= 1)){
df1 <- add_NA_rows(df2_only, df1)
}
return(cbind(df1, df2))
}
# Keep file count marker.
file_count = 0
# Loop over index of all infiles.
for(i in 1:length(in_files[[1]])){
infile <- read.table(in_files[[1]][i], header=TRUE, quote="", row.names=1,
sep=opt$delim, stringsAsFactors=FALSE)
if (! is.null(opt$names)) {
colnames(infile) <- paste(in_names[[1]][i], colnames(infile), sep=".")
}
file_count = file_count + 1
if(file_count == 1) {
combined <- infile
} else {
combined <- cbind_w_missing(combined, infile)
}
}
all_col <- colnames(combined)
combined$sample <- rownames(combined)
combined <- combined[,c("sample", all_col)]
write.table(x = combined, file = opt$output, quote = FALSE, sep = opt$delim,
col.names = TRUE, row.names = FALSE)