-
Notifications
You must be signed in to change notification settings - Fork 0
/
params_CB315vMET.R
228 lines (182 loc) · 10.4 KB
/
params_CB315vMET.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
### ========= input parameters section ===========
## rarely changed params:
# === image format parameters ===
# the suffix <FRM> of saveImageAs<FRM> variable stands for
# an image format. The following list is a subset
# of all formats supported by a2ping.
# Here is a complete list of supported a2ping outputs: BMP EPS GIF JPEG PBM PCL5 PDF PDF1 PGM PNG PPM PS TIFF XPM XWD markedEPS markedPS
# If you think that I've ommited some important format,
# please let me know or simply add the appropriate flag (and add it to imageFormat table in core.R)
saveImageAsPDF <- TRUE
saveImageAsPS <- TRUE # this needs to be TRUE for the latex report file
saveImageAsPBM <- FALSE
saveImageAsPGM <- FALSE
saveImageAsPPM <- FALSE
saveImageAsPNG <- FALSE
saveImageAsXWD <- FALSE
saveImageAsBMP <- FALSE
saveImageAsTIFF <- FALSE
saveImageAsJPEG <- FALSE
saveImageAsGIF <- FALSE
saveImageAsXPM <- FALSE
doAbtTables <- FALSE
useHelv <- FALSE
dumpAbuTables <- TRUE
dumpAnnTbl <- FALSE # use annotate_tables.py instead of g3anntbl.csv
dumpBinaryTables <- TRUE
dumpDistTables <- TRUE
#within_between_analysis <- TRUE
alternate_pch_Thld <-1000
## environment/user specific config:
# === output directory ===
# try to avoid dashes in directory names, they are interpolated into dots
dataDir <- "/Users/lalithaviswanathan/Documents/Project/Cubi_LChesnel_EPAN12_0120/PhyCA.Stats.Out" # directory for storing all PhyCA-Stats output data
sysTime <- format(Sys.time(), "%H.%M.%S") # directory for storing all this particular run of PhyCA-Stats
outDir <- paste(dataDir,"/",Sys.Date(),"_",sysTime,".PhyCA.Stats.Out",sep="")
outDir <- gsub("-",".",outDir)
nCPUs <- 3 # number of CPUs
# Note, that right now, snowfall parallel set up for each fragment of the pipeline is done separately. In production, I would consider only one initialization.
# === binaries ===
binExtractAnnotation <- "/Users/lalithaviswanathan/Documents/Project/PhyCA_Stats/code/extractAnnotation.pl" # full path to extractAnnotation.pl script
binComUnderChars <- "/Users/lalithaviswanathan/Documents/Project/PhyCA_Stats/code/commentOutUnderscoreChars.sh" # shell script to comment out underscore characters in latex tabels
binExtHeader <- "/Users/lalithaviswanathan/Documents/Project/PhyCA_Stats/code/extractHeader.pl" # path to perl script extracting headers of data files (formatted for unifrac)
unifracBin <- "macqiime beta_diversity.py" # path to qiime pkg for unifrac, works with qiime svn revision 2507
# Note that ps2pdf is used for ps-to-pdf conversion
# and a2ping for conversion to other formats
# Right now it is assumed that ps2pdf and a2ping are in the PATH variable
# === script paths ===
coreLibPath <- "/Users/lalithaviswanathan/Documents/Project/PhyCA_Stats/code/coreLib.R"
# === input files ===
otuAnnFile <- "/Users/lalithaviswanathan/Documents/Project/Cubi_LChesnel_EPAN12_0120/abund_rep_set.GG98_FL.taxonomy.80.csv" # combo of previous seqTaxFile and seqAnnFile
pathToTree <- "/Users/lalithaviswanathan/Documents/Project/Cubi_LChesnel_EPAN12_0120/gg_97_otus_4feb2011.tre"
## project specific params:
# === input files ===
bFile <- "/Users/lalithaviswanathan/Documents/Project/Cubi_LChesnel_EPAN12_0120/ref97_gg98_filtered.txt" # FULL path to presence/absence table
aFile <- "/Users/lalithaviswanathan/Documents/Project/Cubi_LChesnel_EPAN12_0120/ref97_gg98_filtered.txt" # FULL path to abundance table
mFile <- "/Users/lalithaviswanathan/Documents/Project/Cubi_LChesnel_EPAN12_0120/metadata.csv" # FULL path to metadata table; check the format of this file - get rid of OTU_ID field
# It is assumed that the second column of mFile, short_name, consists of short names of the files - no longer than 10 characters. Also, metadata column names must start with letters not numbers, else R puts an "X" in front.
unequal.scale.profiles <- T # profiles of abundance vs. sample for an OTU are scaled to that OTU's max if TRUE
## phyca-stats run params here to end
# R needs to know types of metadata columns. In particular, categorical variables have to to be identified by the user.
# There are two ways to specify categorical variables
# 1. write the names of categorical variabls into a comma delimeted file (mmFile)
# 2. specify the names of categorical variabls here (mFactors)
# mFactors <- NULL # set mFactors to NULL if the user does not specify categorical variables
mFactors <- c('CB315vMET');
# here define which factor(s) will be used for filtering.
mFilterFactors <- c('CB315vMET') # what is actually used in the anylsis only in DataRedn3 or 5 l 56 ff
# in doDataRedn6 the 'subjects' and 'pairings' columns must be specified here
# they must also be specified in the mFactors list
# currently the t.test is done but wilcox is also available as is FDR adjustment (paramters and conditional statements need to be added)
# subjectsFactor <- c('PatientID') # s1, s2, etc
# pairingsFactor <- c('CvCtrl') # pre, post etc (must be only two levels) # turn on for filter 6 and place in factor
# For bioenv, Mantel and Adonis tests need a selection of continuous
# variables.
# There are two options:
# 1. User specifies only categorical variables and all others are
# treated as continuous (unless R detects a character variable and
# excludes it from the list of continuous variables) This is a
# good choice if the user wants to use bioenv() to identify
# categorical variables associated with distances of environmental
# variables.
# 2. User specifies continuous variables that she wants to use in
# Mantel and Adonis tests.
# Note that contVar has to be defined, even if you don't want to specify any
# continuous variables. In this case, just set
# contVar <- NULL
contVar <- c('gDNAlab')
# === normalization paramters ===
normalize <- T # normalization flag, if set to TRUE, the normalization of all data reduction tables is performed.
normTarget <- 1e6 # target normalization value (used only for totalSum)
whichNorm <- "totalSum" #"totalSum", "rankNorm", "uqNorm"
# === flags for different data reduction methods ===
# Data reduction methods are numbered as in SOW document.
doDataRedn1 <- F
Redn1SampleRemoval <- FALSE # if a sample has zero OTUs in bt1 then remove sample
# may need to set to true if you want bt1 ordinations/clusterings if data set had a clean negative control
# this can create problems with sample labels
doDataRedn2 <- F
doDataRedn3 <- F
permute.filter3 <- TRUE
filter3.num.permutes <- 100
doDataRedn4 <- F
rangeQtThld <- 0.9 # quantile treshold for the max/min ratios; only taxons with the max/min ratios in the 90% quantile will be selected in the data reduction method #4
doDataRedn5 <- TRUE
useANOVA <- TRUE
useWelch <- F
useKW <- F
pValThld <- .05 # threshold of ANOVA type p-values. pValThld <- FALSE will auto-select >= 50 taxa in filter 5
filterMin <- 10 # if pValThld <- FALSE, phyca-stats will select >= this many OTUs, rounding the p value to the nearest greater power of 10
permute.filter5 <- 1 # 0=off, 1=shuffle samples into groups, 2=shuffle all abundandance data
filter5.num.permutes <- 20
doDataRedn6 <- F
doDataRedn7 <- F
doDataRedn8 <- F
doDataRedn9 <- F
# 9 in one per correlated cluster per species/subfamily
# === parameters for distance methods ===
createSorensenDist <- F
createUnifracDist <- T
createWgtdUnifracDist <- T
createBrayCurtisDist <- F
createEcldnDist <- F
# === ordination parameters ===
doPCoA <- T # Principal Coordinate Analysis
doPCA <- FALSE # Principal Component Analysis
doNMDS <- T # Non metric Multi-Dimensional Scaling
hexbinThld <- 330 # threshold for switching to a hexbin plot in ordination scatterplot plotOrd2dAllFtrs() routine
# === biplots ===
doEnvfit <- FALSE # used for biplots
contVar_Envfit <- 'nseqs' # just pick one
ordTrellis1 <- FALSE # if TRUE, single categorical variable trellis plots are generated
ordTrellis2 <- FALSE # if TRUE, two categorical variables trellis plots are generated
# === hierarchical clustering parameters ===
doHCNN <- FALSE # hierarchical clustering using single linkage
doHCAN <- T # hierarchical clustering using average linkage
doHCFN <- FALSE # hierarchical clustering using complete linkage
doHCWard <- FALSE # hierarchical clustering using Ward linkage
sepThickness <- 3;
# === singnificance testing parameters ===
# There are three significance type of tests, all from vegan package:
# bioenv(), mantel() and adonis(). Here are the flags that control
# which of them will be run.
doBioEnv <- FALSE
doMantel <- FALSE
doAdonis <- T
doAdonisPairs <- F
# params used for bioenv and mantel only
# The dissimilarity index used for community data in 'vegdist'
distIndex <- "bray" # possible values (specified in the appropriate
# pull down menu) are: manhattan, euclidean,
# canberra, bray, kulczynski, jaccard, gower,
# altGower, morisita, horn, mountford, raup ,
# binomial or chao.
# used for bioenv and mantel only
# The correlation method used in 'cor', 'bioenv' and 'mantel'subject of unknown paternal ethnic origin
corMethod <- "spearman" # possible values (specified in the
# appropriate pull down menu) : pearson,
# kendall, spearman
# === PAM ===
doPAM <- F # run PAM classifier for each categorical variable to classify samples with missing data within the selected variable
pamMinNoORFs <- 10
pamMaxNoORFs <- 20
pamfilter <- 1 # usually filter 9 is used (feb 2012)
## in PAM plots groups will be ordered alphabeticaly. So if you want to see "pre" before "post" use "1_pre" and "2_post"
# === iTOL ===
# itol requires filter 5, make sure doDataRedn5 <- TRUE
doITOL <- FALSE
itolBaseline <- 'healthy'
itolTaxonLevel <- 'family'
# === compositional, horizontal barcharts ===
doTaxonPropBarCharts <- F # generate bar charts of proportions at different taxonomic levels
doTaxonAbundancePropBarCharts <-TRUE
nTaxaInPropBarCharts <- 8 # maximum number of taxa to display in these bar charts.3
# there will be an extra black category for 'Other', for a total of
# nTaxaInPropBarCharts + 1 elements per bar
dumpBarChartTables <- TRUE
# === richness plots ===
doTaxonRichnessCharts <- F
dumpRichnessChartTables <- FALSE
richnessTaxonLevels <- c("phylum","class","order","family","genus","species")
# full options are c("phylum","class","order","family","genus","species")
### ========= end of input parameters section ===========##