-
Notifications
You must be signed in to change notification settings - Fork 0
/
first_round.R
191 lines (176 loc) · 8.65 KB
/
first_round.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# To run the workflow, user needs to enter the value of a number of variables, especially the location of the files and their names.
# There is a parameter set for each part of the work flow
# /!\ Never change the *names* of the variables /!\
#######################################################################
# parameter for graphic outputs
# Set to 'FALSE' in case an error occurs when attempting to draw a plot
# problem encountered when combining apple computers and unix workstation
draw_plot=TRUE
# Enter here free comments relative to the present dataset
comment0 <-paste("Data curation for the set of 240+2 individuals from plates 1 to 6. ",
"data are in",
"\\scriptsize D://Mes donnees/sequencage/2017/article sequence/Tassel_to_J/data/",
"\\normalsize" ,sep="\n")
#suggestions for comment variables.
# the following sample text shows how add some latex formatting code to a multiline comment texts.
#•This is normal latex code except that the backslash needs to be doubled,
# otherwise it will be considered as part of a special character.
# comment0<-paste("this is a sample text. \\textit{this is in italics}, this is not. ",
# "\\textbf{some bold text} ",
# "\\footnotesize a bit smaller ",
# "\\scriptsize even smaller ",
# "\\tiny and this is very very small ",
# " ",
# "\\normalsize back to normal font",sep="\n")
# This allows the above to be printed as formatted text in the output document.
#
##########################################################
# Parameters for step 1
##########################################################
# Tassel was executed and data were filtered. The FILTER.tab contains genotyping data with coverage information.
# This step can take one or more input files (eg. because data were treated in parallel).
# For each input file, a genotypic file and a marker info file are created.
##########################################################
# Enter here your free comments for step 1
comment1 <- ""
# place of initial data files
where_in <- "data/set1"
# name of input file.
# Note: to save memory, several files can be passed with syntax c(file1,file2, file3)
file.in <- "first_round.tab"
#place of files produced at step 1
where_out_1 <- "data/set1/step1"
##name of output genotype file
file.value <- "genot_first_round.txt"
# name of marker info file
# Note: to save memory, several files can be passed with syntax c(file1,file2, file3)
file.ident <- "markid_first_round.txt"
# Directories containing the functions
where_functions <- "functions"
#p.value cut-off Suggested something like 1/(2* number of rows). Negative values cancel the filter
P.value.min <- -1
# Cross type: (only BC1 is implemented currently)
type <- "BC1"
# Name the parents. Two values are required (In BC1, the heterozygous parent comes first).
# if one parent was not geotyped, set the correspondig flag to FALSE
parent <- c("P3637","P49")
# flags to activate/deactivate tests one one parent
# at least one of the following should be TRUE
has.hetero <- TRUE # heterozygous parent
has.homo <- TRUE # homozygous parent
#list of excluded genotypes
others <- NULL
#others<-c("DA10D","DegNour")
# names of the columns in the filter.tab file.
marker_column <- "marker"
scaffold_column <- "scaffold"
position_column <- "position"
# locus_column <- "marker"
# scaffold_column<- "chomosome"
# position_column <- "position"
##########################################################
# Parameters for step 2
##########################################################
# At this stage, the genotypes are assigned to the parents.
# Only loci whose segregation is compatible with the mating plan are retained.
# Statistics are computed on the disjunction and the results are written into the locus info file.
# The columns of the genotype table are reordered. First the parents, then the PROGENIES in order of increasing number of missing data.
# All genotyping files and all locus information file are merged at this step.
##########################################################
# uses where_out_1, file.value, file.ident
# Enter here your free comments
comment2 <- ""
# name of input file
# place of files produced at step 2
where_out_2 <-"data/set1/step2"
file.value.2 <- "genot_first_round_2.txt"
file.ident.2 <- "markid_first_round_2.txt"
# Thresholds
# discard individuals having less than 100*cut.ind % good data.
# This parameter should be liberal since more tests will come later.
cut.ind <- 0.5
# The following thresholds flag markers for elimination at a later stage
# minimum frequency of the minority allele
balanced <- .15
# minimum proportion of valid data (excluding missing and unexpected alleles)
few_good_data <- 0.7
# maximum proportion of unexpected alleles
many_wrong <- 0.1
##########################################################
# Parameters for step 3
##########################################################
# Codes the genotype file in a form usable by mapping software. Parents are removed from the output file.
##########################################################
# uses where_out_2, file.value.2 and file.ident.2 for data entry
# Enter here your free comments
comment3 <- ""
#place of final files
where_out_3 <- "data/set1/step3"
# name of coded genotype file
file.value.3 <- "genot_first_round_3.txt"
#name of updated marker info file
file.ident.3 <- "markid_first_round_3.txt"
# Use a four letter code to describe the genotypes (preferred for Joinmap: "ABHU")
# letters are in upper case (at a letter stage, lower cases will be used to identify corrections)
# first letter: a homozygous parental genotype
# Second letter: the other homozygous parental genotype (not used in BC1)
# third letter: the heterozygous genotype
# fourth letter: unknown genotype
# Joinmap might require converting to lower case at the end of the process (to be checked)
genotype_code <- "ABHU"
# do not change the next two lines
code_U <- strsplit(genotype_code,split="")[[1]]
code_L <- tolower(code_U)
##########################################################
# Parameters for step 4
##########################################################
# the purpose of this treatment chain is to automatize the selection of useful markers.
# There are two main issues: redundant markers and outliers ie markers that segregate
# independently from the other loci of the same scaffold (possibly chimeric scaffolds or wrong assignment of marker to scaffold).
##########################################################
# uses where_out_3, file.value.3 and file.ident.3 for data entry
# Enter here your free comments
comment4 <- ""
# place of final files
where_out_4 <- "data/set1/step4"
# names of files
pair_outlier <- "pair_outlier.txt"
pair_redundant <- "pair_redundant.txt"
multiple_outlier <- "multiple_outlier.txt"
multiple_redundant <- "multiple_redundant.txt"
multiple_all <- "multiple_all.txt"
reduced_data <- "reduced_data.txt"
reduced_info <- "reduced_info.txt"
indiv_select <- "indiv_select.txt"
# two distances between loci of the same scaffold are computed
# the first one is the expectation of the *number* of recombinations
# under the hypothesis of no linkage (taking into account missing data)
# It is used to identify outliers
# suggested value: a little less than N/2-sqrt(N) where N is the number of descendents
# This value may need to be adapted based on the histograms produced at this step.
t_outlier <- 60
# the second distance is the observed recombination rate, based only on individuals which have both loci
# issues a warning for large gaps between loci of the same scaffold.
l_gap<-0.05
# in the second step of imputation, chosen individuals are assigned a value based on a majority rule.
# Accept imputation only if the frequency of the most frequent allele is above 'majority'
majority <- 0.8
#Parameters to select individuals and loci
stop_remove <- 20 # do not consider removing more than 'stop_remove' individuals
max_miss_loc <- 0.2 # retain only loci with 'min_miss_loc*100' % missing data.
##########################################################
# Parameters for step 5
##########################################################
# Last step: Examine last figure of previous step and chose the number of individuals to eliminate.
# This step takes one data and one information file.
# Another file set with less individual and less loci (but also less missing data) is produced.
##########################################################
# uses where_out_4,for data entry
# Enter here your free comments
comment5 <- ""
# place of final files
where_out_5 <- "data/set1/step5"
#Number of individuals used for plotting
genot_file_5<-"data_joinmap.txt"
infoloc_5<-"info_loc.txt"
removed <- 18