From 8c36f4718cf4e3775d716b10e749cf85074a7f8c Mon Sep 17 00:00:00 2001 From: Thierry Gosselin Date: Tue, 29 Nov 2016 17:40:44 -0500 Subject: [PATCH] **v.0.4.6** * I'm pleased to announce that `stackr` parallel mode now works with **Windows**! Nothing to install, just need to choose the number of CPU, the rest is done automatically. * `haplo2colony` is deprecated. Use the new function called `write_colony`! * `write_colony`: works similarly to the deprecated function `haplo2colony`, * with the major advantage that it's no longer restricted to STACKS haplotypes file. * The function is using the `tidy_genomic_data` module to import files. So you can choose one of the 10 input file formats supported by `stackr`! * other benefits also include the possibility to efficiently test MAF, snp.ld, haplotypes/snp approach, whitelist of markes, blacklist of individuals, blacklist of genotypes, etc. with the buit-it arguments. * the function only **keeps markers in common** between populations/groups and **is removing monomorphic markers**. * **Note:** there are several *defaults* in the function and it's a complicated file format, so make sure to read the function documentation, please, and `COLONY` manual. --- DESCRIPTION | 4 +- NAMESPACE | 11 +- R/.DS_Store | Bin 6148 -> 6148 bytes R/write_colony.R | 649 ++++++++++++++++++++++++++++++++++++++++++++ README.Rmd | 26 +- README.md | 17 +- inst/CITATION | 2 +- man/write_colony.Rd | 320 ++++++++++++++++++++++ 8 files changed, 1018 insertions(+), 11 deletions(-) create mode 100644 R/write_colony.R create mode 100644 man/write_colony.Rd diff --git a/DESCRIPTION b/DESCRIPTION index c3e1244..8af42b8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: stackr Type: Package Title: GBS/RADseq Data Exploration, Manipulation and Visualization using R -Version: 0.4.5 -Date: 2016-11-18 +Version: 0.4.6 +Date: 2016-11-29 Encoding: UTF-8 Authors@R: c( person("Thierry", "Gosselin", email = "thierrygosselin@icloud.com", role = c("aut", "cre")), diff --git a/NAMESPACE b/NAMESPACE index 801a5bc..303d6c7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,7 +27,6 @@ export(filter_snp_number) export(fis_summary) export(genomic_converter) export(genotypes_summary) -export(haplo2colony) export(ibdg_fh) export(individuals2strata) export(keep_common_markers) @@ -79,6 +78,7 @@ export(whitelist_loci_snp) export(whitelist_snp_vcf) export(write_arlequin) export(write_betadiv) +export(write_colony) export(write_genepop) export(write_genind) export(write_genlight) @@ -90,10 +90,8 @@ export(write_vcf) import(adegenet) import(dplyr) import(ggplot2) -import(lazyeval) import(parallel) import(readr) -import(reshape2) import(strataG) import(stringi) importFrom(adegenet,chromosome) @@ -116,6 +114,7 @@ importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) importFrom(dplyr,coalesce) importFrom(dplyr,contains) +importFrom(dplyr,count) importFrom(dplyr,distinct) importFrom(dplyr,filter) importFrom(dplyr,filter_) @@ -144,8 +143,12 @@ importFrom(dplyr,ungroup) importFrom(lazyeval,interp) importFrom(magrittr,"%>%") importFrom(methods,new) +importFrom(parallel,clusterExport) importFrom(parallel,detectCores) +importFrom(parallel,makeCluster) importFrom(parallel,mclapply) +importFrom(parallel,parLapply) +importFrom(parallel,stopCluster) importFrom(purrr,detect_index) importFrom(purrr,discard) importFrom(purrr,flatten) @@ -158,6 +161,7 @@ importFrom(readr,read_delim) importFrom(readr,read_lines) importFrom(readr,read_tsv) importFrom(readr,write_delim) +importFrom(readr,write_file) importFrom(readr,write_tsv) importFrom(reshape2,dcast) importFrom(reshape2,melt) @@ -202,6 +206,7 @@ importFrom(tidyr,unite_) importFrom(utils,combn) importFrom(utils,count.fields) importFrom(utils,packageVersion) +importFrom(utils,sessionInfo) importFrom(utils,write.table) importFrom(vcfR,extract.gt) importFrom(vcfR,read.vcfR) diff --git a/R/.DS_Store b/R/.DS_Store index 6481cd87c17f2e79436091e9596a987f630f7fa4..1ce8379f50bf6c4e86a03b83a2a50bdd323411c0 100644 GIT binary patch delta 35 qcmZoMXffCj&9-?u8w(>ZA454q5kn?J2}3GF{NxNa{mrx4GX(&yZV7S# delta 35 qcmZoMXffCj&Bn{ekin41P{5GGkk4Q=Ifq?;b2J+Z% + stringi::stri_replace_all_fixed( + str = ., + pattern = c("-", " ", ":"), + replacement = c("", "@", ""), + vectorize_all = FALSE + ) %>% + stringi::stri_sub(str = ., from = 1, to = 13) + ) + + if (!is.null(imputation.method)) { + filename.imp <- stringi::stri_replace_all_fixed( + str = filename, + pattern = "stackr_colony_", + replacement = "stackr_colony_imputed_", + vectorize_all = FALSE + ) + } + } else { + if (!is.null(imputation.method)) { + filename.imp <- stringi::stri_join(filename, "_imputed") + } + } + + # Import---------------------------------------------------------------------- + data.type <- detect_genomic_format(data = data) + message(paste("File type: ", data.type)) + + # Strata argument required for VCF and haplotypes files + if (data.type == "vcf.file" & is.null(strata)) stop("strata argument is required") + + message("Importing data...") + if (data.type == "tbl_df") { + input <- stackr::tidy_wide(data = data, import.metadata = TRUE) + # For long tidy format, switch LOCUS to MARKERS column name, if found MARKERS not found + if (tibble::has_name(input, "LOCUS") && !tibble::has_name(input, "MARKERS")) { + input <- dplyr::rename(.data = input, MARKERS = LOCUS) + } + } else { + input <- suppressMessages( + stackr::tidy_genomic_data( + data = data, + vcf.metadata = FALSE, + blacklist.id = blacklist.id, + blacklist.genotype = blacklist.genotype, + whitelist.markers = whitelist.markers, + monomorphic.out = TRUE, + max.marker = max.marker, + snp.ld = snp.ld, + common.markers = TRUE, + maf.thresholds = maf.thresholds, + maf.pop.num.threshold = maf.pop.num.threshold, + maf.approach = maf.approach, + maf.operator = maf.operator, + strata = strata, + pop.levels = pop.levels, + pop.labels = pop.labels, + pop.select = pop.select, + filename = NULL, + verbose = FALSE + ) + ) + } + + input$GT <- stringi::stri_replace_all_fixed( + str = input$GT, + pattern = c("/", ":", "_", "-", "."), + replacement = c("", "", "", "", ""), + vectorize_all = FALSE + ) + pop.levels <- levels(input$POP_ID) + pop.labels <- pop.levels + + # Subsampling markers -------------------------------------------------------- + if (!is.null(sample.markers)) { + message(stringi::stri_join("Randomly subsampling ", sample.markers, " markers...")) + # sample.markers <- 500 # test + markers.list <- dplyr::distinct(input, MARKERS) %>% + dplyr::sample_n(tbl = ., size = sample.markers, replace = FALSE) + + input <- suppressWarnings(dplyr::semi_join(input, markers.list, by = "MARKERS")) + markers.list <- NULL + } + + # Generate colony without imputations ---------------------------------------- + message("Generating COLONY file without imputations...\n") + stackr_colony(data = input, filename = filename) + + # Imputations----------------------------------------------------------------- + if (!is.null(imputation.method)) { + message("\nMap-independent imputations...\n\n") + input.imp <- stackr::stackr_imputations_module( + data = dplyr::select(.data = input, MARKERS, POP_ID, INDIVIDUALS, GT), + imputation.method = imputation.method, + impute = impute, + imputations.group = imputations.group, + num.tree = num.tree, + iteration.rf = iteration.rf, + split.number = split.number, + verbose = verbose, + parallel.core = parallel.core, + filename = NULL + ) + message("\n\nGenerating COLONY file WITH imputations...\n") + stackr_colony(data = input.imp, filename = filename.imp) + + } # End imputations + + message("COLONY file(s) written in the working directory") + res <- "COLONY file(s) written in the working directory" + # Imputations: colony with imputed haplotypes using Random Forest ------------ + timing <- proc.time() - timing + message(stringi::stri_join("Computation time: ", round(timing[[3]]), " sec")) + cat("############################## completed ##############################\n") + return(res) +} + +# stackr_colony internal function ---------------------------------------------- +# @keywords internal +# @param data the tidy data to transform into a colony file +# @param filename name of the file written in the directory +# @param ... other argument passed to the remaining part of the code + +stackr_colony <- function( + data, + filename, + allele.freq = NULL, + inbreeding = 0, + mating.sys.males = 0, + mating.sys.females = 0, + clone = 0, + run.length =2, + analysis = 1, + allelic.dropout = 0, + error.rate = 0.02, + print.all.colony.opt = FALSE, + ...) { + # data <- input #test + + # Separate the alleles ------------------------------------------------------- + input <- dplyr::select(.data = data, MARKERS, POP_ID, INDIVIDUALS, GT) %>% + dplyr::mutate( + A1 = stringi::stri_sub(GT, 1, 3), + A2 = stringi::stri_sub(GT, 4,6) + ) %>% + dplyr::select(-GT) %>% + tidyr::gather(data = ., key = ALLELE_GROUP, value = ALLELES, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% + dplyr::mutate(ALLELES = as.numeric(ALLELES)) %>% + dplyr::arrange(MARKERS, POP_ID, INDIVIDUALS) + + # Allele frequency per locus-------------------------------------------------- + if (!is.null(allele.freq)) { + message("Computing allele frequency") + if (allele.freq != "overall") { + input.alleles <- dplyr::filter(.data = input, POP_ID %in% allele.freq) %>% + dplyr::filter(ALLELES != 0) + } else { + input.alleles <- dplyr::filter(.data = input, ALLELES != 0) + } + + allele.per.locus <- dplyr::distinct(input.alleles, MARKERS, ALLELES) %>% + dplyr::count(x = ., MARKERS) %>% + dplyr::arrange(MARKERS) %>% + dplyr::select(n) %>% + purrr::flatten_chr(.) + + # If someone finds a beter and faster wayto do all this, I'm all in! + freq <- input.alleles %>% + dplyr::group_by(MARKERS, ALLELES) %>% + dplyr::tally(.) %>% + dplyr::ungroup() %>% + dplyr::group_by(MARKERS) %>% + dplyr::mutate(FREQ = round(n/sum(n), 2)) %>% + dplyr::select(MARKERS, ALLELES, FREQ) %>% + dplyr::arrange(MARKERS) %>% + dplyr::mutate(GROUP = seq(1, n(), by = 1)) %>% + dplyr::mutate_all(.tbl = ., .funs = as.character) %>% + tidyr::gather(data = ., key = ALLELES_FREQ, value = VALUE, -c(MARKERS, GROUP)) %>% + dplyr::mutate(ALLELES_FREQ = factor(ALLELES_FREQ, levels = c("ALLELES", "FREQ"), ordered = TRUE)) %>% + dplyr::group_by(MARKERS, ALLELES_FREQ) %>% + tidyr::spread(data = ., key = GROUP, value = VALUE) %>% + dplyr::ungroup(.) %>% + tidyr::unite(data = ., col = INFO, -c(MARKERS, ALLELES_FREQ), sep = " ") %>% + dplyr::mutate( + INFO = stringi::stri_replace_all_regex(str = INFO, pattern = "NA", replacement = "", vectorize_all = FALSE), + INFO = stringi::stri_trim_right(str = INFO, pattern = "\\P{Wspace}") + ) %>% + dplyr::select(-MARKERS, -ALLELES_FREQ) + input.alleles <- NULL + } + + markers.name <- tibble::as_data_frame(t(dplyr::distinct(input, MARKERS) %>% dplyr::arrange(MARKERS))) + marker.num <- ncol(markers.name) + + input <- tidyr::unite(data = input, col = MARKERS.ALLELE_GROUP, MARKERS, ALLELE_GROUP, sep = ".") %>% + dplyr::group_by(POP_ID, INDIVIDUALS) %>% + tidyr::spread(data = ., key = MARKERS.ALLELE_GROUP, value = ALLELES) %>% + dplyr::arrange(POP_ID, INDIVIDUALS) %>% + dplyr::ungroup(.) %>% + dplyr::select(-POP_ID) %>% + dplyr::mutate_all(.tbl = ., .funs = as.character) + + # Line 1 = Dataset name------------------------------------------------------- + dataset.opt <- "`My first COLONY run` ! Dataset name\n" + readr::write_file(x = dataset.opt, path = filename, append = FALSE) + + # Line 2 = Output filename---------------------------------------------------- + colony.output.filename <- stringi::stri_replace_all_fixed( + filename, + pattern = ".dat", + replacement = "") + colony.output.filename <- paste(colony.output.filename, " ! Output file name\n") + readr::write_file(x = colony.output.filename, path = filename, append = TRUE) + + # Line 3 = Offspring number--------------------------------------------------- + off.num.opt <- paste(nrow(input), " ! Number of offspring in the sample\n", sep = "") + readr::write_file(x = off.num.opt, path = filename, append = TRUE) + + # Line 4 = Number of loci----------------------------------------------------- + marker.num.opt <- paste(marker.num, " ! Number of loci\n", sep = "") + readr::write_file(x = marker.num.opt, path = filename, append = TRUE) + + # Line 5 = Seed random number generator ------------------------------------- + seed.opt <- "1234 ! Seed for random number generator\n" + readr::write_file(x = seed.opt, path = filename, append = TRUE) + + # Line 6 = Updating allele frequency------------------------------------------ + update.allele.freq.opt <- "0 ! 0/1=Not updating/updating allele frequency\n" + readr::write_file(x = update.allele.freq.opt, path = filename, append = TRUE) + + # Line 7 = Dioecious/Monoecious species--------------------------------------- + dioecious.opt <- "2 ! 2/1=Dioecious/Monoecious species\n" + readr::write_file(x = dioecious.opt, path = filename, append = TRUE) + + # Line 8 = inbreeding--------------------------------------------------------- + inbreeding.opt <- paste(inbreeding, " ! 0/1=No inbreeding/inbreeding\n", sep = "") + readr::write_file(x = inbreeding.opt, path = filename, append = TRUE) + + # Line 9 = Ploidy------------------------------------------------------------ + ploidy.opt <- "0 ! 0/1=Diploid species/HaploDiploid species\n" + readr::write_file(x = ploidy.opt, path = filename, append = TRUE) + + # Line 10 = Mating system (polygamous: 0, monogamous: 1).--------------------- + mating.opt <- paste(mating.sys.males," ", mating.sys.females, " ! 0/1=Polygamy/Monogamy for males & females\n", sep = "") + readr::write_file(x = mating.opt, path = filename, append = TRUE) + + # Line 11 = Clone inference--------------------------------------------------- + clone.opt <- paste(clone, " ! 0/1=Clone inference =No/Yes\n", sep = "") + readr::write_file(x = clone.opt, path = filename, append = TRUE) + + # Line 12 = Sibship size scaling---------------------------------------------- + sib.size.scal.opt <- "1 ! 0/1=Full sibship size scaling =No/Yes\n" + readr::write_file(x = sib.size.scal.opt, path = filename, append = TRUE) + + # Line 13 = Sibship prior indicator (Integer), average paternal sibship size (Real, optional), average maternal sibship size (Real, optional) + sib.prior.opt <- "0 0 0 ! 0, 1, 2, 3 = No, weak, medium, strong sibship size prior; mean paternal & maternal sibship size\n" + readr::write_file(x = sib.prior.opt, path = filename, append = TRUE) + + # Line 14 = Population allele frequency indicator----------------------------- + if (is.null(allele.freq)) { + allele.freq.ind.opt <- "0 ! 0/1=Unknown/Known population allele frequency\n" + } else { + allele.freq.ind.opt <- "1 ! 0/1=Unknown/Known population allele frequency\n" + } + readr::write_file(x = allele.freq.ind.opt, path = filename, append = TRUE) + + + # Line 15 and more------------------------------------------------------------ + # Numbers of alleles per locus (Integer, optional). + # Required when the Population allele frequency indicator is set to 1 + # Alleles and their frequencies per locus (Integer, Real, optional) + # If someone finds a beter to do all this, I'm all in! + + if (!is.null(allele.freq)) { + readr::write_file(x = paste0(paste0(allele.per.locus, collapse = " "), " !Number of alleles per locus\n"), path = filename, append = TRUE) + utils::write.table(x = freq, file = filename, append = TRUE, quote = FALSE, row.names = FALSE, col.names = FALSE) + } + + # Number of runs-------------------------------------------------------------- + num.run.opt <- "1 ! Number of runs\n" + readr::write_file(x = num.run.opt, path = filename, append = TRUE) + + # Length of run -------------------------------------------------------------- + # give a value of 1, 2, 3, 4 to indicate short, medium, long, very long run + run.length.opt <- paste(run.length, " ! Length of run\n", sep = "") + readr::write_file(x = run.length.opt, path = filename, append = TRUE) + + # Monitor method (Time in second)--------------------------------------------- + monitor.met.opt <- "0 ! 0/1=Monitor method by Iterate\n" + readr::write_file(x = monitor.met.opt, path = filename, append = TRUE) + + + # Monitor interval (Time in second)------------------------------------------- + monitor.int.opt <- "10000 ! Monitor interval in Iterate\n" + readr::write_file(x = monitor.int.opt, path = filename, append = TRUE) + + # WindowsGUI/DOS, 0 when running Colony in DOS mode or on other platforms----- + windows.gui.opt <- "0 ! non-Windows version\n" + readr::write_file(x = windows.gui.opt, path = filename, append = TRUE) + + # Analysis method : ---------------------------------------------------------- + # 0, 1 or 2 for Pairwise-Likelihood score (PLS), full likelihood method (FL), + # or the FL and PLS combined method (FPLS). More on these methods are explained + # above in the Windows GUI data input section. + analysis.opt <- paste(analysis, " ! Analysis 0 (Pairwise-Likelihood Score), 1 (Full Likelihood), 2 (combined Pairwise-Likelihood Score and Full Likelihood)\n", sep = "") + readr::write_file(x = analysis.opt, path = filename, append = TRUE) + + # Precision------------------------------------------------------------------- + precision.opt <- "3 ! 1/2/3=low/medium/high Precision for Full likelihood\n" + readr::write_file(x = precision.opt, path = filename, append = TRUE) + + # Marker IDs/Names------------------------------------------------------------ + # snp.id <- seq(from = 1, to = marker.number, by = 1) + # markers.name.opt <- as.character(c(markers.name, " ! Marker IDs\n")) + # readr::write_file(x = markers.name.opt, path = filename, append = TRUE) + markers.name.opt <- stringi::stri_join(markers.name, collapse = " ") + markers.name.opt <- stringi::stri_join(markers.name.opt, "! Marker IDs\n") + readr::write_file(x = markers.name.opt, path = filename, append = TRUE) + + # Marker types --------------------------------------------------------------- + # marker.type (codominant/dominant) + marker.type.opt <- stringi::stri_join(rep(0, marker.num), collapse = " ") + marker.type.opt <- stringi::stri_join(marker.type.opt, " ! Marker types, 0/1=Codominant/Dominant\n") + readr::write_file(x = marker.type.opt, path = filename, append = TRUE) + + # Allelic dropout rates------------------------------------------------------- + dropout <- stringi::stri_join(rep(allelic.dropout, marker.num), collapse = " ") + dropout <- stringi::stri_join(dropout, " ! Allelic dropout rate at each locus\n") + readr::write_file(x = dropout, path = filename, append = TRUE) + + # Error rates----------------------------------------------------------------- + error <- stringi::stri_join(rep(error.rate, marker.num), collapse = " ") + error <- stringi::stri_join(error, " ! False allele rate\n\n") + readr::write_file(x = error, path = filename, append = TRUE) + + # Offspring IDs and genotype-------------------------------------------------- + utils::write.table(x = input, file = filename, sep = " ", append = TRUE, col.names = FALSE, row.names = FALSE, quote = FALSE) + + # Probabilities that the father and mother of an offspring are included------- + # in the candidate males and females. The two numbers must be provided even if + # there are no candidate males or/and females. + prob.opt <- "\n\n0 0 ! Prob. of dad/mum included in the candidates\n" + readr::write_file(x = prob.opt, path = filename, append = TRUE) + + # Numbers of candidate males and females-------------------------------------- + candidate.opt <- "0 0 ! Numbers of candidate males & females\n" + readr::write_file(x = candidate.opt, path = filename, append = TRUE) + + # PRINT ALL REMAINING COLONY OPTIONS ----------------------------------------- + if (print.all.colony.opt) { + message("Printing all COLONY options...") + # Candidate male IDs/names and genotypes------------------------------------ + candidate.male.id.geno.opt <- "!Candidate male ID and genotypes\n" + readr::write_file(x = candidate.male.id.geno.opt, path = filename, append = TRUE) + # Candidate female IDs/names and genotypes ---------------------------------- + candidate.female.id.geno.opt <- "!Candidate female ID and genotypes\n" + readr::write_file(x = candidate.female.id.geno.opt, path = filename, append = TRUE) + } + + # Number of offspring with known paternity------------------------------------ + known.paternity.opt <- "0 ! Number of offspring with known father\n" + readr::write_file(x = known.paternity.opt, path = filename, append = TRUE) + + # Known offspring-father dyad------------------------------------------------- + if (print.all.colony.opt) { + known.father.dyad.opt <- "! Offspring ID and known father ID (Known offspring-father dyad)\n" + readr::write_file(x = known.father.dyad.opt, path = filename, append = TRUE) + } + + # Number of offspring with known maternity------------------------------------ + known.maternity.opt <- "0 ! Number of offspring with known mother\n" + readr::write_file(x = known.maternity.opt, path = filename, append = TRUE) + + # Known offspring-mother dyad------------------------------------------------- + if (print.all.colony.opt) { + known.mother.dyad.opt <- "! Offspring ID and known mother ID (Known offspring-mother dyad)\n" + readr::write_file(x = known.mother.dyad.opt, path = filename, append = TRUE) + } + + # Number of known paternal sibships (Integer)--------------------------------- + known.paternal.sibships.opt <- "0 ! Number of known paternal sibships\n" + readr::write_file(x = known.paternal.sibships.opt, path = filename, append = TRUE) + + + # Paternal sibship size and members (Integer, String, optional).-------------- + if (print.all.colony.opt) { + known.paternal.sibships.size.opt <- "! Paternal sibship size and members\n" + readr::write_file(x = known.paternal.sibships.size.opt, path = filename, append = TRUE) + } + + # Number of known maternal sibships (Integer)--------------------------------- + known.maternal.sibships.opt <- "0 ! Number of known maternal sibships\n" + readr::write_file(x = known.maternal.sibships.opt, path = filename, append = TRUE) + + + # Maternal sibship size and members (Integer, String, optional).-------------- + if (print.all.colony.opt) { + known.maternal.sibships.size.opt <- "! Maternal sibship size and members\n" + readr::write_file(x = known.maternal.sibships.size.opt, path = filename, append = TRUE) + } + + # Number of offspring with known excluded paternity (Integer). --------------- + offspring.known.excl.paternity.opt <- "0 ! Number of offspring with known excluded fathers\n" + readr::write_file(x = offspring.known.excl.paternity.opt, path = filename, append = TRUE) + + + # Excluded paternity --------------------------------------------------------- + if (print.all.colony.opt) { + excl.paternity.opt <- "! Offspring ID, number of excluded fathers, and excluded father IDs\n" + readr::write_file(x = excl.paternity.opt, path = filename, append = TRUE) + } + + # Number of offspring with known excluded maternity (Integer). --------------- + offspring.known.excl.maternity.opt <- "0 ! Number of offspring with known excluded mothers\n" + readr::write_file(x = offspring.known.excl.maternity.opt, path = filename, append = TRUE) + + + # Excluded maternity---------------------------------------------------------- + if (print.all.colony.opt) { + excl.maternity.opt <- "! Offspring ID, number of excluded mothers, and excluded father IDs\n" + readr::write_file(x = excl.maternity.opt, path = filename, append = TRUE) + } + + # Number of offspring with known excluded paternal sibships------------------- + offspring.known.excl.paternal.sibships.opt <- "0 ! Number of offspring with known excluded paternal sibships\n" + readr::write_file(x = offspring.known.excl.paternal.sibships.opt, path = filename, append = TRUE) + + # Excluded paternal siblings-------------------------------------------------- + if (print.all.colony.opt) { + excluded.paternal.siblings.opt <- "! Excluded paternal siblings\n" + readr::write_file(x = excluded.paternal.siblings.opt, path = filename, append = TRUE) + } + + # Number of offspring with known excluded maternal sibships------------------- + offspring.known.excl.maternal.sibships.opt <- "0 ! Number of offspring with known excluded maternal sibships\n" + readr::write_file(x = offspring.known.excl.maternal.sibships.opt, path = filename, append = TRUE) + + # Excluded maternal siblings-------------------------------------------------- + if (print.all.colony.opt) { + excluded.maternal.siblings.opt <- "! Excluded maternal siblings\n" + readr::write_file(x = excluded.maternal.siblings.opt, path = filename, append = TRUE) + } +}# End stackr_colony + + + + + diff --git a/README.Rmd b/README.Rmd index 3788830..7e8f4f8 100644 --- a/README.Rmd +++ b/README.Rmd @@ -57,14 +57,14 @@ library(stackr) # to load | Caracteristics | Description | |:-------------------|:--------------------------------------------------------| | **Import** | Various supported genomic file formats in `tidy_genomic_format` and/or their separate module:
[VCF](https://samtools.github.io/hts-specs/) (Danecek et al., 2011)
[PLINK tped/tfam](http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#tr) (Purcell et al., 2007)
[genind](https://github.com/thibautjombart/adegenet) (Jombart et al., 2010; Jombart and Ahmed, 2011)
[genlight](https://github.com/thibautjombart/adegenet) (Jombart et al., 2010; Jombart and Ahmed, 2011), also in `tidy_genlight`
[strataG gtypes](https://github.com/EricArcher/strataG) (Archer et al., 2016)
[Genepop](http://genepop.curtin.edu.au) (Raymond and Rousset, 1995; Rousset, 2008), also in `tidy_genepop`
[STACKS haplotype file](http://catchenlab.life.illinois.edu/stacks/) (Catchen et al., 2011, 2013)
[hierfstat](https://github.com/jgx65/hierfstat) (Goudet, 2005), also in `tidy_fstat`
Dataframes of genotypes in wide or long/tidy format, also in `tidy_wide`| -| **Output** |Export genomic data out of **stackr** using `genomic_converter` or the separate modules (12 formats):
`write_vcf`: [VCF](https://samtools.github.io/hts-specs/) (Danecek et al., 2011)
`write_plink`: [PLINK tped/tfam](http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#tr) (Purcell et al., 2007)
`write_genind`: [adegenet genind and genlight](https://github.com/thibautjombart/adegenet) (Jombart et al., 2010; Jombart and Ahmed, 2011)
`write_genlight`: [genlight](https://github.com/thibautjombart/adegenet) (Jombart et al., 2010; Jombart and Ahmed, 2011)
`write_gtypes`: [strataG gtypes](https://github.com/EricArcher/strataG)
`write_genepop`: [Genepop](http://genepop.curtin.edu.au) (Raymond and Rousset, 1995; Rousset, 2008)
[STACKS haplotype file](http://catchenlab.life.illinois.edu/stacks/) (Catchen et al., 2011, 2013)
`write_betadiv`: [betadiv](http://adn.biol.umontreal.ca/~numericalecology/Rcode/) (Lamy, 2015)
`vcf2dadi`: [δaδi](http://gutengroup.mcb.arizona.edu/software/) (Gutenkunst et al., 2009)
`write_structure`: [structure](http://pritchardlab.stanford.edu/structure.html) (Pritchard et al., 2000)
`write_arlequin`: [Arlequin](http://cmpg.unibe.ch/software/arlequin35/) (Excoffier et al. 2005)
`write_hierfstat`: [hierfstat](https://github.com/jgx65/hierfstat) (Goudet, 2005)
Dataframes of genotypes in wide or long/tidy format| +| **Output** |Export genomic data out of **stackr** using `genomic_converter` or the separate modules (12 formats):
`write_vcf`: [VCF](https://samtools.github.io/hts-specs/) (Danecek et al., 2011)
`write_plink`: [PLINK tped/tfam](http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#tr) (Purcell et al., 2007)
`write_genind`: [adegenet genind and genlight](https://github.com/thibautjombart/adegenet) (Jombart et al., 2010; Jombart and Ahmed, 2011)
`write_genlight`: [genlight](https://github.com/thibautjombart/adegenet) (Jombart et al., 2010; Jombart and Ahmed, 2011)
`write_gtypes`: [strataG gtypes](https://github.com/EricArcher/strataG)
`write_colony`: [COLONY](https://www.zsl.org/science/software/colony) (Jones and Wang, 2010; Wang, 2012)
`write_genepop`: [Genepop](http://genepop.curtin.edu.au) (Raymond and Rousset, 1995; Rousset, 2008)
[STACKS haplotype file](http://catchenlab.life.illinois.edu/stacks/) (Catchen et al., 2011, 2013)
`write_betadiv`: [betadiv](http://adn.biol.umontreal.ca/~numericalecology/Rcode/) (Lamy, 2015)
`vcf2dadi`: [δaδi](http://gutengroup.mcb.arizona.edu/software/) (Gutenkunst et al., 2009)
`write_structure`: [structure](http://pritchardlab.stanford.edu/structure.html) (Pritchard et al., 2000)
`write_arlequin`: [Arlequin](http://cmpg.unibe.ch/software/arlequin35/) (Excoffier et al. 2005)
`write_hierfstat`: [hierfstat](https://github.com/jgx65/hierfstat) (Goudet, 2005)
Dataframes of genotypes in wide or long/tidy format| |**Conversion function**| `genomic_converter` import/export genomic formats mentioned above. The function is also integrated with usefull filters, blacklist and whitelist.| |**Outliers detection**|`detect_duplicate_genomes`: Detect and remove duplicate individuals from your dataset
`detect_mixed_genomes`: Detect and remove potentially mixed individuals
`summary_haplotype` and `filter_snp_number`: Discard of outlier markers with *de novo* assembly artifact (e.g. markers with an extreme number of SNP per haplotype or with irregular number of alleles)| |**Pattern of missingness**|`missing_visualization`: Visualize patterns of missing data. Find patterns associated with different variables of your study (lanes, chips, sequencers, populations, sample sites, reads/samples, homozygosity, etc)| |**Filters**| Alleles, genotypes, markers, individuals and populations can be filtered and/or selected in several ways:

`filter_coverage`: Discard markers based on read depth (coverage) of alleles and genotypes
`filter_genotype_likelihood`: Discard markers based on genotype likelihood
`filter_individual`: Discard markers based on proportion of genotyped individuals
`filter_population`: Discard markers based on proportion of genotyped populations
`filter_het`: Discard markers based on the observed heterozygosity (Het obs)
`filter_fis`: Discard markers based on the inbreeding coefficient (Fis)
`filter_hw`: Discard markers based on the Hardy-Weinberg Equilibrium expectations (HWE)
`filter_maf`: Discard markers based on local or global (overall) minor allele frequency| |**Imputations**|**Map-independent** imputations of missing genotypes.
Using **Random Forest** or the most frequent category.
Imputations can be conducted **overall samples** or **by populations**.

Imputations are integrated in several functions and in a separate module: `stackr_imputations_module`| |**[ggplot2](http://ggplot2.org)-based plotting**|Visualize distribution of important metric and statistics and create publication-ready figures| -|**Parallel**|Codes designed and optimized for fast computations running imputations, iterations, etc. in parallel| +|**Parallel**|Codes designed and optimized for fast computations running imputations, iterations, etc. in parallel. Works with all OS: Linux, Mac and now PC!| [More in stackr workflow below](https://github.com/thierrygosselin/stackr#stackr-workflow) @@ -120,6 +120,28 @@ citation("stackr") ## New features Change log, version, new features and bug history now lives in the [NEWS.md file](https://github.com/thierrygosselin/stackr/blob/master/NEWS.md) +**v.0.4.6** + +* I'm pleased to announce that `stackr` parallel mode now works with **Windows**! +Nothing to install, just need to choose the number of CPU, +the rest is done automatically. +* `haplo2colony` is deprecated. Use the new function called `write_colony`! +* `write_colony`: works similarly to the deprecated function `haplo2colony`, + * with the major advantage that it's no longer restricted to STACKS + haplotypes file. + * The function is using the `tidy_genomic_data` module + to import files. So you can choose one of the 10 input file formats supported + by `stackr`! + * other benefits also include the possibility to efficiently test MAF, + snp.ld, haplotypes/snp approach, whitelist of markes, + blacklist of individuals, blacklist of genotypes, etc. with the buit-it + arguments. + * the function only **keeps markers in common** between populations/groups + and **is removing monomorphic markers**. + * **Note:** there are several *defaults* in the function and + it's a complicated file format, so make sure to read the function + documentation, please, and `COLONY` manual. + **v.0.4.5** diff --git a/README.md b/README.md index 3a6a7c5..4329d81 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![Travis-CI Build Status](https://travis-ci.org/thierrygosselin/stackr.svg?branch=master)](https://travis-ci.org/thierrygosselin/stackr) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/thierrygosselin/stackr?branch=master&svg=true)](https://ci.appveyor.com/project/thierrygosselin/stackr) [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/stackr)](http://cran.r-project.org/package=stackr) [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active) [![DOI](https://zenodo.org/badge/14548/thierrygosselin/stackr.svg)](https://zenodo.org/badge/latestdoi/14548/thierrygosselin/stackr) -[![packageversion](https://img.shields.io/badge/Package%20version-0.4.5-orange.svg)](commits/master) [![Last-changedate](https://img.shields.io/badge/last%20change-2016--11--27-brightgreen.svg)](/commits/master) +[![packageversion](https://img.shields.io/badge/Package%20version-0.4.6-orange.svg)](commits/master) [![Last-changedate](https://img.shields.io/badge/last%20change-2016--11--29-brightgreen.svg)](/commits/master) ------------------------------------------------------------------------ @@ -40,7 +40,7 @@ library(stackr) # to load Output -Export genomic data out of stackr using genomic_converter or the separate modules (12 formats):
write_vcf: VCF (Danecek et al., 2011)
write_plink: PLINK tped/tfam (Purcell et al., 2007)
write_genind: adegenet genind and genlight (Jombart et al., 2010; Jombart and Ahmed, 2011)
write_genlight: genlight (Jombart et al., 2010; Jombart and Ahmed, 2011)
write_gtypes: strataG gtypes
write_genepop: Genepop (Raymond and Rousset, 1995; Rousset, 2008)
STACKS haplotype file (Catchen et al., 2011, 2013)
write_betadiv: betadiv (Lamy, 2015)
vcf2dadi: δaδi (Gutenkunst et al., 2009)
write_structure: structure (Pritchard et al., 2000)
write_arlequin: Arlequin (Excoffier et al. 2005)
write_hierfstat: hierfstat (Goudet, 2005)
Dataframes of genotypes in wide or long/tidy format +Export genomic data out of stackr using genomic_converter or the separate modules (12 formats):
write_vcf: VCF (Danecek et al., 2011)
write_plink: PLINK tped/tfam (Purcell et al., 2007)
write_genind: adegenet genind and genlight (Jombart et al., 2010; Jombart and Ahmed, 2011)
write_genlight: genlight (Jombart et al., 2010; Jombart and Ahmed, 2011)
write_gtypes: strataG gtypes
write_colony: COLONY (Jones and Wang, 2010; Wang, 2012)
write_genepop: Genepop (Raymond and Rousset, 1995; Rousset, 2008)
STACKS haplotype file (Catchen et al., 2011, 2013)
write_betadiv: betadiv (Lamy, 2015)
vcf2dadi: δaδi (Gutenkunst et al., 2009)
write_structure: structure (Pritchard et al., 2000)
write_arlequin: Arlequin (Excoffier et al. 2005)
write_hierfstat: hierfstat (Goudet, 2005)
Dataframes of genotypes in wide or long/tidy format Conversion function @@ -68,7 +68,7 @@ library(stackr) # to load Parallel -Codes designed and optimized for fast computations running imputations, iterations, etc. in parallel +Codes designed and optimized for fast computations running imputations, iterations, etc. in parallel. Works with all OS: Linux, Mac and now PC! @@ -127,6 +127,17 @@ New features Change log, version, new features and bug history now lives in the [NEWS.md file](https://github.com/thierrygosselin/stackr/blob/master/NEWS.md) +**v.0.4.6** + +- I'm pleased to announce that `stackr` parallel mode now works with **Windows**! Nothing to install, just need to choose the number of CPU, the rest is done automatically. +- `haplo2colony` is deprecated. Use the new function called `write_colony`! +- `write_colony`: works similarly to the deprecated function `haplo2colony`, + - with the major advantage that it's no longer restricted to STACKS haplotypes file. + - The function is using the `tidy_genomic_data` module to import files. So you can choose one of the 10 input file formats supported by `stackr`! + - other benefits also include the possibility to efficiently test MAF, snp.ld, haplotypes/snp approach, whitelist of markes, blacklist of individuals, blacklist of genotypes, etc. with the buit-it arguments. + - the function only **keeps markers in common** between populations/groups and **is removing monomorphic markers**. + - **Note:** there are several *defaults* in the function and it's a complicated file format, so make sure to read the function documentation, please, and `COLONY` manual. + **v.0.4.5** - temporary fix to `tidy_genomic_data` to read unconventional Tassel VCF diff --git a/inst/CITATION b/inst/CITATION index bca4d7e..df0c8c3 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -7,5 +7,5 @@ year = "2016", author = personList(as.person("Thierry Gosselin"), as.person("Louis Bernatchez")), url = "https://github.com/thierrygosselin/stackr", doi = "http://dx.doi.org/10.5281/zenodo.154432", -textVersion = paste("Thierry Gosselin, Louis Bernatchez, (2016). stackr: GBS/RAD Data Exploration, Manipulation and Visualization using R. R package version 0.4.5 https://github.com/thierrygosselin/stackr. doi : 10.5281/zenodo.154432") +textVersion = paste("Thierry Gosselin, Louis Bernatchez, (2016). stackr: GBS/RAD Data Exploration, Manipulation and Visualization using R. R package version 0.4.6 https://github.com/thierrygosselin/stackr. doi : 10.5281/zenodo.154432") ) diff --git a/man/write_colony.Rd b/man/write_colony.Rd new file mode 100644 index 0000000..c2e2754 --- /dev/null +++ b/man/write_colony.Rd @@ -0,0 +1,320 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/write_colony.R +\name{write_colony} +\alias{write_colony} +\title{Write a \code{COLONY} input file from 9 genomic formats} +\usage{ +write_colony(data, strata = NULL, pop.levels = NULL, pop.labels = NULL, + blacklist.id = NULL, blacklist.genotype = NULL, + whitelist.markers = NULL, monomorphic.out = TRUE, snp.ld = NULL, + common.markers = TRUE, maf.thresholds = NULL, maf.pop.num.threshold = 1, + maf.approach = "SNP", maf.operator = "OR", max.marker = NULL, + sample.markers = NULL, pop.select = NULL, allele.freq = NULL, + inbreeding = 0, mating.sys.males = 0, mating.sys.females = 0, + clone = 0, run.length = 2, analysis = 1, allelic.dropout = 0, + error.rate = 0.02, print.all.colony.opt = FALSE, + imputation.method = NULL, impute = "genotype", + imputations.group = "populations", num.tree = 100, iteration.rf = 10, + split.number = 100, verbose = FALSE, + parallel.core = parallel::detectCores() - 1, filename = NULL) +} +\arguments{ +\item{data}{9 options: biallelic and haplotypic vcf +(to make vcf population ready, see details below), +plink, stacks haplotype file, genind (library(adegenet)), +genlight (library(adegenet)), gtypes (library(strataG)), genepop, +and a data frame in wide format. +\emph{See details} of \code{\link{tidy_genomic_data}}.} + +\item{strata}{(optional/required) Required for VCF and haplotypes files, +optional for the other file formats supported. + +The strata file is a tab delimited file with 2 columns with header: +\code{INDIVIDUALS} and \code{STRATA}. With a +data frame of genotypes the strata is the INDIVIDUALS and POP_ID columns, with +PLINK files, the \code{tfam} first 2 columns are used. +If a \code{strata} file is specified, the strata file will have +precedence. The \code{STRATA} column can be any hierarchical grouping. +To create a strata file see \code{\link[stackr]{individuals2strata}}. +If you have already run +\href{http://catchenlab.life.illinois.edu/stacks/}{stacks} on your data, +the strata file is similar to a stacks `population map file`, make sure you +have the required column names (\code{INDIVIDUALS} and \code{STRATA}). +Default: \code{strata = NULL}.} + +\item{pop.levels}{(optional, string) This refers to the levels in a factor. In this +case, the id of the pop. +Use this argument to have the pop ordered your way instead of the default +alphabetical or numerical order. e.g. \code{pop.levels = c("QUE", "ONT", "ALB")} +instead of the default \code{pop.levels = c("ALB", "ONT", "QUE")}. +White spaces in population names are replaced by underscore. +Default: \code{pop.levels = NULL}.} + +\item{pop.labels}{(optional, string) Use this argument to rename/relabel +your pop or combine your pop. e.g. To combine \code{"QUE"} and \code{"ONT"} +into a new pop called \code{"NEW"}: +(1) First, define the levels for your pop with \code{pop.levels} argument: +\code{pop.levels = c("QUE", "ONT", "ALB")}. +(2) then, use \code{pop.labels} argument: +\code{pop.levels = c("NEW", "NEW", "ALB")}. +To rename \code{"QUE"} to \code{"TAS"}: +\code{pop.labels = c("TAS", "ONT", "ALB")}. +Default: \code{pop.labels = NULL}. If you find this too complicated, there is also the +\code{strata} argument that can do the same thing, see below. +White spaces in population names are replaced by underscore.} + +\item{blacklist.id}{(optional) A blacklist with individual ID and +a column header 'INDIVIDUALS'. The blacklist is in the working directory +(e.g. "blacklist.txt"). +Default: \code{blacklist.id = NULL}.} + +\item{blacklist.genotype}{(optional) Useful to erase genotype with below +average quality, e.g. genotype with more than 2 alleles in diploid likely +sequencing errors or genotypes with poor genotype likelihood or coverage. +The blacklist has a minimum of 2 column headers (markers and individuals). +Markers can be 1 column (CHROM or LOCUS or POS), +a combination of 2 (e.g. CHROM and POS or CHROM and LOCUS or LOCUS and POS) or +all 3 (CHROM, LOCUS, POS). The markers columns must be designated: CHROM (character +or integer) and/or LOCUS (integer) and/or POS (integer). The id column designated +INDIVIDUALS (character) columns header. The blacklist must be in the working +directory (e.g. "blacklist.genotype.txt"). For de novo VCF, CHROM column +with 'un' need to be changed to 1. +Default: \code{blacklist.genotype = NULL} for no blacklist of +genotypes to erase.} + +\item{whitelist.markers}{(optional) A whitelist containing CHROM (character +or integer) and/or LOCUS (integer) and/or +POS (integer) columns header. To filter by chromosome and/or locus and/or by snp. +The whitelist is in the working directory (e.g. "whitelist.txt"). +de novo CHROM column with 'un' need to be changed to 1. +In the VCF, the column ID is the LOCUS identification. +Default \code{whitelist.markers = NULL} for no whitelist of markers.} + +\item{monomorphic.out}{(optional) Should the monomorphic +markers present in the dataset be filtered out ? +Default: \code{monomorphic.out = TRUE}.} + +\item{snp.ld}{(optional) \strong{For VCF file only}. +SNP short distance linkage disequilibrium pruning. With anonymous markers from +RADseq/GBS de novo discovery, you can minimize linkage disequilibrium (LD) by +choosing among these 3 options: \code{"random"} selection, \code{"first"} or +\code{"last"} SNP on the same short read/haplotype. For long distance linkage +disequilibrium pruning, see details below. +Default: \code{snp.ld = NULL}.} + +\item{common.markers}{(optional) Logical. Default: \code{common.markers = TRUE}, +will only keep markers in common (genotyped) between all the populations.} + +\item{maf.thresholds}{(string, double, optional) String with +local/populations and global/overall maf thresholds, respectively. +e.g. \code{maf.thresholds = c(0.05, 0.1)} for a local maf threshold +of 0.05 and a global threshold of 0.1. Available for VCF, PLINK and data frame +files. +Default: \code{maf.thresholds = NULL}.} + +\item{maf.pop.num.threshold}{(integer, optional) When maf thresholds are used, +this argument is for the number of pop required to pass the maf thresholds +to keep the locus. Default: \code{maf.pop.num.threshold = 1}} + +\item{maf.approach}{(character, optional). +\code{maf.approach = "haplotype"} : looks at the minimum MAF found on the +read/haplotype. Using this option will discard all the markers/snp on +that read based on the thresholds chosen. This method is only available +for VCF and haplotype files, or tidy data frame from those file types. +\code{maf.approach = "SNP"} : treats all the SNP on the same +haplotype/read as independent. Doesn't work with haplotype file, +but does work for all other file type. +Default is \code{maf.approach = "SNP"}.} + +\item{maf.operator}{(character, optional) \code{maf.operator = "AND"} or +default \code{maf.operator = "OR"}. +When filtering over LOCUS or SNP, do you want the local \code{"AND"} +global MAF to pass the thresholds, or ... you want the local \code{"OR"} +global MAF to pass the thresholds, to keep the marker?} + +\item{max.marker}{An optional integer useful to subsample marker number in +large PLINK file. e.g. if the data set +contains 200 000 markers and \code{max.marker = 10000} 10000 markers are +subsampled randomly from the 200000 markers. Use \code{whitelist.markers} to +keep specific markers. +Default: \code{max.marker = NULL}.} + +\item{sample.markers}{(number) \code{COLONY} can take a long time to run, +use a random subsample of your markers to speed test \code{COLONY} +e.g. \code{sample.markers = 500} to use only 500 randomly chosen markers. +Default: \code{sample.markers = NULL}, will use all markers.} + +\item{pop.select}{(string, optional) Selected list of populations for +the analysis. e.g. \code{pop.select = c("QUE", "ONT")} to select \code{QUE} +and \code{ONT} population samples (out of 20 pops). +Default: \code{pop.select = NULL}} + +\item{allele.freq}{(optional, string) Allele frequency can be computed from +a select group. +e.g. \code{allele.freq = "QUE"} or \code{allele.freq = c("QUE", "ONT")}. +Using \code{allele.freq = "overall"} will use all the samples to compute the +allele frequency. +Default: \code{allele.freq = NULL}, will not compute allele frequency.} + +\item{inbreeding}{(boolean) 0/1 no inbreeding/inbreeding. +Default: \code{inbreeding = 0}} + +\item{mating.sys.males}{(boolean) Mating system in males. +0/1 polygyny/monogyny. +Default: \code{mating.sys.males = 0}.} + +\item{mating.sys.females}{(boolean) Mating system in females. +0/1 polygyny/monogyny. +Default: \code{mating.sys.females = 0}.} + +\item{clone}{(boolean) Should clones and duplicated individuals be inferred. +0/1, yes/no. Default: \code{clone = 0}.} + +\item{run.length}{(integer) Length of run. 1 (short), 2 (medium), 3 (long), +4 (very long). Start with short or medium run and consider longer run if your +estimates probability are not stable or really good. +Default: \code{run.length = 2}.} + +\item{analysis}{(integer) Analysis method. +0 (Pairwise-Likelihood Score), 1 (Full Likelihood), +2 (combined Pairwise-Likelihood Score and Full Likelihood). +Default: \code{analysis = 1}.} + +\item{allelic.dropout}{Locus allelic dropout rate. +Default : \code{allelic.dropout = 0}.} + +\item{error.rate}{Locus error rate. +Default:\code{error.rate = 0.02}.} + +\item{print.all.colony.opt}{(logical) Should all \code{COLONY} options be printed in the file. + + +\strong{This require manual curation, for the file to work directly with \code{COLONY}}. +Default = \code{print.all.colony.opt = FALSE}.} + +\item{imputation.method}{(character, optional) +Methods available for map-independent imputations of missing genotype: +(1) \code{"max"} to use the most frequent category for imputations. +(2) \code{"rf"} using Random Forest algorithm. +Default: no imputation \code{imputation.method = NULL}.} + +\item{impute}{(character, optional) Imputation on missing genotype +\code{impute = "genotype"} or alleles \code{impute = "allele"}. +Default: \code{"genotype"}.} + +\item{imputations.group}{(character, optional) \code{"global"} or \code{"populations"}. +Should the imputations be computed globally or by populations. If you choose +global, turn the verbose to \code{TRUE}, to see progress. +Default = \code{"populations"}.} + +\item{num.tree}{(integer, optional) The number of trees to grow in Random Forest. +Default: \code{num.tree = 100}.} + +\item{iteration.rf}{(integer, optional) The number of iterations of missing data algorithm +in Random Forest. +Default: \code{iteration.rf = 10}.} + +\item{split.number}{(integer, optional) Non-negative integer value used to specify +random splitting in Random Forest. +Default: \code{split.number = 100}.} + +\item{verbose}{(optional, logical) When \code{verbose = TRUE} +the function is a little more chatty during execution. +Default: \code{verbose = FALSE}.} + +\item{parallel.core}{(optional) The number of core for OpenMP shared-memory parallel +programming of Random Forest imputations. For more info on how to install the +OpenMP version see \code{\link[randomForestSRC]{randomForestSRC-package}}. +If not selected \code{detectCores()-1} is used as default.} + +\item{filename}{Name of the acronym for filenaming in the working directory.} +} +\value{ +A \code{COLONY} file in your working directory (2 if you selected imputations arguments...) +} +\description{ +Write a \code{COLONY} input file from several genomic formats, +look into \pkg{stackr} \code{\link{tidy_genomic_data}} for supported input +files. +} +\details{ +\strong{It is highly recommended to read (twice!) the user guide distributed with +\code{COLONY} to find out the details for input and output of the software.} + +Not all options are provided here. + +But to ease the process, all the required options to properly run \code{COLONY} +will be printed in the file written in your working directory. +Change the values accordingly and wisely. + + +\strong{Imputations} +The imputations using Random Forest requires more time to compute +and can take several minutes and hours depending on the size of +the dataset and polymorphism of the species used. e.g. with a +low polymorphic taxa, and a data set containing 30\% missing data, +5 000 haplotypes loci and 500 individuals will require 15 min. +For details about imputations, look into \pkg{stackr} \code{\link{stackr_imputations_module}} +} +\note{ +\code{common.markers} argument should be left to the default: \code{common.markers = TRUE}. + + +\code{monomorphic.out} argument should be left to the default: \code{monomorphic.out = TRUE}. + + +\strong{using a VCF with several SNP per haplotypes/read for input?} + +Please consider keeping just 1 SNP per read, \code{COLONY} doesn't like this +kind of linkage... use the argument \code{snp.ld}. +} +\examples{ +\dontrun{ +tidy.vcf <- tidy_genomic_data( +data = "batch_1.vcf", +whitelist.markers = "whitelist.vcf.txt", +snp.ld = NULL, +common.markers = TRUE, +blacklist.id = "blacklist.id.treefrog.tsv", +strata = "strata.treefrog.tsv", +pop.levels = c("PAN", "COS") +) +} +} +\author{ +Thierry Gosselin \email{thierrygosselin@icloud.com} +} +\references{ +Catchen JM, Amores A, Hohenlohe PA et al. (2011) +Stacks: Building and Genotyping Loci De Novo From Short-Read Sequences. +G3, 1, 171-182. + +Catchen JM, Hohenlohe PA, Bassham S, Amores A, Cresko WA (2013) +Stacks: an analysis tool set for population genomics. +Molecular Ecology, 22, 3124-3140. + +Ishwaran H. and Kogalur U.B. (2015). Random Forests for Survival, + Regression and Classification (RF-SRC), R package version 1.6.1. + +Ishwaran H. and Kogalur U.B. (2007). Random survival forests +for R. R News 7(2), 25-31. + +Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. (2008). +Random survival forests. Ann. Appl. Statist. 2(3), 841--860. + +Jones OR, Wang J (2010) COLONY: a program for parentage and +sibship inference from multilocus genotype data. +Molecular Ecology Resources, 10, 551–555. + +Wang J (2012) Computationally Efficient Sibship and +Parentage Assignment from Multilocus Marker Data. Genetics, 191, 183–194. +} +\seealso{ +\code{COLONY} is available on Jinliang Wang web site +\url{http://www.zsl.org/science/software/colony} + +\code{randomForestSRC} is available on +CRAN \url{http://cran.r-project.org/web/packages/randomForestSRC/} +and github \url{https://github.com/ehrlinger/randomForestSRC} +} +