diff --git a/.svn/entries b/.svn/entries deleted file mode 100644 index 48082f7..0000000 --- a/.svn/entries +++ /dev/null @@ -1 +0,0 @@ -12 diff --git a/.svn/format b/.svn/format deleted file mode 100644 index 48082f7..0000000 --- a/.svn/format +++ /dev/null @@ -1 +0,0 @@ -12 diff --git a/.svn/pristine/02/02b2e852092a4f453afaa99edb670326549a1ad2.svn-base b/.svn/pristine/02/02b2e852092a4f453afaa99edb670326549a1ad2.svn-base deleted file mode 100644 index bbec726..0000000 --- a/.svn/pristine/02/02b2e852092a4f453afaa99edb670326549a1ad2.svn-base +++ /dev/null @@ -1,44 +0,0 @@ -\name{LogNMulti} -\alias{LogNMulti} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ -EM algorithm for the NB-beta model in the multiple condition test -} -\description{ -'LogNMulti' specifies the function to run (one round of) the EM algorithm for the NB-beta model in the multiple condition test.} -\usage{ -LogNMulti(Input, InputSP, EmpiricalR, EmpiricalRSP, - NumOfEachGroup, AlphaIn, BetaIn, PIn, - NoneZeroLength, AllParti, Conditions) -} - -\arguments{ - \item{Input, InputSP}{The expressions among all the samples.} - \item{NumOfEachGroup}{Number of genes in each Ng group.} - \item{AlphaIn, PIn, BetaIn, EmpiricalR, EmpiricalRSP}{The parameters from the last EM step.} - \item{NoneZeroLength}{Number of Ng groups.} - \item{AllParti}{The patterns of interests.} - \item{Conditions}{The condition assignment for each sample.} -} - -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) - -} -\author{ -Ning Leng -} - - -\examples{ -# - -#Input = matrix(rnorm(100,100,1),ncol=10) -#rownames(Input) = paste("g",1:10) -#RIn = matrix(rnorm(100,200,1), ncol=10) -#res = LogNMulti(Input, list(Input[,1:5], Input[,6:10]), -# RIn, list(RIn[,1:5], RIn[,6:10]), 10, .6, .7, -# c(.3,.7), 1, rbind(c(1,1), c(1,2)), -# as.factor(rep(c("C1","C2"), each=5))) -} - diff --git a/.svn/pristine/06/06bac27d56ba50532bc5c061619e0385095bd5cd.svn-base b/.svn/pristine/06/06bac27d56ba50532bc5c061619e0385095bd5cd.svn-base deleted file mode 100644 index 6c36a00..0000000 --- a/.svn/pristine/06/06bac27d56ba50532bc5c061619e0385095bd5cd.svn-base +++ /dev/null @@ -1,51 +0,0 @@ -DenNHist <- -function(EBOut, GeneLevel=F) -{ - if(!"Alpha"%in%names(EBOut))stop("The input doesn't seem like an output from EBTest/EBMultiTest") - maxround=nrow(EBOut$Alpha) - Alpha=EBOut$Alpha[maxround,] - Beta=EBOut$Beta[maxround,] - # Multi - if(!is.null(EBOut$PPpattern)){ - QList=EBOut$QList - for(i in 1:length(EBOut$QList)){ - for(j in 1:length(EBOut$QList[[i]])){ - if(GeneLevel==F)Main=paste("Ig",i,"C",j) - if(GeneLevel==T)Main=paste("Gene","C",j) - hist(QList[[i]][[j]][QList[[i]][[j]]<.98&QList[[i]][[j]]>0], - prob=T,col="blue",breaks=100, - main=Main, - xlim=c(0,1),xlab=paste("Q alpha=",round(Alpha,2), - " beta=",round(Beta[i],2),sep="")) - tmpSize=length(QList[[i]][[j]][QList[[i]][[j]]<.98]) - tmpseq=seq(0.001,1,length=1000) - ll=tmpseq - lines(ll,dbeta(ll,Alpha,Beta[i]),col="green",lwd=2) - legend("topright",c("Data","Fitted density"),col=c("blue","green"),lwd=2) - } - } - } - - if(is.null(EBOut$PPpattern)){ - for(con in 1:2){ - if(con==1)QList=EBOut$QList1 - if(con==2)QList=EBOut$QList2 - if(!is.list(QList)) QList=list(QList) - for (i in 1:length(QList)){ - if(GeneLevel==F)Main=paste("Ig",i,"C",con) - if(GeneLevel==T)Main=paste("Gene","C",con) - hist(QList[[i]][QList[[i]]<.98&QList[[i]]>0], - prob=T,col="blue",breaks=100, - main=Main, - xlim=c(0,1),xlab=paste("Q alpha=",round(Alpha,2), - " beta=",round(Beta[i],2),sep="")) - tmpSize=length(QList[[i]][QList[[i]]<.98]) - tmpseq=seq(0.001,1,length=1000) - ll=tmpseq - lines(ll,dbeta(ll,Alpha,Beta[i]),col="green",lwd=2) - legend("topright",c("Data","Fitted density"),col=c("blue","green"),lwd=2) - }} - } - - } - diff --git a/.svn/pristine/07/073be145bab3cd95c45898c7f0d2e23f097d1279.svn-base b/.svn/pristine/07/073be145bab3cd95c45898c7f0d2e23f097d1279.svn-base deleted file mode 100644 index 9fc47ff..0000000 --- a/.svn/pristine/07/073be145bab3cd95c45898c7f0d2e23f097d1279.svn-base +++ /dev/null @@ -1,38 +0,0 @@ -\name{MedianNorm} -\alias{MedianNorm} -\title{ -Median Normalization -} -\description{ -'MedianNorm' specifies the median normalization function from Anders et. al., 2010. -} -\usage{ -MedianNorm(Data) -} -\arguments{ - \item{Data}{The data matrix with transcripts in rows and lanes in columns.} -} - -\value{The function will return a vector contains the normalization factor for each lane.} - -\references{ -Simon Anders and Wolfgang Huber. Differential expression analysis for sequence count data. -Genome Biology (2010) 11:R106 (open access) -} -\author{ -Ning Leng -} - - -\seealso{ -QuantileNorm -} -\examples{ -data(GeneMat) -Sizes = MedianNorm(GeneMat) -#EBOut = EBTest(Data = GeneMat, -# Conditions = as.factor(rep(c("C1","C2"), each=5)), -# sizeFactors = Sizes, maxround = 5) - -} -\keyword{ Normalization } diff --git a/.svn/pristine/07/07f79f33a75d66b902c4f2a64e5b782708f17f8e.svn-base b/.svn/pristine/07/07f79f33a75d66b902c4f2a64e5b782708f17f8e.svn-base deleted file mode 100644 index ce2dd7e..0000000 --- a/.svn/pristine/07/07f79f33a75d66b902c4f2a64e5b782708f17f8e.svn-base +++ /dev/null @@ -1,425 +0,0 @@ -EBMultiTest <- -function(Data,NgVector=NULL,Conditions,AllParti=NULL, sizeFactors, maxround, Pool=F, NumBin=1000, ApproxVal=10^-10,PoolLower=.25, PoolUpper=.75,Print=T,Qtrm=.75,QtrmCut=10) -{ - if(!is.factor(Conditions))Conditions=as.factor(Conditions) - if(is.null(rownames(Data)))stop("Please add gene/isoform names to the data matrix") - if(!is.matrix(Data))stop("The input Data is not a matrix") - if(length(Conditions)!=ncol(Data))stop("The number of conditions is not the same as the number of samples! ") - if(nlevels(Conditions)==2)stop("Only 2 conditions - Please use EBTest() function") - if(nlevels(Conditions)<2)stop("Less than 2 conditions - Please check your input") - if(length(sizeFactors)!=length(Data) & length(sizeFactors)!=ncol(Data)) - stop("The number of library size factors is not the same as the number of samples!") - - - tau=CI=CIthre=NULL - Dataraw=Data - - - #Normalized - DataNorm=GetNormalizedMat(Data, sizeFactors) - - - - QuantileFor0=apply(DataNorm,1,function(i)quantile(i,Qtrm)) - AllZeroNames=which(QuantileFor0<=QtrmCut) - NotAllZeroNames=which(QuantileFor0>QtrmCut) - if(length(AllZeroNames)>0 & Print==T) - cat(paste0("Removing transcripts with ",Qtrm*100, - " th quantile < = ",QtrmCut," \n", - length(NotAllZeroNames)," transcripts will be tested \n")) - if(length(NotAllZeroNames)==0)stop("0 transcript passed") - Data=Data[NotAllZeroNames,] - - if(!is.null(NgVector))NgVector=NgVector[NotAllZeroNames] - if(is.null(NgVector))NgVector=rep(1,nrow(Data)) - - - #ReNameThem - IsoNamesIn=rownames(Data) - Names=paste("I",c(1:dim(Data)[1]),sep="") - names(IsoNamesIn)=Names - rownames(Data)=paste("I",c(1:dim(Data)[1]),sep="") - names(NgVector)=paste("I",c(1:dim(Data)[1]),sep="") - - # If PossibleCond==NULL, use all combinations - NumCond=nlevels(Conditions) - CondLevels=levels(Conditions) - #library(blockmodeling) - if(is.null(AllParti)){ - AllPartiList=sapply(1:NumCond,function(i)nkpartitions(NumCond,i)) - AllParti=do.call(rbind,AllPartiList) - colnames(AllParti)=CondLevels - rownames(AllParti)=paste("Pattern",1:nrow(AllParti),sep="") - } - if(length(sizeFactors)==length(Data)){ - rownames(sizeFactors)=rownames(Data) - colnames(sizeFactors)=Conditions - } - - - NoneZeroLength=nlevels(as.factor(NgVector)) - NameList=sapply(1:NoneZeroLength,function(i)names(NgVector)[NgVector==i],simplify=F) - DataList=sapply(1:NoneZeroLength , function(i) Data[NameList[[i]],],simplify=F) - names(DataList)=names(NameList) - - NumEachGroup=sapply(1:NoneZeroLength , function(i)dim(DataList)[i]) - # Unlist - DataList.unlist=do.call(rbind, DataList) - - # Divide by SampleSize factor - - if(length(sizeFactors)==ncol(Data)) - DataList.unlist.dvd=t(t( DataList.unlist)/sizeFactors) - - if(length(sizeFactors)==length(Data)) - DataList.unlist.dvd=DataList.unlist/sizeFactors - - # Pool or Not - if(Pool==T){ - DataforPoolSP.dvd=MeanforPoolSP.dvd=vector("list",NumCond) - for(lv in 1:NumCond){ - DataforPoolSP.dvd[[lv]]=matrix(DataList.unlist.dvd[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist)[1]) - MeanforPoolSP.dvd[[lv]]=rowMeans(DataforPoolSP.dvd[[lv]]) - } - MeanforPool.dvd=rowMeans(DataList.unlist.dvd) - NumInBin=floor(dim(DataList.unlist)[1]/NumBin) - StartSeq=c(0:(NumBin-1))*NumInBin+1 - EndSeq=c(StartSeq[-1]-1,dim(DataList.unlist)[1]) - MeanforPool.dvd.Sort=sort(MeanforPool.dvd,decreasing=T) - MeanforPool.dvd.Order=order(MeanforPool.dvd,decreasing=T) - PoolGroups=sapply(1:NumBin,function(i)(names(MeanforPool.dvd.Sort)[StartSeq[i]:EndSeq[i]]),simplify=F) - #FCforPool=MeanforPoolSP.dvd1/MeanforPoolSP.dvd2 - # Use GeoMean of every two-group partition - Parti2=nkpartitions(NumCond,2) - FCForPoolList=sapply(1:nrow(Parti2),function(i)rowMeans(do.call(cbind, - MeanforPoolSP.dvd[Parti2[i,]==1]))/ - rowMeans(do.call(cbind,MeanforPoolSP.dvd[Parti2[i,]==2])), - simplify=F) - FCForPoolMat=do.call(cbind,FCForPoolList) - FCforPool=apply(FCForPoolMat,1,function(i)exp(mean(log(i)))) - names(FCforPool)=names(MeanforPool.dvd) - FC_Use=names(FCforPool)[which(FCforPool>=quantile(FCforPool[!is.na(FCforPool)],PoolLower) & FCforPool<=quantile(FCforPool[!is.na(FCforPool)],PoolUpper))] - PoolGroupVar=sapply(1:NumBin,function(i)(mean(apply(matrix(DataList.unlist[PoolGroups[[i]][PoolGroups[[i]]%in%FC_Use],],ncol=ncol(DataList.unlist)),1,var)))) - PoolGroupVarInList=sapply(1:NumBin,function(i)(rep(PoolGroupVar[i],length(PoolGroups[[i]]))),simplify=F) - PoolGroupVarVector=unlist(PoolGroupVarInList) - VarPool=PoolGroupVarVector[MeanforPool.dvd.Order] - names(VarPool)=names(MeanforPool.dvd) - } - - DataListSP=vector("list",nlevels(Conditions)) - DataListSP.dvd=vector("list",nlevels(Conditions)) - SizeFSP=DataListSP - MeanSP=DataListSP - VarSP=DataListSP - GetPSP=DataListSP - RSP=DataListSP - CISP=DataListSP - tauSP=DataListSP - - NumEachCondLevel=summary(Conditions) - if(Pool==F & is.null(CI)) CondLevelsUse=CondLevels[NumEachCondLevel>1] - if(Pool==T | !is.null(CI)) CondLevelsUse=CondLevels - NumCondUse=length(CondLevelsUse) - - for (lv in 1:nlevels(Conditions)){ - DataListSP[[lv]]= matrix(DataList.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist)[1]) - rownames(DataListSP[[lv]])=rownames(DataList.unlist) - DataListSP.dvd[[lv]]= matrix(DataList.unlist.dvd[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist.dvd)[1]) - if(ncol(DataListSP[[lv]])==1 & Pool==F & !is.null(CI)){ - CISP[[lv]]=matrix(CI[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist.dvd)[1]) - tauSP[[lv]]=matrix(tau[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist.dvd)[1]) - } - # no matter sizeFactors is a vector or a matrix. Matrix should be columns are the normalization factors - # may input one for each - if(length(sizeFactors)==ncol(Data))SizeFSP[[lv]]=sizeFactors[Conditions==levels(Conditions)[lv]] - if(length(sizeFactors)==length(Data))SizeFSP[[lv]]=sizeFactors[,Conditions==levels(Conditions)[lv]] - - MeanSP[[lv]]=rowMeans(DataListSP.dvd[[lv]]) - names(MeanSP[[lv]])=rownames(DataListSP[[lv]]) - - if(length(sizeFactors)==ncol(Data))PrePareVar=sapply(1:ncol( DataListSP[[lv]]),function(i)( DataListSP[[lv]][,i]- SizeFSP[[lv]][i]*MeanSP[[lv]])^2 /SizeFSP[[lv]][i]) - if(length(sizeFactors)==length(Data))PrePareVar=sapply(1:ncol( DataListSP[[lv]]),function(i)( DataListSP[[lv]][,i]- SizeFSP[[lv]][,i]*MeanSP[[lv]])^2 /SizeFSP[[lv]][,i]) - - if(ncol(DataListSP[[lv]])==1 & Pool==F & !is.null(CI)) - VarSP[[lv]]=as.vector(((DataListSP[[lv]]/tauSP[[lv]]) * CISP[[lv]]/(CIthre*2))^2) - if( Pool==T){ - VarSP[[lv]]=VarPool - } - if(ncol(DataListSP[[lv]])!=1){ - VarSP[[lv]]=rowSums(PrePareVar)/ncol( DataListSP[[lv]]) - names(VarSP[[lv]])=rownames(DataList.unlist) - GetPSP[[lv]]=MeanSP[[lv]]/VarSP[[lv]] - RSP[[lv]]=MeanSP[[lv]]*GetPSP[[lv]]/(1-GetPSP[[lv]]) - } - names(MeanSP[[lv]])=rownames(DataList.unlist) - } - - # Get Empirical R - # POOL R??? - MeanList=rowMeans(DataList.unlist.dvd) - VarList=apply(DataList.unlist.dvd, 1, var) - - if(NumCondUse!=0){ - Varcbind=do.call(cbind,VarSP[CondLevels%in%CondLevelsUse]) - PoolVarSpeedUp_MDFPoi_NoNormVarList=rowMeans(Varcbind) - VarrowMin=apply(Varcbind,1,min) - - } - if(NumCondUse==0) - { - NumFCgp=choose(NumCond,2) - FC_Use_tmp=vector("list",NumFCgp) - aa=1 - for(k1 in 1:(NumCond-1)){ - for(k2 in (k1+1):NumCond){ - FCforPool=DataList.unlist.dvd[,k1]/DataList.unlist.dvd[,k2] - names(FCforPool)=rownames(DataList.unlist.dvd) - FC_Use_tmp[[aa]]=names(FCforPool)[which(FCforPool>=quantile(FCforPool[!is.na(FCforPool)],.25) & - FCforPool<=quantile(FCforPool[!is.na(FCforPool)],.75))] - aa=aa+1 - }} - FC_Use=Reduce(intersect,FC_Use_tmp) - if(length(FC_Use)==0){ - All_candi=unlist(FC_Use_tmp) - FC_Use=names(table(All_candi))[1:3] - - } - Var_FC_Use=apply( DataList.unlist.dvd[FC_Use,],1,var ) - MeanforPool=apply( DataList.unlist.dvd,1,mean ) - Mean_FC_Use=apply( DataList.unlist.dvd[FC_Use,],1,mean ) - FC_Use2=which(Var_FC_Use>=Mean_FC_Use) - Var_FC_Use2=Var_FC_Use[FC_Use2] - Mean_FC_Use2=Mean_FC_Use[FC_Use2] - Phi=mean((Var_FC_Use2-Mean_FC_Use2)/Mean_FC_Use2^2) - VarEst= MeanforPool*(1+MeanforPool*Phi) - if(Print==T)message(paste("No Replicate - estimate phi",round(Phi,5), "\n")) - Varcbind=VarEst - PoolVarSpeedUp_MDFPoi_NoNormVarList=VarEst - VarrowMin=VarEst - - } - - - - GetP=MeanList/PoolVarSpeedUp_MDFPoi_NoNormVarList - - EmpiricalRList=MeanList*GetP/(1-GetP) - # sep - #Rcb=cbind(RSP[[1]],RSP[[2]]) - #Rbest=apply(Rcb,1,function(i)max(i[!is.na(i) & i!=Inf])) - EmpiricalRList[EmpiricalRList==Inf] =max(EmpiricalRList[EmpiricalRList!=Inf]) - # fine - # - GoodData=names(MeanList)[EmpiricalRList>0 & VarrowMin!=0 & EmpiricalRList!=Inf & !is.na(VarrowMin) & !is.na(EmpiricalRList)] - NotIn=names(MeanList)[EmpiricalRList<=0 | VarrowMin==0 | EmpiricalRList==Inf | is.na(VarrowMin) | is.na(EmpiricalRList)] - #NotIn.BestR=Rbest[NotIn.raw] - #NotIn.fix=NotIn.BestR[which(NotIn.BestR>0)] - #EmpiricalRList[names(NotIn.fix)]=NotIn.fix - #print(paste("ZeroVar",sum(VarrowMin==0), "InfR", length(which(EmpiricalRList==Inf)), "Poi", length(which(EmpiricalRList<0)), "")) - #GoodData=c(GoodData.raw,names(NotIn.fix)) - #NotIn=NotIn.raw[!NotIn.raw%in%names(NotIn.fix)] - EmpiricalRList.NotIn=EmpiricalRList[NotIn] - EmpiricalRList.Good=EmpiricalRList[GoodData] - EmpiricalRList.Good[EmpiricalRList.Good<1]=1+EmpiricalRList.Good[EmpiricalRList.Good<1] - if(length(sizeFactors)==ncol(Data)) - EmpiricalRList.Good.mat= outer(EmpiricalRList.Good, sizeFactors) - if(length(sizeFactors)==length(Data)) - EmpiricalRList.Good.mat=EmpiricalRList.Good* sizeFactors[GoodData,] - - - # Only Use Data has Good q's - DataList.In=sapply(1:NoneZeroLength, function(i)DataList[[i]][GoodData[GoodData%in%rownames(DataList[[i]])],],simplify=F) - DataList.NotIn=sapply(1:NoneZeroLength, function(i)DataList[[i]][NotIn[NotIn%in%rownames(DataList[[i]])],],simplify=F) - DataListIn.unlist=do.call(rbind, DataList.In) - DataListNotIn.unlist=do.call(rbind, DataList.NotIn) - - DataListSPIn=vector("list",nlevels(Conditions)) - DataListSPNotIn=vector("list",nlevels(Conditions)) - EmpiricalRList.Good.mat.SP=vector("list",nlevels(Conditions)) - for (lv in 1:nlevels(Conditions)){ - DataListSPIn[[lv]]= matrix(DataListIn.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataListIn.unlist)[1]) - if(length(NotIn)>0) DataListSPNotIn[[lv]]= matrix(DataListNotIn.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataListNotIn.unlist)[1]) - rownames(DataListSPIn[[lv]])=rownames(DataListIn.unlist) - if(length(NotIn)>0)rownames(DataListSPNotIn[[lv]])=rownames(DataListNotIn.unlist) - EmpiricalRList.Good.mat.SP[[lv]]=matrix(EmpiricalRList.Good.mat[,Conditions==levels(Conditions)[lv]],nrow=dim(EmpiricalRList.Good.mat)[1]) - } - - NumOfEachGroupIn=sapply(1:NoneZeroLength, function(i)max(0,dim(DataList.In[[i]])[1])) - NumOfEachGroupNotIn=sapply(1:NoneZeroLength, function(i)max(0,dim(DataList.NotIn[[i]])[1])) - - #Initialize SigIn & ... - AlphaIn=0.5 - BetaIn=rep(0.5,NoneZeroLength) - PIn=rep(1/nrow(AllParti),nrow(AllParti)) - - ####use while to make an infinity round? - UpdateAlpha=NULL - UpdateBeta=NULL - UpdateP=NULL - UpdatePFromZ=NULL - Timeperround=NULL - for (times in 1:maxround){ - temptime1=proc.time() - UpdateOutput=suppressWarnings(LogNMulti(DataListIn.unlist,DataListSPIn, EmpiricalRList.Good.mat ,EmpiricalRList.Good.mat.SP, - NumOfEachGroupIn, AlphaIn, BetaIn, PIn, NoneZeroLength, AllParti,Conditions)) - message(paste("iteration", times, "done \n",sep=" ")) - AlphaIn=UpdateOutput$AlphaNew - BetaIn=UpdateOutput$BetaNew - PIn=UpdateOutput$PNew - PFromZ=UpdateOutput$PFromZ - FOut=UpdateOutput$FGood - UpdateAlpha=rbind(UpdateAlpha,AlphaIn) - UpdateBeta=rbind(UpdateBeta,BetaIn) - UpdateP=rbind(UpdateP,PIn) - UpdatePFromZ=rbind(UpdatePFromZ,PFromZ) - temptime2=proc.time() - Timeperround=c(Timeperround,temptime2[3]-temptime1[3]) - message(paste("time" ,round(Timeperround[times],2),"\n",sep=" ")) - Z.output=UpdateOutput$ZEachGood - Z.NA.Names=UpdateOutput$zNaNName - } - #Remove this } after testing!! - -# if (times!=1){ -# if((UpdateAlpha[times]-UpdateAlpha[times-1])^2+UpdateBeta[times]-UpdateBeta[times-1])^2+UpdateR[times]-UpdateR[times-1])^2+UpdateP[times]-UpdateP[times-1])^2<=10^(-6)){ -# Result=list(Sig=SigIn, Miu=MiuIn, Tau=TauIn) -# break -# } -# } -#} - -##########Change Names############ -## Only z are for Good Ones -## Others are for ALL Data -GoodData=GoodData[!GoodData%in%Z.NA.Names] -IsoNamesIn.Good=as.vector(IsoNamesIn[GoodData]) -RealName.Z.output=Z.output -RealName.F=FOut -rownames(RealName.Z.output)=IsoNamesIn.Good -rownames(RealName.F)=IsoNamesIn.Good - -RealName.EmpiricalRList=sapply(1:NoneZeroLength,function(i)EmpiricalRList[names(EmpiricalRList)%in%NameList[[i]]], simplify=F) -RealName.MeanList=sapply(1:NoneZeroLength,function(i)MeanList[names(MeanList)%in%NameList[[i]]], simplify=F) -RealName.SPMeanList=sapply(1:NoneZeroLength,function(i)sapply(1:length(MeanSP), function(j)MeanSP[[j]][names(MeanSP[[j]])%in%NameList[[i]]],simplify=F), simplify=F) -RealName.SPVarList=sapply(1:NoneZeroLength,function(i)sapply(1:length(VarSP), function(j)VarSP[[j]][names(VarSP[[j]])%in%NameList[[i]]],simplify=F), simplify=F) -RealName.DataList=sapply(1:NoneZeroLength,function(i)DataList[[i]][rownames(DataList[[i]])%in%NameList[[i]],], simplify=F) - -RealName.VarList=sapply(1:NoneZeroLength,function(i)VarList[names(VarList)%in%NameList[[i]]], simplify=F) -RealName.PoolVarList=sapply(1:NoneZeroLength,function(i)PoolVarSpeedUp_MDFPoi_NoNormVarList[names(PoolVarSpeedUp_MDFPoi_NoNormVarList)%in%NameList[[i]]], simplify=F) -RealName.QList=sapply(1:NoneZeroLength,function(i)sapply(1:length(GetPSP), function(j)GetPSP[[j]][names(GetPSP[[j]])%in%NameList[[i]]],simplify=F), simplify=F) - - -for (i in 1:NoneZeroLength){ -tmp=NameList[[i]] -Names=IsoNamesIn[tmp] -RealName.MeanList[[i]]=RealName.MeanList[[i]][NameList[[i]]] -RealName.VarList[[i]]=RealName.VarList[[i]][NameList[[i]]] - for(j in 1:NumCond){ - RealName.SPMeanList[[i]][[j]]=RealName.SPMeanList[[i]][[j]][NameList[[i]]] - if(!is.null(RealName.QList[[i]][[j]])){ - RealName.QList[[i]][[j]]=RealName.QList[[i]][[j]][NameList[[i]]] - RealName.SPVarList[[i]][[j]]=RealName.SPVarList[[i]][[j]][NameList[[i]]] - names(RealName.QList[[i]][[j]])=Names - names(RealName.SPVarList[[i]][[j]])=Names - } - names(RealName.SPMeanList[[i]][[j]])=Names - } -RealName.EmpiricalRList[[i]]=RealName.EmpiricalRList[[i]][NameList[[i]]] -RealName.PoolVarList[[i]]=RealName.PoolVarList[[i]][NameList[[i]]] -RealName.DataList[[i]]=RealName.DataList[[i]][NameList[[i]],] - -names(RealName.MeanList[[i]])=Names -names(RealName.VarList[[i]])=Names - -names(RealName.EmpiricalRList[[i]])=Names -names(RealName.PoolVarList[[i]])=Names -rownames(RealName.DataList[[i]])=Names - -} - - -#########posterior part for other data set here later############ -AllNA=unique(c(Z.NA.Names,NotIn)) -AllZ=NULL -AllF=NULL -if(length(AllNA)==0){ - AllZ=RealName.Z.output[IsoNamesIn,] - AllF=RealName.F[IsoNamesIn,] -} -ZEachNA=NULL -if (length(AllNA)>0){ - Ng.NA=NgVector[AllNA] - AllNA.Ngorder=AllNA[order(Ng.NA)] - NumOfEachGroupNA=rep(0,NoneZeroLength) - NumOfEachGroupNA.tmp=tapply(Ng.NA,Ng.NA,length) - names(NumOfEachGroupNA)=c(1:NoneZeroLength) - NumOfEachGroupNA[names(NumOfEachGroupNA.tmp)]=NumOfEachGroupNA.tmp - PNotIn=rep(1-ApproxVal,length(AllNA.Ngorder)) - MeanList.NotIn=MeanList[AllNA.Ngorder] - R.NotIn.raw=MeanList.NotIn*PNotIn/(1-PNotIn) - if(length(sizeFactors)==ncol(Data)) - R.NotIn=matrix(outer(R.NotIn.raw,sizeFactors),nrow=length(AllNA.Ngorder)) - if(length(sizeFactors)==length(Data)) - R.NotIn=matrix(R.NotIn.raw*sizeFactors[NotIn,],nrow=length(AllNA.Ngorder)) - - DataListNotIn.unlistWithZ=matrix(DataList.unlist[AllNA.Ngorder,], - nrow=length(AllNA.Ngorder)) - rownames(DataListNotIn.unlistWithZ)=AllNA.Ngorder - DataListSPNotInWithZ=vector("list",nlevels(Conditions)) - RListSPNotInWithZ=vector("list",nlevels(Conditions)) - for (lv in 1:nlevels(Conditions)) { - DataListSPNotInWithZ[[lv]] = matrix(DataListSP[[lv]][AllNA.Ngorder,],nrow=length(AllNA.Ngorder)) - RListSPNotInWithZ[[lv]]=matrix(R.NotIn[,Conditions==levels(Conditions)[lv]],nrow=length(AllNA.Ngorder)) - } - - FListNA=sapply(1:nrow(AllParti),function(i)sapply(1:nlevels(as.factor(AllParti[i,])), - function(j)f0(do.call(cbind, DataListSPNotInWithZ[AllParti[i,]==j]),AlphaIn, BetaIn, - do.call(cbind,RListSPNotInWithZ[AllParti[i,]==j]), NumOfEachGroupNA, log=T)), - simplify=F) - for(ii in 1:length(FListNA)) - FListNA[[ii]]=matrix(FListNA[[ii]],nrow=length(AllNA.Ngorder)) - FPartiLogNA=matrix(sapply(FListNA,rowSums),nrow=length(AllNA.Ngorder)) - FMatNA=exp(FPartiLogNA+600) - - rownames(FMatNA)=rownames(DataListNotIn.unlistWithZ) - PMatNA=matrix(rep(1,nrow(DataListNotIn.unlistWithZ)),ncol=1)%*%matrix(PIn,nrow=1) - FmultiPNA=matrix(FMatNA*PMatNA,nrow=length(AllNA.Ngorder)) - DenomNA=rowSums(FmultiPNA) - ZEachNA=matrix(apply(FmultiPNA,2,function(i)i/DenomNA),nrow=length(AllNA.Ngorder)) - - rownames(ZEachNA)=IsoNamesIn[AllNA.Ngorder] - - AllZ=rbind(RealName.Z.output,ZEachNA) - AllZ=AllZ[IsoNamesIn,] - - F.NotIn=FPartiLogNA - rownames(F.NotIn)=IsoNamesIn[rownames(FMatNA)] - AllF=rbind(RealName.F,F.NotIn) - AllF=AllF[IsoNamesIn,] - -} -colnames(AllZ)=rownames(AllParti) -colnames(AllF)=rownames(AllParti) -rownames(UpdateAlpha)=paste("iter",1:nrow(UpdateAlpha),sep="") -rownames(UpdateBeta)=paste("iter",1:nrow(UpdateBeta),sep="") -rownames(UpdateP)=paste("iter",1:nrow(UpdateP),sep="") -rownames(UpdatePFromZ)=paste("iter",1:nrow(UpdatePFromZ),sep="") -colnames(UpdateBeta)=paste("Ng",1:ncol(UpdateBeta),sep="") - -CondOut=levels(Conditions) -names(CondOut)=paste("Condition",c(1:length(CondOut)),sep="") - -AllZWith0=matrix(NA,ncol=ncol(AllZ),nrow=nrow(Dataraw)) -rownames(AllZWith0)=rownames(Dataraw) -colnames(AllZWith0)=colnames(AllZ) -if(is.null(AllZeroNames))AllZWith0=AllZ -if(!is.null(AllZeroNames))AllZWith0[names(NotAllZeroNames),]=AllZ[names(NotAllZeroNames),] - -#############Result############################ -Result=list(Alpha=UpdateAlpha,Beta=UpdateBeta,P=UpdateP,PFromZ=UpdatePFromZ, - Z=RealName.Z.output,PoissonZ=ZEachNA, RList=RealName.EmpiricalRList, MeanList=RealName.MeanList, - VarList=RealName.VarList, QList=RealName.QList, SPMean=RealName.SPMeanList, SPEstVar=RealName.SPVarList, - PoolVar=RealName.PoolVarList , DataList=RealName.DataList,PPpattern=AllZ,f=AllF, AllParti=AllParti, - PPMat=AllZ,PPMatWith0=AllZWith0, ConditionOrder=CondOut) -} - diff --git a/.svn/pristine/08/0879045f703cdb6886a159b283b0d615f9de3c9e.svn-base b/.svn/pristine/08/0879045f703cdb6886a159b283b0d615f9de3c9e.svn-base deleted file mode 100644 index 27f5a8d..0000000 --- a/.svn/pristine/08/0879045f703cdb6886a159b283b0d615f9de3c9e.svn-base +++ /dev/null @@ -1,36 +0,0 @@ -\name{PlotPattern} -\alias{PlotPattern} -\title{ -Visualize the patterns -} -\description{ -'PlotPattern' generates the visualized patterns before the multiple condition test. -} -\usage{ -PlotPattern(Patterns) -} -\arguments{ - \item{Patterns}{ -The output of GetPatterns function. -} - -} -\value{ -A heatmap to visualize the patterns of interest. -} - -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} - -\author{ -Ning Leng -} - -\examples{ -Conditions = c("C1","C1","C2","C2","C3","C3") -Patterns = GetPatterns(Conditions) -PlotPattern(Patterns) - -} -\keyword{ patterns } diff --git a/.svn/pristine/0b/0b3cfe0c55129fd2ab73e3a72e21ce1db2b5cd8b.svn-base b/.svn/pristine/0b/0b3cfe0c55129fd2ab73e3a72e21ce1db2b5cd8b.svn-base deleted file mode 100644 index e24d90d..0000000 --- a/.svn/pristine/0b/0b3cfe0c55129fd2ab73e3a72e21ce1db2b5cd8b.svn-base +++ /dev/null @@ -1,6 +0,0 @@ -GetNormalizedMat<-function(Data, Sizes){ -if(length(Sizes)!=length(Data) & length(Sizes)!=ncol(Data)) - stop("The number of library size factors is not the same as the number of samples!") -if(length(Sizes)==length(Data))Out=Data/Sizes -if(length(Sizes)==ncol(Data))Out=t(t(Data)/Sizes) -Out} diff --git a/.svn/pristine/0b/0b7ee0767255d354e83a85faf723b3f5b5397886.svn-base b/.svn/pristine/0b/0b7ee0767255d354e83a85faf723b3f5b5397886.svn-base deleted file mode 100644 index de4abf5..0000000 --- a/.svn/pristine/0b/0b7ee0767255d354e83a85faf723b3f5b5397886.svn-base +++ /dev/null @@ -1,58 +0,0 @@ -\name{PostFC} -\alias{PostFC} -\title{ -Calculate the posterior fold change for each transcript across conditions -} -\description{ -'PostFC' calculates the posterior fold change for each transcript across conditions. -} -\usage{ -PostFC(EBoutput, SmallNum = 0.01) -} -\arguments{ - \item{EBoutput}{ -The ourput from function EBTest. -} - -\item{SmallNum}{A small number will be added for each transcript in each condition to avoid Inf and NA. Default is 0.01.} - -} -\value{ -Provide both FC and posterior FC across two conditions. -FC is calculated as (MeanC1+SmallNum)/(MeanC2+SmallNum). -And Posterior FC is calculated as: - -# Post alpha P_a_C1 = alpha + r_C1 * n_C1 - -# Post beta P_b_C1 = beta + Mean_C1 * n_C1 - -# P_q_C1 = P_a_C1 / (P_a_C1 + P_b_C1) - -# Post FC = ((1-P_q_C1)/P_q_c1) / ( (1-P_q_c2)/P_q_c2) - -\item{PostFC}{The posterior FC across two conditions.} -\item{RealFC}{The FC across two conditions (adjusted by the normalization factors).} -\item{Direction}{The diretion of FC calculation.} - -} -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - - -\seealso{ -EBTest, GetMultiFC -} -\examples{ -data(GeneMat) -GeneMat.small = GeneMat[c(500:550),] -Sizes = MedianNorm(GeneMat.small) -EBOut = EBTest(Data = GeneMat.small, - Conditions = as.factor(rep(c("C1","C2"), each=5)), - sizeFactors = Sizes, maxround = 5) -FC=PostFC(EBOut) -} -\keyword{ Fold Change } diff --git a/.svn/pristine/0d/0d785cf6baa284767fd5dc6de5c4f3763dddf924.svn-base b/.svn/pristine/0d/0d785cf6baa284767fd5dc6de5c4f3763dddf924.svn-base deleted file mode 100644 index 7ca78f2..0000000 --- a/.svn/pristine/0d/0d785cf6baa284767fd5dc6de5c4f3763dddf924.svn-base +++ /dev/null @@ -1,52 +0,0 @@ -\name{GetNg} -\alias{GetNg} -\title{ -Ng Vector -} -\description{ -'GetNg' generates the Ng vector for the isoform level data. -(While using the number of isoform in the host gene to define the uncertainty groups.) -} -\usage{ -GetNg(IsoformName, GeneName, TrunThre = 3) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{IsoformName}{A vector contains the isoform names.} - \item{GeneName}{The gene names of the isoforms in IsoformNames (Should be in the same order).} -\item{TrunThre}{The number of uncertainty groups the user wish to define. The default is 3.} -} -\value{ - \item{GeneNg}{The number of isoforms that are contained in each gene. - } - \item{GeneNgTrun}{The truncated Ng of each gene. (The genes contain more than 3 isoforms are with Ng 3.) - } - \item{IsoformNg}{The Ng of each isoform.} - \item{IsoformNgTrun}{The truncated Ng of each isoform (could be used to define the uncertainty group assignment).} -} - -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - -\examples{ - -data(IsoList) - -IsoMat = IsoList$IsoMat -IsoNames = IsoList$IsoNames -IsosGeneNames = IsoList$IsosGeneNames -IsoSizes = MedianNorm(IsoMat) -NgList = GetNg(IsoNames, IsosGeneNames) - -#IsoNgTrun = NgList$IsoformNgTrun -#IsoEBOut = EBTest(Data = IsoMat, NgVector = IsoNgTrun, -# Conditions = as.factor(rep(c("C1","C2"), each=5)), -# sizeFactors = IsoSizes, maxround = 5) - -} - -\keyword{ Ng } diff --git a/.svn/pristine/0e/0e8be11e6a889a10ffaaf6da9ad5c8ee7616dcd4.svn-base b/.svn/pristine/0e/0e8be11e6a889a10ffaaf6da9ad5c8ee7616dcd4.svn-base deleted file mode 100644 index fd4ee6a..0000000 --- a/.svn/pristine/0e/0e8be11e6a889a10ffaaf6da9ad5c8ee7616dcd4.svn-base +++ /dev/null @@ -1,19 +0,0 @@ -\name{IsoMultiList} -\alias{IsoMultiList} -\docType{data} -\title{ -The simulated data for multiple condition isoform DE analysis -} -\description{ -'IsoMultiList' gives a set of simulated data for multiple condition isoform DE analysis. -} -\usage{data(IsoMultiList)} -\source{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\seealso{IsoList -} -\examples{ -data(IsoMultiList) -} -\keyword{datasets} diff --git a/.svn/pristine/10/104d01c6007452890419dd054f0503083aafe689.svn-base b/.svn/pristine/10/104d01c6007452890419dd054f0503083aafe689.svn-base deleted file mode 100644 index 310e363..0000000 --- a/.svn/pristine/10/104d01c6007452890419dd054f0503083aafe689.svn-base +++ /dev/null @@ -1,5 +0,0 @@ -GetPPMat <- function(EBout){ - if(!"PPMat"%in%names(EBout))stop("The input doesn't seem like an output from EBTest") - - PP=EBout$PPMat -} diff --git a/.svn/pristine/11/1176fb86556be29b8c324b3de0b8d48cfe4273ca.svn-base b/.svn/pristine/11/1176fb86556be29b8c324b3de0b8d48cfe4273ca.svn-base deleted file mode 100644 index c954850..0000000 --- a/.svn/pristine/11/1176fb86556be29b8c324b3de0b8d48cfe4273ca.svn-base +++ /dev/null @@ -1,7 +0,0 @@ -PlotPattern<-function(Patterns){ - par(oma=c(3,3,3,3)) - PatternCol=rev(rainbow(ncol(Patterns))) - heatmap(Patterns,col=PatternCol,Colv=NA,Rowv=NA,scale="none") - -} - diff --git a/.svn/pristine/14/14b9888566dfd473965e38b9b0a5077c2ba01a9b.svn-base b/.svn/pristine/14/14b9888566dfd473965e38b9b0a5077c2ba01a9b.svn-base deleted file mode 100644 index b384e0c..0000000 --- a/.svn/pristine/14/14b9888566dfd473965e38b9b0a5077c2ba01a9b.svn-base +++ /dev/null @@ -1,1054 +0,0 @@ -%\VignetteIndexEntry{EBSeq} - -\documentclass{article} -\usepackage{fullpage} -\usepackage{graphicx, graphics, epsfig,setspace,amsmath, amsthm} -\usepackage{hyperref} -\usepackage{natbib} -%\usepackage{listings} -\usepackage{moreverb} -\begin{document} -\title{EBSeq: An R package for differential expression analysis using RNA-seq data} -\author{Ning Leng, John Dawson, and Christina Kendziorski} -\maketitle -\tableofcontents -\setcounter{tocdepth}{2} - -\section{Introduction} -EBSeq may be used to identify differentially expressed (DE) -genes and isoforms in an RNA-Seq experiment. As detailed in -Leng {\it et al.}, 2013 \cite{Leng13}, -EBSeq is an empirical Bayesian approach that models a number of features -observed in RNA-seq data. Importantly, for isoform level inference, -EBSeq directly accommodates isoform expression estimation uncertainty by -modeling the differential variability observed in distinct groups of isoforms. -Consider Figure 1, where we have plotted variance against mean -for all isoforms using RNA-Seq expression data from Leng {\it et al.}, 2013 \cite{Leng13}. -Also shown is the fit within three sub-groups of isoforms defined -by the number of constituent isoforms of the parent gene. -An isoform of gene $g$ is assigned to the $I_g=k$ group, where $k=1,2,3$, -if the total number of isoforms from gene $g$ is $k$ (the $I_g=3$ group contains -all isoforms from genes having 3 or more isoforms). -As shown in Figure 1, there is decreased variability in the $I_g=1$ group, -but increased variability in the others, due to the relative increase in -uncertainty inherent in estimating isoform expression when multiple isoforms of a given gene are -present. If this structure is not accommodated, there is reduced power for -identifying isoforms in the $I_g=1$ group (since the true variances in that group are -lower, on average, than that derived from the full collection of isoforms) as well as increased -false discoveries in the $I_g=2$ and $I_g=3$ groups (since the true variances are higher, on average, -than those derived from the full collection). EBSeq directly models differential variability -as a function of $I_g$ providing a powerful approach for isoform level inference. As shown in Leng {\it et al.}, 2013 -\cite{Leng13}, the model is also useful for identifying DE genes. -We will briefly detail the model in Section \ref{sec:model} and then describe -the flow of analysis in Section \ref{sec:quickstart} for both isoform and gene-level inference. - -\begin{figure}[t] -\centering -\includegraphics[width=0.6\textwidth]{PlotExample.png} -\label{fig:GouldNg} -\caption{Empirical variance vs. mean for -each isoform profiled in the ESCs vs iPSCs experiment detailed in -the Case Study section of Leng {\it et al.}, 2013 \cite{Leng13}. -A spline fit to all isoforms is shown in red with splines fit within the $I_g=1$, $I_g=2$, and $I_g=3$ isoform groups -shown in yellow, pink, and green, respectively.} -\end{figure} - - -\section{Citing this software} -\label{sec:cite} -Please cite the following article when reporting results from the software. - -\noindent Leng, N., J.A. Dawson, J.A. Thomson, V. Ruotti, A.I. Rissman, -B.M.G. Smits, J.D. Haag, M.N. Gould, R.M. Stewart, and C. Kendziorski. -EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq -experiments, {\it Bioinformatics}, 2013. - -\section{The Model} -\label{sec:model} -\subsection{Two conditions} -\label{sec:twocondmodel} -We let $X_{g_i}^{C1} = X_{g_i,1} ,X_{g_i,2}, ...,X_{g_i,S_1}$ denote data from condition 1 -and $ X_{g_i}^{C2} = X_{g_i,(S_1+1)},X_{g_i,(S_1+2)},...,X_{g_i,S}$ data from condition 2. -We assume that counts within condition $C$ are distributed as Negative Binomial: -$X_{g_i,s}^C|r_{g_i,s}, q_{g_i}^C \sim NB(r_{g_i,s}, q_{g_i}^C)$ where -\begin{equation} -P(X_{g_i,s}|r_{g_i,s},q_{g_i}^C) = {X_{g_i,s}+r_{g_i,s}-1\choose X_{g_i,s}}(1-q_{g_i}^C)^{X_{g_i,s}}(q_{g_i}^C)^{r_{g_i,s}}\label{eq:01} -\end{equation} - -\noindent and $\mu_{g_i,s}^C=r_{g_i,s} (1-q_{g_i}^C)/q_{g_i}^C$; -$(\sigma_{g_i,s}^C)^2=r_{g_i,s} (1-q_{g_i}^C)/(q_{g_i}^C)^2.$ - -\medskip - -We assume a prior distribution on $q_{g_i}^C$: $q_{g_i}^C|\alpha, \beta^{I_g} \sim Beta(\alpha, \beta^{I_g})$. -The hyperparameter $\alpha$ is shared by all the isoforms and $\beta^{I_g}$ is $I_g$ specific (note this is an index, not a power). -We further assume that $r_{g_i,s}=r_{g_i,0} l_s$, where $r_{g_i,0}$ is an isoform specific -parameter common across conditions and $r_{g_i,s}$ depends on it through the sample-specific normalization factor $l_s$. -Of interest in this two group comparison is distinguishing between two cases, or what we will refer to subsequently as -two patterns of expression, namely equivalent expression (EE) and differential expression (DE): -\begin{center} -$H_0$ (EE) : $q_{g_i}^{C1}=q_{g_i}^{C2}$ vs $H_1$ (DE) : $q_{g_i}^{C1} \neq q_{g_i}^{C2}$. -\end{center} -Under the null hypothesis (EE), the data $X_{g_i}^{C1,C2} = X_{g_i}^{C1}, X_{g_i}^{C2}$ arises -from the prior predictive distribution $f_0^{I_g}(X_{g_i}^{C1,C2})$: -%\tiny -\begin{equation} -f_0^{I_g}(X_{g_i}^{C1,C2})=\Bigg[\prod_{s=1}^S {X_{g_i,s}+r_{g_i,s}-1\choose X_{g_i,s}}\Bigg] -\frac{Beta(\alpha+\sum_{s=1}^S r_{g_i,s}, \beta^{I_g}+\sum_{s=1}^SX_{g_i,s} )}{Beta(\alpha, \beta^{I_g})}\label{eq:05} -\end{equation} -%\normalsize - -Alternatively (in a DE scenario), $X_{g_i}^{C1,C2}$ follows the prior predictive distribution $f_1^{I_g}(X_{g_i}^{C1,C2})$: -\begin{equation} -f_1^{I_g}(X_{g_i}^{C1,C2})=f_0^{I_g}(X_{g_i}^{C1})f_0^{I_g}(X_{g_i}^{C2}) \label{eq:06} -\end{equation} - -Let the latent variable $Z_{g_i}$ be defined so that $Z_{g_i} = 1$ indicates that -isoform $g_i$ is DE and $Z_{g_i} = 0$ indicates isoform $g_i$ is EE, and -$Z_{g_i} \sim Bernoulli(p)$. -Then, the marginal distribution of $X_{g_i}^{C1,C2}$ and $Z_{g_i}$ is: -\begin{equation} -(1-p)f_0^{I_g}(X_{g_i}^{C1,C2}) + pf_1^{I_g}(X_{g_i}^{C1,C2})\label{eq:07} -\end{equation} - -\noindent The posterior probability of being DE at isoform $g_i$ is obtained by Bayes' rule: -\begin{equation} -\frac{pf_1^{I_g}(X_{g_i}^{C1,C2})}{(1-p)f_0^{I_g}(X_{g_i}^{C1,C2}) + pf_1^{I_g}(X_{g_i}^{C1,C2})}\label{eq:08} -\end{equation} - -%\newpage -\subsection{More than two conditions} -\label{sec:multicondmodel} -EBSeq naturally accommodates multiple condition comparisons. -For example, in a study with 3 conditions, there are K=5 possible expression patterns (P1,...,P5), or ways in which -latent levels of expression may vary across conditions: -\begin{align} -\textrm {P1:}& \hspace{0.05in} q_{g_i}^{C1} = q_{g_i}^{C2}=q_{g_i}^{C3} \nonumber \\ -\textrm {P2:}& \hspace{0.05in} q_{g_i}^{C1} = q_{g_i}^{C2} \neq q_{g_i}^{C3} \nonumber \\ -\textrm {P3:}& \hspace{0.05in} q_{g_i}^{C1} = q_{g_i}^{C3} \neq q_{g_i}^{C2} \nonumber \\ -\textrm {P4:}& \hspace{0.05in} q_{g_i}^{C1} \neq q_{g_i}^{C2} = q_{g_i}^{C3} \nonumber \\ -\textrm {P5:}& \hspace{0.05in} q_{g_i}^{C1} \neq q_{g_i}^{C2} \neq -q_{g_i}^{C3} \textrm{ and } q_{g_i}^{C1} \neq q_{g_i}^{C3} \nonumber -\end{align} - -\noindent The prior predictive distributions for these are given, respectively, by: - -\begin{align} -g_1^{I_g}(X_{g_i}^{C1,C2,C3}) &= f_0^{I_g}(X_{g_i}^{C1,C2,C3}) \nonumber \\ -g_2^{I_g}(X_{g_i}^{C1,C2,C3}) &= f_0^{I_g}(X_{g_i}^{C1,C2})f_0^{I_g}(X_{g_i}^{C3}) \nonumber \\ -g_3^{I_g}(X_{g_i}^{C1,C2,C3}) &= f_0^{I_g}(X_{g_i}^{C1,C3})f_0^{I_g}(X_{g_i}^{C2}) \nonumber \\ -g_4^{I_g}(X_{g_i}^{C1,C2,C3}) &= f_0^{I_g}(X_{g_i}^{C1})f_0^{I_g}(X_{g_i}^{C2,C3}) \nonumber \\ -g_5^{I_g}(X_{g_i}^{C1,C2,C3}) &= f_0^{I_g}(X_{g_i}^{C1})f_0^{I_g}(X_{g_i}^{C2})f_0^{I_g}(X_{g_i}^{C3}) \nonumber -\end{align} - -\noindent where $f_0^{I_g}$ is the same as in equation \ref{eq:05}. Then the marginal distribution in -equation \ref{eq:07} becomes: - -\begin{equation} -\sum_{k=1}^5 p_k g_k^{I_g}(X_{g_i}^{C1,C2,C3}) \label{eq:11} -\end{equation} -\noindent where $\sum_{k=1}^5 p_k = 1$. Thus, the posterior probability of -isoform $g_i$ coming from pattern $K$ is readily obtained by: - -\begin{equation} -\frac{p_K g_K^{I_g}(X_{g_i}^{C1,C2,C3})}{\sum_{k=1}^5 p_k g_k^{I_g}(X_{g_i}^{C1,C2,C3})} \label{eq:12} -\end{equation} - -\subsection{Getting a false discovery rate (FDR) controlled list of genes or isoforms} -\label{sec:fdrlist} -To obtain a list of DE genes with false discovery rate (FDR) controlled -at $\alpha$ in an experiment comparing two biological conditions, the genes -with posterior probability of being DE (PPDE) greater than 1 - $\alpha$ should be used. -For example, the genes with PPDE>=0.95 make up the list of DE genes with target -FDR controlled at 5\%. With more than two biological conditions, there are multiple -DE patterns (see Section \ref{sec:multicondmodel}). To obtain a list of genes in a specific DE pattern with target -FDR $\alpha$, a user should -take the genes with posterior probability of being in that pattern greater -than 1 - $\alpha$. Isoform-based lists are obtained in the same way. -\newpage -\section{Quick Start} -\label{sec:quickstart} -Before analysis can proceed, the EBSeq package must be loaded into the working space: -<<>>= -library(EBSeq) -@ -\subsection{Gene level DE analysis (two conditions)} -\label{sec:startgenede} -\subsubsection{Required input} -\label{sec:startgenedeinput} -\begin{flushleft} -{\bf Data}: The object \verb+Data+ should be a $G-by-S$ matrix containing the expression values for each gene and each sample, -where $G$ is the number of genes and $S$ is the number of samples. These -values should exhibit raw counts, without normalization -across samples. Counts of this nature may be obtained from RSEM \cite{Li11b}, -Cufflinks \cite{Trapnell12}, or a similar approach. - -\vspace{5 mm} - -{\bf Conditions}: The object \verb+Conditions+ should be a Factor vector of length $S$ that indicates to which condition each sample belongs. -For example, if there are two conditions and three samples in each, -$S=6$ and \verb+Conditions+ may be given by - -\verb+as.factor(c("C1","C1","C1","C2","C2","C2"))+ - -\end{flushleft} -\noindent The object \verb+GeneMat+ is a simulated data matrix containing -1,000 rows of genes and 10 columns of samples. The genes are named -\verb+Gene_1, Gene_2 ...+ -<<>>= -data(GeneMat) -str(GeneMat) -@ - -\subsubsection{Library size factor} -\label{sec:startgenedesize} -As detailed in Section \ref{sec:model}, EBSeq requires the library size factor $l_s$ for each sample $s$. -Here, $l_s$ may be obtained via the function \verb+MedianNorm+, which reproduces the median normalization approach -in DESeq \citep{Anders10}. -<<>>= -Sizes=MedianNorm(GeneMat) -@ - -\noindent If quantile normalization is preferred, $l_s$ may be obtained via the function \verb+QuantileNorm+. -(e.g. \verb+QuantileNorm(GeneMat,.75)+ for Upper-Quantile Normalization in \cite{Bullard10}) - -\subsubsection{Running EBSeq on gene expression estimates} -\label{sec:startgenederun} -The function \verb+EBTest+ is used to detect DE genes. -For gene-level data, we don't need to specify the parameter -\verb+NgVector+ since there are no differences in $I_g$ structure among the different genes. -Here, we simulated the first five samples to be in condition 1 and the other five in condition 2, so define: - -\verb+Conditions=as.factor(rep(c("C1","C2"),each=5))+ - -\noindent \verb+sizeFactors+ is used to define the library size factor of each sample. -It could be obtained by summing up the total number of reads within each sample, -Median Normalization \citep{Anders10}, -scaling normalization \citep{Robinson10}, Upper-Quantile Normalization \cite{Bullard10}, -or some other such approach. -These in hand, we run the EM algorithm, setting the number -of iterations to five via \verb+maxround=5+ for demonstration purposes. -However, we note that in practice, -additional iterations are usually required. Convergence should always be -checked (see Section \ref{sec:detailedgenedeconverge} for details). -Please note this may take several minutes: -<<>>= -EBOut=EBTest(Data=GeneMat, -Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=Sizes, maxround=5) -@ -\noindent The posterior probabilities of being DE are obtained as follows, where \verb+PP+ is a matrix containing the posterior probabilities of -being EE or DE for each of the 1,000 simulated genes: -<<>>= -PP=GetPPMat(EBOut) -str(PP) -head(PP) -@ -\noindent The matrix \verb+PP+ contains two columns \verb+PPEE+ and \verb+PPDE+, -corresponding to the posterior probabilities of being EE or DE for each gene. -\verb+PP+ may be used to form an FDR-controlled list of DE genes with a target FDR of 0.05 as follows: -<<>>= -DEfound=rownames(PP)[which(PP[,"PPDE"]>=.95)] -str(DEfound) -@ -\noindent EBSeq found 98 DE genes in total with target FDR 0.05. - -\subsection{Isoform level DE analysis (two conditions)} -\label{sec:startisode} -\subsubsection{Required inputs} -\label{sec:startisodeinput} - -\begin{flushleft} -{\bf Data}: The object \verb+Data+ should be a $I-by-S$ matrix containing the expression values for each isoform and each sample, -where $I$ is the number of isoforms and $S$ is the number of sample. As in the gene-level analysis, these values should exhibit raw data, without normalization -across samples. - -\vspace{5 mm} - -{\bf Conditions}: The object \verb+Conditions+ should be a vector with length $S$ to indicate the condition of each sample. - -\vspace{5 mm} - -{\bf IsoformNames}: The object \verb+IsoformNames+ should be a vector with length $I$ to indicate the isoform names. - -\vspace{5 mm} - -{\bf IsosGeneNames}: The object \verb+IsosGeneNames+ should be a vector with length $I$ to indicate the gene name of each isoform. -(in the same order as \verb+IsoformNames+.) -\end{flushleft} - -\noindent \verb+IsoList+ contains 1,200 simulated isoforms. -In which \verb+IsoList$IsoMat+ is a data matrix containing -1,200 rows of isoforms and 10 columns of samples; -\verb+IsoList$IsoNames+ contains the isoform names; -\verb+IsoList$IsosGeneNames+ contains the names of the genes the isoforms belong to. - -<<>>= -data(IsoList) -str(IsoList) -IsoMat=IsoList$IsoMat -str(IsoMat) -IsoNames=IsoList$IsoNames -IsosGeneNames=IsoList$IsosGeneNames -@ - -\subsubsection{Library size factor} -\label{sec:startisodesize} -Similar to the gene-level analysis presented above, we may obtain the isoform-level -library size factors via \verb+MedianNorm+: -<<>>= -IsoSizes=MedianNorm(IsoMat) -@ - -\subsubsection{The $I_g$ vector} -\label{sec:startisodeNg} - -While working on isoform level data, EBSeq fits different prior -parameters for different uncertainty groups (defined as $I_g$ groups). -The default setting to define the uncertainty groups consists of using -the number of isoforms the host gene contains ($N_g$) for each isoform. -The default settings will provide three uncertainty groups: - -$I_g=1$ group: Isoforms with $N_g=1$; - -$I_g=2$ group: Isoforms with $N_g=2$; - -$I_g=3$ group: Isoforms with $N_g \geq 3$. - -The $N_g$ and $I_g$ group assignment can be obtained using the function \verb+GetNg+. -The required inputs of \verb+GetNg+ are the isoform names (\verb+IsoformNames+) and -their corresponding gene names (\verb+IsosGeneNames+). -<<>>= -NgList=GetNg(IsoNames, IsosGeneNames) -IsoNgTrun=NgList$IsoformNgTrun -IsoNgTrun[c(1:3,201:203,601:603)] -@ -More details could be found in Section \ref{sec:detailedisode}. - -\subsubsection{Running EBSeq on isoform expression estimates} -\label{sec:startisoderun} -The \verb+EBTest+ function is also used to run EBSeq for two condition comparisons -on isoform-level data. -Below we use 5 iterations to demonstrate. However, as -in the gene level analysis, we advise that additional iterations will likely be -required in practice (see Section \ref{sec:detailedisodeconverge} for details). - -<<>>= -IsoEBOut=EBTest(Data=IsoMat, NgVector=IsoNgTrun, -Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=IsoSizes, maxround=5) -IsoPP=GetPPMat(IsoEBOut) -str(IsoPP) -head(IsoPP) -IsoDE=rownames(IsoPP)[which(IsoPP[,"PPDE"]>=.95)] -str(IsoDE) -@ -\noindent We see that EBSeq found 105 DE isoforms at the target FDR of 0.05. - -\subsection{Gene level DE analysis (more than two conditions)} -\label{sec:startmulticond} -\noindent The object \verb+MultiGeneMat+ is a matrix containing -500 simulated genes with 6 samples: -the first two samples are from condition 1; the second and the third sample are -from condition 2; the last two samples are from condition 3. - -<<>>= -data(MultiGeneMat) -str(MultiGeneMat) -@ -In analysis where the data are spread over more than two conditions, -the set of possible patterns for each gene is more complicated -than simply EE and DE. As noted in Section \ref{sec:model}, when we have 3 conditions, there are 5 expression -patterns to consider. In the simulated data, we have 6 samples, 2 in each of 3 conditions. -The function \verb+GetPatterns+ allows the user to generate all possible patterns given the conditions. For example: - -<<>>= -Conditions=c("C1","C1","C2","C2","C3","C3") -PosParti=GetPatterns(Conditions) -PosParti -@ - -\noindent where the first row means all three conditions have the same latent mean expression level; -the second row means C1 and C2 have the same latent mean expression level but that of C3 is different; -and the last row corresponds to the case where the three conditions all have different latent mean expression levels. -The user may use all or only some of these possible patterns as an input to \verb+EBMultiTest+. -For example, if we were interested in Patterns 1, 2, 4 and 5 only, we'd define: -<<>>= -Parti=PosParti[-3,] -Parti -@ - -Moving on to the analysis, \verb+MedianNorm+ or one of its competitors should be used to determine the normalization factors. -Once this is done, the formal test is performed by \verb+EBMultiTest+. -<<>>= -MultiSize=MedianNorm(MultiGeneMat) -MultiOut=EBMultiTest(MultiGeneMat,NgVector=NULL,Conditions=Conditions, -AllParti=Parti, sizeFactors=MultiSize, maxround=5) -@ -\noindent The posterior probability of being in each pattern for every gene is obtained by using the -function \verb+GetMultiPP+: -<<>>= -MultiPP=GetMultiPP(MultiOut) -names(MultiPP) -MultiPP$PP[1:10,] -MultiPP$MAP[1:10] -MultiPP$Patterns -@ -\noindent where \verb+MultiPP$PP+ provides the posterior probability of being in each pattern for every gene. -\verb+MultiPP$MAP+ provides the most likely pattern of each gene based on the posterior -probabilities. \verb+MultiPP$Patterns+ provides the details of the patterns. - - - -\subsection{Isoform level DE analysis (more than two conditions)} -\label{sec:startisomulticond} -\noindent Similar to \verb+IsoList+, -the object \verb+IsoMultiList+ is an object containing the isoform expression estimates matrix, the isoform -names, and the gene names of the isoforms' host genes. -\verb+IsoMultiList$IsoMultiMat+ contains 300 simulated isoforms with 8 samples. -The first two samples are from condition 1; the second and the third sample are -from condition 2; the fifth and sixth sample are from condition 3; -the last two samples are from condition 4. -Similar to Section \ref{sec:startisode}, the function \verb+MedianNorm+ and \verb+GetNg+ could be used for normalization -and calculating the $N_g$'s. -<<>>= -data(IsoMultiList) -IsoMultiMat=IsoMultiList[[1]] -IsoNames.Multi=IsoMultiList$IsoNames -IsosGeneNames.Multi=IsoMultiList$IsosGeneNames -IsoMultiSize=MedianNorm(IsoMultiMat) -NgList.Multi=GetNg(IsoNames.Multi, IsosGeneNames.Multi) -IsoNgTrun.Multi=NgList.Multi$IsoformNgTrun -Conditions=c("C1","C1","C2","C2","C3","C3","C4","C4") -@ -Here we have 4 conditions, there are 15 expression -patterns to consider. -The function \verb+GetPatterns+ allows the user to generate all possible patterns given the conditions. For example: - -<<>>= -PosParti.4Cond=GetPatterns(Conditions) -PosParti.4Cond -@ - -\noindent -If we were interested in Patterns 1, 2, 3, 8 and 15 only, we'd define: -<<>>= -Parti.4Cond=PosParti.4Cond[c(1,2,3,8,15),] -Parti.4Cond -@ - -\noindent -Moving on to the analysis, \verb+EBMultiTest+ could be used to perform the test: -<<>>= -IsoMultiOut=EBMultiTest(IsoMultiMat, -NgVector=IsoNgTrun.Multi,Conditions=Conditions, -AllParti=Parti.4Cond, sizeFactors=IsoMultiSize, -maxround=5) -@ -\noindent The posterior probability of being in each pattern for every gene is obtained by using the -function \verb+GetMultiPP+: -<<>>= -IsoMultiPP=GetMultiPP(IsoMultiOut) -names(MultiPP) -IsoMultiPP$PP[1:10,] -IsoMultiPP$MAP[1:10] -IsoMultiPP$Patterns -@ -\noindent where \verb+MultiPP$PP+ provides the posterior probability of being in each pattern for every gene. -\verb+MultiPP$MAP+ provides the most likely pattern of each gene based on the posterior -probabilities. \verb+MultiPP$Patterns+ provides the details of the patterns. - - - -\newpage -\section{More detailed examples} -\label{sec:detailed} -\subsection{Gene level DE analysis (two conditions)} -\label{sec:detailedgenede} - -\subsubsection{Running EBSeq on simulated gene expression estimates} -\label{sec:detailedgenederun} -EBSeq is applied as described in Section \ref{sec:startgenederun}. -<>= -data(GeneMat) -Sizes=MedianNorm(GeneMat) -EBOut=EBTest(Data=GeneMat, -Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=Sizes, maxround=5) -PP=GetPPMat(EBOut) -@ -<<>>= -str(PP) -head(PP) -DEfound=rownames(PP)[which(PP[,"PPDE"]>=.95)] -str(DEfound) -@ -\noindent EBSeq found 98 DE genes at a target FDR of 0.05.\\ - -\subsubsection{Calculating FC} -\label{sec:detailedgenedefc} -The function \verb+PostFC+ may be used to calculate the Fold Change (FC) -of the raw data as well as the posterior FC of the normalized data. -\begin{figure}[h!] -\centering -<>= -GeneFC=PostFC(EBOut) -str(GeneFC) -PlotPostVsRawFC(EBOut,GeneFC) -@ -\caption{ -FC vs. Posterior FC for 1,000 gene expression estimates} -\label{fig:GeneFC} -\end{figure} -Figure \ref{fig:GeneFC} shows the FC vs. Posterior FC on 1,000 gene expression estimates. -The genes are ranked by their cross-condition mean (adjusted by the normalization factors). -The posterior FC tends to shrink genes with low expressions (small rank); in this case the differences -are minor. - - -\newpage - -\subsubsection{Checking convergence} -\label{sec:detailedgenedeconverge} -As detailed in Section \ref{sec:model}, we assume the prior distribution of $q_g^C$ is -$Beta(\alpha,\beta)$. The EM algorithm is used to estimate the -hyper-parameters $\alpha,\beta$ and the mixture parameter $p$. -The optimized parameters at each iteration may be obtained as follows (recall -we are using 5 iterations for demonstration purposes): -<<>>= -EBOut$Alpha -EBOut$Beta -EBOut$P -@ -In this case the differences between the 4th and 5th iterations are always less -than 0.01. - - -\subsubsection{Checking the model fit and other diagnostics} -\label{sec:detailedgenedeplot} -As noted in Leng {\it et al.}, 2013 \cite{Leng13}, EBSeq relies on parametric assumptions that should -be checked following each analysis. -The \verb+QQP+ function may be used to assess prior assumptions. -In practice, \verb+QQP+ generates the Q-Q plot of the empirical $q$'s -vs. the simulated $q$'s from the Beta prior distribution with -estimated hyper-parameters. Figure \ref{fig:GeneQQ} shows that the -data points lie on the $y=x$ line for both conditions, which indicates -that the Beta prior is appropriate. - -\begin{figure}[h!] -\centering -<>= -par(mfrow=c(1,2)) -QQP(EBOut) -@ -\caption{QQ-plots for checking the assumption of a Beta prior (upper panels) as well as the -model fit using data from condition 1 and condition 2 (lower panels)} -\label{fig:GeneQQ} -\end{figure} - -\newpage -\noindent -Likewise, the \verb+DenNHist+ function may be used to check the density plot of empirical $q$'s vs the simulated -$q$'s from the fitted Beta prior distribution. -Figure \ref{fig:GeneDenNHist} also shows our estimated distribution fits the -data very well. - -\begin{figure}[h!] -\centering -<>= -par(mfrow=c(1,2)) -DenNHist(EBOut) -@ -\caption{Density plots for checking the model fit using data from condition 1 and condition 2} -\label{fig:GeneDenNHist} -\end{figure} - -\newpage -\subsection{Isoform level DE analysis (two conditions)} -\label{sec:detailedisode} -\subsubsection{The $I_g$ vector} -\label{sec:detailedisodeNg} -Since EBSeq fits rely on $I_g$, -we need to obtain the $I_g$ for each isoform. This can be done using the -function \verb+GetNg+. -The required inputs of \verb+GetNg+ are the isoform names (\verb+IsoformNames+) and -their corresponding gene names (\verb+IsosGeneNames+), described above. -In the simulated data, we assume that the isoforms in the $I_g=1$ group belong to genes \verb+Gene_1, ... , Gene_200+; -The isoforms in the $I_g=2$ group belong to genes -\verb+Gene_201, ..., Gene_400+; and isoforms in the $I_g=3$ group -belong to \verb+Gene_401, ..., Gene_600+. - -<>= -data(IsoList) -IsoMat=IsoList$IsoMat -IsoNames=IsoList$IsoNames -IsosGeneNames=IsoList$IsosGeneNames -NgList=GetNg(IsoNames, IsosGeneNames, TrunThre=3) -@ -<<>>= -names(NgList) -IsoNgTrun=NgList$IsoformNgTrun -IsoNgTrun[c(1:3,201:203,601:603)] -@ - -The output of \verb+GetNg+ contains 4 vectors. \verb+GeneNg+ (\verb+IsoformNg+) provides -the number of isoforms $N_g$ within each gene (within each isoform's host gene). -\verb+GeneNgTrun+ (\verb+IsoformNgTrun+) provides the $I_g$ group assignments. -The default number of groups is 3, which means the isoforms -with $N_g$ greater than 3 will be assigned to $I_g=3$ group. -We use 3 in the case studies -since the number of isoforms with $N_g$ larger than 3 is relatively small and -the small sample size may induce poor parameter fitting if we treat them -as separate groups. -In practice, if there is evidence that the $N_g=4,5,6...$ groups should be -treated as separate groups, a user can change \verb+TrunThre+ to define -a different truncation threshold. - -\subsubsection{Using mappability ambiguity clusters instead of -the $I_g$ vector when the gene-isoform relationship is unknown} -\label{sec:detailedisodeNoNg} -When working with a de-novo assembled transcriptome, in which case the gene-isoform -relationship is unknown, -a user can use read mapping ambiguity cluster information instead of Ng, -as provided by RSEM \cite{Li11b} in the -output file \verb+output_name.ngvec+. The file contains a vector with the same -length as the total number of transcripts. -Each transcript has been assigned to one of 3 levels -(1, 2, or 3) to indicate the mapping uncertainty level of that transcript. -The mapping ambiguity clusters are partitioned via a k-means algorithm on the unmapability -scores that are provided by RSEM. A user can read in the mapping ambiguity cluster information -using: - -<>= -IsoNgTrun = scan(file="output_name.ngvec", what=0, sep="\n") -@\\ -Where \verb+"output_name.ngvec"+ is the output file obtained from RSEM function rsem-generate-ngvector. -More details on using the RSEM-EBSeq pipeline -on de novo assembled transcriptomes can be found -at \url{http://deweylab.biostat.wisc.edu/rsem/README.html#de}. - -Other unmappability scores and other cluster methods (e.g. Gaussian Mixed Model) -could also be used to form the uncertainty clusters. - -\subsubsection{Running EBSeq on simulated isoform expression estimates} -\label{sec:detailedisoderun} -EBSeq can be applied as described in Section \ref{sec:startisoderun}. -<>= -IsoSizes=MedianNorm(IsoMat) -IsoEBOut=EBTest(Data=IsoMat, NgVector=IsoNgTrun, -Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=IsoSizes, maxround=5) -IsoPP=GetPPMat(IsoEBOut) -IsoDE=rownames(IsoPP)[which(IsoPP[,"PPDE"]>=.95)] -@ -<<>>= -str(IsoDE) -@ -\noindent We see that EBSeq found 105 DE isoforms at a target FDR of 0.05. -The function \verb+PostFC+ could also be used here to calculate the Fold Change (FC) -as well as the posterior FC on the normalization factor adjusted data. -<<>>= -IsoFC=PostFC(IsoEBOut) -str(IsoFC) -@ - - -\subsubsection{Checking convergence} -\label{sec:detailedisodeconverge} -For isoform level data, we assume the prior distribution of $q_{gi}^C$ is -$Beta(\alpha,\beta^{I_g})$. -As in Section \ref{sec:detailedgenedeconverge}, the optimized parameters at each iteration -may be obtained as follows (recall -we are using 5 iterations for demonstration purposes): -<<>>= -IsoEBOut$Alpha -IsoEBOut$Beta -IsoEBOut$P -@ -Here we have 3 $\beta$'s in each iteration corresponding to -$\beta^{I_g=1},\beta^{I_g=2},\beta^{I_g=3}$. -We see that parameters are changing less than $10^{-2}$ or $10^{-3}$. -In practice, we require changes less than $10^{-3}$ to declare convergence. - -\subsubsection{Checking the model fit and other diagnostics} -\label{sec:detailedisodeplot} -In Leng {\it et al.}, 2013\citep{Leng13}, we showed the mean-variance differences across different -isoform groups on multiple data sets. -In practice, if it is of interest to check differences among -isoform groups defined by truncated $I_g$ (such as those shown here -in Figure 1), the function \verb+PolyFitPlot+ may be used. -The following code generates the three -panels shown in Figure \ref{fig:IsoSimuNgEach} -(if condition 2 is of interest, a user could change each \verb+C1+ to \verb+C2+.): -\begin{figure}[h!] -\centering -<>= -par(mfrow=c(2,2)) -PolyFitValue=vector("list",3) -for(i in 1:3) - PolyFitValue[[i]]=PolyFitPlot(IsoEBOut$C1Mean[[i]], - IsoEBOut$C1EstVar[[i]],5) -@ -\caption{ The mean-variance fitting plot for each Ng group} -\label{fig:IsoSimuNgEach} -\end{figure} - -\newpage -Superimposing all $I_g$ groups using the code below will generate the figure (shown -here in Figure \ref{fig:IsoSimuNg}), which is similar in structure to Figure 1: - - -\begin{figure}[h!] -\centering -<>= -PolyAll=PolyFitPlot(unlist(IsoEBOut$C1Mean), unlist(IsoEBOut$C1EstVar),5) -lines(log10(IsoEBOut$C1Mean[[1]][PolyFitValue[[1]]$sort]), -PolyFitValue[[1]]$fit[PolyFitValue[[1]]$sort],col="yellow",lwd=2) -lines(log10(IsoEBOut$C1Mean[[2]][PolyFitValue[[2]]$sort]), -PolyFitValue[[2]]$fit[PolyFitValue[[2]]$sort],col="pink",lwd=2) -lines(log10(IsoEBOut$C1Mean[[3]][PolyFitValue[[3]]$sort]), -PolyFitValue[[3]]$fit[PolyFitValue[[3]]$sort],col="green",lwd=2) -legend("topleft",c("All Isoforms","Ng = 1","Ng = 2","Ng = 3"), -col=c("red","yellow","pink","green"),lty=1,lwd=3,box.lwd=2) -@ -\caption{The mean-variance plot for each Ng group} -\label{fig:IsoSimuNg} -\end{figure} - - -\newpage -\noindent To generate a QQ-plot of the fitted Beta prior distribution -and the $\hat{q}^C$'s within condition, a user may -use the following code to generate 6 panels (as shown in Figure \ref{fig:IsoQQ}). -\begin{figure}[h!] -\centering -<>= -par(mfrow=c(2,3)) -QQP(IsoEBOut) -@ -\caption{ QQ-plots of the fitted prior distributions within each condition and each Ig group} -\label{fig:IsoQQ} -\end{figure} - -\newpage -\noindent And in order to produce the plot of the fitted Beta prior densities -and the histograms of $\hat{q}^C$'s within each condition, -the following may be used (it generates Figure \ref{fig:IsoDenNHist}): -\begin{figure}[h] -\centering -<>= -par(mfrow=c(2,3)) -DenNHist(IsoEBOut) -@ -\caption{ Prior distribution fit within each condition and each Ig group. -(Note only a small set of isoforms are considered here for demonstration. -Better fitting should be expected while using full set of isoforms.)} -\label{fig:IsoDenNHist} -\end{figure} - -\clearpage -\subsection{Gene level DE analysis (more than two conditions)} -\label{sec:detailedmulticond} -As described in Section \ref{sec:startmulticond}, -the function \verb+GetPatterns+ allows the user to generate all possible patterns given the conditions. -To visualize the patterns, the function \verb+PlotPattern+ may be used. - -\begin{figure}[h!] -\centering -<>= -Conditions=c("C1","C1","C2","C2","C3","C3") -PosParti=GetPatterns(Conditions) -PosParti -PlotPattern(PosParti) -@ -\caption{ All possible patterns} -\label{fig:Patterns} -\end{figure} -\newpage -\noindent If we were interested in Patterns 1, 2, 4 and 5 only, we'd define: -<<>>= -Parti=PosParti[-3,] -Parti -@ - -\noindent -Moving on to the analysis, \verb+MedianNorm+ or one of its competitors should be used to determine the normalization factors. -Once this is done, the formal test is performed by \verb+EBMultiTest+. -<>= -data(MultiGeneMat) -MultiSize=MedianNorm(MultiGeneMat) -MultiOut=EBMultiTest(MultiGeneMat, -NgVector=NULL,Conditions=Conditions, -AllParti=Parti, sizeFactors=MultiSize, -maxround=5) -@ -\noindent The posterior probability of being in each pattern for every gene is obtained using the -function \verb+GetMultiPP+: -<<>>= -MultiPP=GetMultiPP(MultiOut) -names(MultiPP) -MultiPP$PP[1:10,] -MultiPP$MAP[1:10] -MultiPP$Patterns -@ -\noindent where \verb+MultiPP$PP+ provides the posterior probability of being in each pattern for every gene. -\verb+MultiPP$MAP+ provides the most likely pattern of each gene based on the posterior -probabilities. \verb+MultiPP$Patterns+ provides the details of the patterns. The FC and posterior FC for multiple condition data can -be obtained by the function \verb+GetMultiFC+: - -<<>>= -MultiFC=GetMultiFC(MultiOut) -str(MultiFC) -@ - -\noindent To generate a QQ-plot of the fitted Beta prior distribution -and the $\hat{q}^C$'s within condition, a user could also use function -\verb+DenNHist+ and \verb+QQP+. - -\begin{figure}[h!] -\centering -<>= -par(mfrow=c(2,2)) -QQP(MultiOut) -@ -\caption{ QQ-plots of the fitted prior distributions within each condition and each Ig group} -\label{fig:GeneMultiQQ} -\end{figure} - -\begin{figure}[h] -\centering -<>= -par(mfrow=c(2,2)) -DenNHist(MultiOut) -@ -\caption{ Prior distributions fit within each condition. -(Note only a small set of genes are considered here for demonstration. -Better fitting should be expected while using full set of genes.)} -\label{fig:GeneMultiDenNHist} -\end{figure} -\newpage -\clearpage -\newpage -\subsection{Isoform level DE analysis (more than two conditions)} -\label{sec:detailedisomulticond} -Similar to Section \ref{sec:startmulticond}, -the function \verb+GetPatterns+ allows a user to generate all possible patterns given the conditions. -To visualize the patterns, the function \verb+PlotPattern+ may be used. -<<>>= -Conditions=c("C1","C1","C2","C2","C3","C3","C4","C4") -PosParti.4Cond=GetPatterns(Conditions) -PosParti.4Cond -@ -\newpage -\begin{figure}[h!] -\centering -<>= -PlotPattern(PosParti.4Cond) -Parti.4Cond=PosParti.4Cond[c(1,2,3,8,15),] -Parti.4Cond -@ -\caption{All possible patterns for 4 conditions} -\label{fig:Patterns4Cond} -\end{figure} - -\newpage -<>= -data(IsoMultiList) -IsoMultiMat=IsoMultiList[[1]] -IsoNames.Multi=IsoMultiList$IsoNames -IsosGeneNames.Multi=IsoMultiList$IsosGeneNames -IsoMultiSize=MedianNorm(IsoMultiMat) -NgList.Multi=GetNg(IsoNames.Multi, IsosGeneNames.Multi) -IsoNgTrun.Multi=NgList.Multi$IsoformNgTrun -IsoMultiOut=EBMultiTest(IsoMultiMat,NgVector=IsoNgTrun.Multi,Conditions=Conditions, -AllParti=Parti.4Cond, -sizeFactors=IsoMultiSize, maxround=5) -IsoMultiPP=GetMultiPP(IsoMultiOut) -@ -<<>>= -names(MultiPP) -IsoMultiPP$PP[1:10,] -IsoMultiPP$MAP[1:10] -IsoMultiPP$Patterns -IsoMultiFC=GetMultiFC(IsoMultiOut) -@ -The FC and posterior FC for multiple condition data can be obtained by the function \verb+GetMultiFC+: - - -\noindent To generate a QQ-plot of the fitted Beta prior distribution -and the $\hat{q}^C$'s within condition, a user could also use the functions -\verb+DenNHist+ and \verb+QQP+. -\newpage -\begin{figure}[h!] -\centering -<>= -par(mfrow=c(3,4)) -QQP(IsoMultiOut) - -@ -\caption{ QQ-plots of the fitted prior distributions within each condition and Ig group. -(Note only a small set of isoforms are considered here for demonstration. -Better fitting should be expected while using full set of isoforms.)} -\label{fig:IsoMultiQQ} -\end{figure} - -\begin{figure}[h] -\centering -<>= -par(mfrow=c(3,4)) -DenNHist(IsoMultiOut) -@ -\caption{ Prior distributions fit within each condition and Ig group. -(Note only a small set of isoforms are considered here for demonstration. -Better fitting should be expected while using full set of isoforms.)} -\label{fig:IsoMultiDenNHist} -\end{figure} -\clearpage -\newpage - - -\newpage -\subsection{Working without replicates} -When replicates are not available, it is difficult to estimate the transcript specific variance. -In this case, EBSeq estimates the variance by pooling similar genes together. -Specifically, we take genes with FC in the 25\% - 75\% quantile of all FC's as -candidate genes. By defining \verb+NumBin = 1000+ (default in \verb+EBTest+), EBSeq -will group genes with similar means into 1,000 bins. -For each candidate gene, we use the across-condition variance estimate as its variance estimate. -For each bin, the bin-wise variance estimation is taken to be the median of the -across-condition variance estimates of the candidate genes within that bin. -For each non-candidate gene, we use the bin-wise variance estimate of the host bin (the bin containing this gene) -as its variance estimate. -This approach works well when there are no more than 50\% DE genes in the data set. - -\subsubsection{Gene counts with two conditions} -\label{sec:norepgenede} - -To generate a data set with no replicates, we take the first sample of each condition. -For example, using the data from Section \ref{sec:detailedgenede}, we take sample 1 from condition 1 and -sample 6 from condition 2. Functions \verb+MedianNorm+, \verb+GetPPMat+ and -\verb+PostFC+ may be used on data without replicates. -<<>>= -data(GeneMat) -GeneMat.norep=GeneMat[,c(1,6)] -Sizes.norep=MedianNorm(GeneMat.norep) -EBOut.norep=EBTest(Data=GeneMat.norep, -Conditions=as.factor(rep(c("C1","C2"))), -sizeFactors=Sizes.norep, maxround=5) -PP.norep=GetPPMat(EBOut.norep) -DEfound.norep=rownames(PP.norep)[which(PP.norep[,"PPDE"]>=.95)] -GeneFC.norep=PostFC(EBOut.norep) -@ - -\subsubsection{Isoform counts with two conditions} -\label{norepisode} -To generate an isoform level data set with no replicates, we -also take sample 1 and sample 6 in the data we used in Section -\ref{sec:detailedisode}. -Example codes are shown below. - -<<>>= -data(IsoList) -IsoMat=IsoList$IsoMat -IsoNames=IsoList$IsoNames -IsosGeneNames=IsoList$IsosGeneNames -NgList=GetNg(IsoNames, IsosGeneNames) -IsoNgTrun=NgList$IsoformNgTrun -IsoMat.norep=IsoMat[,c(1,6)] -IsoSizes.norep=MedianNorm(IsoMat.norep) -IsoEBOut.norep=EBTest(Data=IsoMat.norep, NgVector=IsoNgTrun, -Conditions=as.factor(c("C1","C2")), -sizeFactors=IsoSizes.norep, maxround=5) -IsoPP.norep=GetPPMat(IsoEBOut.norep) -IsoDE.norep=rownames(IsoPP.norep)[which(IsoPP.norep[,"PPDE"]>=.95)] -IsoFC.norep=PostFC(IsoEBOut.norep) -@ - -\subsubsection{Gene counts with more than two conditions} -\label{norepisode} -To generate a data set with multiple conditions and no replicates, -we take the first sample from each condition (sample 1, 3 and 5) in the data we used -in Section \ref{sec:detailedmulticond}. -Example codes are shown below. -<<>>= -data(MultiGeneMat) -MultiGeneMat.norep=MultiGeneMat[,c(1,3,5)] -Conditions=c("C1","C2","C3") -PosParti=GetPatterns(Conditions) -Parti=PosParti[-3,] -MultiSize.norep=MedianNorm(MultiGeneMat.norep) -MultiOut.norep=EBMultiTest(MultiGeneMat.norep, -NgVector=NULL,Conditions=Conditions, -AllParti=Parti, sizeFactors=MultiSize.norep, -maxround=5) -MultiPP.norep=GetMultiPP(MultiOut.norep) -MultiFC.norep=GetMultiFC(MultiOut.norep) -@ - -\subsubsection{Isoform counts with more than two conditions} -\label{sec:norepmulticond} -To generate an isoform level data set with multiple conditions and no replicates, -we take the first sample from each condition (sample 1, 3, 5 and 7) in the data we used -in Section \ref{sec:detailedisomulticond}. -Example codes are shown below. - - - -<<>>= -data(IsoMultiList) -IsoMultiMat=IsoMultiList[[1]] -IsoNames.Multi=IsoMultiList$IsoNames -IsosGeneNames.Multi=IsoMultiList$IsosGeneNames -IsoMultiMat.norep=IsoMultiMat[,c(1,3,5,7)] -IsoMultiSize.norep=MedianNorm(IsoMultiMat.norep) -NgList.Multi=GetNg(IsoNames.Multi, IsosGeneNames.Multi) -IsoNgTrun.Multi=NgList.Multi$IsoformNgTrun -Conditions=c("C1","C2","C3","C4") -PosParti.4Cond=GetPatterns(Conditions) -PosParti.4Cond -Parti.4Cond=PosParti.4Cond[c(1,2,3,8,15),] -Parti.4Cond -IsoMultiOut.norep=EBMultiTest(IsoMultiMat.norep, -NgVector=IsoNgTrun.Multi,Conditions=Conditions, -AllParti=Parti.4Cond, sizeFactors=IsoMultiSize.norep, -maxround=5) -IsoMultiPP.norep=GetMultiPP(IsoMultiOut.norep) -IsoMultiFC.norep=GetMultiFC(IsoMultiOut.norep) -@ - -\section{EBSeq pipelines and extensions} -\subsection{RSEM-EBSeq pipeline: from raw reads to differential expression analysis results} -EBSeq is coupled with RSEM \cite{Li11b} as an RSEM-EBSeq pipeline which provides -quantification and DE testing on both gene and isoform levels. - -For more details, see -\url{http://deweylab.biostat.wisc.edu/rsem/README.html#de} - -\subsection{EBSeq interface: A user-friendly graphical interface for differetial expression analysis} -EBSeq interface provides a graphical interface implementation for users who are not familiar with the R -programming language. It takes .xls, .xlsx and .csv files as input. -Additional packages need be downloaded; they may be found at -\url{http://www.biostat.wisc.edu/~ningleng/EBSeq_Package/EBSeq_Interface/} - -\subsection{EBSeq Galaxy tool shed} -EBSeq tool shed contains EBSeq wrappers for a local Galaxy implementation. -For more details, see -\url{http://www.biostat.wisc.edu/~ningleng/EBSeq_Package/EBSeq_Galaxy_toolshed/} - -\section{Acknowledgment} -We would like to thank Haolin Xu for checking the package and -proofreading the vignette. - -\section{News} -2014-1-30: In EBSeq 1.3.3, the default setting of EBTest function will remove -low expressed genes (genes whose 75th quantile of normalized counts is less -than 10) before identifying DE genes. -These two thresholds can be changed in EBTest function. -We found that low expressed genes are more easily to be affected by noises. -Removing these genes prior to downstream analyses can improve the -model fitting and reduce impacts of noisy genes (e.g. genes with outliers). - -2014-5-22: In EBSeq 1.5.2, numerical approximations are implemented to deal with -underflow. The underflow is likely due to large number of samples. -\pagebreak -\bibliographystyle{plain} -\bibliography{lengetal} - -\end{document} - diff --git a/.svn/pristine/15/15494b558fd6408f90df6ed5893c0294ee8b94bd.svn-base b/.svn/pristine/15/15494b558fd6408f90df6ed5893c0294ee8b94bd.svn-base deleted file mode 100644 index 1f160a0..0000000 --- a/.svn/pristine/15/15494b558fd6408f90df6ed5893c0294ee8b94bd.svn-base +++ /dev/null @@ -1,10 +0,0 @@ -f1 <- -function(Input1, Input2, AlphaIn, BetaIn, EmpiricalRSP1,EmpiricalRSP2,NumOfGroup, log){ - F0.1=f0(Input1, AlphaIn, BetaIn, EmpiricalRSP1, NumOfGroup, log=log) - F0.2=f0(Input2, AlphaIn, BetaIn, EmpiricalRSP2, NumOfGroup, log=log) - - if (log==F) Result=F0.1*F0.2 - if (log==T) Result=F0.1+F0.2 - Result -} - diff --git a/.svn/pristine/1a/1ad6b6f0f8f5f4f883f8fb0dac75c99baab89340.svn-base b/.svn/pristine/1a/1ad6b6f0f8f5f4f883f8fb0dac75c99baab89340.svn-base deleted file mode 100644 index 3a0b5b2..0000000 --- a/.svn/pristine/1a/1ad6b6f0f8f5f4f883f8fb0dac75c99baab89340.svn-base +++ /dev/null @@ -1,12 +0,0 @@ -GetPatterns<-function(Conditions){ - if(!is.factor(Conditions))Conditions=as.factor(Conditions) - NumCond=nlevels(Conditions) - if(NumCond<3)stop("Less than 3 conditions!") - CondLevels=levels(Conditions) - AllPartiList=sapply(1:NumCond,function(i)nkpartitions(NumCond,i)) - AllParti=do.call(rbind,AllPartiList) - colnames(AllParti)=CondLevels - rownames(AllParti)=paste("Pattern",1:nrow(AllParti),sep="") - AllParti - -} diff --git a/.svn/pristine/1b/1b43170d3ae12cb3f7bff2ecceecc0e9eef2d0aa.svn-base b/.svn/pristine/1b/1b43170d3ae12cb3f7bff2ecceecc0e9eef2d0aa.svn-base deleted file mode 100644 index 91d9e8b..0000000 Binary files a/.svn/pristine/1b/1b43170d3ae12cb3f7bff2ecceecc0e9eef2d0aa.svn-base and /dev/null differ diff --git a/.svn/pristine/1c/1cfe9e547ce94d5943d38162123614c9a8184649.svn-base b/.svn/pristine/1c/1cfe9e547ce94d5943d38162123614c9a8184649.svn-base deleted file mode 100644 index 59fd29c..0000000 --- a/.svn/pristine/1c/1cfe9e547ce94d5943d38162123614c9a8184649.svn-base +++ /dev/null @@ -1,44 +0,0 @@ -PolyFitPlot <- -function(X , Y , nterms , xname="Estimated Mean", yname="Estimated Var", pdfname="", xlim=c(-1,5), ylim=c(-1,7), ChangeXY=F,col="red"){ - - b=rep(NA,nterms) - logX=matrix(rep(X, nterms),ncol=nterms, byrow=T) - for (i in 1:nterms) - logX[,i]=(log10(X))^i - colnames(logX)=paste("logmu^",c(1:nterms)) - rownames(logX)=names(X) - NotUse=c(names(X)[X==0],names(Y)[Y==0],names(X)[rowMeans(logX)==-Inf],names(X)[rowMeans(logX)==Inf]) - Use=names(X[!names(X)%in%NotUse]) - Lm=lm(log10(Y[Use])~logX[Use,1:nterms]) - b=summary(Lm)$coefficients[2:(nterms+1),1] - d=summary(Lm)$coefficients[1,1] - bvec=matrix(rep(b,length(X)),ncol=nterms,byrow=T) - fit=rowSums(logX*bvec)+d - main2=NULL - if (ChangeXY==T){ - X.plot=log10(Y) - Y.plot=log10(X) - fit.X.plot=fit - fit.Y.plot=log10(X) - } - else{ - X.plot=log10(X) - Y.plot=log10(Y) - fit.X.plot=log10(X) - fit.Y.plot=fit - } - - for (i in 1:nterms) - main2=paste(main2,round(b[i],2),"*log(",xname,")^",i,"+") - main=pdfname - - smoothScatter(X.plot, Y.plot ,main=main,xlim=xlim,ylim=ylim,xlab=xname,ylab=yname,axes=F) - axis(1,at=seq(xlim[1],xlim[2],by=1), 10^seq(xlim[1],xlim[2],by=1)) - axis(2,at=seq(ylim[1],ylim[2],by=2), 10^seq(ylim[1],ylim[2],by=2)) - Sortit=order(fit.X.plot) - lines(fit.X.plot[Sortit],fit.Y.plot[Sortit],col=col,lwd=3) - output=list(b=b,d=d,lm=Lm,fit=fit,sort=Sortit) - names(output$b)=paste(xname,"^",c(1:length(output$b))) - output -} - diff --git a/.svn/pristine/1d/1d26788328ff2bbcf677f3e459310e98c0da8e0f.svn-base b/.svn/pristine/1d/1d26788328ff2bbcf677f3e459310e98c0da8e0f.svn-base deleted file mode 100644 index f750ee7..0000000 --- a/.svn/pristine/1d/1d26788328ff2bbcf677f3e459310e98c0da8e0f.svn-base +++ /dev/null @@ -1,76 +0,0 @@ -GetDEResults<-function(EBPrelim, FDR=0.05, Method="robust", - FDRMethod="hard", Threshold_FC=0.7, - Threshold_FCRatio=0.3, SmallNum=0.01) -{ - if(!"PPDE"%in%names(EBPrelim))stop("The input doesn't seem like an output from EBTest") - - ################# - Conditions = EBPrelim$Conditions - Levels = levels(as.factor(Conditions)) - PPcut=FDR - # normalized data - GeneMat=EBPrelim$DataNorm - - - ###Get DEfound by FDRMethod type - PP=GetPPMat(EBPrelim) - if(FDRMethod=="hard") - {DEfound=rownames(PP)[which(PP[,"PPDE"]>=(1-PPcut))]} - else{SoftThre=crit_fun(PP[,"PPEE"],PPcut) - DEfound=rownames(PP)[which(PP[,"PPDE"]>=SoftThre)]} - - # classic - if(Method=="classic"){ - Gene_status=rep("EE",dim(GeneMat)[1]) - names(Gene_status)=rownames(GeneMat) - Gene_status[DEfound]="DE" - NoTest_genes=rownames(GeneMat)[!(rownames(GeneMat)%in%rownames(PP))] - Gene_status[NoTest_genes]="Filtered: Low Expression" - - PPMatWith0=EBPrelim$PPMatWith0 - PPMatWith0[NoTest_genes,]=c(NA,NA) - - return(list(DEfound=DEfound,PPMat=PPMatWith0,Status=Gene_status)) - } - else{ - ###Post_Foldchange - PostFoldChange=PostFC(EBPrelim) - PPFC=PostFoldChange$PostFC - - OldPPFC=PPFC[DEfound] - OldPPFC[which(OldPPFC>1)]=1/OldPPFC[which(OldPPFC>1)] - - FilterFC=names(OldPPFC)[which(OldPPFC>Threshold_FC)] - - ###New Fold Change - NewFC1=apply(matrix(GeneMat[DEfound,which(Conditions==Levels[[1]])]+SmallNum, - nrow=length(DEfound)),1,median) - NewFC2=apply(matrix(GeneMat[DEfound,which(Conditions==Levels[[2]])]+SmallNum, - nrow=length(DEfound)),1,median) - NewFC=NewFC1/NewFC2 - NewFC[which(NewFC>1)]=1/NewFC[which(NewFC>1)] - - ###FC Ratio - FCRatio=NewFC/OldPPFC - FCRatio[which(OldPPFCQtrmCut) - if(length(AllZeroNames)>0 & Print==T) - cat(paste0("Removing transcripts with ",Qtrm*100, - " th quantile < = ",QtrmCut," \n", - length(NotAllZeroNames)," transcripts will be tested \n")) - if(length(NotAllZeroNames)==0)stop("0 transcript passed") - Data=Data[NotAllZeroNames,] - - if(!is.null(NgVector))NgVector=NgVector[NotAllZeroNames] - if(is.null(NgVector))NgVector=rep(1,nrow(Data)) - - - #ReNameThem - IsoNamesIn=rownames(Data) - Names=paste("I",c(1:dim(Data)[1]),sep="") - names(IsoNamesIn)=Names - rownames(Data)=paste("I",c(1:dim(Data)[1]),sep="") - names(NgVector)=paste("I",c(1:dim(Data)[1]),sep="") - - # If PossibleCond==NULL, use all combinations - NumCond=nlevels(Conditions) - CondLevels=levels(Conditions) - #library(blockmodeling) - if(is.null(AllParti)){ - AllPartiList=sapply(1:NumCond,function(i)nkpartitions(NumCond,i)) - AllParti=do.call(rbind,AllPartiList) - colnames(AllParti)=CondLevels - rownames(AllParti)=paste("Pattern",1:nrow(AllParti),sep="") - } - if(length(sizeFactors)==length(Data)){ - rownames(sizeFactors)=rownames(Data) - colnames(sizeFactors)=Conditions - } - - - NoneZeroLength=nlevels(as.factor(NgVector)) - NameList=sapply(1:NoneZeroLength,function(i)names(NgVector)[NgVector==i],simplify=F) - DataList=sapply(1:NoneZeroLength , function(i) Data[NameList[[i]],],simplify=F) - names(DataList)=names(NameList) - - NumEachGroup=sapply(1:NoneZeroLength , function(i)dim(DataList)[i]) - # Unlist - DataList.unlist=do.call(rbind, DataList) - - # Divide by SampleSize factor - - if(length(sizeFactors)==ncol(Data)) - DataList.unlist.dvd=t(t( DataList.unlist)/sizeFactors) - - if(length(sizeFactors)==length(Data)) - DataList.unlist.dvd=DataList.unlist/sizeFactors - - # Pool or Not - if(Pool==T){ - DataforPoolSP.dvd=MeanforPoolSP.dvd=vector("list",NumCond) - for(lv in 1:NumCond){ - DataforPoolSP.dvd[[lv]]=matrix(DataList.unlist.dvd[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist)[1]) - MeanforPoolSP.dvd[[lv]]=rowMeans(DataforPoolSP.dvd[[lv]]) - } - MeanforPool.dvd=rowMeans(DataList.unlist.dvd) - NumInBin=floor(dim(DataList.unlist)[1]/NumBin) - StartSeq=c(0:(NumBin-1))*NumInBin+1 - EndSeq=c(StartSeq[-1]-1,dim(DataList.unlist)[1]) - MeanforPool.dvd.Sort=sort(MeanforPool.dvd,decreasing=T) - MeanforPool.dvd.Order=order(MeanforPool.dvd,decreasing=T) - PoolGroups=sapply(1:NumBin,function(i)(names(MeanforPool.dvd.Sort)[StartSeq[i]:EndSeq[i]]),simplify=F) - #FCforPool=MeanforPoolSP.dvd1/MeanforPoolSP.dvd2 - # Use GeoMean of every two-group partition - Parti2=nkpartitions(NumCond,2) - FCForPoolList=sapply(1:nrow(Parti2),function(i)rowMeans(do.call(cbind, - MeanforPoolSP.dvd[Parti2[i,]==1]))/ - rowMeans(do.call(cbind,MeanforPoolSP.dvd[Parti2[i,]==2])), - simplify=F) - FCForPoolMat=do.call(cbind,FCForPoolList) - FCforPool=apply(FCForPoolMat,1,function(i)exp(mean(log(i)))) - names(FCforPool)=names(MeanforPool.dvd) - FC_Use=names(FCforPool)[which(FCforPool>=quantile(FCforPool[!is.na(FCforPool)],PoolLower) & FCforPool<=quantile(FCforPool[!is.na(FCforPool)],PoolUpper))] - PoolGroupVar=sapply(1:NumBin,function(i)(mean(apply(matrix(DataList.unlist[PoolGroups[[i]][PoolGroups[[i]]%in%FC_Use],],ncol=ncol(DataList.unlist)),1,var)))) - PoolGroupVarInList=sapply(1:NumBin,function(i)(rep(PoolGroupVar[i],length(PoolGroups[[i]]))),simplify=F) - PoolGroupVarVector=unlist(PoolGroupVarInList) - VarPool=PoolGroupVarVector[MeanforPool.dvd.Order] - names(VarPool)=names(MeanforPool.dvd) - } - - DataListSP=vector("list",nlevels(Conditions)) - DataListSP.dvd=vector("list",nlevels(Conditions)) - SizeFSP=DataListSP - MeanSP=DataListSP - VarSP=DataListSP - GetPSP=DataListSP - RSP=DataListSP - CISP=DataListSP - tauSP=DataListSP - - NumEachCondLevel=summary(Conditions) - if(Pool==F & is.null(CI)) CondLevelsUse=CondLevels[NumEachCondLevel>1] - if(Pool==T | !is.null(CI)) CondLevelsUse=CondLevels - NumCondUse=length(CondLevelsUse) - - for (lv in 1:nlevels(Conditions)){ - DataListSP[[lv]]= matrix(DataList.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist)[1]) - rownames(DataListSP[[lv]])=rownames(DataList.unlist) - DataListSP.dvd[[lv]]= matrix(DataList.unlist.dvd[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist.dvd)[1]) - if(ncol(DataListSP[[lv]])==1 & Pool==F & !is.null(CI)){ - CISP[[lv]]=matrix(CI[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist.dvd)[1]) - tauSP[[lv]]=matrix(tau[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist.dvd)[1]) - } - # no matter sizeFactors is a vector or a matrix. Matrix should be columns are the normalization factors - # may input one for each - if(length(sizeFactors)==ncol(Data))SizeFSP[[lv]]=sizeFactors[Conditions==levels(Conditions)[lv]] - if(length(sizeFactors)==length(Data))SizeFSP[[lv]]=sizeFactors[,Conditions==levels(Conditions)[lv]] - - MeanSP[[lv]]=rowMeans(DataListSP.dvd[[lv]]) - names(MeanSP[[lv]])=rownames(DataListSP[[lv]]) - - if(length(sizeFactors)==ncol(Data))PrePareVar=sapply(1:ncol( DataListSP[[lv]]),function(i)( DataListSP[[lv]][,i]- SizeFSP[[lv]][i]*MeanSP[[lv]])^2 /SizeFSP[[lv]][i]) - if(length(sizeFactors)==length(Data))PrePareVar=sapply(1:ncol( DataListSP[[lv]]),function(i)( DataListSP[[lv]][,i]- SizeFSP[[lv]][,i]*MeanSP[[lv]])^2 /SizeFSP[[lv]][,i]) - - if(ncol(DataListSP[[lv]])==1 & Pool==F & !is.null(CI)) - VarSP[[lv]]=as.vector(((DataListSP[[lv]]/tauSP[[lv]]) * CISP[[lv]]/(CIthre*2))^2) - if( Pool==T){ - VarSP[[lv]]=VarPool - } - if(ncol(DataListSP[[lv]])!=1){ - VarSP[[lv]]=rowSums(PrePareVar)/ncol( DataListSP[[lv]]) - names(VarSP[[lv]])=rownames(DataList.unlist) - GetPSP[[lv]]=MeanSP[[lv]]/VarSP[[lv]] - RSP[[lv]]=MeanSP[[lv]]*GetPSP[[lv]]/(1-GetPSP[[lv]]) - } - names(MeanSP[[lv]])=rownames(DataList.unlist) - } - - # Get Empirical R - # POOL R??? - MeanList=rowMeans(DataList.unlist.dvd) - VarList=apply(DataList.unlist.dvd, 1, var) - - if(NumCondUse!=0){ - Varcbind=do.call(cbind,VarSP[CondLevels%in%CondLevelsUse]) - PoolVarSpeedUp_MDFPoi_NoNormVarList=rowMeans(Varcbind) - VarrowMin=apply(Varcbind,1,min) - - } - if(NumCondUse==0) - { - NumFCgp=choose(NumCond,2) - FC_Use_tmp=vector("list",NumFCgp) - aa=1 - for(k1 in 1:(NumCond-1)){ - for(k2 in (k1+1):NumCond){ - FCforPool=DataList.unlist.dvd[,k1]/DataList.unlist.dvd[,k2] - names(FCforPool)=rownames(DataList.unlist.dvd) - FC_Use_tmp[[aa]]=names(FCforPool)[which(FCforPool>=quantile(FCforPool[!is.na(FCforPool)],.25) & - FCforPool<=quantile(FCforPool[!is.na(FCforPool)],.75))] - aa=aa+1 - }} - FC_Use=Reduce(intersect,FC_Use_tmp) - if(length(FC_Use)==0){ - All_candi=unlist(FC_Use_tmp) - FC_Use=names(table(All_candi))[1:3] - - } - Var_FC_Use=apply( DataList.unlist.dvd[FC_Use,],1,var ) - MeanforPool=apply( DataList.unlist.dvd,1,mean ) - Mean_FC_Use=apply( DataList.unlist.dvd[FC_Use,],1,mean ) - FC_Use2=which(Var_FC_Use>=Mean_FC_Use) - Var_FC_Use2=Var_FC_Use[FC_Use2] - Mean_FC_Use2=Mean_FC_Use[FC_Use2] - Phi=mean((Var_FC_Use2-Mean_FC_Use2)/Mean_FC_Use2^2) - VarEst= MeanforPool*(1+MeanforPool*Phi) - if(Print==T)message(paste("No Replicate - estimate phi",round(Phi,5), "\n")) - Varcbind=VarEst - PoolVarSpeedUp_MDFPoi_NoNormVarList=VarEst - VarrowMin=VarEst - - } - - - - GetP=MeanList/PoolVarSpeedUp_MDFPoi_NoNormVarList - - EmpiricalRList=MeanList*GetP/(1-GetP) - # sep - #Rcb=cbind(RSP[[1]],RSP[[2]]) - #Rbest=apply(Rcb,1,function(i)max(i[!is.na(i) & i!=Inf])) - EmpiricalRList[EmpiricalRList==Inf] =max(EmpiricalRList[EmpiricalRList!=Inf]) - # fine - # - GoodData=names(MeanList)[EmpiricalRList>0 & VarrowMin!=0 & EmpiricalRList!=Inf & !is.na(VarrowMin) & !is.na(EmpiricalRList)] - NotIn=names(MeanList)[EmpiricalRList<=0 | VarrowMin==0 | EmpiricalRList==Inf | is.na(VarrowMin) | is.na(EmpiricalRList)] - #NotIn.BestR=Rbest[NotIn.raw] - #NotIn.fix=NotIn.BestR[which(NotIn.BestR>0)] - #EmpiricalRList[names(NotIn.fix)]=NotIn.fix - #print(paste("ZeroVar",sum(VarrowMin==0), "InfR", length(which(EmpiricalRList==Inf)), "Poi", length(which(EmpiricalRList<0)), "")) - #GoodData=c(GoodData.raw,names(NotIn.fix)) - #NotIn=NotIn.raw[!NotIn.raw%in%names(NotIn.fix)] - EmpiricalRList.NotIn=EmpiricalRList[NotIn] - EmpiricalRList.Good=EmpiricalRList[GoodData] - EmpiricalRList.Good[EmpiricalRList.Good<1]=1+EmpiricalRList.Good[EmpiricalRList.Good<1] - if(length(sizeFactors)==ncol(Data)) - EmpiricalRList.Good.mat= outer(EmpiricalRList.Good, sizeFactors) - if(length(sizeFactors)==length(Data)) - EmpiricalRList.Good.mat=EmpiricalRList.Good* sizeFactors[GoodData,] - - - # Only Use Data has Good q's - DataList.In=sapply(1:NoneZeroLength, function(i)DataList[[i]][GoodData[GoodData%in%rownames(DataList[[i]])],],simplify=F) - DataList.NotIn=sapply(1:NoneZeroLength, function(i)DataList[[i]][NotIn[NotIn%in%rownames(DataList[[i]])],],simplify=F) - DataListIn.unlist=do.call(rbind, DataList.In) - DataListNotIn.unlist=do.call(rbind, DataList.NotIn) - - DataListSPIn=vector("list",nlevels(Conditions)) - DataListSPNotIn=vector("list",nlevels(Conditions)) - EmpiricalRList.Good.mat.SP=vector("list",nlevels(Conditions)) - for (lv in 1:nlevels(Conditions)){ - DataListSPIn[[lv]]= matrix(DataListIn.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataListIn.unlist)[1]) - if(length(NotIn)>0) DataListSPNotIn[[lv]]= matrix(DataListNotIn.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataListNotIn.unlist)[1]) - rownames(DataListSPIn[[lv]])=rownames(DataListIn.unlist) - if(length(NotIn)>0)rownames(DataListSPNotIn[[lv]])=rownames(DataListNotIn.unlist) - EmpiricalRList.Good.mat.SP[[lv]]=matrix(EmpiricalRList.Good.mat[,Conditions==levels(Conditions)[lv]],nrow=dim(EmpiricalRList.Good.mat)[1]) - } - - NumOfEachGroupIn=sapply(1:NoneZeroLength, function(i)max(0,dim(DataList.In[[i]])[1])) - NumOfEachGroupNotIn=sapply(1:NoneZeroLength, function(i)max(0,dim(DataList.NotIn[[i]])[1])) - - #Initialize SigIn & ... - AlphaIn=0.5 - BetaIn=rep(0.5,NoneZeroLength) - PIn=rep(1/nrow(AllParti),nrow(AllParti)) - - ####use while to make an infinity round? - UpdateAlpha=NULL - UpdateBeta=NULL - UpdateP=NULL - UpdatePFromZ=NULL - Timeperround=NULL - for (times in 1:maxround){ - temptime1=proc.time() - UpdateOutput=suppressWarnings(LogNMulti(DataListIn.unlist,DataListSPIn, EmpiricalRList.Good.mat ,EmpiricalRList.Good.mat.SP, - NumOfEachGroupIn, AlphaIn, BetaIn, PIn, NoneZeroLength, AllParti,Conditions)) - message(paste("iteration", times, "done \n",sep=" ")) - AlphaIn=UpdateOutput$AlphaNew - BetaIn=UpdateOutput$BetaNew - PIn=UpdateOutput$PNew - PFromZ=UpdateOutput$PFromZ - FOut=UpdateOutput$FGood - UpdateAlpha=rbind(UpdateAlpha,AlphaIn) - UpdateBeta=rbind(UpdateBeta,BetaIn) - UpdateP=rbind(UpdateP,PIn) - UpdatePFromZ=rbind(UpdatePFromZ,PFromZ) - temptime2=proc.time() - Timeperround=c(Timeperround,temptime2[3]-temptime1[3]) - message(paste("time" ,round(Timeperround[times],2),"\n",sep=" ")) - Z.output=UpdateOutput$ZEachGood - Z.NA.Names=UpdateOutput$zNaNName - } - #Remove this } after testing!! - -# if (times!=1){ -# if((UpdateAlpha[times]-UpdateAlpha[times-1])^2+UpdateBeta[times]-UpdateBeta[times-1])^2+UpdateR[times]-UpdateR[times-1])^2+UpdateP[times]-UpdateP[times-1])^2<=10^(-6)){ -# Result=list(Sig=SigIn, Miu=MiuIn, Tau=TauIn) -# break -# } -# } -#} - -##########Change Names############ -## Only z are for Good Ones -## Others are for ALL Data -GoodData=GoodData[!GoodData%in%Z.NA.Names] -IsoNamesIn.Good=as.vector(IsoNamesIn[GoodData]) -RealName.Z.output=Z.output -RealName.F=FOut -rownames(RealName.Z.output)=IsoNamesIn.Good -rownames(RealName.F)=IsoNamesIn.Good - -RealName.EmpiricalRList=sapply(1:NoneZeroLength,function(i)EmpiricalRList[names(EmpiricalRList)%in%NameList[[i]]], simplify=F) -RealName.MeanList=sapply(1:NoneZeroLength,function(i)MeanList[names(MeanList)%in%NameList[[i]]], simplify=F) -RealName.SPMeanList=sapply(1:NoneZeroLength,function(i)sapply(1:length(MeanSP), function(j)MeanSP[[j]][names(MeanSP[[j]])%in%NameList[[i]]],simplify=F), simplify=F) -RealName.SPVarList=sapply(1:NoneZeroLength,function(i)sapply(1:length(VarSP), function(j)VarSP[[j]][names(VarSP[[j]])%in%NameList[[i]]],simplify=F), simplify=F) -RealName.DataList=sapply(1:NoneZeroLength,function(i)DataList[[i]][rownames(DataList[[i]])%in%NameList[[i]],], simplify=F) - -RealName.VarList=sapply(1:NoneZeroLength,function(i)VarList[names(VarList)%in%NameList[[i]]], simplify=F) -RealName.PoolVarList=sapply(1:NoneZeroLength,function(i)PoolVarSpeedUp_MDFPoi_NoNormVarList[names(PoolVarSpeedUp_MDFPoi_NoNormVarList)%in%NameList[[i]]], simplify=F) -RealName.QList=sapply(1:NoneZeroLength,function(i)sapply(1:length(GetPSP), function(j)GetPSP[[j]][names(GetPSP[[j]])%in%NameList[[i]]],simplify=F), simplify=F) - - -for (i in 1:NoneZeroLength){ -tmp=NameList[[i]] -Names=IsoNamesIn[tmp] -RealName.MeanList[[i]]=RealName.MeanList[[i]][NameList[[i]]] -RealName.VarList[[i]]=RealName.VarList[[i]][NameList[[i]]] - for(j in 1:NumCond){ - RealName.SPMeanList[[i]][[j]]=RealName.SPMeanList[[i]][[j]][NameList[[i]]] - if(!is.null(RealName.QList[[i]][[j]])){ - RealName.QList[[i]][[j]]=RealName.QList[[i]][[j]][NameList[[i]]] - RealName.SPVarList[[i]][[j]]=RealName.SPVarList[[i]][[j]][NameList[[i]]] - names(RealName.QList[[i]][[j]])=Names - names(RealName.SPVarList[[i]][[j]])=Names - } - names(RealName.SPMeanList[[i]][[j]])=Names - } -RealName.EmpiricalRList[[i]]=RealName.EmpiricalRList[[i]][NameList[[i]]] -RealName.PoolVarList[[i]]=RealName.PoolVarList[[i]][NameList[[i]]] -RealName.DataList[[i]]=RealName.DataList[[i]][NameList[[i]],] - -names(RealName.MeanList[[i]])=Names -names(RealName.VarList[[i]])=Names - -names(RealName.EmpiricalRList[[i]])=Names -names(RealName.PoolVarList[[i]])=Names -rownames(RealName.DataList[[i]])=Names - -} - - -#########posterior part for other data set here later############ -AllNA=unique(c(Z.NA.Names,NotIn)) -AllZ=NULL -AllF=NULL -if(length(AllNA)==0){ - AllZ=RealName.Z.output[IsoNamesIn,] - AllF=RealName.F[IsoNamesIn,] -} -ZEachNA=NULL -if (length(AllNA)>0){ - Ng.NA=NgVector[AllNA] - AllNA.Ngorder=AllNA[order(Ng.NA)] - NumOfEachGroupNA=rep(0,NoneZeroLength) - NumOfEachGroupNA.tmp=tapply(Ng.NA,Ng.NA,length) - names(NumOfEachGroupNA)=c(1:NoneZeroLength) - NumOfEachGroupNA[names(NumOfEachGroupNA.tmp)]=NumOfEachGroupNA.tmp - PNotIn=rep(1-ApproxVal,length(AllNA.Ngorder)) - MeanList.NotIn=MeanList[AllNA.Ngorder] - R.NotIn.raw=MeanList.NotIn*PNotIn/(1-PNotIn) - if(length(sizeFactors)==ncol(Data)) - R.NotIn=matrix(outer(R.NotIn.raw,sizeFactors),nrow=length(AllNA.Ngorder)) - if(length(sizeFactors)==length(Data)) - R.NotIn=matrix(R.NotIn.raw*sizeFactors[NotIn,],nrow=length(AllNA.Ngorder)) - - DataListNotIn.unlistWithZ=matrix(DataList.unlist[AllNA.Ngorder,], - nrow=length(AllNA.Ngorder)) - rownames(DataListNotIn.unlistWithZ)=AllNA.Ngorder - DataListSPNotInWithZ=vector("list",nlevels(Conditions)) - RListSPNotInWithZ=vector("list",nlevels(Conditions)) - for (lv in 1:nlevels(Conditions)) { - DataListSPNotInWithZ[[lv]] = matrix(DataListSP[[lv]][AllNA.Ngorder,],nrow=length(AllNA.Ngorder)) - RListSPNotInWithZ[[lv]]=matrix(R.NotIn[,Conditions==levels(Conditions)[lv]],nrow=length(AllNA.Ngorder)) - } - - FListNA=sapply(1:nrow(AllParti),function(i)sapply(1:nlevels(as.factor(AllParti[i,])), - function(j)f0(do.call(cbind, DataListSPNotInWithZ[AllParti[i,]==j]),AlphaIn, BetaIn, - do.call(cbind,RListSPNotInWithZ[AllParti[i,]==j]), NumOfEachGroupNA, log=T)), - simplify=F) - for(ii in 1:length(FListNA)) - FListNA[[ii]]=matrix(FListNA[[ii]],nrow=length(AllNA.Ngorder)) - FPartiLogNA=matrix(sapply(FListNA,rowSums),nrow=length(AllNA.Ngorder)) - FMatNA=exp(FPartiLogNA+600) - - rownames(FMatNA)=rownames(DataListNotIn.unlistWithZ) - PMatNA=matrix(rep(1,nrow(DataListNotIn.unlistWithZ)),ncol=1)%*%matrix(PIn,nrow=1) - FmultiPNA=matrix(FMatNA*PMatNA,nrow=length(AllNA.Ngorder)) - DenomNA=rowSums(FmultiPNA) - ZEachNA=matrix(apply(FmultiPNA,2,function(i)i/DenomNA),nrow=length(AllNA.Ngorder)) - - rownames(ZEachNA)=IsoNamesIn[AllNA.Ngorder] - - AllZ=rbind(RealName.Z.output,ZEachNA) - AllZ=AllZ[IsoNamesIn,] - - F.NotIn=FPartiLogNA - rownames(F.NotIn)=IsoNamesIn[rownames(FMatNA)] - AllF=rbind(RealName.F,F.NotIn) - AllF=AllF[IsoNamesIn,] - -} -colnames(AllZ)=rownames(AllParti) -colnames(AllF)=rownames(AllParti) -rownames(UpdateAlpha)=paste("iter",1:nrow(UpdateAlpha),sep="") -rownames(UpdateBeta)=paste("iter",1:nrow(UpdateBeta),sep="") -rownames(UpdateP)=paste("iter",1:nrow(UpdateP),sep="") -rownames(UpdatePFromZ)=paste("iter",1:nrow(UpdatePFromZ),sep="") -colnames(UpdateBeta)=paste("Ng",1:ncol(UpdateBeta),sep="") - -CondOut=levels(Conditions) -names(CondOut)=paste("Condition",c(1:length(CondOut)),sep="") - -AllZWith0=matrix(NA,ncol=ncol(AllZ),nrow=nrow(Dataraw)) -rownames(AllZWith0)=rownames(Dataraw) -colnames(AllZWith0)=colnames(AllZ) -if(is.null(AllZeroNames))AllZWith0=AllZ -if(!is.null(AllZeroNames))AllZWith0[names(NotAllZeroNames),]=AllZ[names(NotAllZeroNames),] - -#############Result############################ -Result=list(Alpha=UpdateAlpha,Beta=UpdateBeta,P=UpdateP,PFromZ=UpdatePFromZ, - Z=RealName.Z.output,PoissonZ=ZEachNA, RList=RealName.EmpiricalRList, MeanList=RealName.MeanList, - VarList=RealName.VarList, QList=RealName.QList, SPMean=RealName.SPMeanList, SPEstVar=RealName.SPVarList, - PoolVar=RealName.PoolVarList , DataList=RealName.DataList,PPpattern=AllZ,f=AllF, AllParti=AllParti, - PPMat=AllZ,PPMatWith0=AllZWith0, ConditionOrder=CondOut) -} - diff --git a/.svn/pristine/44/4437fd66dbf5a6e76f2e00b64338c809274f4832.svn-base b/.svn/pristine/44/4437fd66dbf5a6e76f2e00b64338c809274f4832.svn-base deleted file mode 100644 index ec4cee0..0000000 --- a/.svn/pristine/44/4437fd66dbf5a6e76f2e00b64338c809274f4832.svn-base +++ /dev/null @@ -1,101 +0,0 @@ -\name{PolyFitPlot} -\alias{PolyFitPlot} -\title{ -Fit the mean-var relationship using polynomial regression -} -\description{ -'PolyFitPlot' fits the mean-var relationship using polynomial regression. -} - -\usage{ -PolyFitPlot(X, Y, nterms, xname = "Estimated Mean", - yname = "Estimated Var", pdfname = "", - xlim = c(-1,5), ylim = c(-1,7), ChangeXY = F, - col = "red") -} - -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{X}{ -The first group of values want to be fitted by the polynomial regression (e.g Mean of the data). -} - \item{Y}{ -The second group of values want to be fitted by the polynomial regression (e.g. variance of the data). The length of Y should be the same as the length of X. -} - \item{nterms}{ -How many polynomial terms want to be used. -} - \item{xname}{ -Name of the x axis. -} - \item{yname}{ -Name of the y axis. -} - \item{pdfname}{ -Name of the plot. -} - \item{xlim}{ -The x limits of the plot. -} - \item{ylim}{ -The y limits of the plot. - -} - \item{ChangeXY}{ -If ChangeXY is setted to be TRUE, X will be treated as the dependent variable and Y will be treated as the independent one. Default is FALSE. -} - \item{col}{ -Color of the fitted line. -} -} -\value{The PolyFitPlot function provides a smooth scatter plot of two variables and their best fitting line of polynomial regression. -} - -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - - -\examples{ -data(IsoList) -str(IsoList) -IsoMat = IsoList$IsoMat -IsoNames = IsoList$IsoNames -IsosGeneNames = IsoList$IsosGeneNames -IsoSizes = MedianNorm(IsoMat) -NgList = GetNg(IsoNames, IsosGeneNames) - -IsoNgTrun = NgList$IsoformNgTrun -#IsoEBOut = EBTest(Data = IsoMat.small, -# NgVector = IsoNgTrun, -# Conditions = as.factor(rep(c("C1","C2"), each=5)), -# sizeFactors = IsoSizes, maxround = 5) - -#par(mfrow=c(2,2)) -#PolyFitValue = vector("list",3) - -#for(i in 1:3) -# PolyFitValue[[i]] = PolyFitPlot(IsoEBOut$C1Mean[[i]], -# IsoEBOut$C1EstVar[[i]], 5) - -#PolyAll = PolyFitPlot(unlist(IsoEBOut$C1Mean), -# unlist(IsoEBOut$C1EstVar), 5) - -#lines(log10(IsoEBOut$C1Mean[[1]][PolyFitValue[[1]]$sort]), -# PolyFitValue[[1]]$fit[PolyFitValue[[1]]$sort], -# col="yellow", lwd=2) -#lines(log10(IsoEBOut$C1Mean[[2]][PolyFitValue[[2]]$sort]), -# PolyFitValue[[2]]$fit[PolyFitValue[[2]]$sort], -# col="pink", lwd=2) -#lines(log10(IsoEBOut$C1Mean[[3]][PolyFitValue[[3]]$sort]), -# PolyFitValue[[3]]$fit[PolyFitValue[[3]]$sort], -# col="green", lwd=2) - -#legend("topleft",c("All Isoforms","Ng = 1","Ng = 2","Ng = 3"), -# col = c("red","yellow","pink","green"), -# lty=1, lwd=3, box.lwd=2) - -} diff --git a/.svn/pristine/45/45577ba2ca526584ef358086de2c264bc1d5e5eb.svn-base b/.svn/pristine/45/45577ba2ca526584ef358086de2c264bc1d5e5eb.svn-base deleted file mode 100644 index 11215cc..0000000 --- a/.svn/pristine/45/45577ba2ca526584ef358086de2c264bc1d5e5eb.svn-base +++ /dev/null @@ -1,13 +0,0 @@ - -RankNorm=function(Data){ - if(ncol(Data)==1)stop("Only 1 sample!") - - RankData=apply(Data, 2, rank) - SortData=apply(Data, 2, sort) - SortMean=rowMeans(SortData) - SortMean[SortMean==0]=1 - NormMatrix=sapply(1:ncol(Data), function(i)Data[,i]/(SortMean[RankData[,i]])) - NormMatrix[NormMatrix==0]=1 - NormMatrix - } - diff --git a/.svn/pristine/4a/4ac14ffeba216dd2661b49ebc82f719baf39f56c.svn-base b/.svn/pristine/4a/4ac14ffeba216dd2661b49ebc82f719baf39f56c.svn-base deleted file mode 100644 index 97e4d39..0000000 --- a/.svn/pristine/4a/4ac14ffeba216dd2661b49ebc82f719baf39f56c.svn-base +++ /dev/null @@ -1,103 +0,0 @@ -\name{EBTest} -\alias{EBTest} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ -Using EM algorithm to calculate the posterior probabilities of being DE -} -\description{ -Base on the assumption of NB-Beta Empirical Bayes model, the EM algorithm is used to get the posterior probability of being DE. -} -\usage{ -EBTest(Data, NgVector = NULL, Conditions, sizeFactors, maxround, - Pool = F, NumBin = 1000, ApproxVal = 10^-10, Alpha = NULL, - Beta = NULL, PInput = NULL, RInput = NULL, - PoolLower = .25, PoolUpper = .75, Print = T, Qtrm = .75,QtrmCut=10) -} -\arguments{ - - \item{Data}{A data matrix contains expression values for each transcript (gene or isoform level). In which rows should be transcripts and columns should be samples.} - \item{NgVector}{A vector indicates the uncertainty group assignment of each isoform. -e.g. if we use number of isoforms in the host gene to define the uncertainty groups, suppose the isoform is in a gene with 2 isoforms, Ng of this isoform should be 2. The length of this vector should be the same as the number of rows in Data. If it's gene level data, Ngvector could be left as NULL.} - \item{Conditions}{A factor indicates the condition which each sample belongs to. } - \item{sizeFactors}{The normalization factors. It should be a vector with lane specific numbers (the length of the vector should be the same as the number of samples, with the same order as the columns of Data).} - \item{maxround}{Number of iterations. The default value is 5. Users should always check the convergency by looking at the Alpha and Beta in output. If the hyper-parameter estimations are not converged in 5 iterations, larger number is suggested.} -\item{Pool}{While working without replicates, user could define the Pool = TRUE in the EBTest function to enable pooling.} -\item{NumBin}{By defining NumBin = 1000, EBSeq will group the genes with similar means together into 1,000 bins.} -\item{PoolLower, PoolUpper}{ -With the assumption that only subset of the genes are DE in the data set, we take genes whose FC are in the PoolLower - PoolUpper quantile of the FC's as the candidate genes (default is 25\%-75\%). - -For each bin, the bin-wise variance estimation is defined as the median of the cross condition variance estimations of the candidate genes within that bin. - -We use the cross condition variance estimations for the candidate genes and the bin-wise variance estimations of the host bin for the non-candidate genes. -} - -\item{ApproxVal}{The variances of the transcripts with mean < var will be approximated as mean/(1-ApproxVal). } - -\item{Alpha, Beta, PInput, RInput}{If the parameters are known and the user doesn't want to estimate them from the data, user could specify them here.} -\item{Print}{Whether print the elapsed-time while running the test.} -\item{Qtrm, QtrmCut}{ -Transcripts with Qtrm th quantile < = QtrmCut will be removed before testing. The default value is Qtrm = 0.75 and QtrmCut=10. -By default setting, transcripts that have >75\% of the samples with expression less than 10 -won't be tested. -} -} - -\details{For each transcript gi within condition, the model assumes: -X_{gis}|mu_{gi} ~ NB (r_{gi0} * l_s, q_{gi}) -q_gi|alpha, beta^N_g ~ Beta (alpha, beta^N_g) -In which the l_s is the sizeFactors of samples. - -The function will test "H0: q_{gi}^{C1} = q_{gi}^{C2}" and "H1: q_{gi}^{C1} != q_{gi}^{C2}." -} -\value{ -\item{Alpha}{Fitted parameter alpha of the prior beta distribution. Rows are the values for each iteration.} -\item{Beta}{Fitted parameter beta of the prior beta distribution. Rows are the values for each iteration.} -\item{P, PFromZ}{The bayes estimator of being DE. Rows are the values for each iteration.} -\item{Z, PoissonZ}{The Posterior Probability of being DE for each transcript(Maybe not in the same order of input). } -\item{RList}{The fitted values of r for each transcript.} -\item{MeanList}{The mean of each transcript (across conditions).} -\item{VarList}{The variance of each transcript (across conditions).} -\item{QListi1}{The fitted q values of each transcript within condition 1.} -\item{QListi2}{The fitted q values of each transcript within condition 2.} -\item{C1Mean}{The mean of each transcript within Condition 1 (adjusted by normalization factors).} -\item{C2Mean}{The mean of each transcript within Condition 2 (adjusted by normalization factors).} -\item{C1EstVar}{The estimated variance of each transcript within Condition 1 (adjusted by normalization factors).} -\item{C2EstVar}{The estimated variance of each transcript within Condition 2 (adjusted by normalization factors).} -\item{PoolVar}{The variance of each transcript (The pooled value of within condition EstVar).} -\item{DataList}{A List of data that grouped with Ng.} -\item{PPDE}{The Posterior Probability of being DE for each transcript (The same order of input).} -\item{f0,f1}{The likelihood of the prior predictive distribution of being EE or DE (in log scale).} -\item{AllZeroIndex}{The transcript with expression 0 for all samples (which are not tested).} -\item{PPMat}{A matrix contains posterior probabilities of being EE (the first column) or DE (the second column). -Rows are transcripts. -Transcripts with expression 0 for all samples are not shown in this matrix.} -\item{PPMatWith0}{A matrix contains posterior probabilities of being EE (the first column) or DE (the second column). -Rows are transcripts. -Transcripts with expression 0 for all samples are shown as PP(EE) = PP(DE) = NA in this matrix. -The transcript order is exactly the same as the order of the input data.} -\item{ConditionOrder}{The condition assignment for C1Mean, C2Mean, etc.} -\item{Conditions}{The input conditions.} -} -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - - -\seealso{ -EBMultiTest, PostFC, GetPPMat -} -\examples{ -data(GeneMat) -str(GeneMat) -GeneMat.small = GeneMat[c(1:10,511:550),] -Sizes = MedianNorm(GeneMat.small) -EBOut = EBTest(Data = GeneMat.small, - Conditions = as.factor(rep(c("C1","C2"), each = 5)), - sizeFactors = Sizes, maxround = 5) -PP = GetPPMat(EBOut) -} -\keyword{ DE } -\keyword{ Two condition }% __ONLY ONE__ keyword per line diff --git a/.svn/pristine/4b/4b005a6ce54d36871757440b4b936825b3e086b6.svn-base b/.svn/pristine/4b/4b005a6ce54d36871757440b4b936825b3e086b6.svn-base deleted file mode 100644 index 5048e92..0000000 --- a/.svn/pristine/4b/4b005a6ce54d36871757440b4b936825b3e086b6.svn-base +++ /dev/null @@ -1,36 +0,0 @@ -\name{Likefun} -\alias{Likefun} -\title{ -Likelihood Function of the NB-Beta Model -} -\description{ -'Likefun' specifies the Likelihood Function of the NB-Beta Model. -} -\usage{ -Likefun(ParamPool, InputPool) -} -\arguments{ - \item{ParamPool}{The parameters that will be estimated in EM.} - \item{InputPool}{The control parameters that will not be estimated in EM.} -} - -\value{The function will return the log-likelihood. -} - -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} - -\author{ -Ning Leng -} - - -\examples{ -#x1 = c(.6,.7,.3) -#Input = matrix(rnorm(100,100,1), ncol=10) -#RIn = matrix(rnorm(100,200,1), ncol=10) -#InputPool = list(Input[,1:5], Input[,6:10], Input, -# rep(.1,100), 1, RIn, RIn[,1:5], RIn[,6:10], 100) -#Likefun(x1, InputPool) -} diff --git a/.svn/pristine/4d/4d83f4511ff15ce4e6133961f6b85b6d1a8af0a3.svn-base b/.svn/pristine/4d/4d83f4511ff15ce4e6133961f6b85b6d1a8af0a3.svn-base deleted file mode 100644 index 9b63076..0000000 --- a/.svn/pristine/4d/4d83f4511ff15ce4e6133961f6b85b6d1a8af0a3.svn-base +++ /dev/null @@ -1,5 +0,0 @@ -export(crit_fun, DenNHist, EBTest, GetNg, GetPP, MedianNorm, -PolyFitPlot, PostFC, QQP, QuantileNorm, RankNorm, EBMultiTest, -GetMultiPP, GetPatterns, PlotPattern, GetPPMat, GetMultiFC, PlotPostVsRawFC, -GetNormalizedMat,f0,f1,LogN,LogNMulti) - diff --git a/.svn/pristine/56/56385a0ff28ccc359587f57038fcc989160de852.svn-base b/.svn/pristine/56/56385a0ff28ccc359587f57038fcc989160de852.svn-base deleted file mode 100644 index dc33015..0000000 --- a/.svn/pristine/56/56385a0ff28ccc359587f57038fcc989160de852.svn-base +++ /dev/null @@ -1,33 +0,0 @@ -\name{GetNormalizedMat} -\alias{GetNormalizedMat} -\title{ -Calculate normalized expression matrix -} -\description{ -'GetNormalizedMat' calculates the normalized expression matrix. -(Note: this matrix is only used for visualization etc. EBTes and EBMultiTest request *un-adjusted* expressions and normalization factors.) -} -\usage{ -GetNormalizedMat(Data, Sizes) -} -\arguments{ - - \item{Data}{The data matrix with transcripts in rows and lanes in columns.} -\item{Sizes}{A vector contains the normalization factor for each lane.} -} -\value{The function will return a normalized matrix.} -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - -\examples{ - -data(GeneMat) -str(GeneMat) -Sizes = MedianNorm(GeneMat) -NormData = GetNormalizedMat(GeneMat, Sizes) -} -\keyword{ Normalization }% __ONLY ONE__ keyword per line diff --git a/.svn/pristine/57/577a293b2e1d563e87d92779954895a644adf0c2.svn-base b/.svn/pristine/57/577a293b2e1d563e87d92779954895a644adf0c2.svn-base deleted file mode 100644 index c96ea1e..0000000 --- a/.svn/pristine/57/577a293b2e1d563e87d92779954895a644adf0c2.svn-base +++ /dev/null @@ -1,10 +0,0 @@ - -QuantileNorm=function(Data, Quantile){ - if(ncol(Data)==1)stop("Only 1 sample!") - - QtilePt=apply(Data, 2, function(i)quantile(i, Quantile)) -# Size= QtilePt * prod(QtilePt) ^ (-1/ncol(Data)) - Size=10^(log10(QtilePt)-sum(log10(QtilePt))*(1/ncol(Data)) ) - Size - } - diff --git a/.svn/pristine/5a/5a5f48d3517f8a68890287fc46595ee263210ef1.svn-base b/.svn/pristine/5a/5a5f48d3517f8a68890287fc46595ee263210ef1.svn-base deleted file mode 100644 index ac9505f..0000000 --- a/.svn/pristine/5a/5a5f48d3517f8a68890287fc46595ee263210ef1.svn-base +++ /dev/null @@ -1,39 +0,0 @@ -\name{GetPPMat} -\alias{GetPPMat} -\title{ -Posterior Probability of Transcripts -} -\description{ -'GetPPMat' generates the Posterior Probability of being each pattern of each transcript based on the EBTest output. -} -\usage{ -GetPPMat(EBout) -} -\arguments{ - \item{EBout}{The output of EBTest function.} - -} -\value{The poster probabilities of being EE (first column) and DE (second column). -} -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - -\examples{ - -data(GeneMat) -GeneMat.small = GeneMat[c(500:550),] -Sizes = MedianNorm(GeneMat.small) -EBOut = EBTest(Data = GeneMat.small, - Conditions = as.factor(rep(c("C1","C2"), each=5)), - sizeFactors = Sizes, maxround = 5) -PP = GetPPMat(EBOut) -str(PP) -head(PP) -} -% Add one or more standard keywords, see file 'KEYWORDS' in the -% R documentation directory. -\keyword{ Posterior Probability } diff --git a/.svn/pristine/61/61bef68fab225a62acd5fb5506edfa40ae1f2819.svn-base b/.svn/pristine/61/61bef68fab225a62acd5fb5506edfa40ae1f2819.svn-base deleted file mode 100644 index aa93979..0000000 --- a/.svn/pristine/61/61bef68fab225a62acd5fb5506edfa40ae1f2819.svn-base +++ /dev/null @@ -1,40 +0,0 @@ -\name{LikefunMulti} -\alias{LikefunMulti} -\title{ -Likelihood Function of the NB-Beta Model In Multiple Condition Test -} -\description{ -'LikefunMulti' specifies the Likelihood Function of the NB-Beta Model In Multiple Condition Test. -} -\usage{ -LikefunMulti(ParamPool, InputPool) -} - -\arguments{ - \item{ParamPool}{The parameters that will be estimated in EM.} - \item{InputPool}{The control parameters that will not be estimated in EM.} -} - -\value{The function will return the log-likelihood.} - -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} - -\author{ -Ning Leng -} - -\examples{ -#x1 = c(.6,.7,.3) -#Input = matrix(rnorm(100,100,1),ncol=10) -#RIn = matrix(rnorm(100,200,1),ncol=10) -#InputPool = list(list(Input[,1:5],Input[,6:10]), -# Input, cbind(rep(.1, 10), rep(.9,10)), 1, -# RIn, list(RIn[,1:5],RIn[,6:10]), -# 10, rbind(c(1,1),c(1,2))) -#LikefunMulti(x1, InputPool) - -} - - diff --git a/.svn/pristine/67/670a6ee7b49bec648a6bbe7cd6fe01dfb46150fa.svn-base b/.svn/pristine/67/670a6ee7b49bec648a6bbe7cd6fe01dfb46150fa.svn-base deleted file mode 100644 index 3d5024d..0000000 --- a/.svn/pristine/67/670a6ee7b49bec648a6bbe7cd6fe01dfb46150fa.svn-base +++ /dev/null @@ -1,250 +0,0 @@ -library(EBSeq) -# 3.1 -data(GeneMat) -str(GeneMat) -Sizes=MedianNorm(GeneMat) -EBOut=EBTest(Data=GeneMat, - Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=Sizes, maxround=5) -PP=GetPPMat(EBOut) -str(PP) -head(PP) -DEfound=rownames(PP)[which(PP[,"PPDE"]>=.95)] -str(DEfound) -#3.2 -data(IsoList) -str(IsoList) -IsoMat=IsoList$IsoMat -str(IsoMat) -IsoNames=IsoList$IsoNames -IsosGeneNames=IsoList$IsosGeneNames -IsoSizes=MedianNorm(IsoMat) -NgList=GetNg(IsoNames, IsosGeneNames) -IsoNgTrun=NgList$IsoformNgTrun -IsoNgTrun[c(1:3,201:203,601:603)] -IsoEBOut=EBTest(Data=IsoMat, NgVector=IsoNgTrun, - Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=IsoSizes, maxround=5) -IsoPP=GetPPMat(IsoEBOut) -str(IsoPP) -head(IsoPP) -IsoDE=rownames(IsoPP)[which(IsoPP[,"PPDE"]>=.95)] -str(IsoDE) -#3.3 -data(MultiGeneMat) -str(MultiGeneMat) -Conditions=c("C1","C1","C2","C2","C3","C3") -PosParti=GetPatterns(Conditions) -PosParti -Parti=PosParti[-3,] -Parti -MultiSize=MedianNorm(MultiGeneMat) -MultiOut=EBMultiTest(MultiGeneMat,NgVector=NULL,Conditions=Conditions, - AllParti=Parti, sizeFactors=MultiSize, maxround=5) -MultiPP=GetMultiPP(MultiOut) -names(MultiPP) -MultiPP$PP[1:10,] -MultiPP$MAP[1:10] -MultiPP$Patterns - -#3.4 -data(IsoMultiList) -IsoMultiMat=IsoMultiList[[1]] -IsoNames.Multi=IsoMultiList$IsoNames -IsosGeneNames.Multi=IsoMultiList$IsosGeneNames -IsoMultiSize=MedianNorm(IsoMultiMat) -NgList.Multi=GetNg(IsoNames.Multi, IsosGeneNames.Multi) -IsoNgTrun.Multi=NgList.Multi$IsoformNgTrun -Conditions=c("C1","C1","C2","C2","C3","C3","C4","C4") -PosParti.4Cond=GetPatterns(Conditions) -PosParti.4Cond -Parti.4Cond=PosParti.4Cond[c(1,2,3,8,15),] -Parti.4Cond -IsoMultiOut=EBMultiTest(IsoMultiMat,NgVector=IsoNgTrun.Multi,Conditions=Conditions, - AllParti=Parti.4Cond, sizeFactors=IsoMultiSize, maxround=5) -IsoMultiPP=GetMultiPP(IsoMultiOut) -names(MultiPP) -IsoMultiPP$PP[1:10,] -IsoMultiPP$MAP[1:10] -IsoMultiPP$Patterns - - -#4.1 -data(GeneMat) -str(GeneMat) -Sizes=MedianNorm(GeneMat) -EBOut=EBTest(Data=GeneMat, - Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=Sizes, maxround=5) -PP=GetPPMat(EBOut) -str(PP) -head(PP) -DEfound=rownames(PP)[which(PP[,"PPDE"]>=.95)] -str(DEfound) -EBOut$Alpha -EBOut$Beta -EBOut$P -GeneFC=PostFC(EBOut) -str(GeneFC) -par(mfrow=c(2,2)) -QQP(EBOut) -par(mfrow=c(2,2)) -DenNHist(EBOut) -PlotPostVsRawFC(EBOut,GeneFC) - -#4.2 -data(IsoList) -str(IsoList) -IsoMat=IsoList$IsoMat -str(IsoMat) -IsoNames=IsoList$IsoNames -IsosGeneNames=IsoList$IsosGeneNames -IsoSizes=MedianNorm(IsoMat) -NgList=GetNg(IsoNames, IsosGeneNames) -IsoNgTrun=NgList$IsoformNgTrun -IsoNgTrun[c(1:3,201:203,601:603)] -IsoEBOut=EBTest(Data=IsoMat, NgVector=IsoNgTrun, - Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=IsoSizes, maxround=5) -IsoPP=GetPPMat(IsoEBOut) -str(IsoPP) -IsoDE=rownames(IsoPP)[which(IsoPP[,"PPDE"]>=.95)] -str(IsoDE) -IsoEBOut$Alpha -IsoEBOut$Beta -IsoEBOut$P -IsoFC=PostFC(IsoEBOut) -str(IsoFC) -PlotPostVsRawFC(IsoEBOut,IsoFC) - -par(mfrow=c(2,2)) -PolyFitValue=vector("list",3) -for(i in 1:3) - PolyFitValue[[i]]=PolyFitPlot(IsoEBOut$C1Mean[[i]], - IsoEBOut$C1EstVar[[i]],5) -PolyAll=PolyFitPlot(unlist(IsoEBOut$C1Mean), unlist(IsoEBOut$C1EstVar),5) -lines(log10(IsoEBOut$C1Mean[[1]][PolyFitValue[[1]]$sort]), - PolyFitValue[[1]]$fit[PolyFitValue[[1]]$sort],col="yellow",lwd=2) -lines(log10(IsoEBOut$C1Mean[[2]][PolyFitValue[[2]]$sort]), - PolyFitValue[[2]]$fit[PolyFitValue[[2]]$sort],col="pink",lwd=2) -lines(log10(IsoEBOut$C1Mean[[3]][PolyFitValue[[3]]$sort]), - PolyFitValue[[3]]$fit[PolyFitValue[[3]]$sort],col="green",lwd=2) -legend("topleft",c("All Isoforms","Ig = 1","Ig = 2","Ig = 3"), - col=c("red","yellow","pink","green"),lty=1,lwd=3,box.lwd=2) -par(mfrow=c(2,3)) -QQP(IsoEBOut) -par(mfrow=c(2,3)) -DenNHist(IsoEBOut) - - -#4.3 -data(MultiGeneMat) -str(MultiGeneMat) -Conditions=c("C1","C1","C2","C2","C3","C3") -PosParti=GetPatterns(Conditions) -PosParti -PlotPattern(PosParti) -Parti=PosParti[-3,] -Parti -MultiSize=MedianNorm(MultiGeneMat) -MultiOut=EBMultiTest(MultiGeneMat,NgVector=NULL,Conditions=Conditions, - AllParti=Parti, sizeFactors=MultiSize, maxround=5) -MultiPP=GetMultiPP(MultiOut) -names(MultiPP) -MultiPP$PP[1:10,] -MultiPP$MAP[1:10] -MultiPP$Patterns -MultiFC=GetMultiFC(MultiOut) -str(MultiFC) -par(mfrow=c(2,2)) -DenNHist(MultiOut) -par(mfrow=c(2,2)) -QQP(MultiOut) - -#4.4 -data(IsoMultiList) -IsoMultiMat=IsoMultiList[[1]] -IsoNames.Multi=IsoMultiList$IsoNames -IsosGeneNames.Multi=IsoMultiList$IsosGeneNames -IsoMultiSize=MedianNorm(IsoMultiMat) -NgList.Multi=GetNg(IsoNames.Multi, IsosGeneNames.Multi) -IsoNgTrun.Multi=NgList.Multi$IsoformNgTrun -Conditions=c("C1","C1","C2","C2","C3","C3","C4","C4") -PosParti.4Cond=GetPatterns(Conditions) -PosParti.4Cond -PlotPattern(PosParti.4Cond) -Parti.4Cond=PosParti.4Cond[c(1,2,3,8,15),] -Parti.4Cond -IsoMultiOut=EBMultiTest(IsoMultiMat,NgVector=IsoNgTrun.Multi,Conditions=Conditions, - AllParti=Parti.4Cond, sizeFactors=IsoMultiSize, maxround=5) -IsoMultiPP=GetMultiPP(IsoMultiOut) -names(MultiPP) -IsoMultiPP$PP[1:10,] -IsoMultiPP$MAP[1:10] -IsoMultiPP$Patterns -IsoMultiFC=GetMultiFC(IsoMultiOut) -str(IsoMultiFC) -par(mfrow=c(3,4)) -DenNHist(IsoMultiOut) -par(mfrow=c(3,4)) -QQP(IsoMultiOut) -IsoMultiFC=GetMultiFC(IsoMultiOut) - - - -#4.5 -data(GeneMat) -GeneMat.norep=GeneMat[,c(1,6)] -Sizes.norep=MedianNorm(GeneMat.norep) -EBOut.norep=EBTest(Data=GeneMat.norep, - Conditions=as.factor(rep(c("C1","C2"))),sizeFactors=Sizes.norep, maxround=5) -PP.norep=GetPPMat(EBOut.norep) -DEfound.norep=rownames(PP.norep)[which(PP.norep[,"PPDE"]>=.95)] -GeneFC.norep=PostFC(EBOut.norep) - - -#4.6 -data(IsoList) -IsoMat=IsoList$IsoMat -IsoNames=IsoList$IsoNames -IsosGeneNames=IsoList$IsosGeneNames -NgList=GetNg(IsoNames, IsosGeneNames) -IsoNgTrun=NgList$IsoformNgTrun -IsoMat.norep=IsoMat[,c(1,6)] -IsoSizes.norep=MedianNorm(IsoMat.norep) -IsoEBOut.norep=EBTest(Data=IsoMat.norep, NgVector=IsoNgTrun, - Conditions=as.factor(c("C1","C2")),sizeFactors=IsoSizes.norep, maxround=5) -IsoPP.norep=GetPPMat(IsoEBOut.norep) -IsoDE.norep=rownames(IsoPP.norep)[which(IsoPP.norep[,"PPDE"]>=.95)] -IsoFC.norep=PostFC(IsoEBOut.norep) - - -#4.7 -data(MultiGeneMat) -MultiGeneMat.norep=MultiGeneMat[,c(1,3,5)] -Conditions=c("C1","C2","C3") -PosParti=GetPatterns(Conditions) -Parti=PosParti[-3,] -MultiSize.norep=MedianNorm(MultiGeneMat.norep) -MultiOut.norep=EBMultiTest(MultiGeneMat.norep,NgVector=NULL,Conditions=Conditions, - AllParti=Parti, sizeFactors=MultiSize.norep, maxround=5) -MultiPP.norep=GetMultiPP(MultiOut.norep) -MultiFC.norep=GetMultiFC(MultiOut.norep) - -#4.8 -data(IsoMultiList) -IsoMultiMat=IsoMultiList[[1]] -IsoNames.Multi=IsoMultiList$IsoNames -IsosGeneNames.Multi=IsoMultiList$IsosGeneNames -IsoMultiMat.norep=IsoMultiMat[,c(1,3,5,7)] -IsoMultiSize.norep=MedianNorm(IsoMultiMat.norep) -NgList.Multi=GetNg(IsoNames.Multi, IsosGeneNames.Multi) -IsoNgTrun.Multi=NgList.Multi$IsoformNgTrun -Conditions=c("C1","C2","C3","C4") -PosParti.4Cond=GetPatterns(Conditions) -PosParti.4Cond -Parti.4Cond=PosParti.4Cond[c(1,2,3,8,15),] -Parti.4Cond -IsoMultiOut.norep=EBMultiTest(IsoMultiMat.norep,NgVector=IsoNgTrun.Multi,Conditions=Conditions, - AllParti=Parti.4Cond, sizeFactors=IsoMultiSize.norep, maxround=5) -IsoMultiPP.norep=GetMultiPP(IsoMultiOut.norep) -IsoMultiFC.norep=GetMultiFC(IsoMultiOut.norep) - - -# EOF diff --git a/.svn/pristine/71/71880b01755275477d0349e41e9b08592a986198.svn-base b/.svn/pristine/71/71880b01755275477d0349e41e9b08592a986198.svn-base deleted file mode 100644 index bcac162..0000000 --- a/.svn/pristine/71/71880b01755275477d0349e41e9b08592a986198.svn-base +++ /dev/null @@ -1,42 +0,0 @@ -\name{GetPP} -\alias{GetPP} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ -Generate the Posterior Probability of each transcript. -} -\description{ -'GetPP' generates the Posterior Probability of being DE of each transcript based on the EBTest output. -} -\usage{ -GetPP(EBout) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{EBout}{The output of EBTest function.} -} - -\value{The poster probabilities of being DE. -} -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - -%% ~Make other sections like Warning with \section{Warning }{....} ~ - -\seealso{GetPPMat -} -\examples{ -data(GeneMat) -GeneMat.small = GeneMat[c(1:10,500:550),] -Sizes = MedianNorm(GeneMat.small) -EBOut = EBTest(Data = GeneMat.small, - Conditions = as.factor(rep(c("C1","C2"), each=5)), - sizeFactors = Sizes, maxround = 5) -PPDE = GetPP(EBOut) -str(PPDE) -head(PPDE) -} -\keyword{ Posterior Probability } diff --git a/.svn/pristine/76/76a56499d933c6772f6de1ef00324f6ff5eb78e9.svn-base b/.svn/pristine/76/76a56499d933c6772f6de1ef00324f6ff5eb78e9.svn-base deleted file mode 100644 index f5d8fef..0000000 --- a/.svn/pristine/76/76a56499d933c6772f6de1ef00324f6ff5eb78e9.svn-base +++ /dev/null @@ -1,37 +0,0 @@ -\name{f0} -\alias{f0} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ -The Prior Predictive Distribution of being EE -} -\description{ -'f0' gives the Prior Predictive Distribution of being EE. -} -\usage{ -f0(Input, AlphaIn, BetaIn, EmpiricalR, NumOfGroups, log) -} -\arguments{ - \item{Input}{Expression Values.} - \item{AlphaIn, BetaIn, EmpiricalR}{The parameters estimated from last iteration of EM.} - \item{NumOfGroups}{How many transcripts within each Ng group.} - \item{log}{If true, will give the log of the output.} -} -\value{ -The function will return the prior predictive distribution values of being EE. -} - -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - -\seealso{ -f1 -} -\examples{ -# -#f0(matrix(rnorm(100,100,1),ncol=10), .5, .6, -# matrix(rnorm(100,200,1),ncol=10), 100, TRUE) -} diff --git a/.svn/pristine/78/785b9b5cd5d070ff05c452b47d931389e2298fcd.svn-base b/.svn/pristine/78/785b9b5cd5d070ff05c452b47d931389e2298fcd.svn-base deleted file mode 100644 index 22d9866..0000000 --- a/.svn/pristine/78/785b9b5cd5d070ff05c452b47d931389e2298fcd.svn-base +++ /dev/null @@ -1,104 +0,0 @@ -\name{GetDEResults} -\alias{GetDEResults} -\title{ -Obtain Differential Expression Analysis Results in a Two-condition Test -} -\description{ -Obtain DE analysis results in a two-condition test using the output of EBTest() -} -\usage{ -GetDEResults(EBPrelim, FDR=0.05, Method="robust", - FDRMethod="hard", Threshold_FC=0.7, - Threshold_FCRatio=0.3, SmallNum=0.01) -} -\arguments{ - \item{EBPrelim}{Output from the function EBTest().} - \item{FDR}{Target FDR, defaut is 0.05.} - \item{FDRMethod}{"hard" or "soft". - Giving a target FDR alpha, either hard threshold and soft - threshold may be used. If the hard threshold is preferred, DE transcripts are - defined as the the transcripts with PP(DE) greater than - (1-alpha). Using the hard threshold, any DE transcript in the list - has FDR <= alpha. - - If the soft threshold is preferred, the DE transcripts are defined as the - transcripts with PP(DE) greater than crit_fun(PPEE, alpha). Using - the soft threshold, the list of DE transcripts has average FDR - alpha. - - Based on results from our simulation studies, hard thresholds provide a better-controlled - empirical FDR when sample size is relatively small(Less than 10 samples in each condition). - User may consider the soft threshold when sample size is large to improve power.} - \item{Method}{"robust" or "classic". - Using the "robust" option, EBSeq is more robust to genes with outliers and - genes with extremely small variances. - Using the "classic" option, the results will be more comparable to those obtained - by using the GetPPMat() function from earlier version (<= 1.7.0) of EBSeq. - Default is "robust".} - \item{Threshold_FC}{Threshold for the fold change (FC) statistics. - The default is 0.7. The FC statistics are calculated as follows. - First the posterior FC estimates are calculated using PostFC() function. - The FC statistics is defined as exp(log(-|posterior FC|)) and therefore is always less than - or equal to 1. - The default threshold was selected as the optimal threshold learned from our simulation studies. By setting the - threshold as 0.7, the expected FC for a DE transcript is less than 0.7 - (or greater than 1/0.7=1.4). - User may specify their own threshold here. A higher (less conservative) threshold - may be used here when sample size is large. Our simulation results - indicated that when there are more than or equal to 5 samples in each condition, - a less conservative threshold will improve the power when the FDR is still well-controlled. - The parameter will be ignored if Method is set as "classic".} - \item{Threshold_FCRatio}{Threshold for the fold change ratio (FCRatio) statistics. - The default is 0.3. The FCRatio statistics are calculated as follows. - First we get another revised fold change - statistic called Median-FC statistic for each transcript. - For each transcript, we calculate the median of - normalized expression values within each condition. - The MedianFC is defined as exp(log(-|(C1Median+SmallNum)/(C2Median+SmallNum)|)). - Note a small number is added to avoid Inf and NA. See SmallNum for more details. - The FCRatio is calculated as exp(log(-|FCstatistics/MedianFC|)). - Therefore it is always less than or equal to 1. - The default threshold was selected as the optimal threshold learned from our simulation studies. - By setting the threshold as 0.3, the FCRatio for a DE transcript is - expected to be larger than 0.3. - } - \item{SmallNum}{When calculating the FCRatio (or Median-FC), a small number is added for each transcript in each - condition to avoid Inf and NA. Default is 0.01.} -} -\details{ -GetDEResults() function takes output from EBTest() function and output a list of -DE transcripts under a target FDR. It also provides posterior probability estimates for each -transcript. -} -\value{ - \item{DEfound}{A list of DE transcripts.} - \item{PPMat}{Posterior probability matrix. Transcripts are following the same order as - in the input matrix. - Transcripts that were filtered by magnitude (in EBTest function), FC, or FCR - are assigned with NA for both PPDE and PPEE.} - \item{Status}{Each transcript will be assigned with one of the following - values: "DE", "EE", "Filtered: Low Expression", - "Filtered: Fold Change" and "Filtered: Fold Change Ratio". - Transcripts are following the same order as in the input matrix.} -} -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng, Yuan Li -} -\seealso{ -EBTest -} -\examples{ -data(GeneMat) -str(GeneMat) -GeneMat.small = GeneMat[c(1:10,511:550),] -Sizes = MedianNorm(GeneMat.small) -EBOut = EBTest(Data = GeneMat.small, - Conditions = as.factor(rep(c("C1","C2"), each = 5)), - sizeFactors = Sizes, maxround = 5) -Out = GetDEResults(EBOut) -} -\keyword{ DE } -\keyword{ Two condition } diff --git a/.svn/pristine/7d/7da20f3778c497adc7c5b7e7f1d08fa14333c46c.svn-base b/.svn/pristine/7d/7da20f3778c497adc7c5b7e7f1d08fa14333c46c.svn-base deleted file mode 100644 index 510cb1e..0000000 --- a/.svn/pristine/7d/7da20f3778c497adc7c5b7e7f1d08fa14333c46c.svn-base +++ /dev/null @@ -1,28 +0,0 @@ -LikefunMulti <- -function(ParamPool, InputPool) -{ - -NoneZeroLength=InputPool[[4]] -AlphaIn=ParamPool[1] -BetaIn=ParamPool[2:(1+NoneZeroLength)] -PIn=ParamPool[(2+NoneZeroLength):length(ParamPool)] -PInAll=c(1-sum(PIn),PIn) -ZIn=InputPool[[3]] -Input=InputPool[[2]] -InputSP=InputPool[[1]] -RIn=InputPool[[5]] -RInSP=InputPool[[6]] -NumIn=InputPool[[7]] -AllParti=InputPool[[8]] -PInMat=matrix(rep(1,nrow(Input)),ncol=1)%*%matrix(PInAll,nrow=1) -##Function here -FList=sapply(1:nrow(AllParti),function(i)sapply(1:nlevels(as.factor(AllParti[i,])), - function(j)f0(do.call(cbind,InputSP[AllParti[i,]==j]),AlphaIn, BetaIn, - do.call(cbind,RInSP[AllParti[i,]==j]), NumIn, log=T)), - simplify=F) -FPartiLog=sapply(FList,rowSums) -#FMat=exp(FPartiLog) -FMat=FPartiLog --sum(ZIn*(FMat+log(PInMat))) -} - diff --git a/.svn/pristine/85/850e65827f270b337d013356cfff71b80f2f7274.svn-base b/.svn/pristine/85/850e65827f270b337d013356cfff71b80f2f7274.svn-base deleted file mode 100644 index 11977fd..0000000 --- a/.svn/pristine/85/850e65827f270b337d013356cfff71b80f2f7274.svn-base +++ /dev/null @@ -1,46 +0,0 @@ -\name{GetMultiPP} -\alias{GetMultiPP} -\title{ -Posterior Probability of Each Transcript -} -\description{ -'GetMultiPP' generates the Posterior Probability of being each pattern of each transcript based on the EBMultiTest output. -} -\usage{ -GetMultiPP(EBout) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{EBout}{The output of EBMultiTest function.} - -} -\value{ -\item{PP}{The poster probabilities of being each pattern.} -\item{MAP}{Gives the most likely pattern.} -\item{Patterns}{The Patterns.} -} -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - - -\seealso{GetPPMat} -\examples{ -data(MultiGeneMat) -MultiGeneMat.small = MultiGeneMat[201:210,] - -Conditions = c("C1","C1","C2","C2","C3","C3") -PosParti = GetPatterns(Conditions) -Parti = PosParti[-3,] -MultiSize = MedianNorm(MultiGeneMat.small) - -MultiOut = EBMultiTest(MultiGeneMat.small, - NgVector=NULL, Conditions=Conditions, - AllParti=Parti, sizeFactors=MultiSize, - maxround=5) -MultiPP = GetMultiPP(MultiOut) -} -\keyword{ Posterior Probability } diff --git a/.svn/pristine/88/888a0b582be2e1727c2ca89ef7de73e1fa793259.svn-base b/.svn/pristine/88/888a0b582be2e1727c2ca89ef7de73e1fa793259.svn-base deleted file mode 100644 index 05a4c66..0000000 --- a/.svn/pristine/88/888a0b582be2e1727c2ca89ef7de73e1fa793259.svn-base +++ /dev/null @@ -1,40 +0,0 @@ -\name{RankNorm} -\alias{RankNorm} -\title{ -Rank Normalization -} -\description{ -'RankNorm' gives the rank normalization. -} -\usage{ -RankNorm(Data) -} -\arguments{ - \item{Data}{ -The data matrix with transcripts in rows and lanes in columns. -} -} - -\value{ -The function will return a matrix contains the normalization factor for each lane and each transcript. -} - -\author{ -Ning Leng -} - - -\seealso{ -MedianNorm, QuantileNorm -} - -\examples{ -data(GeneMat) -Sizes = RankNorm(GeneMat) -# Run EBSeq -# EBres = EBTest(Data = GeneData, NgVector = rep(1,10^4), -# Vect5End = rep(1,10^4), Vect3End = rep(1,10^4), -# Conditions = as.factor(rep(c(1,2), each=5)), -# sizeFactors = Sizes, maxround=5) -} -\keyword{ Normalization } diff --git a/.svn/pristine/88/88e2e312265521a6f4476f676db230449ac916b3.svn-base b/.svn/pristine/88/88e2e312265521a6f4476f676db230449ac916b3.svn-base deleted file mode 100644 index 1dcabca..0000000 --- a/.svn/pristine/88/88e2e312265521a6f4476f676db230449ac916b3.svn-base +++ /dev/null @@ -1,12 +0,0 @@ -GetNg<- function(IsoformName, GeneName, TrunThre=3){ - if(length(IsoformName)!=length(GeneName))stop("The length of IsoformName is not the same as the length of GeneName") - GeneNg = tapply(IsoformName, GeneName, length) - if(max(GeneNg)TrunThre]=TrunThre - IsoformNgTrun=IsoformNg - IsoformNgTrun[IsoformNgTrun>TrunThre]=TrunThre - out=list( GeneNg=GeneNg, GeneNgTrun=GeneNgTrun, IsoformNg=IsoformNg, IsoformNgTrun=IsoformNgTrun) - } diff --git a/.svn/pristine/89/89eee87f4f1d178d2fa866c03dc89d4779f25578.svn-base b/.svn/pristine/89/89eee87f4f1d178d2fa866c03dc89d4779f25578.svn-base deleted file mode 100644 index ebe05ab..0000000 --- a/.svn/pristine/89/89eee87f4f1d178d2fa866c03dc89d4779f25578.svn-base +++ /dev/null @@ -1,26 +0,0 @@ -Likefun <- -function(ParamPool, InputPool) -{ - -NoneZeroLength=InputPool[[5]] -AlphaIn=ParamPool[1] -BetaIn=ParamPool[2:(1+NoneZeroLength)] -PIn=ParamPool[2+NoneZeroLength] -ZIn=InputPool[[4]] -Input=InputPool[[3]] -Input1=matrix(InputPool[[1]],nrow=nrow(Input)) -Input2=matrix(InputPool[[2]],nrow=nrow(Input)) -RIn=InputPool[[6]] -RInSP1=matrix(InputPool[[7]],nrow=nrow(Input)) -RInSP2=matrix(InputPool[[8]],nrow=nrow(Input)) -NumIn=InputPool[[9]] -##Function here -#LikelihoodFunction<- function(NoneZeroLength){ - F0=f0(Input, AlphaIn, BetaIn, RIn, NumIn, log=T) - F1=f1(Input1, Input2, AlphaIn, BetaIn, RInSP1,RInSP2, NumIn, log=T) - F0[F0==Inf]=min(!is.na(F0[F0!=Inf])) - F1[F1==Inf]=min(!is.na(F1[F1!=Inf])) - - -sum((1-ZIn)*F0+ (1-ZIn)* log(1-PIn) + ZIn*F1 + ZIn*log(PIn)) -} - diff --git a/.svn/pristine/8a/8a47f9c2d5ec241f788310030816dc671ce00b69.svn-base b/.svn/pristine/8a/8a47f9c2d5ec241f788310030816dc671ce00b69.svn-base deleted file mode 100644 index 3f90a65..0000000 --- a/.svn/pristine/8a/8a47f9c2d5ec241f788310030816dc671ce00b69.svn-base +++ /dev/null @@ -1,21 +0,0 @@ -Package: EBSeq -Type: Package -Title:An R package for gene and isoform differential expression analysis of RNA-seq data -Version: 1.7.0 -Date: 2014-9-17 -Author: Ning Leng, Christina Kendziorski -Maintainer: Ning Leng -Depends: blockmodeling, gplots, R (>= 3.0.0) -Description: Differential Expression analysis at both gene and isoform - level using RNA-seq data -License: Artistic-2.0 -LazyLoad: yes -Collate: 'MedianNorm.R' 'GetNg.R' 'beta.mom.R' 'f0.R' 'f1.R' - 'Likefun.R' 'LogN.R' 'LogNMulti.R' 'LikefunMulti.R' 'EBTest.R' - 'GetPatterns.R' 'EBMultiTest.R' 'GetPP.R' 'PostFC.R' - 'GetPPMat.R' 'GetMultiPP.R' 'GetMultiFC.R' 'PlotPostVsRawFC.R' - 'crit_fun.R' 'DenNHist.R' 'GetNormalizedMat.R' 'PlotPattern.R' - 'PolyFitPlot.R' 'QQP.R' 'QuantileNorm.R' 'RankNorm.R' -BuildVignettes: yes -biocViews: StatisticalMethod, DifferentialExpression, - MultipleComparison, RNASeq, Sequencing diff --git a/.svn/pristine/8d/8d58cef114d8f400111095282b197ab553f2f3c8.svn-base b/.svn/pristine/8d/8d58cef114d8f400111095282b197ab553f2f3c8.svn-base deleted file mode 100644 index 9924b3e..0000000 --- a/.svn/pristine/8d/8d58cef114d8f400111095282b197ab553f2f3c8.svn-base +++ /dev/null @@ -1,45 +0,0 @@ -\name{PlotPostVsRawFC} -\alias{PlotPostVsRawFC} -\title{ -Plot Posterior FC vs FC -} -\description{ -'PlotPostVsRawFC' helps the users visualize the posterior FC vs FC in a two condition study. -} -\usage{ -PlotPostVsRawFC(EBOut, FCOut) -} -\arguments{ - \item{EBOut}{ -The output of EBMultiTest function. -} - \item{FCOut}{The output of PostFC function.} - -} -\value{ -A figure shows fold change vs posterior fold change. -} - -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} - -\author{ -Ning Leng -} - -\seealso{ -PostFC -} -\examples{ -data(GeneMat) -GeneMat.small = GeneMat[c(500:600),] -Sizes = MedianNorm(GeneMat.small) -EBOut = EBTest(Data = GeneMat.small, - Conditions = as.factor(rep(c("C1","C2"), each=5)), - sizeFactors = Sizes, maxround = 5) -FC = PostFC(EBOut) -PlotPostVsRawFC(EBOut,FC) - -} -\keyword{ Posterior Probability } diff --git a/.svn/pristine/90/902b6e424620694b60ad8d9df63c70c47ff34018.svn-base b/.svn/pristine/90/902b6e424620694b60ad8d9df63c70c47ff34018.svn-base deleted file mode 100644 index 39fc8ee..0000000 --- a/.svn/pristine/90/902b6e424620694b60ad8d9df63c70c47ff34018.svn-base +++ /dev/null @@ -1,51 +0,0 @@ -\name{crit_fun} -\alias{crit_fun} -\title{ -Calculate the soft threshold for a target FDR -} -\description{ -'crit_fun' calculates the soft threshold for a target FDR. -} -\usage{ -crit_fun(PPEE, thre) -} -\arguments{ - \item{PPEE}{The posterior probabilities of being EE.} - \item{thre}{The target FDR.} - -} -\details{ -Regarding a target FDR alpha, both hard threshold and soft threshold could be used. -If the hard threshold is preferred, user could simply take the transcripts with -PP(DE) greater than (1-alpha). Using the hard threshold, any DE transcript in the -list is with FDR <= alpha. - -If the soft threshold is preferred, user could take the transcripts with PP(DE) -greater than crit_fun(PPEE, alpha). Using the soft threshold, the list of -DE transcripts is with average FDR alpha. -} -\value{ -The adjusted FDR threshold of target FDR. -} -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - -\examples{ -data(GeneMat) -GeneMat.small = GeneMat[c(1:10, 500:600),] -Sizes = MedianNorm(GeneMat.small) -EBOut = EBTest(Data = GeneMat.small, - Conditions = as.factor(rep(c("C1","C2"), each=5)), - sizeFactors = Sizes, maxround = 5) -PP = GetPPMat(EBOut) -DEfound = rownames(PP)[which(PP[,"PPDE"] >= 0.95)] -str(DEfound) - -SoftThre = crit_fun(PP[,"PPEE"], 0.05) -DEfound_soft = rownames(PP)[which(PP[,"PPDE"] >= SoftThre)] -} -\keyword{ FDR } diff --git a/.svn/pristine/92/92f39a873591cb9b212ec520575582c90f39ab42.svn-base b/.svn/pristine/92/92f39a873591cb9b212ec520575582c90f39ab42.svn-base deleted file mode 100644 index 0424f32..0000000 --- a/.svn/pristine/92/92f39a873591cb9b212ec520575582c90f39ab42.svn-base +++ /dev/null @@ -1,40 +0,0 @@ -CHANGES IN VERSION 1.5.4 ------------------------- - - - o An extra numerical approximation step is implemented - in EBMultiTest() function to avoid - underflow. The underflow is likely due to large number of samples. - A bug in EBMultiTest() is fixed. The bug will cause error when there is - exactly 1 gene/isoform that needs numerical approximation. - -CHANGES IN VERSION 1.5.3 -------------------------- - -BUG FIXES - - o Fixed a bug that may generate NA FC estimates when there are no replicates. - -CHANGES IN VERSION 1.5.2 ------------------------- - -NEW FEATURES - - o An extra numerical approximation step is - implemented in EBTest() function to avoid - underflow. The underflow is likely due to large number of samples. - - -CHANGES IN VERSION 1.3.3 ------------------------- - -NEW FEATURES - - o In EBSeq 1.3.3, the default setting of EBTest function will remove - low expressed genes (genes whose 75th quantile of normalized counts is less - than 10) before identifying DE genes. - These two thresholds can be changed in EBTest function. - We found that low expressed genes are more easily to be affected by noises. - Removing these genes prior to downstream analyses can improve the - model fitting and reduce impacts of noisy genes (e.g. genes with outliers). - diff --git a/.svn/pristine/94/9436e9bdb996e25c6f99394bf743d3e80ae181a8.svn-base b/.svn/pristine/94/9436e9bdb996e25c6f99394bf743d3e80ae181a8.svn-base deleted file mode 100644 index afd7b37..0000000 --- a/.svn/pristine/94/9436e9bdb996e25c6f99394bf743d3e80ae181a8.svn-base +++ /dev/null @@ -1,5 +0,0 @@ -MedianNorm=function(Data){ - if(ncol(Data)==1)stop("Only 1 sample!") - geomeans <- exp(rowMeans(log(Data))) - apply(Data, 2, function(cnts) median((cnts/geomeans)[geomeans > 0])) -} diff --git a/.svn/pristine/94/94ed4a00687fa68f900e78185722da17a5342d0f.svn-base b/.svn/pristine/94/94ed4a00687fa68f900e78185722da17a5342d0f.svn-base deleted file mode 100644 index 62f2e26..0000000 --- a/.svn/pristine/94/94ed4a00687fa68f900e78185722da17a5342d0f.svn-base +++ /dev/null @@ -1,40 +0,0 @@ -\name{EBSeq_NingLeng-package} -\alias{EBSeq_NingLeng-package} -\alias{EBSeq_NingLeng} -\docType{package} -\title{ -EBSeq: RNA-Seq Differential Expression Analysis on both gene and isoform level -} -\description{ -In 'EBSeq_NingLeng-package,' a Negative Binomial-beta model was built to analyze the RNASeq data. We used the empirical bayes method and EM algrithom. -} -\details{ -\tabular{ll}{ -Package: \tab EBSeq_NingLeng\cr -Type: \tab Package\cr -Version: \tab 1.0\cr -Date: \tab 2011-06-13\cr -License: \tab What license is it under?\cr -LazyLoad: \tab yes\cr -} -} -\author{ -Ning Leng, Christina Kendziorski - -Maintainer: Ning Leng -} -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\keyword{ package } -\seealso{ -EBTest, EBMultiTest -} -\examples{ -data(GeneMat) -GeneMat.small = GeneMat[c(1:10,511:550),] -Sizes = MedianNorm(GeneMat.small) -EBOut = EBTest(Data=GeneMat.small, - Conditions=as.factor(rep(c("C1","C2"), each=5)), - sizeFactors=Sizes, maxround=5) -} diff --git a/.svn/pristine/97/976690b470fec3cc7bac0278fff7f24dced23899.svn-base b/.svn/pristine/97/976690b470fec3cc7bac0278fff7f24dced23899.svn-base deleted file mode 100644 index a8a03a1..0000000 --- a/.svn/pristine/97/976690b470fec3cc7bac0278fff7f24dced23899.svn-base +++ /dev/null @@ -1,1088 +0,0 @@ -%\VignetteIndexEntry{EBSeq Vignette} - -\documentclass{article} -\usepackage{fullpage} -\usepackage{graphicx, graphics, epsfig,setspace,amsmath, amsthm} -\usepackage{hyperref} -\usepackage{natbib} -%\usepackage{listings} -\usepackage{moreverb} -\begin{document} -\title{EBSeq: An R package for differential expression analysis using RNA-seq data} -\author{Ning Leng, John Dawson, and Christina Kendziorski} -\maketitle -\tableofcontents -\setcounter{tocdepth}{2} - -\section{Introduction} -EBSeq may be used to identify differentially expressed (DE) -genes and isoforms in an RNA-Seq experiment. As detailed in -Leng {\it et al.}, 2013 \cite{Leng13}, -EBSeq is an empirical Bayesian approach that models a number of features -observed in RNA-seq data. Importantly, for isoform level inference, -EBSeq directly accommodates isoform expression estimation uncertainty by -modeling the differential variability observed in distinct groups of isoforms. -Consider Figure 1, where we have plotted variance against mean -for all isoforms using RNA-Seq expression data from Leng {\it et al.}, 2013 \cite{Leng13}. -Also shown is the fit within three sub-groups of isoforms defined -by the number of constituent isoforms of the parent gene. -An isoform of gene $g$ is assigned to the $I_g=k$ group, where $k=1,2,3$, -if the total number of isoforms from gene $g$ is $k$ (the $I_g=3$ group contains -all isoforms from genes having 3 or more isoforms). -As shown in Figure 1, there is decreased variability in the $I_g=1$ group, -but increased variability in the others, due to the relative increase in -uncertainty inherent in estimating isoform expression when multiple isoforms of a given gene are -present. If this structure is not accommodated, there is reduced power for -identifying isoforms in the $I_g=1$ group (since the true variances in that group are -lower, on average, than that derived from the full collection of isoforms) as well as increased -false discoveries in the $I_g=2$ and $I_g=3$ groups (since the true variances are higher, on average, -than those derived from the full collection). EBSeq directly models differential variability -as a function of $I_g$ providing a powerful approach for isoform level inference. As shown in Leng {\it et al.}, 2013 -\cite{Leng13}, the model is also useful for identifying DE genes. -We will briefly detail the model in Section \ref{sec:model} and then describe -the flow of analysis in Section \ref{sec:quickstart} for both isoform and gene-level inference. - -\begin{figure}[t] -\centering -\includegraphics[width=0.6\textwidth]{PlotExample.png} -\label{fig:GouldNg} -\caption{Empirical variance vs. mean for -each isoform profiled in the ESCs vs iPSCs experiment detailed in -the Case Study section of Leng {\it et al.}, 2013 \cite{Leng13}. -A spline fit to all isoforms is shown in red with splines fit within the $I_g=1$, $I_g=2$, and $I_g=3$ isoform groups -shown in yellow, pink, and green, respectively.} -\end{figure} - - -\section{Citing this software} -\label{sec:cite} -Please cite the following article when reporting results from the software. - -\noindent Leng, N., J.A. Dawson, J.A. Thomson, V. Ruotti, A.I. Rissman, -B.M.G. Smits, J.D. Haag, M.N. Gould, R.M. Stewart, and C. Kendziorski. -EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq -experiments, {\it Bioinformatics}, 2013. - -\section{The Model} -\label{sec:model} -\subsection{Two conditions} -\label{sec:twocondmodel} -We let $X_{g_i}^{C1} = X_{g_i,1} ,X_{g_i,2}, ...,X_{g_i,S_1}$ denote data from condition 1 -and $ X_{g_i}^{C2} = X_{g_i,(S_1+1)},X_{g_i,(S_1+2)},...,X_{g_i,S}$ data from condition 2. -We assume that counts within condition $C$ are distributed as Negative Binomial: -$X_{g_i,s}^C|r_{g_i,s}, q_{g_i}^C \sim NB(r_{g_i,s}, q_{g_i}^C)$ where -\begin{equation} -P(X_{g_i,s}|r_{g_i,s},q_{g_i}^C) = {X_{g_i,s}+r_{g_i,s}-1\choose X_{g_i,s}}(1-q_{g_i}^C)^{X_{g_i,s}}(q_{g_i}^C)^{r_{g_i,s}}\label{eq:01} -\end{equation} - -\noindent and $\mu_{g_i,s}^C=r_{g_i,s} (1-q_{g_i}^C)/q_{g_i}^C$; -$(\sigma_{g_i,s}^C)^2=r_{g_i,s} (1-q_{g_i}^C)/(q_{g_i}^C)^2.$ - -\medskip - -We assume a prior distribution on $q_{g_i}^C$: $q_{g_i}^C|\alpha, \beta^{I_g} \sim Beta(\alpha, \beta^{I_g})$. -The hyperparameter $\alpha$ is shared by all the isoforms and $\beta^{I_g}$ is $I_g$ specific (note this is an index, not a power). -We further assume that $r_{g_i,s}=r_{g_i,0} l_s$, where $r_{g_i,0}$ is an isoform specific -parameter common across conditions and $r_{g_i,s}$ depends on it through the sample-specific normalization factor $l_s$. -Of interest in this two group comparison is distinguishing between two cases, or what we will refer to subsequently as -two patterns of expression, namely equivalent expression (EE) and differential expression (DE): -\begin{center} -$H_0$ (EE) : $q_{g_i}^{C1}=q_{g_i}^{C2}$ vs $H_1$ (DE) : $q_{g_i}^{C1} \neq q_{g_i}^{C2}$. -\end{center} -Under the null hypothesis (EE), the data $X_{g_i}^{C1,C2} = X_{g_i}^{C1}, X_{g_i}^{C2}$ arises -from the prior predictive distribution $f_0^{I_g}(X_{g_i}^{C1,C2})$: -%\tiny -\begin{equation} -f_0^{I_g}(X_{g_i}^{C1,C2})=\Bigg[\prod_{s=1}^S {X_{g_i,s}+r_{g_i,s}-1\choose X_{g_i,s}}\Bigg] -\frac{Beta(\alpha+\sum_{s=1}^S r_{g_i,s}, \beta^{I_g}+\sum_{s=1}^SX_{g_i,s} )}{Beta(\alpha, \beta^{I_g})}\label{eq:05} -\end{equation} -%\normalsize - -Alternatively (in a DE scenario), $X_{g_i}^{C1,C2}$ follows the prior predictive distribution $f_1^{I_g}(X_{g_i}^{C1,C2})$: -\begin{equation} -f_1^{I_g}(X_{g_i}^{C1,C2})=f_0^{I_g}(X_{g_i}^{C1})f_0^{I_g}(X_{g_i}^{C2}) \label{eq:06} -\end{equation} - -Let the latent variable $Z_{g_i}$ be defined so that $Z_{g_i} = 1$ indicates that -isoform $g_i$ is DE and $Z_{g_i} = 0$ indicates isoform $g_i$ is EE, and -$Z_{g_i} \sim Bernoulli(p)$. -Then, the marginal distribution of $X_{g_i}^{C1,C2}$ and $Z_{g_i}$ is: -\begin{equation} -(1-p)f_0^{I_g}(X_{g_i}^{C1,C2}) + pf_1^{I_g}(X_{g_i}^{C1,C2})\label{eq:07} -\end{equation} - -\noindent The posterior probability of being DE at isoform $g_i$ is obtained by Bayes' rule: -\begin{equation} -\frac{pf_1^{I_g}(X_{g_i}^{C1,C2})}{(1-p)f_0^{I_g}(X_{g_i}^{C1,C2}) + pf_1^{I_g}(X_{g_i}^{C1,C2})}\label{eq:08} -\end{equation} - -%\newpage -\subsection{More than two conditions} -\label{sec:multicondmodel} -EBSeq naturally accommodates multiple condition comparisons. -For example, in a study with 3 conditions, there are K=5 possible expression patterns (P1,...,P5), or ways in which -latent levels of expression may vary across conditions: -\begin{align} -\textrm {P1:}& \hspace{0.05in} q_{g_i}^{C1} = q_{g_i}^{C2}=q_{g_i}^{C3} \nonumber \\ -\textrm {P2:}& \hspace{0.05in} q_{g_i}^{C1} = q_{g_i}^{C2} \neq q_{g_i}^{C3} \nonumber \\ -\textrm {P3:}& \hspace{0.05in} q_{g_i}^{C1} = q_{g_i}^{C3} \neq q_{g_i}^{C2} \nonumber \\ -\textrm {P4:}& \hspace{0.05in} q_{g_i}^{C1} \neq q_{g_i}^{C2} = q_{g_i}^{C3} \nonumber \\ -\textrm {P5:}& \hspace{0.05in} q_{g_i}^{C1} \neq q_{g_i}^{C2} \neq -q_{g_i}^{C3} \textrm{ and } q_{g_i}^{C1} \neq q_{g_i}^{C3} \nonumber -\end{align} - -\noindent The prior predictive distributions for these are given, respectively, by: - -\begin{align} -g_1^{I_g}(X_{g_i}^{C1,C2,C3}) &= f_0^{I_g}(X_{g_i}^{C1,C2,C3}) \nonumber \\ -g_2^{I_g}(X_{g_i}^{C1,C2,C3}) &= f_0^{I_g}(X_{g_i}^{C1,C2})f_0^{I_g}(X_{g_i}^{C3}) \nonumber \\ -g_3^{I_g}(X_{g_i}^{C1,C2,C3}) &= f_0^{I_g}(X_{g_i}^{C1,C3})f_0^{I_g}(X_{g_i}^{C2}) \nonumber \\ -g_4^{I_g}(X_{g_i}^{C1,C2,C3}) &= f_0^{I_g}(X_{g_i}^{C1})f_0^{I_g}(X_{g_i}^{C2,C3}) \nonumber \\ -g_5^{I_g}(X_{g_i}^{C1,C2,C3}) &= f_0^{I_g}(X_{g_i}^{C1})f_0^{I_g}(X_{g_i}^{C2})f_0^{I_g}(X_{g_i}^{C3}) \nonumber -\end{align} - -\noindent where $f_0^{I_g}$ is the same as in equation \ref{eq:05}. Then the marginal distribution in -equation \ref{eq:07} becomes: - -\begin{equation} -\sum_{k=1}^5 p_k g_k^{I_g}(X_{g_i}^{C1,C2,C3}) \label{eq:11} -\end{equation} -\noindent where $\sum_{k=1}^5 p_k = 1$. Thus, the posterior probability of -isoform $g_i$ coming from pattern $K$ is readily obtained by: - -\begin{equation} -\frac{p_K g_K^{I_g}(X_{g_i}^{C1,C2,C3})}{\sum_{k=1}^5 p_k g_k^{I_g}(X_{g_i}^{C1,C2,C3})} \label{eq:12} -\end{equation} - -\subsection{Getting a false discovery rate (FDR) controlled list of genes or isoforms} -\label{sec:fdrlist} -To obtain a list of DE genes with false discovery rate (FDR) controlled -at $\alpha$ in an experiment comparing two biological conditions, the genes -with posterior probability of being DE (PPDE) greater than 1 - $\alpha$ should be used. -For example, the genes with PPDE>=0.95 make up the list of DE genes with target -FDR controlled at 5\%. With more than two biological conditions, there are multiple -DE patterns (see Section \ref{sec:multicondmodel}). To obtain a list of genes in a specific DE pattern with target -FDR $\alpha$, a user should -take the genes with posterior probability of being in that pattern greater -than 1 - $\alpha$. Isoform-based lists are obtained in the same way. -\newpage -\section{Quick Start} -\label{sec:quickstart} -Before analysis can proceed, the EBSeq package must be loaded into the working space: -<<>>= -library(EBSeq) -@ -\subsection{Gene level DE analysis (two conditions)} -\label{sec:startgenede} -\subsubsection{Required input} -\label{sec:startgenedeinput} -\begin{flushleft} -{\bf Data}: The object \verb+Data+ should be a $G-by-S$ matrix containing the expression values for each gene and each sample, -where $G$ is the number of genes and $S$ is the number of samples. These -values should exhibit raw counts, without normalization -across samples. Counts of this nature may be obtained from RSEM \cite{Li11b}, -Cufflinks \cite{Trapnell12}, or a similar approach. - -\vspace{5 mm} - -{\bf Conditions}: The object \verb+Conditions+ should be a Factor vector of length $S$ that indicates to which condition each sample belongs. -For example, if there are two conditions and three samples in each, -$S=6$ and \verb+Conditions+ may be given by - -\verb+as.factor(c("C1","C1","C1","C2","C2","C2"))+ - -\end{flushleft} -\noindent The object \verb+GeneMat+ is a simulated data matrix containing -1,000 rows of genes and 10 columns of samples. The genes are named -\verb+Gene_1, Gene_2 ...+ -<<>>= -data(GeneMat) -str(GeneMat) -@ - -\subsubsection{Library size factor} -\label{sec:startgenedesize} -As detailed in Section \ref{sec:model}, EBSeq requires the library size factor $l_s$ for each sample $s$. -Here, $l_s$ may be obtained via the function \verb+MedianNorm+, which reproduces the median normalization approach -in DESeq \citep{Anders10}. -<<>>= -Sizes=MedianNorm(GeneMat) -@ - -\noindent If quantile normalization is preferred, $l_s$ may be obtained via the function \verb+QuantileNorm+. -(e.g. \verb+QuantileNorm(GeneMat,.75)+ for Upper-Quantile Normalization in \cite{Bullard10}) - -\subsubsection{Running EBSeq on gene expression estimates} -\label{sec:startgenederun} -The function \verb+EBTest+ is used to detect DE genes. -For gene-level data, we don't need to specify the parameter -\verb+NgVector+ since there are no differences in $I_g$ structure among the different genes. -Here, we simulated the first five samples to be in condition 1 and the other five in condition 2, so define: - -\verb+Conditions=as.factor(rep(c("C1","C2"),each=5))+ - -\noindent \verb+sizeFactors+ is used to define the library size factor of each sample. -It could be obtained by summing up the total number of reads within each sample, -Median Normalization \citep{Anders10}, -scaling normalization \citep{Robinson10}, Upper-Quantile Normalization \cite{Bullard10}, -or some other such approach. -These in hand, we run the EM algorithm, setting the number -of iterations to five via \verb+maxround=5+ for demonstration purposes. -However, we note that in practice, -additional iterations are usually required. Convergence should always be -checked (see Section \ref{sec:detailedgenedeconverge} for details). -Please note this may take several minutes: -<<>>= -EBOut=EBTest(Data=GeneMat, -Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=Sizes, maxround=5) -@ -\noindent The list of DE genes and the posterior probabilities of being DE are obtained as follows -<<>>= -EBDERes=GetDEResults(EBOut, FDR=0.05) -str(EBDERes$DEfound) -head(EBDERes$PPMat) -str(EBDERes$Status) -@ -\noindent \verb+EBDERes$DEfound+ is a list of genes identified with 5\% FDR. EBSeq found -95 genes. The matrix \verb+EBDERes$PPMat+ contains two columns \verb+PPEE+ and \verb+PPDE+, -corresponding to the posterior probabilities of being EE or DE for each gene. -\verb+EBDERes$Status+ contains each gene's status called by EBSeq. - -\noindent Note the \verb+GetDEResults()+ was incorporated in EBSeq since version 1.7.1. -By using the default settings, the number of genes identified in any given analysis may -differ slightly from the previous version. The updated algorithm is more robust to outliers -and transcripts with low variance. To obtain results that are comparable -to results from earlier versions of EBSeq ($\le$ 1.7.0), a user may set -\verb+Method="classic"+ in \verb+GetDEResults()+ function, or use the \verb+GetPPMat()+ function. - - -\subsection{Isoform level DE analysis (two conditions)} -\label{sec:startisode} -\subsubsection{Required inputs} -\label{sec:startisodeinput} - -\begin{flushleft} -{\bf Data}: The object \verb+Data+ should be a $I-by-S$ matrix containing the expression values for each isoform and each sample, -where $I$ is the number of isoforms and $S$ is the number of sample. As in the gene-level analysis, these values should exhibit raw data, without normalization -across samples. - -\vspace{5 mm} - -{\bf Conditions}: The object \verb+Conditions+ should be a vector with length $S$ to indicate the condition of each sample. - -\vspace{5 mm} - -{\bf IsoformNames}: The object \verb+IsoformNames+ should be a vector with length $I$ to indicate the isoform names. - -\vspace{5 mm} - -{\bf IsosGeneNames}: The object \verb+IsosGeneNames+ should be a vector with length $I$ to indicate the gene name of each isoform. -(in the same order as \verb+IsoformNames+.) -\end{flushleft} - -\noindent \verb+IsoList+ contains 1,200 simulated isoforms. -In which \verb+IsoList$IsoMat+ is a data matrix containing -1,200 rows of isoforms and 10 columns of samples; -\verb+IsoList$IsoNames+ contains the isoform names; -\verb+IsoList$IsosGeneNames+ contains the names of the genes the isoforms belong to. - -<<>>= -data(IsoList) -str(IsoList) -IsoMat=IsoList$IsoMat -str(IsoMat) -IsoNames=IsoList$IsoNames -IsosGeneNames=IsoList$IsosGeneNames -@ - -\subsubsection{Library size factor} -\label{sec:startisodesize} -Similar to the gene-level analysis presented above, we may obtain the isoform-level -library size factors via \verb+MedianNorm+: -<<>>= -IsoSizes=MedianNorm(IsoMat) -@ - -\subsubsection{The $I_g$ vector} -\label{sec:startisodeNg} - -While working on isoform level data, EBSeq fits different prior -parameters for different uncertainty groups (defined as $I_g$ groups). -The default setting to define the uncertainty groups consists of using -the number of isoforms the host gene contains ($N_g$) for each isoform. -The default settings will provide three uncertainty groups: - -$I_g=1$ group: Isoforms with $N_g=1$; - -$I_g=2$ group: Isoforms with $N_g=2$; - -$I_g=3$ group: Isoforms with $N_g \geq 3$. - -The $N_g$ and $I_g$ group assignment can be obtained using the function \verb+GetNg+. -The required inputs of \verb+GetNg+ are the isoform names (\verb+IsoformNames+) and -their corresponding gene names (\verb+IsosGeneNames+). -<<>>= -NgList=GetNg(IsoNames, IsosGeneNames) -IsoNgTrun=NgList$IsoformNgTrun -IsoNgTrun[c(1:3,201:203,601:603)] -@ -More details could be found in Section \ref{sec:detailedisode}. - -\subsubsection{Running EBSeq on isoform expression estimates} -\label{sec:startisoderun} -The \verb+EBTest+ function is also used to run EBSeq for two condition comparisons -on isoform-level data. -Below we use 5 iterations to demonstrate. However, as -in the gene level analysis, we advise that additional iterations will likely be -required in practice (see Section \ref{sec:detailedisodeconverge} for details). - -<<>>= -IsoEBOut=EBTest(Data=IsoMat, NgVector=IsoNgTrun, -Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=IsoSizes, maxround=5) -IsoEBDERes=GetDEResults(IsoEBOut, FDR=0.05) -str(IsoEBDERes$DEfound) -head(IsoEBDERes$PPMat) -str(IsoEBDERes$Status) -@ -\noindent We see that EBSeq found 104 DE isoforms at the target FDR of 0.05. - -\noindent Note the \verb+GetDEResults()+ was incorporated in EBSeq since version 1.7.1. -By using the default settings, the number of transcripts identified in any given analysis may -differ slightly from the previous version. The updated algorithm is more robust to outliers -and transcripts with low variance. To obtain results that are comparable -to results from earlier versions of EBSeq ($\le$ 1.7.0), a user may set -\verb+Method="classic"+ in \verb+GetDEResults()+ function, or use the \verb+GetPPMat()+ function. - - - - - - -\subsection{Gene level DE analysis (more than two conditions)} -\label{sec:startmulticond} -\noindent The object \verb+MultiGeneMat+ is a matrix containing -500 simulated genes with 6 samples: -the first two samples are from condition 1; the second and the third sample are -from condition 2; the last two samples are from condition 3. - -<<>>= -data(MultiGeneMat) -str(MultiGeneMat) -@ -In analysis where the data are spread over more than two conditions, -the set of possible patterns for each gene is more complicated -than simply EE and DE. As noted in Section \ref{sec:model}, when we have 3 conditions, there are 5 expression -patterns to consider. In the simulated data, we have 6 samples, 2 in each of 3 conditions. -The function \verb+GetPatterns+ allows the user to generate all possible patterns given the conditions. For example: - -<<>>= -Conditions=c("C1","C1","C2","C2","C3","C3") -PosParti=GetPatterns(Conditions) -PosParti -@ - -\noindent where the first row means all three conditions have the same latent mean expression level; -the second row means C1 and C2 have the same latent mean expression level but that of C3 is different; -and the last row corresponds to the case where the three conditions all have different latent mean expression levels. -The user may use all or only some of these possible patterns as an input to \verb+EBMultiTest+. -For example, if we were interested in Patterns 1, 2, 4 and 5 only, we'd define: -<<>>= -Parti=PosParti[-3,] -Parti -@ - -Moving on to the analysis, \verb+MedianNorm+ or one of its competitors should be used to determine the normalization factors. -Once this is done, the formal test is performed by \verb+EBMultiTest+. -<<>>= -MultiSize=MedianNorm(MultiGeneMat) -MultiOut=EBMultiTest(MultiGeneMat,NgVector=NULL,Conditions=Conditions, -AllParti=Parti, sizeFactors=MultiSize, maxround=5) -@ -\noindent The posterior probability of being in each pattern for every gene is obtained by using the -function \verb+GetMultiPP+: -<<>>= -MultiPP=GetMultiPP(MultiOut) -names(MultiPP) -MultiPP$PP[1:10,] -MultiPP$MAP[1:10] -MultiPP$Patterns -@ -\noindent where \verb+MultiPP$PP+ provides the posterior probability of being in each pattern for every gene. -\verb+MultiPP$MAP+ provides the most likely pattern of each gene based on the posterior -probabilities. \verb+MultiPP$Patterns+ provides the details of the patterns. - - - -\subsection{Isoform level DE analysis (more than two conditions)} -\label{sec:startisomulticond} -\noindent Similar to \verb+IsoList+, -the object \verb+IsoMultiList+ is an object containing the isoform expression estimates matrix, the isoform -names, and the gene names of the isoforms' host genes. -\verb+IsoMultiList$IsoMultiMat+ contains 300 simulated isoforms with 8 samples. -The first two samples are from condition 1; the second and the third sample are -from condition 2; the fifth and sixth sample are from condition 3; -the last two samples are from condition 4. -Similar to Section \ref{sec:startisode}, the function \verb+MedianNorm+ and \verb+GetNg+ could be used for normalization -and calculating the $N_g$'s. -<<>>= -data(IsoMultiList) -IsoMultiMat=IsoMultiList[[1]] -IsoNames.Multi=IsoMultiList$IsoNames -IsosGeneNames.Multi=IsoMultiList$IsosGeneNames -IsoMultiSize=MedianNorm(IsoMultiMat) -NgList.Multi=GetNg(IsoNames.Multi, IsosGeneNames.Multi) -IsoNgTrun.Multi=NgList.Multi$IsoformNgTrun -Conditions=c("C1","C1","C2","C2","C3","C3","C4","C4") -@ -Here we have 4 conditions, there are 15 expression -patterns to consider. -The function \verb+GetPatterns+ allows the user to generate all possible patterns given the conditions. For example: - -<<>>= -PosParti.4Cond=GetPatterns(Conditions) -PosParti.4Cond -@ - -\noindent -If we were interested in Patterns 1, 2, 3, 8 and 15 only, we'd define: -<<>>= -Parti.4Cond=PosParti.4Cond[c(1,2,3,8,15),] -Parti.4Cond -@ - -\noindent -Moving on to the analysis, \verb+EBMultiTest+ could be used to perform the test: -<<>>= -IsoMultiOut=EBMultiTest(IsoMultiMat, -NgVector=IsoNgTrun.Multi,Conditions=Conditions, -AllParti=Parti.4Cond, sizeFactors=IsoMultiSize, -maxround=5) -@ -\noindent The posterior probability of being in each pattern for every gene is obtained by using the -function \verb+GetMultiPP+: -<<>>= -IsoMultiPP=GetMultiPP(IsoMultiOut) -names(MultiPP) -IsoMultiPP$PP[1:10,] -IsoMultiPP$MAP[1:10] -IsoMultiPP$Patterns -@ -\noindent where \verb+MultiPP$PP+ provides the posterior probability of being in each pattern for every gene. -\verb+MultiPP$MAP+ provides the most likely pattern of each gene based on the posterior -probabilities. \verb+MultiPP$Patterns+ provides the details of the patterns. - - - -\newpage -\section{More detailed examples} -\label{sec:detailed} -\subsection{Gene level DE analysis (two conditions)} -\label{sec:detailedgenede} - -\subsubsection{Running EBSeq on simulated gene expression estimates} -\label{sec:detailedgenederun} -EBSeq is applied as described in Section \ref{sec:startgenederun}. -<>= -data(GeneMat) -Sizes=MedianNorm(GeneMat) -EBOut=EBTest(Data=GeneMat, -Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=Sizes, maxround=5) -EBDERes=GetDEResults(EBOut, FDR=0.05) -@ -<<>>= -EBDERes=GetDEResults(EBOut, FDR=0.05) -str(EBDERes$DEfound) -head(EBDERes$PPMat) -str(EBDERes$Status) -@ -\noindent EBSeq found 95 DE genes at a target FDR of 0.05.\\ - -\subsubsection{Calculating FC} -\label{sec:detailedgenedefc} -The function \verb+PostFC+ may be used to calculate the Fold Change (FC) -of the raw data as well as the posterior FC of the normalized data. -\begin{figure}[h!] -\centering -<>= -GeneFC=PostFC(EBOut) -str(GeneFC) -PlotPostVsRawFC(EBOut,GeneFC) -@ -\caption{ -FC vs. Posterior FC for 1,000 gene expression estimates} -\label{fig:GeneFC} -\end{figure} -Figure \ref{fig:GeneFC} shows the FC vs. Posterior FC on 1,000 gene expression estimates. -The genes are ranked by their cross-condition mean (adjusted by the normalization factors). -The posterior FC tends to shrink genes with low expressions (small rank); in this case the differences -are minor. - - -\newpage - -\subsubsection{Checking convergence} -\label{sec:detailedgenedeconverge} -As detailed in Section \ref{sec:model}, we assume the prior distribution of $q_g^C$ is -$Beta(\alpha,\beta)$. The EM algorithm is used to estimate the -hyper-parameters $\alpha,\beta$ and the mixture parameter $p$. -The optimized parameters at each iteration may be obtained as follows (recall -we are using 5 iterations for demonstration purposes): -<<>>= -EBOut$Alpha -EBOut$Beta -EBOut$P -@ -In this case the differences between the 4th and 5th iterations are always less -than 0.01. - - -\subsubsection{Checking the model fit and other diagnostics} -\label{sec:detailedgenedeplot} -As noted in Leng {\it et al.}, 2013 \cite{Leng13}, EBSeq relies on parametric assumptions that should -be checked following each analysis. -The \verb+QQP+ function may be used to assess prior assumptions. -In practice, \verb+QQP+ generates the Q-Q plot of the empirical $q$'s -vs. the simulated $q$'s from the Beta prior distribution with -estimated hyper-parameters. Figure \ref{fig:GeneQQ} shows that the -data points lie on the $y=x$ line for both conditions, which indicates -that the Beta prior is appropriate. - -\begin{figure}[h!] -\centering -<>= -par(mfrow=c(1,2)) -QQP(EBOut) -@ -\caption{QQ-plots for checking the assumption of a Beta prior (upper panels) as well as the -model fit using data from condition 1 and condition 2 (lower panels)} -\label{fig:GeneQQ} -\end{figure} - -\newpage -\noindent -Likewise, the \verb+DenNHist+ function may be used to check the density plot of empirical $q$'s vs the simulated -$q$'s from the fitted Beta prior distribution. -Figure \ref{fig:GeneDenNHist} also shows our estimated distribution fits the -data very well. - -\begin{figure}[h!] -\centering -<>= -par(mfrow=c(1,2)) -DenNHist(EBOut) -@ -\caption{Density plots for checking the model fit using data from condition 1 and condition 2} -\label{fig:GeneDenNHist} -\end{figure} - -\newpage -\subsection{Isoform level DE analysis (two conditions)} -\label{sec:detailedisode} -\subsubsection{The $I_g$ vector} -\label{sec:detailedisodeNg} -Since EBSeq fits rely on $I_g$, -we need to obtain the $I_g$ for each isoform. This can be done using the -function \verb+GetNg+. -The required inputs of \verb+GetNg+ are the isoform names (\verb+IsoformNames+) and -their corresponding gene names (\verb+IsosGeneNames+), described above. -In the simulated data, we assume that the isoforms in the $I_g=1$ group belong to genes \verb+Gene_1, ... , Gene_200+; -The isoforms in the $I_g=2$ group belong to genes -\verb+Gene_201, ..., Gene_400+; and isoforms in the $I_g=3$ group -belong to \verb+Gene_401, ..., Gene_600+. - -<>= -data(IsoList) -IsoMat=IsoList$IsoMat -IsoNames=IsoList$IsoNames -IsosGeneNames=IsoList$IsosGeneNames -NgList=GetNg(IsoNames, IsosGeneNames, TrunThre=3) -@ -<<>>= -names(NgList) -IsoNgTrun=NgList$IsoformNgTrun -IsoNgTrun[c(1:3,201:203,601:603)] -@ - -The output of \verb+GetNg+ contains 4 vectors. \verb+GeneNg+ (\verb+IsoformNg+) provides -the number of isoforms $N_g$ within each gene (within each isoform's host gene). -\verb+GeneNgTrun+ (\verb+IsoformNgTrun+) provides the $I_g$ group assignments. -The default number of groups is 3, which means the isoforms -with $N_g$ greater than 3 will be assigned to $I_g=3$ group. -We use 3 in the case studies -since the number of isoforms with $N_g$ larger than 3 is relatively small and -the small sample size may induce poor parameter fitting if we treat them -as separate groups. -In practice, if there is evidence that the $N_g=4,5,6...$ groups should be -treated as separate groups, a user can change \verb+TrunThre+ to define -a different truncation threshold. - -\subsubsection{Using mappability ambiguity clusters instead of -the $I_g$ vector when the gene-isoform relationship is unknown} -\label{sec:detailedisodeNoNg} -When working with a de-novo assembled transcriptome, in which case the gene-isoform -relationship is unknown, -a user can use read mapping ambiguity cluster information instead of Ng, -as provided by RSEM \cite{Li11b} in the -output file \verb+output_name.ngvec+. The file contains a vector with the same -length as the total number of transcripts. -Each transcript has been assigned to one of 3 levels -(1, 2, or 3) to indicate the mapping uncertainty level of that transcript. -The mapping ambiguity clusters are partitioned via a k-means algorithm on the unmapability -scores that are provided by RSEM. A user can read in the mapping ambiguity cluster information -using: - -<>= -IsoNgTrun = scan(file="output_name.ngvec", what=0, sep="\n") -@\\ -Where \verb+"output_name.ngvec"+ is the output file obtained from RSEM function rsem-generate-ngvector. -More details on using the RSEM-EBSeq pipeline -on de novo assembled transcriptomes can be found -at \url{http://deweylab.biostat.wisc.edu/rsem/README.html#de}. - -Other unmappability scores and other cluster methods (e.g. Gaussian Mixed Model) -could also be used to form the uncertainty clusters. - -\subsubsection{Running EBSeq on simulated isoform expression estimates} -\label{sec:detailedisoderun} -EBSeq can be applied as described in Section \ref{sec:startisoderun}. -<>= -IsoSizes=MedianNorm(IsoMat) -IsoEBOut=EBTest(Data=IsoMat, NgVector=IsoNgTrun, -Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=IsoSizes, maxround=5) -IsoEBDERes=GetDEResults(IsoEBOut, FDR=0.05) -@ -<<>>= -str(IsoEBDERes) -@ -\noindent We see that EBSeq found 104 DE isoforms at a target FDR of 0.05. -The function \verb+PostFC+ could also be used here to calculate the Fold Change (FC) -as well as the posterior FC on the normalization factor adjusted data. -<<>>= -IsoFC=PostFC(IsoEBOut) -str(IsoFC) -@ - - -\subsubsection{Checking convergence} -\label{sec:detailedisodeconverge} -For isoform level data, we assume the prior distribution of $q_{gi}^C$ is -$Beta(\alpha,\beta^{I_g})$. -As in Section \ref{sec:detailedgenedeconverge}, the optimized parameters at each iteration -may be obtained as follows (recall -we are using 5 iterations for demonstration purposes): -<<>>= -IsoEBOut$Alpha -IsoEBOut$Beta -IsoEBOut$P -@ -Here we have 3 $\beta$'s in each iteration corresponding to -$\beta^{I_g=1},\beta^{I_g=2},\beta^{I_g=3}$. -We see that parameters are changing less than $10^{-2}$ or $10^{-3}$. -In practice, we require changes less than $10^{-3}$ to declare convergence. - -\subsubsection{Checking the model fit and other diagnostics} -\label{sec:detailedisodeplot} -In Leng {\it et al.}, 2013\citep{Leng13}, we showed the mean-variance differences across different -isoform groups on multiple data sets. -In practice, if it is of interest to check differences among -isoform groups defined by truncated $I_g$ (such as those shown here -in Figure 1), the function \verb+PolyFitPlot+ may be used. -The following code generates the three -panels shown in Figure \ref{fig:IsoSimuNgEach} -(if condition 2 is of interest, a user could change each \verb+C1+ to \verb+C2+.): -\begin{figure}[h!] -\centering -<>= -par(mfrow=c(2,2)) -PolyFitValue=vector("list",3) -for(i in 1:3) - PolyFitValue[[i]]=PolyFitPlot(IsoEBOut$C1Mean[[i]], - IsoEBOut$C1EstVar[[i]],5) -@ -\caption{ The mean-variance fitting plot for each Ng group} -\label{fig:IsoSimuNgEach} -\end{figure} - -\newpage -Superimposing all $I_g$ groups using the code below will generate the figure (shown -here in Figure \ref{fig:IsoSimuNg}), which is similar in structure to Figure 1: - - -\begin{figure}[h!] -\centering -<>= -PolyAll=PolyFitPlot(unlist(IsoEBOut$C1Mean), unlist(IsoEBOut$C1EstVar),5) -lines(log10(IsoEBOut$C1Mean[[1]][PolyFitValue[[1]]$sort]), -PolyFitValue[[1]]$fit[PolyFitValue[[1]]$sort],col="yellow",lwd=2) -lines(log10(IsoEBOut$C1Mean[[2]][PolyFitValue[[2]]$sort]), -PolyFitValue[[2]]$fit[PolyFitValue[[2]]$sort],col="pink",lwd=2) -lines(log10(IsoEBOut$C1Mean[[3]][PolyFitValue[[3]]$sort]), -PolyFitValue[[3]]$fit[PolyFitValue[[3]]$sort],col="green",lwd=2) -legend("topleft",c("All Isoforms","Ng = 1","Ng = 2","Ng = 3"), -col=c("red","yellow","pink","green"),lty=1,lwd=3,box.lwd=2) -@ -\caption{The mean-variance plot for each Ng group} -\label{fig:IsoSimuNg} -\end{figure} - - -\newpage -\noindent To generate a QQ-plot of the fitted Beta prior distribution -and the $\hat{q}^C$'s within condition, a user may -use the following code to generate 6 panels (as shown in Figure \ref{fig:IsoQQ}). -\begin{figure}[h!] -\centering -<>= -par(mfrow=c(2,3)) -QQP(IsoEBOut) -@ -\caption{ QQ-plots of the fitted prior distributions within each condition and each Ig group} -\label{fig:IsoQQ} -\end{figure} - -\newpage -\noindent And in order to produce the plot of the fitted Beta prior densities -and the histograms of $\hat{q}^C$'s within each condition, -the following may be used (it generates Figure \ref{fig:IsoDenNHist}): -\begin{figure}[h] -\centering -<>= -par(mfrow=c(2,3)) -DenNHist(IsoEBOut) -@ -\caption{ Prior distribution fit within each condition and each Ig group. -(Note only a small set of isoforms are considered here for demonstration. -Better fitting should be expected while using full set of isoforms.)} -\label{fig:IsoDenNHist} -\end{figure} - -\clearpage -\subsection{Gene level DE analysis (more than two conditions)} -\label{sec:detailedmulticond} -As described in Section \ref{sec:startmulticond}, -the function \verb+GetPatterns+ allows the user to generate all possible patterns given the conditions. -To visualize the patterns, the function \verb+PlotPattern+ may be used. - -\begin{figure}[h!] -\centering -<>= -Conditions=c("C1","C1","C2","C2","C3","C3") -PosParti=GetPatterns(Conditions) -PosParti -PlotPattern(PosParti) -@ -\caption{ All possible patterns} -\label{fig:Patterns} -\end{figure} -\newpage -\noindent If we were interested in Patterns 1, 2, 4 and 5 only, we'd define: -<<>>= -Parti=PosParti[-3,] -Parti -@ - -\noindent -Moving on to the analysis, \verb+MedianNorm+ or one of its competitors should be used to determine the normalization factors. -Once this is done, the formal test is performed by \verb+EBMultiTest+. -<>= -data(MultiGeneMat) -MultiSize=MedianNorm(MultiGeneMat) -MultiOut=EBMultiTest(MultiGeneMat, -NgVector=NULL,Conditions=Conditions, -AllParti=Parti, sizeFactors=MultiSize, -maxround=5) -@ -\noindent The posterior probability of being in each pattern for every gene is obtained using the -function \verb+GetMultiPP+: -<<>>= -MultiPP=GetMultiPP(MultiOut) -names(MultiPP) -MultiPP$PP[1:10,] -MultiPP$MAP[1:10] -MultiPP$Patterns -@ -\noindent where \verb+MultiPP$PP+ provides the posterior probability of being in each pattern for every gene. -\verb+MultiPP$MAP+ provides the most likely pattern of each gene based on the posterior -probabilities. \verb+MultiPP$Patterns+ provides the details of the patterns. The FC and posterior FC for multiple condition data can -be obtained by the function \verb+GetMultiFC+: - -<<>>= -MultiFC=GetMultiFC(MultiOut) -str(MultiFC) -@ - -\noindent To generate a QQ-plot of the fitted Beta prior distribution -and the $\hat{q}^C$'s within condition, a user could also use function -\verb+DenNHist+ and \verb+QQP+. - -\begin{figure}[h!] -\centering -<>= -par(mfrow=c(2,2)) -QQP(MultiOut) -@ -\caption{ QQ-plots of the fitted prior distributions within each condition and each Ig group} -\label{fig:GeneMultiQQ} -\end{figure} - -\begin{figure}[h] -\centering -<>= -par(mfrow=c(2,2)) -DenNHist(MultiOut) -@ -\caption{ Prior distributions fit within each condition. -(Note only a small set of genes are considered here for demonstration. -Better fitting should be expected while using full set of genes.)} -\label{fig:GeneMultiDenNHist} -\end{figure} -\newpage -\clearpage -\newpage -\subsection{Isoform level DE analysis (more than two conditions)} -\label{sec:detailedisomulticond} -Similar to Section \ref{sec:startmulticond}, -the function \verb+GetPatterns+ allows a user to generate all possible patterns given the conditions. -To visualize the patterns, the function \verb+PlotPattern+ may be used. -<<>>= -Conditions=c("C1","C1","C2","C2","C3","C3","C4","C4") -PosParti.4Cond=GetPatterns(Conditions) -PosParti.4Cond -@ -\newpage -\begin{figure}[h!] -\centering -<>= -PlotPattern(PosParti.4Cond) -Parti.4Cond=PosParti.4Cond[c(1,2,3,8,15),] -Parti.4Cond -@ -\caption{All possible patterns for 4 conditions} -\label{fig:Patterns4Cond} -\end{figure} - -\newpage -<>= -data(IsoMultiList) -IsoMultiMat=IsoMultiList[[1]] -IsoNames.Multi=IsoMultiList$IsoNames -IsosGeneNames.Multi=IsoMultiList$IsosGeneNames -IsoMultiSize=MedianNorm(IsoMultiMat) -NgList.Multi=GetNg(IsoNames.Multi, IsosGeneNames.Multi) -IsoNgTrun.Multi=NgList.Multi$IsoformNgTrun -IsoMultiOut=EBMultiTest(IsoMultiMat,NgVector=IsoNgTrun.Multi,Conditions=Conditions, -AllParti=Parti.4Cond, -sizeFactors=IsoMultiSize, maxround=5) -IsoMultiPP=GetMultiPP(IsoMultiOut) -@ -<<>>= -names(MultiPP) -IsoMultiPP$PP[1:10,] -IsoMultiPP$MAP[1:10] -IsoMultiPP$Patterns -IsoMultiFC=GetMultiFC(IsoMultiOut) -@ -The FC and posterior FC for multiple condition data can be obtained by the function \verb+GetMultiFC+: - - -\noindent To generate a QQ-plot of the fitted Beta prior distribution -and the $\hat{q}^C$'s within condition, a user could also use the functions -\verb+DenNHist+ and \verb+QQP+. -\newpage -\begin{figure}[h!] -\centering -<>= -par(mfrow=c(3,4)) -QQP(IsoMultiOut) - -@ -\caption{ QQ-plots of the fitted prior distributions within each condition and Ig group. -(Note only a small set of isoforms are considered here for demonstration. -Better fitting should be expected while using full set of isoforms.)} -\label{fig:IsoMultiQQ} -\end{figure} - -\begin{figure}[h] -\centering -<>= -par(mfrow=c(3,4)) -DenNHist(IsoMultiOut) -@ -\caption{ Prior distributions fit within each condition and Ig group. -(Note only a small set of isoforms are considered here for demonstration. -Better fitting should be expected while using full set of isoforms.)} -\label{fig:IsoMultiDenNHist} -\end{figure} -\clearpage -\newpage - - -\newpage -\subsection{Working without replicates} -When replicates are not available, it is difficult to estimate the transcript specific variance. -In this case, EBSeq estimates the variance by pooling similar genes together. -Specifically, we take genes with FC in the 25\% - 75\% quantile of all FC's as -candidate genes. By defining \verb+NumBin = 1000+ (default in \verb+EBTest+), EBSeq -will group genes with similar means into 1,000 bins. -For each candidate gene, we use the across-condition variance estimate as its variance estimate. -For each bin, the bin-wise variance estimation is taken to be the median of the -across-condition variance estimates of the candidate genes within that bin. -For each non-candidate gene, we use the bin-wise variance estimate of the host bin (the bin containing this gene) -as its variance estimate. -This approach works well when there are no more than 50\% DE genes in the data set. - -\subsubsection{Gene counts with two conditions} -\label{sec:norepgenede} - -To generate a data set with no replicates, we take the first sample of each condition. -For example, using the data from Section \ref{sec:detailedgenede}, we take sample 1 from condition 1 and -sample 6 from condition 2. Functions \verb+MedianNorm+, \verb+GetDEResults+ and -\verb+PostFC+ may be used on data without replicates. -<<>>= -data(GeneMat) -GeneMat.norep=GeneMat[,c(1,6)] -Sizes.norep=MedianNorm(GeneMat.norep) -EBOut.norep=EBTest(Data=GeneMat.norep, -Conditions=as.factor(rep(c("C1","C2"))), -sizeFactors=Sizes.norep, maxround=5) -EBDERes.norep=GetDEResults(EBOut.norep) -GeneFC.norep=PostFC(EBOut.norep) -@ - -\subsubsection{Isoform counts with two conditions} -\label{norepisode} -To generate an isoform level data set with no replicates, we -also take sample 1 and sample 6 in the data we used in Section -\ref{sec:detailedisode}. -Example codes are shown below. - -<<>>= -data(IsoList) -IsoMat=IsoList$IsoMat -IsoNames=IsoList$IsoNames -IsosGeneNames=IsoList$IsosGeneNames -NgList=GetNg(IsoNames, IsosGeneNames) -IsoNgTrun=NgList$IsoformNgTrun -IsoMat.norep=IsoMat[,c(1,6)] -IsoSizes.norep=MedianNorm(IsoMat.norep) -IsoEBOut.norep=EBTest(Data=IsoMat.norep, NgVector=IsoNgTrun, -Conditions=as.factor(c("C1","C2")), -sizeFactors=IsoSizes.norep, maxround=5) -IsoEBDERes.norep=GetDEResults(IsoEBOut.norep) -IsoFC.norep=PostFC(IsoEBOut.norep) -@ - -\subsubsection{Gene counts with more than two conditions} -\label{norepisode} -To generate a data set with multiple conditions and no replicates, -we take the first sample from each condition (sample 1, 3 and 5) in the data we used -in Section \ref{sec:detailedmulticond}. -Example codes are shown below. -<<>>= -data(MultiGeneMat) -MultiGeneMat.norep=MultiGeneMat[,c(1,3,5)] -Conditions=c("C1","C2","C3") -PosParti=GetPatterns(Conditions) -Parti=PosParti[-3,] -MultiSize.norep=MedianNorm(MultiGeneMat.norep) -MultiOut.norep=EBMultiTest(MultiGeneMat.norep, -NgVector=NULL,Conditions=Conditions, -AllParti=Parti, sizeFactors=MultiSize.norep, -maxround=5) -MultiPP.norep=GetMultiPP(MultiOut.norep) -MultiFC.norep=GetMultiFC(MultiOut.norep) -@ - -\subsubsection{Isoform counts with more than two conditions} -\label{sec:norepmulticond} -To generate an isoform level data set with multiple conditions and no replicates, -we take the first sample from each condition (sample 1, 3, 5 and 7) in the data we used -in Section \ref{sec:detailedisomulticond}. -Example codes are shown below. - - - -<<>>= -data(IsoMultiList) -IsoMultiMat=IsoMultiList[[1]] -IsoNames.Multi=IsoMultiList$IsoNames -IsosGeneNames.Multi=IsoMultiList$IsosGeneNames -IsoMultiMat.norep=IsoMultiMat[,c(1,3,5,7)] -IsoMultiSize.norep=MedianNorm(IsoMultiMat.norep) -NgList.Multi=GetNg(IsoNames.Multi, IsosGeneNames.Multi) -IsoNgTrun.Multi=NgList.Multi$IsoformNgTrun -Conditions=c("C1","C2","C3","C4") -PosParti.4Cond=GetPatterns(Conditions) -PosParti.4Cond -Parti.4Cond=PosParti.4Cond[c(1,2,3,8,15),] -Parti.4Cond -IsoMultiOut.norep=EBMultiTest(IsoMultiMat.norep, -NgVector=IsoNgTrun.Multi,Conditions=Conditions, -AllParti=Parti.4Cond, sizeFactors=IsoMultiSize.norep, -maxround=5) -IsoMultiPP.norep=GetMultiPP(IsoMultiOut.norep) -IsoMultiFC.norep=GetMultiFC(IsoMultiOut.norep) -@ - -\section{EBSeq pipelines and extensions} -\subsection{RSEM-EBSeq pipeline: from raw reads to differential expression analysis results} -EBSeq is coupled with RSEM \cite{Li11b} as an RSEM-EBSeq pipeline which provides -quantification and DE testing on both gene and isoform levels. - -For more details, see -\url{http://deweylab.biostat.wisc.edu/rsem/README.html#de} - -\subsection{EBSeq interface: A user-friendly graphical interface for differetial expression analysis} -EBSeq interface provides a graphical interface implementation for users who are not familiar with the R -programming language. It takes .xls, .xlsx and .csv files as input. -Additional packages need be downloaded; they may be found at -\url{http://www.biostat.wisc.edu/~ningleng/EBSeq_Package/EBSeq_Interface/} - -\subsection{EBSeq Galaxy tool shed} -EBSeq tool shed contains EBSeq wrappers for a local Galaxy implementation. -For more details, see -\url{http://www.biostat.wisc.edu/~ningleng/EBSeq_Package/EBSeq_Galaxy_toolshed/} - -\section{Acknowledgment} -We would like to thank Haolin Xu for checking the package and -proofreading the vignette. - -\section{News} -2014-1-30: In EBSeq 1.3.3, the default setting of EBTest function will remove -low expressed genes (genes whose 75th quantile of normalized counts is less -than 10) before identifying DE genes. -These two thresholds can be changed in EBTest function. -Because low expressed genes are disproportionately noisy, -removing these genes prior to downstream analyses can improve model fitting and increase robustness -(e.g. by removing outliers). - -2014-5-22: In EBSeq 1.5.2, numerical approximations are implemented to deal with -underflow. The underflow is likely due to large number of samples. - -2015-1-29: In EBSeq 1.7.1, EBSeq incorporates a new function -GetDEResults() which may be used to obtain a list of transcripts under a target FDR in a two-condition experiment. -The results obtained by applying this function with its default setting will be -more robust to transcripts with low variance and potential outliers. -By using the default settings in this function, -the number of genes identified in any given analysis may -differ slightly from the previous version (1.7.0 or order). -To obtain results that are comparable -to results from earlier versions of EBSeq (1.7.0 or older), a user may set -Method="classic" in GetDEResults() function, or use the original GetPPMat() function. -The GeneDEResults() function also allows a user to modify thresholds to -target genes/isoforms with a pre-specified posterior fold change. - -Also, in EBSeq 1.7.1, the default settings in EBTest() and EBMultiTest() function -will only remove transcripts with all 0's (instead of removing transcripts with -75th quantile less than 10 in version 1.3.3-1.7.0). -To obtain a list of transcripts comparable to the results generated by EBSeq version 1.3.3-1.7.0, a user may change Qtrm = 0.75 and QtrmCut = 10 -when applying EBTest() or EBMultiTest() function. - - - -\pagebreak -\bibliographystyle{plain} -\bibliography{lengetal} - -\end{document} - diff --git a/.svn/pristine/97/97dfca632fc3a193eef0af74113c51be685276e7.svn-base b/.svn/pristine/97/97dfca632fc3a193eef0af74113c51be685276e7.svn-base deleted file mode 100644 index e17f6c1..0000000 --- a/.svn/pristine/97/97dfca632fc3a193eef0af74113c51be685276e7.svn-base +++ /dev/null @@ -1,20 +0,0 @@ -\name{GeneMat} -\alias{GeneMat} -\docType{data} -\title{ -The simulated data for two condition gene DE analysis -} -\description{ -'GeneMat' gives the simulated data for two condition gene DE analysis. -} -\usage{data(GeneMat)} -\source{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\seealso{ -IsoList -} -\examples{ -data(GeneMat) -} -\keyword{datasets} diff --git a/.svn/pristine/a1/a1dcd7555035735fb974523563d4dd3d934ef3cb.svn-base b/.svn/pristine/a1/a1dcd7555035735fb974523563d4dd3d934ef3cb.svn-base deleted file mode 100644 index 3fa12fb..0000000 --- a/.svn/pristine/a1/a1dcd7555035735fb974523563d4dd3d934ef3cb.svn-base +++ /dev/null @@ -1,39 +0,0 @@ -\name{beta.mom} -\alias{beta.mom} - -\title{ -Fit the beta distribution by method of moments -} - -\description{ -'beta.mom' fits the beta distribution by method of moments. -} - -\usage{ -beta.mom(qs.in) -} - -\arguments{ - \item{qs.in}{A vector contains the numbers that are assumed to follow a beta distribution.} -} - -\value{ - \item{alpha.hat}{Returns the estimation of alpha.} - \item{beta.hat}{Returns the estimation of beta.} -} -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - - -\seealso{ -DenNHist, DenNHistTable -} -\examples{ -#tmp = rbeta(5, 5, 100) -#param = beta.mom(tmp) -} -\keyword{ beta } diff --git a/.svn/pristine/a6/a662be0c35c8fe64bc21468fae56582237fddb95.svn-base b/.svn/pristine/a6/a662be0c35c8fe64bc21468fae56582237fddb95.svn-base deleted file mode 100644 index e10df33..0000000 --- a/.svn/pristine/a6/a662be0c35c8fe64bc21468fae56582237fddb95.svn-base +++ /dev/null @@ -1,79 +0,0 @@ -LogNMulti <- -function(Input, InputSP, EmpiricalR, EmpiricalRSP, NumOfEachGroup, AlphaIn, BetaIn, PIn, NoneZeroLength, AllParti, Conditions) -{ - - #For each gene (m rows of Input---m genes) - #Save each gene's F0, F1 for further likelihood calculation. - FList=sapply(1:nrow(AllParti),function(i)sapply(1:nlevels(as.factor(AllParti[i,])), - function(j)f0(do.call(cbind,InputSP[AllParti[i,]==j]),AlphaIn, BetaIn, - do.call(cbind,EmpiricalRSP[AllParti[i,]==j]), NumOfEachGroup, log=T)), - simplify=F) - FPartiLog=sapply(FList,rowSums) - FMat=exp(FPartiLog+600) - rownames(FMat)=rownames(FPartiLog)=rownames(Input) - #Get z - #Use data.list in logfunction - PInMat=matrix(rep(1,nrow(Input)),ncol=1)%*%matrix(PIn,nrow=1) - FmultiP=FMat*PInMat - Denom=rowSums(FmultiP) - ZEach=apply(FmultiP,2,function(i)i/Denom) - zNaNName1=names(Denom)[is.na(Denom)] - # other NAs in LikeFun - LF=ZEach*(log(FmultiP)) - zNaNMore=rownames(LF)[which(is.na(rowSums(LF)))] - zNaNName=unique(c(zNaNName1,zNaNMore)) - zGood=which(!rownames(LF)%in%zNaNName) - - - if(length(zGood)==0){ - #Min=min(min(F0Log[which(F0Log!=-Inf)]), - # min(F1Log[which(F1Log!=-Inf)])) - tmpMat=FPartiLog - tmpMean=apply(tmpMat,1,mean) - FLogMdf=FPartiLog-tmpMean - FMdf=exp(FLogMdf) - - FmultiPMdf=FMdf*PInMat - DenomMdf=rowSums(FmultiPMdf) - ZEach=apply(FmultiPMdf,2,function(i)i/DenomMdf) - zNaNName1Mdf=names(DenomMdf)[is.na(DenomMdf)] - # other NAs in LikeFun - LFMdf=ZEach*(log(FmultiPMdf)) - zNaNMoreMdf=rownames(LFMdf)[which(is.na(rowSums(LFMdf)))] - zNaNNameMdf=unique(c(zNaNName1Mdf,zNaNMoreMdf)) - zGood=which(!rownames(LFMdf)%in%zNaNNameMdf) - - - - - } - - ZEachGood=ZEach[zGood,] - ###Update P - PFromZ=colSums(ZEach[zGood,])/length(zGood) - FGood=FPartiLog[zGood,] - ### MLE Part #### - # Since we dont wanna update p and Z in this step - # Each Ng for one row - - NumGroupVector=rep(c(1:NoneZeroLength),NumOfEachGroup) - - NumGroupVector.zGood=NumGroupVector[zGood] - NumOfEachGroup.zGood=tapply(NumGroupVector.zGood,NumGroupVector.zGood,length) - - StartValue=c(AlphaIn, BetaIn,PIn[-1]) - InputSPGood=sapply(1:length(InputSP),function(i)InputSP[[i]][zGood,],simplify=F) - EmpiricalRSPGood=sapply(1:length(EmpiricalRSP),function(i)EmpiricalRSP[[i]][zGood,],simplify=F) - - Result<-optim(StartValue,LikefunMulti,InputPool=list(InputSPGood,Input[zGood,],ZEach[zGood,], - NoneZeroLength,EmpiricalR[zGood, ],EmpiricalRSPGood, NumOfEachGroup.zGood, AllParti)) - AlphaNew= Result$par[1] - BetaNew=Result$par[2:(1+NoneZeroLength)] - PNewNo1=Result$par[(2+NoneZeroLength):length(Result$par)] - PNew=c(1-sum(PNewNo1),PNewNo1) - ## - Output=list(AlphaNew=AlphaNew,BetaNew=BetaNew,PNew=PNew,ZEachNew=ZEach, ZEachGood=ZEachGood, - PFromZ=PFromZ, zGood=zGood, zNaNName=zNaNName,FGood=FGood) - Output - } - diff --git a/.svn/pristine/af/afb666c1dd849ff38ebb7834842db611e93fb963.svn-base b/.svn/pristine/af/afb666c1dd849ff38ebb7834842db611e93fb963.svn-base deleted file mode 100644 index 02efe8f..0000000 --- a/.svn/pristine/af/afb666c1dd849ff38ebb7834842db611e93fb963.svn-base +++ /dev/null @@ -1,65 +0,0 @@ -GetMultiFC=function(EBMultiOut,SmallNum=.01){ - if(!"PPpattern"%in%names(EBMultiOut))stop("The input doesn't seem like an output from EBMultiTest") - - NumNgGroup=length(EBMultiOut$DataList) - OutNames=rownames(EBMultiOut$PPMat) - NumCondition=length(EBMultiOut$SPMean[[1]]) - ConditionNames=colnames(EBMultiOut$AllParti) - CondMeans=sapply(1:NumCondition, - function(i){ - if (NumNgGroup==1) - out=EBMultiOut$SPMean[[1]][[i]][OutNames] - if(NumNgGroup>1) - out=unlist(sapply(1:NumNgGroup, - function(j)EBMultiOut$SPMean[[j]][[i]]))[OutNames] - out} - ) - colnames(CondMeans)=ConditionNames - CondMeansPlus=CondMeans+SmallNum - - GeneRealMean=rowMeans(CondMeans) - GeneR=unlist(EBMultiOut$RList) - GeneR[GeneR<=0 | is.na(GeneR)]=GeneRealMean[GeneR<=0 | is.na(GeneR)]*.99/.01 - - GeneAlpha=EBMultiOut[[1]][nrow(EBMultiOut[[1]]),] - GeneBeta=unlist(sapply(1:length(EBMultiOut$DataList), - function(i)rep(EBMultiOut[[2]][nrow(EBMultiOut[[1]]),i], - nrow(EBMultiOut$DataList[[i]])))) - GeneBeta=as.vector(GeneBeta) - - - FCMat=PostFCMat=matrix(0,ncol=choose(NumCondition,2),nrow=length(OutNames)) - rownames(FCMat)=rownames(PostFCMat)=OutNames - k=1 - ColNames=rep(NA,choose(NumCondition,2)) - for(i in 1:(NumCondition-1)){ - for(j in (i+1):NumCondition) - { - ColNames[k]=paste(ConditionNames[i],"Over",ConditionNames[j],sep="") - FCMat[,k]=CondMeansPlus[,i]/CondMeansPlus[,j] - - - nC1=sum(EBMultiOut$ConditionOrder==ConditionNames[i]) - nC2=sum(EBMultiOut$ConditionOrder==ConditionNames[j]) - GenePostAlphaC1=GeneAlpha+nC1*GeneR - GenePostAlphaC2=GeneAlpha+nC2*GeneR - GenePostBetaC1=GeneBeta+nC1*CondMeans[,i] - GenePostBetaC2=GeneBeta+nC2*CondMeans[,j] - GenePostQC1=GenePostAlphaC1/(GenePostAlphaC1+GenePostBetaC1) - GenePostQC2=GenePostAlphaC2/(GenePostAlphaC2+GenePostBetaC2) - - GenePostFC=((1-GenePostQC1)/(1-GenePostQC2))*(GenePostQC2/GenePostQC1) - PostFCMat[,k]= GenePostFC - - k=k+1 - } - } - colnames(FCMat)=colnames(PostFCMat)=ColNames - Log2FCMat=log2(FCMat) - Log2PostFCMat=log2(PostFCMat) - Out=list(FCMat=FCMat,Log2FCMat=Log2FCMat, - PostFCMat=PostFCMat, Log2PostFCMat=Log2PostFCMat, - CondMeans=CondMeans, - ConditionOrder=EBMultiOut$ConditionOrder) -} - diff --git a/.svn/pristine/bf/bf4d7cc03c36e94a961d04dd4830132fdcbf93de.svn-base b/.svn/pristine/bf/bf4d7cc03c36e94a961d04dd4830132fdcbf93de.svn-base deleted file mode 100644 index 704dfd1..0000000 --- a/.svn/pristine/bf/bf4d7cc03c36e94a961d04dd4830132fdcbf93de.svn-base +++ /dev/null @@ -1,29 +0,0 @@ -\name{GetPatterns} -\alias{GetPatterns} -\title{ -Generate all possible patterns in a multiple condition study -} -\description{ -'GetPatterns' generates all possible patterns in a multiple condition study. -} -\usage{ -GetPatterns(Conditions) -} -\arguments{ - \item{Conditions}{The names of the Conditions in the study.} -} - -\value{A matrix describe all possible patterns. } - -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} - -\author{ -Ning Leng -} - -\examples{ -Conditions = c("C1","C1","C2","C2","C3","C3") -PosParti = GetPatterns(Conditions) -} diff --git a/.svn/pristine/c0/c02b09c234ceb30367cca3e38778c7074da3f373.svn-base b/.svn/pristine/c0/c02b09c234ceb30367cca3e38778c7074da3f373.svn-base deleted file mode 100644 index 4453909..0000000 --- a/.svn/pristine/c0/c02b09c234ceb30367cca3e38778c7074da3f373.svn-base +++ /dev/null @@ -1,592 +0,0 @@ -EBTest <- -function(Data,NgVector=NULL,Conditions, sizeFactors, maxround, Pool=F, NumBin=1000,ApproxVal=10^-10, Alpha=NULL, Beta=NULL,PInput=NULL,RInput=NULL,PoolLower=.25, PoolUpper=.75,Print=T, Qtrm=.75,QtrmCut=10) -{ - if(!is.factor(Conditions))Conditions=as.factor(Conditions) - if(is.null(rownames(Data)))stop("Please add gene/isoform names to the data matrix") - - if(!is.matrix(Data))stop("The input Data is not a matrix") - if(length(Conditions)!=ncol(Data))stop("The number of conditions is not the same as the number of samples! ") - if(nlevels(Conditions)>2)stop("More than 2 conditions! Please use EBMultiTest() function") - if(nlevels(Conditions)<2)stop("Less than 2 conditions - Please check your input") - if(length(sizeFactors)!=length(Data) & length(sizeFactors)!=ncol(Data)) - stop("The number of library size factors is not the same as the number of samples!") - - Conditions=as.factor(Conditions) - Vect5End=Vect3End=CI=CIthre=tau=NULL - Dataraw=Data - - #Normalized - DataNorm=GetNormalizedMat(Data, sizeFactors) - Levels=levels(as.factor(Conditions)) - - # Dixon Statistics -# library(outliers) -# normalized matrix for each condition -# matC=sapply(1:length(Levels),function(i)DataNorm[,which(Conditions==Levels[i])]) -# run dixon test for each isoform within condition -# DixonP=sapply(1:length(matC),function(j) -# apply(DataNorm,1,function(i){ -# if(mean(i)==0)out=NA -# else out=dixon.test(i)$p.value -# out})) - - - QuantileFor0=apply(DataNorm,1,function(i)quantile(i,Qtrm)) - AllZeroNames=which(QuantileFor0<=QtrmCut) - NotAllZeroNames=which(QuantileFor0>QtrmCut) - if(length(AllZeroNames)>0 & Print==T) - cat(paste0("Removing transcripts with ",Qtrm*100, - " th quantile < = ",QtrmCut," \n", - length(NotAllZeroNames)," transcripts will be tested\n")) - if(length(NotAllZeroNames)==0)stop("0 transcript passed") - Data=Data[NotAllZeroNames,] - if(!is.null(NgVector))NgVector=NgVector[NotAllZeroNames] - if(length(sizeFactors)==length(Data))sizeFactors=sizeFactors[NotAllZeroNames,] - if(is.null(NgVector))NgVector=rep(1,nrow(Data)) - - #Rename Them - IsoNamesIn=rownames(Data) - Names=paste("I",c(1:dim(Data)[1]),sep="") - names(IsoNamesIn)=Names - rownames(Data)=paste("I",c(1:dim(Data)[1]),sep="") - names(NgVector)=paste("I",c(1:dim(Data)[1]),sep="") - - - if(length(sizeFactors)==length(Data)){ - rownames(sizeFactors)=rownames(Data) - colnames(sizeFactors)=Conditions - } - - NumOfNg=nlevels(as.factor(NgVector)) - NameList=sapply(1:NumOfNg,function(i)Names[NgVector==i],simplify=F) - names(NameList)=paste("Ng",c(1:NumOfNg),sep="") - NotNone=NULL - for (i in 1:NumOfNg) { - if (length(NameList[[i]])!=0) - NotNone=c(NotNone,names(NameList)[i]) - } - NameList=NameList[NotNone] - - NoneZeroLength=length(NameList) - DataList=vector("list",NoneZeroLength) - DataList=sapply(1:NoneZeroLength , function(i) Data[NameList[[i]],],simplify=F) - names(DataList)=names(NameList) - - NumEachGroup=sapply(1:NoneZeroLength , function(i)dim(DataList[[i]])[1]) - # Unlist - DataList.unlist=do.call(rbind, DataList) - - # Divide by SampleSize factor - - if(length(sizeFactors)==ncol(Data)) - DataList.unlist.dvd=t(t( DataList.unlist)/sizeFactors) - - if(length(sizeFactors)==length(Data)) - DataList.unlist.dvd=DataList.unlist/sizeFactors - - MeanList=rowMeans(DataList.unlist.dvd) - -############### -# Input R -############### - if (!is.null(RInput)){ - - RNoZero=RInput[NotAllZeroNames] - names(RNoZero)=rownames(Data) - RNoZero.order=RNoZero[rownames(DataList.unlist)] - if(length(sizeFactors)==ncol(Data)){ - RMat= outer(RNoZero.order, sizeFactors) - } - if(length(sizeFactors)==length(Data)){ - RMat= RNoZero.order* sizeFactors - } - - DataListSP=vector("list",nlevels(Conditions)) - RMatSP=vector("list",nlevels(Conditions)) - - for (lv in 1:nlevels(Conditions)){ - DataListSP[[lv]]= matrix(DataList.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist)[1]) - rownames(DataListSP[[lv]])=rownames(DataList.unlist) - RMatSP[[lv]]= matrix(RMat[,Conditions==levels(Conditions)[lv]],nrow=dim(RMat)[1]) - rownames(RMatSP[[lv]])=rownames(RMat) - - } - - F0Log=f0(Input=DataList.unlist, AlphaIn=Alpha, BetaIn=Beta, - EmpiricalR=RMat, NumOfGroups=NumEachGroup, log=T) - F1Log=f1(Input1=DataListSP[[1]], Input2=DataListSP[[2]], - AlphaIn=Alpha, BetaIn=Beta, EmpiricalRSP1=RMatSP[[1]], - EmpiricalRSP2=RMatSP[[2]], NumOfGroup=NumEachGroup, log=T) - F0LogMdf=F0Log+600 - F1LogMdf=F1Log+600 - F0Mdf=exp(F0LogMdf) - F1Mdf=exp(F1LogMdf) - if(!is.null(PInput)){ - z.list=PInput*F1Mdf/(PInput*F1Mdf+(1-PInput)*F0Mdf) - PIn=PInput - } - if(is.null(PInput)){ - PIn=.5 - PInput=rep(NULL,maxround) - for(i in 1:maxround){ - z.list=PIn*F1Mdf/(PIn*F1Mdf+(1-PIn)*F0Mdf) - zNaNName=names(z.list)[is.na(z.list)] - zGood=which(!is.na(z.list)) - PIn=sum(z.list[zGood])/length(z.list[zGood]) - PInput[i]=PIn - } - - zNaNName=names(z.list)[is.na(z.list)] - if(length(zNaNName)!=0){ - PNotIn=rep(1-ApproxVal,length(zNaNName)) - MeanList.NotIn=MeanList[zNaNName] - R.NotIn.raw=MeanList.NotIn*PNotIn/(1-PNotIn) - if(length(sizeFactors)==ncol(Data)) - R.NotIn=outer(R.NotIn.raw,sizeFactors) - if(length(sizeFactors)==length(Data)) - R.NotIn=R.NotIn.raw*sizeFactors[zNaNName,] - R.NotIn1=matrix(R.NotIn[,Conditions==levels(Conditions)[1]],nrow=nrow(R.NotIn)) - R.NotIn2=matrix(R.NotIn[,Conditions==levels(Conditions)[2]],nrow=nrow(R.NotIn)) - NumOfEachGroupNA=sapply(1:NoneZeroLength, function(i)sum(zNaNName%in%rownames(DataList[[i]]))) - F0LogNA=f0(matrix(DataList.unlist[zNaNName,],ncol=ncol(DataList.unlist)), Alpha, Beta, R.NotIn, NumOfEachGroupNA, log=T) - F1LogNA=f1(matrix(DataListSP[[1]][zNaNName,],ncol=ncol(DataListSP[[1]])), - matrix(DataListSP[[2]][zNaNName,],ncol=ncol(DataListSP[[2]])), - Alpha, Beta, R.NotIn1,R.NotIn2, NumOfEachGroupNA, log=T) - F0LogMdfNA=F0LogNA+600 - F1LogMdfNA=F1LogNA+600 - F0MdfNA=exp(F0LogMdfNA) - F1MdfNA=exp(F1LogMdfNA) - z.list.NotIn=PIn*F1MdfNA/(PIn*F1MdfNA+(1-PIn)*F0MdfNA) - z.list[zNaNName]=z.list.NotIn - F0Log[zNaNName]=F0LogNA - F1Log[zNaNName]=F1LogNA - } - } - RealName.Z.output=z.list - RealName.F0=F0Log - RealName.F1=F1Log - names(RealName.Z.output)=IsoNamesIn - names(RealName.F0)=IsoNamesIn - names(RealName.F1)=IsoNamesIn - - - output=list(Alpha=Alpha,Beta=Beta,P=PInput, Z=RealName.Z.output, - PPDE=RealName.Z.output,f0=RealName.F0, f1=RealName.F1) - return(output) - - } - - - # Get FC and VarPool for pooling - Only works on 2 conditions - if(ncol(Data)==2){ - DataforPoolSP.dvd1=matrix(DataList.unlist.dvd[,Conditions==levels(Conditions)[1]],nrow=dim(DataList.unlist)[1]) - DataforPoolSP.dvd2=matrix(DataList.unlist.dvd[,Conditions==levels(Conditions)[2]],nrow=dim(DataList.unlist)[1]) - MeanforPoolSP.dvd1=rowMeans(DataforPoolSP.dvd1) - MeanforPoolSP.dvd2=rowMeans(DataforPoolSP.dvd2) - FCforPool=MeanforPoolSP.dvd1/MeanforPoolSP.dvd2 - names(FCforPool)=rownames(Data) - FC_Use=which(FCforPool>=quantile(FCforPool[!is.na(FCforPool)],PoolLower) & - FCforPool<=quantile(FCforPool[!is.na(FCforPool)],PoolUpper)) - - Var_FC_Use=apply( DataList.unlist.dvd[FC_Use,],1,var ) - Mean_FC_Use=(MeanforPoolSP.dvd1[FC_Use]+MeanforPoolSP.dvd2[FC_Use])/2 - MeanforPool=(MeanforPoolSP.dvd1+MeanforPoolSP.dvd2)/2 - FC_Use2=which(Var_FC_Use>=Mean_FC_Use) - Var_FC_Use2=Var_FC_Use[FC_Use2] - Mean_FC_Use2=Mean_FC_Use[FC_Use2] - Phi=mean((Var_FC_Use2-Mean_FC_Use2)/Mean_FC_Use2^2) - VarEst= MeanforPool*(1+MeanforPool*Phi) - if(Print==T)message(paste("No Replicate - estimate phi",round(Phi,5), "\n")) - names(VarEst)=names(MeanforPoolSP.dvd1)= - names(MeanforPoolSP.dvd2)=rownames(DataList.unlist.dvd) - } - - #DataListSP Here also unlist.. Only two lists - DataListSP=vector("list",nlevels(Conditions)) - DataListSP.dvd=vector("list",nlevels(Conditions)) - SizeFSP=DataListSP - MeanSP=DataListSP - VarSP=DataListSP - GetPSP=DataListSP - RSP=DataListSP - CISP=DataListSP - tauSP=DataListSP - NumSampleEachCon=rep(NULL,nlevels(Conditions)) - - for (lv in 1:nlevels(Conditions)){ - DataListSP[[lv]]= matrix(DataList.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist)[1]) - rownames(DataListSP[[lv]])=rownames(DataList.unlist) - DataListSP.dvd[[lv]]= matrix(DataList.unlist.dvd[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist.dvd)[1]) - NumSampleEachCon[lv]=ncol(DataListSP[[lv]]) - - if(ncol(DataListSP[[lv]])==1 & !is.null(CI)){ - CISP[[lv]]=matrix(CI[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist.dvd)[1]) - tauSP[[lv]]=matrix(tau[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist.dvd)[1]) - } - # no matter sizeFactors is a vector or a matrix. Matrix should be columns are the normalization factors - # may input one for each - if(length(sizeFactors)==ncol(Data))SizeFSP[[lv]]=sizeFactors[Conditions==levels(Conditions)[lv]] - if(length(sizeFactors)==length(Data))SizeFSP[[lv]]=sizeFactors[,Conditions==levels(Conditions)[lv]] - - - MeanSP[[lv]]=rowMeans(DataListSP.dvd[[lv]]) - names(MeanSP[[lv]])=rownames(DataListSP[[lv]]) - - if(length(sizeFactors)==ncol(Data))PrePareVar=sapply(1:ncol( DataListSP[[lv]]),function(i)( DataListSP[[lv]][,i]- SizeFSP[[lv]][i]*MeanSP[[lv]])^2 /SizeFSP[[lv]][i]) - if(length(sizeFactors)==length(Data))PrePareVar=sapply(1:ncol( DataListSP[[lv]]),function(i)( DataListSP[[lv]][,i]- SizeFSP[[lv]][,i]*MeanSP[[lv]])^2 /SizeFSP[[lv]][,i]) - - if(ncol(DataListSP[[lv]])==1 & !is.null(CI)) - VarSP[[lv]]=as.vector(((DataListSP[[lv]]/tauSP[[lv]]) * CISP[[lv]]/(CIthre*2))^2) - if(ncol(DataListSP[[lv]])!=1){ - VarSP[[lv]]=rowSums(PrePareVar)/ncol( DataListSP[[lv]]) - names(MeanSP[[lv]])=rownames(DataList.unlist) - names(VarSP[[lv]])=rownames(DataList.unlist) - GetPSP[[lv]]=MeanSP[[lv]]/VarSP[[lv]] - RSP[[lv]]=MeanSP[[lv]]*GetPSP[[lv]]/(1-GetPSP[[lv]]) - } -} - - - VarList=apply(DataList.unlist.dvd, 1, var) - if(ncol(Data)==2){ - PoolVar=VarEst - VarSP[[1]]=VarSP[[2]]=VarEst - GetPSP[[1]]=MeanSP[[1]]/VarEst - GetPSP[[2]]=MeanSP[[2]]/VarEst - - } - if(!ncol(Data)==2){ - CondWithRep=which(NumSampleEachCon>1) - VarCondWithRep=do.call(cbind,VarSP[CondWithRep]) - PoolVar=rowMeans(VarCondWithRep) - - } - GetP=MeanList/PoolVar - - EmpiricalRList=MeanList*GetP/(1-GetP) - EmpiricalRList[EmpiricalRList==Inf] =max(EmpiricalRList[EmpiricalRList!=Inf]) -##################### - if(ncol(Data)!=2){ - Varcbind=do.call(cbind,VarSP) - VarrowMin=apply(Varcbind,1,min) - } - - if(ncol(Data)==2){ - Varcbind=VarEst - VarrowMin=VarEst - VarSP[[1]]=VarSP[[2]]=VarEst - names(MeanSP[[1]])=names(VarSP[[1]]) - names(MeanSP[[2]])=names(VarSP[[2]]) - } - # - # - GoodData=names(MeanList)[EmpiricalRList>0 & VarrowMin!=0 & EmpiricalRList!=Inf & !is.na(VarrowMin) & !is.na(EmpiricalRList)] - NotIn=names(MeanList)[EmpiricalRList<=0 | VarrowMin==0 | EmpiricalRList==Inf | is.na(VarrowMin) | is.na(EmpiricalRList)] - #print(paste("ZeroVar",sum(VarrowMin==0), "InfR", length(which(EmpiricalRList==Inf)), "Poi", length(which(EmpiricalRList<0)), "")) - EmpiricalRList.NotIn=EmpiricalRList[NotIn] - EmpiricalRList.Good=EmpiricalRList[GoodData] - EmpiricalRList.Good[EmpiricalRList.Good<1]=1+EmpiricalRList.Good[EmpiricalRList.Good<1] - if(length(sizeFactors)==ncol(Data)){ - EmpiricalRList.Good.mat= outer(EmpiricalRList.Good, sizeFactors) - EmpiricalRList.mat= outer(EmpiricalRList, sizeFactors) - } - if(length(sizeFactors)==length(Data)){ - EmpiricalRList.Good.mat=EmpiricalRList.Good* sizeFactors[GoodData,] - EmpiricalRList.mat=EmpiricalRList* sizeFactors - } - - # Only Use Data has Good q's - DataList.In=sapply(1:NoneZeroLength, function(i)DataList[[i]][GoodData[GoodData%in%rownames(DataList[[i]])],],simplify=F) - DataList.NotIn=sapply(1:NoneZeroLength, function(i)DataList[[i]][NotIn[NotIn%in%rownames(DataList[[i]])],],simplify=F) - DataListIn.unlist=do.call(rbind, DataList.In) - DataListNotIn.unlist=do.call(rbind, DataList.NotIn) - - DataListSPIn=vector("list",nlevels(Conditions)) - DataListSPNotIn=vector("list",nlevels(Conditions)) - EmpiricalRList.Good.mat.SP=EmpiricalRList.mat.SP=vector("list",nlevels(Conditions)) - for (lv in 1:nlevels(Conditions)){ - DataListSPIn[[lv]]= matrix(DataListIn.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataListIn.unlist)[1]) - if(length(NotIn)>0){ - DataListSPNotIn[[lv]]= matrix(DataListNotIn.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataListNotIn.unlist)[1]) - rownames(DataListSPNotIn[[lv]])=rownames(DataListNotIn.unlist) - } - rownames(DataListSPIn[[lv]])=rownames(DataListIn.unlist) - EmpiricalRList.Good.mat.SP[[lv]]=matrix(EmpiricalRList.Good.mat[,Conditions==levels(Conditions)[lv]],nrow=dim(EmpiricalRList.Good.mat)[1]) - EmpiricalRList.mat.SP[[lv]]=matrix(EmpiricalRList.mat[,Conditions==levels(Conditions)[lv]],nrow=dim(EmpiricalRList.mat)[1]) - } - - NumOfEachGroupIn=sapply(1:NoneZeroLength, function(i)max(0,dim(DataList.In[[i]])[1])) - NumOfEachGroupNotIn=sapply(1:NoneZeroLength, function(i)max(0,dim(DataList.NotIn[[i]])[1])) - - -################# -# For output -################# -RealName.EmpiricalRList=sapply(1:NoneZeroLength,function(i)EmpiricalRList[names(EmpiricalRList)%in%NameList[[i]]], simplify=F) -RealName.MeanList=sapply(1:NoneZeroLength,function(i)MeanList[names(MeanList)%in%NameList[[i]]], simplify=F) -RealName.C1MeanList=sapply(1:NoneZeroLength,function(i)MeanSP[[1]][names(MeanSP[[1]])%in%NameList[[i]]], simplify=F) -RealName.C2MeanList=sapply(1:NoneZeroLength,function(i)MeanSP[[2]][names(MeanSP[[2]])%in%NameList[[i]]], simplify=F) -RealName.C1VarList=sapply(1:NoneZeroLength,function(i)VarSP[[1]][names(VarSP[[1]])%in%NameList[[i]]], simplify=F) -RealName.C2VarList=sapply(1:NoneZeroLength,function(i)VarSP[[2]][names(VarSP[[2]])%in%NameList[[i]]], simplify=F) -RealName.DataList=sapply(1:NoneZeroLength,function(i)DataList[[i]][rownames(DataList[[i]])%in%NameList[[i]],], simplify=F) - - - -RealName.VarList=sapply(1:NoneZeroLength,function(i)VarList[names(VarList)%in%NameList[[i]]], simplify=F) -RealName.PoolVarList=sapply(1:NoneZeroLength,function(i)PoolVar[names(PoolVar)%in%NameList[[i]]], simplify=F) - - -RealName.QList1=sapply(1:NoneZeroLength,function(i)GetPSP[[1]][names(GetPSP[[1]])%in%NameList[[i]]], simplify=F) -RealName.QList2=sapply(1:NoneZeroLength,function(i)GetPSP[[2]][names(GetPSP[[2]])%in%NameList[[i]]], simplify=F) - - -for (i in 1:NoneZeroLength){ -tmp=NameList[[i]] -names=IsoNamesIn[tmp] - -RealName.MeanList[[i]]=RealName.MeanList[[i]][NameList[[i]]] -RealName.VarList[[i]]=RealName.VarList[[i]][NameList[[i]]] -RealName.QList1[[i]]=RealName.QList1[[i]][NameList[[i]]] -RealName.QList2[[i]]=RealName.QList2[[i]][NameList[[i]]] -RealName.EmpiricalRList[[i]]=RealName.EmpiricalRList[[i]][NameList[[i]]] -RealName.C1MeanList[[i]]=RealName.C1MeanList[[i]][NameList[[i]]] -RealName.C2MeanList[[i]]=RealName.C2MeanList[[i]][NameList[[i]]] -RealName.PoolVarList[[i]]=RealName.PoolVarList[[i]][NameList[[i]]] -RealName.C1VarList[[i]]=RealName.C1VarList[[i]][NameList[[i]]] -RealName.C2VarList[[i]]=RealName.C2VarList[[i]][NameList[[i]]] -RealName.DataList[[i]]=RealName.DataList[[i]][NameList[[i]],] - -names(RealName.MeanList[[i]])=names -names(RealName.VarList[[i]])=names -if(ncol(DataListSP[[1]])!=1){ - names(RealName.QList1[[i]])=names - names(RealName.C1VarList[[i]])=names -} -if(ncol(DataListSP[[2]])!=1){ - names(RealName.QList2[[i]])=names - names(RealName.C2VarList[[i]])=names -} - -names(RealName.EmpiricalRList[[i]])=names -names(RealName.C1MeanList[[i]])=names -names(RealName.C2MeanList[[i]])=names -names(RealName.PoolVarList[[i]])=names -rownames(RealName.DataList[[i]])=names - - -} - -##################### -# If Don need EM -##################### - if(!is.null(Alpha)&!is.null(Beta)){ - F0Log=f0(Input=DataList.unlist, AlphaIn=Alpha, BetaIn=Beta, - EmpiricalR=EmpiricalRList.mat, NumOfGroups=NumEachGroup, log=T) - F1Log=f1(Input1=DataListSP[[1]], Input2=DataListSP[[2]], - AlphaIn=Alpha, BetaIn=Beta, EmpiricalRSP1=EmpiricalRList.mat.SP[[1]], - EmpiricalRSP2=EmpiricalRList.mat.SP[[2]], NumOfGroup=NumEachGroup, log=T) - F0LogMdf=F0Log+600 - F1LogMdf=F1Log+600 - F0Mdf=exp(F0LogMdf) - F1Mdf=exp(F1LogMdf) - if(!is.null(PInput)){ - z.list=PInput*F1Mdf/(PInput*F1Mdf+(1-PInput)*F0Mdf) - PIn=PInput - } - if(is.null(PInput)){ - PIn=.5 - PInput=rep(NULL,maxround) - for(i in 1:maxround){ - z.list=PIn*F1Mdf/(PIn*F1Mdf+(1-PIn)*F0Mdf) - zNaNName=names(z.list)[is.na(z.list)] - zGood=which(!is.na(z.list)) - PIn=sum(z.list[zGood])/length(z.list[zGood]) - PInput[i]=PIn - } - - zNaNName=names(z.list)[is.na(z.list)] - if(length(zNaNName)!=0){ - PNotIn=rep(1-ApproxVal,length(zNaNName)) - MeanList.NotIn=MeanList[zNaNName] - R.NotIn.raw=MeanList.NotIn*PNotIn/(1-PNotIn) - if(length(sizeFactors)==ncol(Data)) - R.NotIn=outer(R.NotIn.raw,sizeFactors) - if(length(sizeFactors)==length(Data)) - R.NotIn=R.NotIn.raw*sizeFactors[zNaNName,] - R.NotIn1=matrix(R.NotIn[,Conditions==levels(Conditions)[1]],nrow=nrow(R.NotIn)) - R.NotIn2=matrix(R.NotIn[,Conditions==levels(Conditions)[2]],nrow=nrow(R.NotIn)) - NumOfEachGroupNA=sapply(1:NoneZeroLength, function(i)sum(zNaNName%in%rownames(DataList[[i]]))) - F0LogNA=f0(matrix(DataList.unlist[zNaNName,], ncol=ncol(DataList.unlist)), Alpha, Beta, R.NotIn, NumOfEachGroupNA, log=T) - F1LogNA=f1(matrix(DataListSP[[1]][zNaNName,],ncol=ncol(DataListSP[[1]])), - matrix(DataListSP[[2]][zNaNName,],ncol=ncol(DataListSP[[2]])), - Alpha, Beta, R.NotIn1,R.NotIn2, NumOfEachGroupNA, log=T) - F0LogMdfNA=F0LogNA+600 - F1LogMdfNA=F1LogNA+600 - F0MdfNA=exp(F0LogMdfNA) - F1MdfNA=exp(F1LogMdfNA) - z.list.NotIn=PIn*F1MdfNA/(PIn*F1MdfNA+(1-PIn)*F0MdfNA) - z.list[zNaNName]=z.list.NotIn - F0Log[zNaNName]=F0LogNA - F1Log[zNaNName]=F1LogNA - } - } - RealName.Z.output=z.list - RealName.F0=F0Log - RealName.F1=F1Log - names(RealName.Z.output)=IsoNamesIn - names(RealName.F0)=IsoNamesIn - names(RealName.F1)=IsoNamesIn - - - output=list(Alpha=Alpha,Beta=Beta,P=PInput, Z=RealName.Z.output, - RList=RealName.EmpiricalRList, MeanList=RealName.MeanList, - VarList=RealName.VarList, QList1=RealName.QList1, QList2=RealName.QList2, - C1Mean=RealName.C1MeanList, C2Mean=RealName.C2MeanList, - C1EstVar=RealName.C1VarList, C2EstVar=RealName.C2VarList, - PoolVar=RealName.PoolVarList , DataList=RealName.DataList, - PPDE=RealName.Z.output,f0=RealName.F0, f1=RealName.F1) - return(output) - } - - -##################### -#Initialize SigIn & ... -##################### - AlphaIn=0.5 - BetaIn=rep(0.5,NoneZeroLength) - PIn=0.5 - - -##################### -# EM -##################### - UpdateAlpha=NULL - UpdateBeta=NULL - UpdateP=NULL - UpdatePFromZ=NULL - Timeperround=NULL - for (times in 1:maxround){ - temptime1=proc.time() - UpdateOutput=suppressWarnings(LogN(DataListIn.unlist,DataListSPIn, EmpiricalRList.Good.mat ,EmpiricalRList.Good.mat.SP, NumOfEachGroupIn, AlphaIn, BetaIn, PIn, NoneZeroLength)) - message(paste("iteration", times, "done \n",sep=" ")) - AlphaIn=UpdateOutput$AlphaNew - BetaIn=UpdateOutput$BetaNew - PIn=UpdateOutput$PNew - PFromZ=UpdateOutput$PFromZ - F0Out=UpdateOutput$F0Out - F1Out=UpdateOutput$F1Out - UpdateAlpha=rbind(UpdateAlpha,AlphaIn) - UpdateBeta=rbind(UpdateBeta,BetaIn) - UpdateP=rbind(UpdateP,PIn) - UpdatePFromZ=rbind(UpdatePFromZ,PFromZ) - temptime2=proc.time() - Timeperround=c(Timeperround,temptime2[3]-temptime1[3]) - message(paste("time" ,round(Timeperround[times],2),"\n",sep=" ")) - Z.output=UpdateOutput$ZNew.list[!is.na(UpdateOutput$ZNew.list)] - Z.NA.Names=UpdateOutput$zNaNName - } - #Remove this } after testing!! - -# if (times!=1){ -# if((UpdateAlpha[times]-UpdateAlpha[times-1])^2+UpdateBeta[times]-UpdateBeta[times-1])^2+UpdateR[times]-UpdateR[times-1])^2+UpdateP[times]-UpdateP[times-1])^2<=10^(-6)){ -# Result=list(Sig=SigIn, Miu=MiuIn, Tau=TauIn) -# break -# } -# } -#} - -##########Change Names############ -## Only z are for Good Ones -GoodData=GoodData[!GoodData%in%Z.NA.Names] -IsoNamesIn.Good=IsoNamesIn[GoodData] -RealName.Z.output=Z.output -RealName.F0=F0Out -RealName.F1=F1Out -names(RealName.Z.output)=IsoNamesIn.Good -names(RealName.F0)=IsoNamesIn.Good -names(RealName.F1)=IsoNamesIn.Good - - - -#########posterior part for other data set here later############ -AllNA=unique(c(Z.NA.Names,NotIn)) -z.list.NotIn=NULL -AllF0=c(RealName.F0) -AllF1=c(RealName.F1) -AllZ=RealName.Z.output - -if (length(AllNA)>0){ - Ng.NA=NgVector[AllNA] - AllNA.Ngorder=AllNA[order(Ng.NA)] - NumOfEachGroupNA=rep(0,NoneZeroLength) - NumOfEachGroupNA.tmp=tapply(Ng.NA,Ng.NA,length) - names(NumOfEachGroupNA)=c(1:NoneZeroLength) - NumOfEachGroupNA[names(NumOfEachGroupNA.tmp)]=NumOfEachGroupNA.tmp - PNotIn=rep(1-ApproxVal,length(AllNA.Ngorder)) - MeanList.NotIn=MeanList[AllNA.Ngorder] - R.NotIn.raw=MeanList.NotIn*PNotIn/(1-PNotIn) - if(length(sizeFactors)==ncol(Data)) - R.NotIn=outer(R.NotIn.raw,sizeFactors) - if(length(sizeFactors)==length(Data)) - R.NotIn=R.NotIn.raw*sizeFactors[NotIn,] - R.NotIn1=matrix(R.NotIn[,Conditions==levels(Conditions)[1]],nrow=nrow(R.NotIn)) - R.NotIn2=matrix(R.NotIn[,Conditions==levels(Conditions)[2]],nrow=nrow(R.NotIn)) - - DataListNotIn.unlistWithZ=matrix(DataList.unlist[AllNA.Ngorder,],nrow=length(AllNA.Ngorder)) - DataListSPNotInWithZ=vector("list",nlevels(Conditions)) - for (lv in 1:nlevels(Conditions)) - DataListSPNotInWithZ[[lv]] = matrix(DataListSP[[lv]][AllNA.Ngorder,],nrow=length(AllNA.Ngorder)) - F0Log=f0(DataListNotIn.unlistWithZ, AlphaIn, BetaIn, R.NotIn, NumOfEachGroupNA, log=T) - F1Log=f1(DataListSPNotInWithZ[[1]], DataListSPNotInWithZ[[2]], AlphaIn, BetaIn, R.NotIn1,R.NotIn2, NumOfEachGroupNA, log=T) - F0LogMdf=F0Log+600 - F1LogMdf=F1Log+600 - F0Mdf=exp(F0LogMdf) - F1Mdf=exp(F1LogMdf) - z.list.NotIn=PIn*F1Mdf/(PIn*F1Mdf+(1-PIn)*F0Mdf) -# names(z.list.NotIn)=IsoNamesIn.Good=IsoNamesIn[which(Names%in%NotIn)] - names(z.list.NotIn)=IsoNamesIn[AllNA.Ngorder] - - AllZ=c(RealName.Z.output,z.list.NotIn) - AllZ=AllZ[IsoNamesIn] - AllZ[is.na(AllZ)]=0 - F0.NotIn=F0Log - F1.NotIn=F1Log - names(F0.NotIn)=IsoNamesIn[names(F0Log)] - names(F1.NotIn)=IsoNamesIn[names(F1Log)] - AllF0=c(RealName.F0,F0.NotIn) - AllF1=c(RealName.F1,F1.NotIn) - AllF0=AllF0[IsoNamesIn] - AllF1=AllF1[IsoNamesIn] - AllF0[is.na(AllF0)]=0 - AllF1[is.na(AllF1)]=0 -} -PPMatNZ=cbind(1-AllZ,AllZ) -colnames(PPMatNZ)=c("PPEE","PPDE") -rownames(UpdateAlpha)=paste("iter",1:nrow(UpdateAlpha),sep="") -rownames(UpdateBeta)=paste("iter",1:nrow(UpdateBeta),sep="") -rownames(UpdateP)=paste("iter",1:nrow(UpdateP),sep="") -rownames(UpdatePFromZ)=paste("iter",1:nrow(UpdatePFromZ),sep="") -colnames(UpdateBeta)=paste("Ng",1:ncol(UpdateBeta),sep="") - -CondOut=levels(Conditions) -names(CondOut)=paste("Condition",c(1:length(CondOut)),sep="") - -PPMat=matrix(NA,ncol=2,nrow=nrow(Dataraw)) -rownames(PPMat)=rownames(Dataraw) -colnames(PPMat)=c("PPEE","PPDE") -if(is.null(AllZeroNames))PPMat=PPMatNZ -if(!is.null(AllZeroNames))PPMat[names(NotAllZeroNames),]=PPMatNZ[names(NotAllZeroNames),] - - -#############Result############################ -Result=list(Alpha=UpdateAlpha,Beta=UpdateBeta,P=UpdateP, - PFromZ=UpdatePFromZ, Z=RealName.Z.output,PoissonZ=z.list.NotIn, - RList=RealName.EmpiricalRList, MeanList=RealName.MeanList, - VarList=RealName.VarList, QList1=RealName.QList1, QList2=RealName.QList2, - C1Mean=RealName.C1MeanList, C2Mean=RealName.C2MeanList,C1EstVar=RealName.C1VarList, - C2EstVar=RealName.C2VarList, PoolVar=RealName.PoolVarList , - DataList=RealName.DataList,PPDE=AllZ,f0=AllF0, f1=AllF1, - AllZeroIndex=AllZeroNames,PPMat=PPMatNZ, PPMatWith0=PPMat, - ConditionOrder=CondOut, Conditions=Conditions) -} - diff --git a/.svn/pristine/c7/c779b9b915a419641ffc480470360b635903584f.svn-base b/.svn/pristine/c7/c779b9b915a419641ffc480470360b635903584f.svn-base deleted file mode 100644 index 9c74bdf..0000000 --- a/.svn/pristine/c7/c779b9b915a419641ffc480470360b635903584f.svn-base +++ /dev/null @@ -1,592 +0,0 @@ -EBTest <- -function(Data,NgVector=NULL,Conditions, sizeFactors, maxround, Pool=F, NumBin=1000,ApproxVal=10^-10, Alpha=NULL, Beta=NULL,PInput=NULL,RInput=NULL,PoolLower=.25, PoolUpper=.75,Print=T, Qtrm=1,QtrmCut=0) -{ - if(!is.factor(Conditions))Conditions=as.factor(Conditions) - if(is.null(rownames(Data)))stop("Please add gene/isoform names to the data matrix") - - if(!is.matrix(Data))stop("The input Data is not a matrix") - if(length(Conditions)!=ncol(Data))stop("The number of conditions is not the same as the number of samples! ") - if(nlevels(Conditions)>2)stop("More than 2 conditions! Please use EBMultiTest() function") - if(nlevels(Conditions)<2)stop("Less than 2 conditions - Please check your input") - if(length(sizeFactors)!=length(Data) & length(sizeFactors)!=ncol(Data)) - stop("The number of library size factors is not the same as the number of samples!") - - Conditions=as.factor(Conditions) - Vect5End=Vect3End=CI=CIthre=tau=NULL - Dataraw=Data - - #Normalized - DataNorm=GetNormalizedMat(Data, sizeFactors) - Levels=levels(as.factor(Conditions)) - - # Dixon Statistics -# library(outliers) -# normalized matrix for each condition -# matC=sapply(1:length(Levels),function(i)DataNorm[,which(Conditions==Levels[i])]) -# run dixon test for each isoform within condition -# DixonP=sapply(1:length(matC),function(j) -# apply(DataNorm,1,function(i){ -# if(mean(i)==0)out=NA -# else out=dixon.test(i)$p.value -# out})) - - - QuantileFor0=apply(DataNorm,1,function(i)quantile(i,Qtrm)) - AllZeroNames=which(QuantileFor0<=QtrmCut) - NotAllZeroNames=which(QuantileFor0>QtrmCut) - if(length(AllZeroNames)>0 & Print==T) - cat(paste0("Removing transcripts with ",Qtrm*100, - " th quantile < = ",QtrmCut," \n", - length(NotAllZeroNames)," transcripts will be tested\n")) - if(length(NotAllZeroNames)==0)stop("0 transcript passed") - Data=Data[NotAllZeroNames,] - if(!is.null(NgVector))NgVector=NgVector[NotAllZeroNames] - if(length(sizeFactors)==length(Data))sizeFactors=sizeFactors[NotAllZeroNames,] - if(is.null(NgVector))NgVector=rep(1,nrow(Data)) - - #Rename Them - IsoNamesIn=rownames(Data) - Names=paste("I",c(1:dim(Data)[1]),sep="") - names(IsoNamesIn)=Names - rownames(Data)=paste("I",c(1:dim(Data)[1]),sep="") - names(NgVector)=paste("I",c(1:dim(Data)[1]),sep="") - - - if(length(sizeFactors)==length(Data)){ - rownames(sizeFactors)=rownames(Data) - colnames(sizeFactors)=Conditions - } - - NumOfNg=nlevels(as.factor(NgVector)) - NameList=sapply(1:NumOfNg,function(i)Names[NgVector==i],simplify=F) - names(NameList)=paste("Ng",c(1:NumOfNg),sep="") - NotNone=NULL - for (i in 1:NumOfNg) { - if (length(NameList[[i]])!=0) - NotNone=c(NotNone,names(NameList)[i]) - } - NameList=NameList[NotNone] - - NoneZeroLength=length(NameList) - DataList=vector("list",NoneZeroLength) - DataList=sapply(1:NoneZeroLength , function(i) Data[NameList[[i]],],simplify=F) - names(DataList)=names(NameList) - - NumEachGroup=sapply(1:NoneZeroLength , function(i)dim(DataList[[i]])[1]) - # Unlist - DataList.unlist=do.call(rbind, DataList) - - # Divide by SampleSize factor - - if(length(sizeFactors)==ncol(Data)) - DataList.unlist.dvd=t(t( DataList.unlist)/sizeFactors) - - if(length(sizeFactors)==length(Data)) - DataList.unlist.dvd=DataList.unlist/sizeFactors - - MeanList=rowMeans(DataList.unlist.dvd) - -############### -# Input R -############### - if (!is.null(RInput)){ - - RNoZero=RInput[NotAllZeroNames] - names(RNoZero)=rownames(Data) - RNoZero.order=RNoZero[rownames(DataList.unlist)] - if(length(sizeFactors)==ncol(Data)){ - RMat= outer(RNoZero.order, sizeFactors) - } - if(length(sizeFactors)==length(Data)){ - RMat= RNoZero.order* sizeFactors - } - - DataListSP=vector("list",nlevels(Conditions)) - RMatSP=vector("list",nlevels(Conditions)) - - for (lv in 1:nlevels(Conditions)){ - DataListSP[[lv]]= matrix(DataList.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist)[1]) - rownames(DataListSP[[lv]])=rownames(DataList.unlist) - RMatSP[[lv]]= matrix(RMat[,Conditions==levels(Conditions)[lv]],nrow=dim(RMat)[1]) - rownames(RMatSP[[lv]])=rownames(RMat) - - } - - F0Log=f0(Input=DataList.unlist, AlphaIn=Alpha, BetaIn=Beta, - EmpiricalR=RMat, NumOfGroups=NumEachGroup, log=T) - F1Log=f1(Input1=DataListSP[[1]], Input2=DataListSP[[2]], - AlphaIn=Alpha, BetaIn=Beta, EmpiricalRSP1=RMatSP[[1]], - EmpiricalRSP2=RMatSP[[2]], NumOfGroup=NumEachGroup, log=T) - F0LogMdf=F0Log+600 - F1LogMdf=F1Log+600 - F0Mdf=exp(F0LogMdf) - F1Mdf=exp(F1LogMdf) - if(!is.null(PInput)){ - z.list=PInput*F1Mdf/(PInput*F1Mdf+(1-PInput)*F0Mdf) - PIn=PInput - } - if(is.null(PInput)){ - PIn=.5 - PInput=rep(NULL,maxround) - for(i in 1:maxround){ - z.list=PIn*F1Mdf/(PIn*F1Mdf+(1-PIn)*F0Mdf) - zNaNName=names(z.list)[is.na(z.list)] - zGood=which(!is.na(z.list)) - PIn=sum(z.list[zGood])/length(z.list[zGood]) - PInput[i]=PIn - } - - zNaNName=names(z.list)[is.na(z.list)] - if(length(zNaNName)!=0){ - PNotIn=rep(1-ApproxVal,length(zNaNName)) - MeanList.NotIn=MeanList[zNaNName] - R.NotIn.raw=MeanList.NotIn*PNotIn/(1-PNotIn) - if(length(sizeFactors)==ncol(Data)) - R.NotIn=outer(R.NotIn.raw,sizeFactors) - if(length(sizeFactors)==length(Data)) - R.NotIn=R.NotIn.raw*sizeFactors[zNaNName,] - R.NotIn1=matrix(R.NotIn[,Conditions==levels(Conditions)[1]],nrow=nrow(R.NotIn)) - R.NotIn2=matrix(R.NotIn[,Conditions==levels(Conditions)[2]],nrow=nrow(R.NotIn)) - NumOfEachGroupNA=sapply(1:NoneZeroLength, function(i)sum(zNaNName%in%rownames(DataList[[i]]))) - F0LogNA=f0(matrix(DataList.unlist[zNaNName,],ncol=ncol(DataList.unlist)), Alpha, Beta, R.NotIn, NumOfEachGroupNA, log=T) - F1LogNA=f1(matrix(DataListSP[[1]][zNaNName,],ncol=ncol(DataListSP[[1]])), - matrix(DataListSP[[2]][zNaNName,],ncol=ncol(DataListSP[[2]])), - Alpha, Beta, R.NotIn1,R.NotIn2, NumOfEachGroupNA, log=T) - F0LogMdfNA=F0LogNA+600 - F1LogMdfNA=F1LogNA+600 - F0MdfNA=exp(F0LogMdfNA) - F1MdfNA=exp(F1LogMdfNA) - z.list.NotIn=PIn*F1MdfNA/(PIn*F1MdfNA+(1-PIn)*F0MdfNA) - z.list[zNaNName]=z.list.NotIn - F0Log[zNaNName]=F0LogNA - F1Log[zNaNName]=F1LogNA - } - } - RealName.Z.output=z.list - RealName.F0=F0Log - RealName.F1=F1Log - names(RealName.Z.output)=IsoNamesIn - names(RealName.F0)=IsoNamesIn - names(RealName.F1)=IsoNamesIn - - - output=list(Alpha=Alpha,Beta=Beta,P=PInput, Z=RealName.Z.output, - PPDE=RealName.Z.output,f0=RealName.F0, f1=RealName.F1) - return(output) - - } - - - # Get FC and VarPool for pooling - Only works on 2 conditions - if(ncol(Data)==2){ - DataforPoolSP.dvd1=matrix(DataList.unlist.dvd[,Conditions==levels(Conditions)[1]],nrow=dim(DataList.unlist)[1]) - DataforPoolSP.dvd2=matrix(DataList.unlist.dvd[,Conditions==levels(Conditions)[2]],nrow=dim(DataList.unlist)[1]) - MeanforPoolSP.dvd1=rowMeans(DataforPoolSP.dvd1) - MeanforPoolSP.dvd2=rowMeans(DataforPoolSP.dvd2) - FCforPool=MeanforPoolSP.dvd1/MeanforPoolSP.dvd2 - names(FCforPool)=rownames(Data) - FC_Use=which(FCforPool>=quantile(FCforPool[!is.na(FCforPool)],PoolLower) & - FCforPool<=quantile(FCforPool[!is.na(FCforPool)],PoolUpper)) - - Var_FC_Use=apply( DataList.unlist.dvd[FC_Use,],1,var ) - Mean_FC_Use=(MeanforPoolSP.dvd1[FC_Use]+MeanforPoolSP.dvd2[FC_Use])/2 - MeanforPool=(MeanforPoolSP.dvd1+MeanforPoolSP.dvd2)/2 - FC_Use2=which(Var_FC_Use>=Mean_FC_Use) - Var_FC_Use2=Var_FC_Use[FC_Use2] - Mean_FC_Use2=Mean_FC_Use[FC_Use2] - Phi=mean((Var_FC_Use2-Mean_FC_Use2)/Mean_FC_Use2^2) - VarEst= MeanforPool*(1+MeanforPool*Phi) - if(Print==T)message(paste("No Replicate - estimate phi",round(Phi,5), "\n")) - names(VarEst)=names(MeanforPoolSP.dvd1)= - names(MeanforPoolSP.dvd2)=rownames(DataList.unlist.dvd) - } - - #DataListSP Here also unlist.. Only two lists - DataListSP=vector("list",nlevels(Conditions)) - DataListSP.dvd=vector("list",nlevels(Conditions)) - SizeFSP=DataListSP - MeanSP=DataListSP - VarSP=DataListSP - GetPSP=DataListSP - RSP=DataListSP - CISP=DataListSP - tauSP=DataListSP - NumSampleEachCon=rep(NULL,nlevels(Conditions)) - - for (lv in 1:nlevels(Conditions)){ - DataListSP[[lv]]= matrix(DataList.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist)[1]) - rownames(DataListSP[[lv]])=rownames(DataList.unlist) - DataListSP.dvd[[lv]]= matrix(DataList.unlist.dvd[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist.dvd)[1]) - NumSampleEachCon[lv]=ncol(DataListSP[[lv]]) - - if(ncol(DataListSP[[lv]])==1 & !is.null(CI)){ - CISP[[lv]]=matrix(CI[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist.dvd)[1]) - tauSP[[lv]]=matrix(tau[,Conditions==levels(Conditions)[lv]],nrow=dim(DataList.unlist.dvd)[1]) - } - # no matter sizeFactors is a vector or a matrix. Matrix should be columns are the normalization factors - # may input one for each - if(length(sizeFactors)==ncol(Data))SizeFSP[[lv]]=sizeFactors[Conditions==levels(Conditions)[lv]] - if(length(sizeFactors)==length(Data))SizeFSP[[lv]]=sizeFactors[,Conditions==levels(Conditions)[lv]] - - - MeanSP[[lv]]=rowMeans(DataListSP.dvd[[lv]]) - names(MeanSP[[lv]])=rownames(DataListSP[[lv]]) - - if(length(sizeFactors)==ncol(Data))PrePareVar=sapply(1:ncol( DataListSP[[lv]]),function(i)( DataListSP[[lv]][,i]- SizeFSP[[lv]][i]*MeanSP[[lv]])^2 /SizeFSP[[lv]][i]) - if(length(sizeFactors)==length(Data))PrePareVar=sapply(1:ncol( DataListSP[[lv]]),function(i)( DataListSP[[lv]][,i]- SizeFSP[[lv]][,i]*MeanSP[[lv]])^2 /SizeFSP[[lv]][,i]) - - if(ncol(DataListSP[[lv]])==1 & !is.null(CI)) - VarSP[[lv]]=as.vector(((DataListSP[[lv]]/tauSP[[lv]]) * CISP[[lv]]/(CIthre*2))^2) - if(ncol(DataListSP[[lv]])!=1){ - VarSP[[lv]]=rowSums(PrePareVar)/ncol( DataListSP[[lv]]) - names(MeanSP[[lv]])=rownames(DataList.unlist) - names(VarSP[[lv]])=rownames(DataList.unlist) - GetPSP[[lv]]=MeanSP[[lv]]/VarSP[[lv]] - RSP[[lv]]=MeanSP[[lv]]*GetPSP[[lv]]/(1-GetPSP[[lv]]) - } -} - - - VarList=apply(DataList.unlist.dvd, 1, var) - if(ncol(Data)==2){ - PoolVar=VarEst - VarSP[[1]]=VarSP[[2]]=VarEst - GetPSP[[1]]=MeanSP[[1]]/VarEst - GetPSP[[2]]=MeanSP[[2]]/VarEst - - } - if(!ncol(Data)==2){ - CondWithRep=which(NumSampleEachCon>1) - VarCondWithRep=do.call(cbind,VarSP[CondWithRep]) - PoolVar=rowMeans(VarCondWithRep) - - } - GetP=MeanList/PoolVar - - EmpiricalRList=MeanList*GetP/(1-GetP) - EmpiricalRList[EmpiricalRList==Inf] =max(EmpiricalRList[EmpiricalRList!=Inf]) -##################### - if(ncol(Data)!=2){ - Varcbind=do.call(cbind,VarSP) - VarrowMin=apply(Varcbind,1,min) - } - - if(ncol(Data)==2){ - Varcbind=VarEst - VarrowMin=VarEst - VarSP[[1]]=VarSP[[2]]=VarEst - names(MeanSP[[1]])=names(VarSP[[1]]) - names(MeanSP[[2]])=names(VarSP[[2]]) - } - # - # - GoodData=names(MeanList)[EmpiricalRList>0 & VarrowMin!=0 & EmpiricalRList!=Inf & !is.na(VarrowMin) & !is.na(EmpiricalRList)] - NotIn=names(MeanList)[EmpiricalRList<=0 | VarrowMin==0 | EmpiricalRList==Inf | is.na(VarrowMin) | is.na(EmpiricalRList)] - #print(paste("ZeroVar",sum(VarrowMin==0), "InfR", length(which(EmpiricalRList==Inf)), "Poi", length(which(EmpiricalRList<0)), "")) - EmpiricalRList.NotIn=EmpiricalRList[NotIn] - EmpiricalRList.Good=EmpiricalRList[GoodData] - EmpiricalRList.Good[EmpiricalRList.Good<1]=1+EmpiricalRList.Good[EmpiricalRList.Good<1] - if(length(sizeFactors)==ncol(Data)){ - EmpiricalRList.Good.mat= outer(EmpiricalRList.Good, sizeFactors) - EmpiricalRList.mat= outer(EmpiricalRList, sizeFactors) - } - if(length(sizeFactors)==length(Data)){ - EmpiricalRList.Good.mat=EmpiricalRList.Good* sizeFactors[GoodData,] - EmpiricalRList.mat=EmpiricalRList* sizeFactors - } - - # Only Use Data has Good q's - DataList.In=sapply(1:NoneZeroLength, function(i)DataList[[i]][GoodData[GoodData%in%rownames(DataList[[i]])],],simplify=F) - DataList.NotIn=sapply(1:NoneZeroLength, function(i)DataList[[i]][NotIn[NotIn%in%rownames(DataList[[i]])],],simplify=F) - DataListIn.unlist=do.call(rbind, DataList.In) - DataListNotIn.unlist=do.call(rbind, DataList.NotIn) - - DataListSPIn=vector("list",nlevels(Conditions)) - DataListSPNotIn=vector("list",nlevels(Conditions)) - EmpiricalRList.Good.mat.SP=EmpiricalRList.mat.SP=vector("list",nlevels(Conditions)) - for (lv in 1:nlevels(Conditions)){ - DataListSPIn[[lv]]= matrix(DataListIn.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataListIn.unlist)[1]) - if(length(NotIn)>0){ - DataListSPNotIn[[lv]]= matrix(DataListNotIn.unlist[,Conditions==levels(Conditions)[lv]],nrow=dim(DataListNotIn.unlist)[1]) - rownames(DataListSPNotIn[[lv]])=rownames(DataListNotIn.unlist) - } - rownames(DataListSPIn[[lv]])=rownames(DataListIn.unlist) - EmpiricalRList.Good.mat.SP[[lv]]=matrix(EmpiricalRList.Good.mat[,Conditions==levels(Conditions)[lv]],nrow=dim(EmpiricalRList.Good.mat)[1]) - EmpiricalRList.mat.SP[[lv]]=matrix(EmpiricalRList.mat[,Conditions==levels(Conditions)[lv]],nrow=dim(EmpiricalRList.mat)[1]) - } - - NumOfEachGroupIn=sapply(1:NoneZeroLength, function(i)max(0,dim(DataList.In[[i]])[1])) - NumOfEachGroupNotIn=sapply(1:NoneZeroLength, function(i)max(0,dim(DataList.NotIn[[i]])[1])) - - -################# -# For output -################# -RealName.EmpiricalRList=sapply(1:NoneZeroLength,function(i)EmpiricalRList[names(EmpiricalRList)%in%NameList[[i]]], simplify=F) -RealName.MeanList=sapply(1:NoneZeroLength,function(i)MeanList[names(MeanList)%in%NameList[[i]]], simplify=F) -RealName.C1MeanList=sapply(1:NoneZeroLength,function(i)MeanSP[[1]][names(MeanSP[[1]])%in%NameList[[i]]], simplify=F) -RealName.C2MeanList=sapply(1:NoneZeroLength,function(i)MeanSP[[2]][names(MeanSP[[2]])%in%NameList[[i]]], simplify=F) -RealName.C1VarList=sapply(1:NoneZeroLength,function(i)VarSP[[1]][names(VarSP[[1]])%in%NameList[[i]]], simplify=F) -RealName.C2VarList=sapply(1:NoneZeroLength,function(i)VarSP[[2]][names(VarSP[[2]])%in%NameList[[i]]], simplify=F) -RealName.DataList=sapply(1:NoneZeroLength,function(i)DataList[[i]][rownames(DataList[[i]])%in%NameList[[i]],], simplify=F) - - - -RealName.VarList=sapply(1:NoneZeroLength,function(i)VarList[names(VarList)%in%NameList[[i]]], simplify=F) -RealName.PoolVarList=sapply(1:NoneZeroLength,function(i)PoolVar[names(PoolVar)%in%NameList[[i]]], simplify=F) - - -RealName.QList1=sapply(1:NoneZeroLength,function(i)GetPSP[[1]][names(GetPSP[[1]])%in%NameList[[i]]], simplify=F) -RealName.QList2=sapply(1:NoneZeroLength,function(i)GetPSP[[2]][names(GetPSP[[2]])%in%NameList[[i]]], simplify=F) - - -for (i in 1:NoneZeroLength){ -tmp=NameList[[i]] -names=IsoNamesIn[tmp] - -RealName.MeanList[[i]]=RealName.MeanList[[i]][NameList[[i]]] -RealName.VarList[[i]]=RealName.VarList[[i]][NameList[[i]]] -RealName.QList1[[i]]=RealName.QList1[[i]][NameList[[i]]] -RealName.QList2[[i]]=RealName.QList2[[i]][NameList[[i]]] -RealName.EmpiricalRList[[i]]=RealName.EmpiricalRList[[i]][NameList[[i]]] -RealName.C1MeanList[[i]]=RealName.C1MeanList[[i]][NameList[[i]]] -RealName.C2MeanList[[i]]=RealName.C2MeanList[[i]][NameList[[i]]] -RealName.PoolVarList[[i]]=RealName.PoolVarList[[i]][NameList[[i]]] -RealName.C1VarList[[i]]=RealName.C1VarList[[i]][NameList[[i]]] -RealName.C2VarList[[i]]=RealName.C2VarList[[i]][NameList[[i]]] -RealName.DataList[[i]]=RealName.DataList[[i]][NameList[[i]],] - -names(RealName.MeanList[[i]])=names -names(RealName.VarList[[i]])=names -if(ncol(DataListSP[[1]])!=1){ - names(RealName.QList1[[i]])=names - names(RealName.C1VarList[[i]])=names -} -if(ncol(DataListSP[[2]])!=1){ - names(RealName.QList2[[i]])=names - names(RealName.C2VarList[[i]])=names -} - -names(RealName.EmpiricalRList[[i]])=names -names(RealName.C1MeanList[[i]])=names -names(RealName.C2MeanList[[i]])=names -names(RealName.PoolVarList[[i]])=names -rownames(RealName.DataList[[i]])=names - - -} - -##################### -# If Don need EM -##################### - if(!is.null(Alpha)&!is.null(Beta)){ - F0Log=f0(Input=DataList.unlist, AlphaIn=Alpha, BetaIn=Beta, - EmpiricalR=EmpiricalRList.mat, NumOfGroups=NumEachGroup, log=T) - F1Log=f1(Input1=DataListSP[[1]], Input2=DataListSP[[2]], - AlphaIn=Alpha, BetaIn=Beta, EmpiricalRSP1=EmpiricalRList.mat.SP[[1]], - EmpiricalRSP2=EmpiricalRList.mat.SP[[2]], NumOfGroup=NumEachGroup, log=T) - F0LogMdf=F0Log+600 - F1LogMdf=F1Log+600 - F0Mdf=exp(F0LogMdf) - F1Mdf=exp(F1LogMdf) - if(!is.null(PInput)){ - z.list=PInput*F1Mdf/(PInput*F1Mdf+(1-PInput)*F0Mdf) - PIn=PInput - } - if(is.null(PInput)){ - PIn=.5 - PInput=rep(NULL,maxround) - for(i in 1:maxround){ - z.list=PIn*F1Mdf/(PIn*F1Mdf+(1-PIn)*F0Mdf) - zNaNName=names(z.list)[is.na(z.list)] - zGood=which(!is.na(z.list)) - PIn=sum(z.list[zGood])/length(z.list[zGood]) - PInput[i]=PIn - } - - zNaNName=names(z.list)[is.na(z.list)] - if(length(zNaNName)!=0){ - PNotIn=rep(1-ApproxVal,length(zNaNName)) - MeanList.NotIn=MeanList[zNaNName] - R.NotIn.raw=MeanList.NotIn*PNotIn/(1-PNotIn) - if(length(sizeFactors)==ncol(Data)) - R.NotIn=outer(R.NotIn.raw,sizeFactors) - if(length(sizeFactors)==length(Data)) - R.NotIn=R.NotIn.raw*sizeFactors[zNaNName,] - R.NotIn1=matrix(R.NotIn[,Conditions==levels(Conditions)[1]],nrow=nrow(R.NotIn)) - R.NotIn2=matrix(R.NotIn[,Conditions==levels(Conditions)[2]],nrow=nrow(R.NotIn)) - NumOfEachGroupNA=sapply(1:NoneZeroLength, function(i)sum(zNaNName%in%rownames(DataList[[i]]))) - F0LogNA=f0(matrix(DataList.unlist[zNaNName,], ncol=ncol(DataList.unlist)), Alpha, Beta, R.NotIn, NumOfEachGroupNA, log=T) - F1LogNA=f1(matrix(DataListSP[[1]][zNaNName,],ncol=ncol(DataListSP[[1]])), - matrix(DataListSP[[2]][zNaNName,],ncol=ncol(DataListSP[[2]])), - Alpha, Beta, R.NotIn1,R.NotIn2, NumOfEachGroupNA, log=T) - F0LogMdfNA=F0LogNA+600 - F1LogMdfNA=F1LogNA+600 - F0MdfNA=exp(F0LogMdfNA) - F1MdfNA=exp(F1LogMdfNA) - z.list.NotIn=PIn*F1MdfNA/(PIn*F1MdfNA+(1-PIn)*F0MdfNA) - z.list[zNaNName]=z.list.NotIn - F0Log[zNaNName]=F0LogNA - F1Log[zNaNName]=F1LogNA - } - } - RealName.Z.output=z.list - RealName.F0=F0Log - RealName.F1=F1Log - names(RealName.Z.output)=IsoNamesIn - names(RealName.F0)=IsoNamesIn - names(RealName.F1)=IsoNamesIn - - - output=list(Alpha=Alpha,Beta=Beta,P=PInput, Z=RealName.Z.output, - RList=RealName.EmpiricalRList, MeanList=RealName.MeanList, - VarList=RealName.VarList, QList1=RealName.QList1, QList2=RealName.QList2, - C1Mean=RealName.C1MeanList, C2Mean=RealName.C2MeanList, - C1EstVar=RealName.C1VarList, C2EstVar=RealName.C2VarList, - PoolVar=RealName.PoolVarList , DataList=RealName.DataList, - PPDE=RealName.Z.output,f0=RealName.F0, f1=RealName.F1) - return(output) - } - - -##################### -#Initialize SigIn & ... -##################### - AlphaIn=0.5 - BetaIn=rep(0.5,NoneZeroLength) - PIn=0.5 - - -##################### -# EM -##################### - UpdateAlpha=NULL - UpdateBeta=NULL - UpdateP=NULL - UpdatePFromZ=NULL - Timeperround=NULL - for (times in 1:maxround){ - temptime1=proc.time() - UpdateOutput=suppressWarnings(LogN(DataListIn.unlist,DataListSPIn, EmpiricalRList.Good.mat ,EmpiricalRList.Good.mat.SP, NumOfEachGroupIn, AlphaIn, BetaIn, PIn, NoneZeroLength)) - message(paste("iteration", times, "done \n",sep=" ")) - AlphaIn=UpdateOutput$AlphaNew - BetaIn=UpdateOutput$BetaNew - PIn=UpdateOutput$PNew - PFromZ=UpdateOutput$PFromZ - F0Out=UpdateOutput$F0Out - F1Out=UpdateOutput$F1Out - UpdateAlpha=rbind(UpdateAlpha,AlphaIn) - UpdateBeta=rbind(UpdateBeta,BetaIn) - UpdateP=rbind(UpdateP,PIn) - UpdatePFromZ=rbind(UpdatePFromZ,PFromZ) - temptime2=proc.time() - Timeperround=c(Timeperround,temptime2[3]-temptime1[3]) - message(paste("time" ,round(Timeperround[times],2),"\n",sep=" ")) - Z.output=UpdateOutput$ZNew.list[!is.na(UpdateOutput$ZNew.list)] - Z.NA.Names=UpdateOutput$zNaNName - } - #Remove this } after testing!! - -# if (times!=1){ -# if((UpdateAlpha[times]-UpdateAlpha[times-1])^2+UpdateBeta[times]-UpdateBeta[times-1])^2+UpdateR[times]-UpdateR[times-1])^2+UpdateP[times]-UpdateP[times-1])^2<=10^(-6)){ -# Result=list(Sig=SigIn, Miu=MiuIn, Tau=TauIn) -# break -# } -# } -#} - -##########Change Names############ -## Only z are for Good Ones -GoodData=GoodData[!GoodData%in%Z.NA.Names] -IsoNamesIn.Good=IsoNamesIn[GoodData] -RealName.Z.output=Z.output -RealName.F0=F0Out -RealName.F1=F1Out -names(RealName.Z.output)=IsoNamesIn.Good -names(RealName.F0)=IsoNamesIn.Good -names(RealName.F1)=IsoNamesIn.Good - - - -#########posterior part for other data set here later############ -AllNA=unique(c(Z.NA.Names,NotIn)) -z.list.NotIn=NULL -AllF0=c(RealName.F0) -AllF1=c(RealName.F1) -AllZ=RealName.Z.output - -if (length(AllNA)>0){ - Ng.NA=NgVector[AllNA] - AllNA.Ngorder=AllNA[order(Ng.NA)] - NumOfEachGroupNA=rep(0,NoneZeroLength) - NumOfEachGroupNA.tmp=tapply(Ng.NA,Ng.NA,length) - names(NumOfEachGroupNA)=c(1:NoneZeroLength) - NumOfEachGroupNA[names(NumOfEachGroupNA.tmp)]=NumOfEachGroupNA.tmp - PNotIn=rep(1-ApproxVal,length(AllNA.Ngorder)) - MeanList.NotIn=MeanList[AllNA.Ngorder] - R.NotIn.raw=MeanList.NotIn*PNotIn/(1-PNotIn) - if(length(sizeFactors)==ncol(Data)) - R.NotIn=outer(R.NotIn.raw,sizeFactors) - if(length(sizeFactors)==length(Data)) - R.NotIn=R.NotIn.raw*sizeFactors[NotIn,] - R.NotIn1=matrix(R.NotIn[,Conditions==levels(Conditions)[1]],nrow=nrow(R.NotIn)) - R.NotIn2=matrix(R.NotIn[,Conditions==levels(Conditions)[2]],nrow=nrow(R.NotIn)) - - DataListNotIn.unlistWithZ=matrix(DataList.unlist[AllNA.Ngorder,],nrow=length(AllNA.Ngorder)) - DataListSPNotInWithZ=vector("list",nlevels(Conditions)) - for (lv in 1:nlevels(Conditions)) - DataListSPNotInWithZ[[lv]] = matrix(DataListSP[[lv]][AllNA.Ngorder,],nrow=length(AllNA.Ngorder)) - F0Log=f0(DataListNotIn.unlistWithZ, AlphaIn, BetaIn, R.NotIn, NumOfEachGroupNA, log=T) - F1Log=f1(DataListSPNotInWithZ[[1]], DataListSPNotInWithZ[[2]], AlphaIn, BetaIn, R.NotIn1,R.NotIn2, NumOfEachGroupNA, log=T) - F0LogMdf=F0Log+600 - F1LogMdf=F1Log+600 - F0Mdf=exp(F0LogMdf) - F1Mdf=exp(F1LogMdf) - z.list.NotIn=PIn*F1Mdf/(PIn*F1Mdf+(1-PIn)*F0Mdf) -# names(z.list.NotIn)=IsoNamesIn.Good=IsoNamesIn[which(Names%in%NotIn)] - names(z.list.NotIn)=IsoNamesIn[AllNA.Ngorder] - - AllZ=c(RealName.Z.output,z.list.NotIn) - AllZ=AllZ[IsoNamesIn] - AllZ[is.na(AllZ)]=0 - F0.NotIn=F0Log - F1.NotIn=F1Log - names(F0.NotIn)=IsoNamesIn[names(F0Log)] - names(F1.NotIn)=IsoNamesIn[names(F1Log)] - AllF0=c(RealName.F0,F0.NotIn) - AllF1=c(RealName.F1,F1.NotIn) - AllF0=AllF0[IsoNamesIn] - AllF1=AllF1[IsoNamesIn] - AllF0[is.na(AllF0)]=0 - AllF1[is.na(AllF1)]=0 -} -PPMatNZ=cbind(1-AllZ,AllZ) -colnames(PPMatNZ)=c("PPEE","PPDE") -rownames(UpdateAlpha)=paste("iter",1:nrow(UpdateAlpha),sep="") -rownames(UpdateBeta)=paste("iter",1:nrow(UpdateBeta),sep="") -rownames(UpdateP)=paste("iter",1:nrow(UpdateP),sep="") -rownames(UpdatePFromZ)=paste("iter",1:nrow(UpdatePFromZ),sep="") -colnames(UpdateBeta)=paste("Ng",1:ncol(UpdateBeta),sep="") - -CondOut=levels(Conditions) -names(CondOut)=paste("Condition",c(1:length(CondOut)),sep="") - -PPMat=matrix(NA,ncol=2,nrow=nrow(Dataraw)) -rownames(PPMat)=rownames(Dataraw) -colnames(PPMat)=c("PPEE","PPDE") -if(is.null(AllZeroNames))PPMat=PPMatNZ -if(!is.null(AllZeroNames))PPMat[names(NotAllZeroNames),]=PPMatNZ[names(NotAllZeroNames),] - - -#############Result############################ -Result=list(Alpha=UpdateAlpha,Beta=UpdateBeta,P=UpdateP, - PFromZ=UpdatePFromZ, Z=RealName.Z.output,PoissonZ=z.list.NotIn, - RList=RealName.EmpiricalRList, MeanList=RealName.MeanList, - VarList=RealName.VarList, QList1=RealName.QList1, QList2=RealName.QList2, - C1Mean=RealName.C1MeanList, C2Mean=RealName.C2MeanList,C1EstVar=RealName.C1VarList, - C2EstVar=RealName.C2VarList, PoolVar=RealName.PoolVarList , - DataList=RealName.DataList,PPDE=AllZ,f0=AllF0, f1=AllF1, - AllZeroIndex=AllZeroNames,PPMat=PPMatNZ, PPMatWith0=PPMat, - ConditionOrder=CondOut, Conditions=Conditions, DataNorm=DataNorm) -} - diff --git a/.svn/pristine/cd/cd27a6060166d64f6efe3e0af69292442b01cc03.svn-base b/.svn/pristine/cd/cd27a6060166d64f6efe3e0af69292442b01cc03.svn-base deleted file mode 100644 index 8371a47..0000000 --- a/.svn/pristine/cd/cd27a6060166d64f6efe3e0af69292442b01cc03.svn-base +++ /dev/null @@ -1,21 +0,0 @@ -Package: EBSeq -Type: Package -Title:An R package for gene and isoform differential expression analysis of RNA-seq data -Version: 1.7.1 -Date: 2015-1-29 -Author: Ning Leng, Christina Kendziorski -Maintainer: Ning Leng -Depends: blockmodeling, gplots, R (>= 3.0.0) -Description: Differential Expression analysis at both gene and isoform - level using RNA-seq data -License: Artistic-2.0 -LazyLoad: yes -Collate: 'MedianNorm.R' 'GetNg.R' 'beta.mom.R' 'f0.R' 'f1.R' - 'Likefun.R' 'LogN.R' 'LogNMulti.R' 'LikefunMulti.R' 'EBTest.R' - 'GetPatterns.R' 'EBMultiTest.R' 'GetPP.R' 'PostFC.R' - 'GetPPMat.R' 'GetMultiPP.R' 'GetMultiFC.R' 'PlotPostVsRawFC.R' - 'crit_fun.R' 'DenNHist.R' 'GetNormalizedMat.R' 'PlotPattern.R' - 'PolyFitPlot.R' 'QQP.R' 'QuantileNorm.R' 'RankNorm.R' 'GetDEResults.R' -BuildVignettes: yes -biocViews: StatisticalMethod, DifferentialExpression, - MultipleComparison, RNASeq, Sequencing diff --git a/.svn/pristine/cf/cfc4b390856bc5ab359843d2250f72f641e0a0f2.svn-base b/.svn/pristine/cf/cfc4b390856bc5ab359843d2250f72f641e0a0f2.svn-base deleted file mode 100644 index 64ba2b3..0000000 --- a/.svn/pristine/cf/cfc4b390856bc5ab359843d2250f72f641e0a0f2.svn-base +++ /dev/null @@ -1,21 +0,0 @@ -PlotPostVsRawFC<-function(EBOut,FCOut){ -library(gplots) - -par(fig=c(0,.8,0,1), new=F) -RainbowColor=rev(redgreen(length(FCOut$PostFC))) -par(oma=c(0,1,1,0),cex=1.3) -plot(FCOut$PostFC,FCOut$RealFC, -log="xy",col=RainbowColor[rank(unlist(EBOut$MeanList)[names(FCOut$PostFC)])], xlab="Posterior FC", -ylab="FC",pch=21) -abline(h=1, v=1, col="gray") -#legend("topleft",col=c("green","black","red"),legend=c("Low Expression","Median Expression","High Expression"),pch=21) - -par(fig=c(.7,1,0,1), new=TRUE) -Seq=1:ceiling(length(RainbowColor)/100) -plot(c(0,10), c(1,length(RainbowColor)), type='n', bty='n', xaxt='n', xlab='Rank', ylab='') -for (i in 1:length(Seq)) { -rect(0,(i-1)*100,10,i*100, col=RainbowColor[(i-1)*100], border=NA) -} - - -} diff --git a/.svn/pristine/d0/d06b333af872264aa6783a2d7eaeb8e5481d4746.svn-base b/.svn/pristine/d0/d06b333af872264aa6783a2d7eaeb8e5481d4746.svn-base deleted file mode 100644 index 8c95ebf..0000000 --- a/.svn/pristine/d0/d06b333af872264aa6783a2d7eaeb8e5481d4746.svn-base +++ /dev/null @@ -1,5 +0,0 @@ -export(crit_fun, DenNHist, EBTest, GetNg, GetPP, MedianNorm, -PolyFitPlot, PostFC, QQP, QuantileNorm, RankNorm, EBMultiTest, -GetMultiPP, GetPatterns, PlotPattern, GetPPMat, GetMultiFC, PlotPostVsRawFC, -GetNormalizedMat,f0,f1,LogN,LogNMulti, GetDEResults) - diff --git a/.svn/pristine/da/da3c95de58674987efc83fa6e06948b4fe9fa61e.svn-base b/.svn/pristine/da/da3c95de58674987efc83fa6e06948b4fe9fa61e.svn-base deleted file mode 100644 index 28f23f4..0000000 --- a/.svn/pristine/da/da3c95de58674987efc83fa6e06948b4fe9fa61e.svn-base +++ /dev/null @@ -1,104 +0,0 @@ -\name{EBTest} -\alias{EBTest} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ -Using EM algorithm to calculate the posterior probabilities of being DE -} -\description{ -Base on the assumption of NB-Beta Empirical Bayes model, the EM algorithm is used to get the posterior probability of being DE. -} -\usage{ -EBTest(Data, NgVector = NULL, Conditions, sizeFactors, maxround, - Pool = F, NumBin = 1000, ApproxVal = 10^-10, Alpha = NULL, - Beta = NULL, PInput = NULL, RInput = NULL, - PoolLower = .25, PoolUpper = .75, Print = T, Qtrm = 1,QtrmCut=0) -} -\arguments{ - - \item{Data}{A data matrix contains expression values for each transcript (gene or isoform level). In which rows should be transcripts and columns should be samples.} - \item{NgVector}{A vector indicates the uncertainty group assignment of each isoform. -e.g. if we use number of isoforms in the host gene to define the uncertainty groups, suppose the isoform is in a gene with 2 isoforms, Ng of this isoform should be 2. The length of this vector should be the same as the number of rows in Data. If it's gene level data, Ngvector could be left as NULL.} - \item{Conditions}{A factor indicates the condition which each sample belongs to. } - \item{sizeFactors}{The normalization factors. It should be a vector with lane specific numbers (the length of the vector should be the same as the number of samples, with the same order as the columns of Data).} - \item{maxround}{Number of iterations. The default value is 5. Users should always check the convergency by looking at the Alpha and Beta in output. If the hyper-parameter estimations are not converged in 5 iterations, larger number is suggested.} -\item{Pool}{While working without replicates, user could define the Pool = TRUE in the EBTest function to enable pooling.} -\item{NumBin}{By defining NumBin = 1000, EBSeq will group the genes with similar means together into 1,000 bins.} -\item{PoolLower, PoolUpper}{ -With the assumption that only subset of the genes are DE in the data set, we take genes whose FC are in the PoolLower - PoolUpper quantile of the FC's as the candidate genes (default is 25\%-75\%). - -For each bin, the bin-wise variance estimation is defined as the median of the cross condition variance estimations of the candidate genes within that bin. - -We use the cross condition variance estimations for the candidate genes and the bin-wise variance estimations of the host bin for the non-candidate genes. -} - -\item{ApproxVal}{The variances of the transcripts with mean < var will be approximated as mean/(1-ApproxVal). } - -\item{Alpha, Beta, PInput, RInput}{If the parameters are known and the user doesn't want to estimate them from the data, user could specify them here.} -\item{Print}{Whether print the elapsed-time while running the test.} -\item{Qtrm, QtrmCut}{ -Transcripts with Qtrm th quantile < = QtrmCut will be removed before testing. The default value is Qtrm = 1 and QtrmCut=0. -By default setting, transcripts with all 0's -won't be tested. -} -} - -\details{For each transcript gi within condition, the model assumes: -X_{gis}|mu_{gi} ~ NB (r_{gi0} * l_s, q_{gi}) -q_gi|alpha, beta^N_g ~ Beta (alpha, beta^N_g) -In which the l_s is the sizeFactors of samples. - -The function will test "H0: q_{gi}^{C1} = q_{gi}^{C2}" and "H1: q_{gi}^{C1} != q_{gi}^{C2}." -} -\value{ -\item{Alpha}{Fitted parameter alpha of the prior beta distribution. Rows are the values for each iteration.} -\item{Beta}{Fitted parameter beta of the prior beta distribution. Rows are the values for each iteration.} -\item{P, PFromZ}{The bayes estimator of being DE. Rows are the values for each iteration.} -\item{Z, PoissonZ}{The Posterior Probability of being DE for each transcript(Maybe not in the same order of input). } -\item{RList}{The fitted values of r for each transcript.} -\item{MeanList}{The mean of each transcript (across conditions).} -\item{VarList}{The variance of each transcript (across conditions).} -\item{QListi1}{The fitted q values of each transcript within condition 1.} -\item{QListi2}{The fitted q values of each transcript within condition 2.} -\item{C1Mean}{The mean of each transcript within Condition 1 (adjusted by normalization factors).} -\item{C2Mean}{The mean of each transcript within Condition 2 (adjusted by normalization factors).} -\item{C1EstVar}{The estimated variance of each transcript within Condition 1 (adjusted by normalization factors).} -\item{C2EstVar}{The estimated variance of each transcript within Condition 2 (adjusted by normalization factors).} -\item{PoolVar}{The variance of each transcript (The pooled value of within condition EstVar).} -\item{DataList}{A List of data that grouped with Ng.} -\item{PPDE}{The Posterior Probability of being DE for each transcript (The same order of input).} -\item{f0,f1}{The likelihood of the prior predictive distribution of being EE or DE (in log scale).} -\item{AllZeroIndex}{The transcript with expression 0 for all samples (which are not tested).} -\item{PPMat}{A matrix contains posterior probabilities of being EE (the first column) or DE (the second column). -Rows are transcripts. -Transcripts with expression 0 for all samples are not shown in this matrix.} -\item{PPMatWith0}{A matrix contains posterior probabilities of being EE (the first column) or DE (the second column). -Rows are transcripts. -Transcripts with expression 0 for all samples are shown as PP(EE) = PP(DE) = NA in this matrix. -The transcript order is exactly the same as the order of the input data.} -\item{ConditionOrder}{The condition assignment for C1Mean, C2Mean, etc.} -\item{Conditions}{The input conditions.} -\item{DataNorm}{Normalized expression matrix.} -} -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - - -\seealso{ -EBMultiTest, PostFC, GetPPMat -} -\examples{ -data(GeneMat) -str(GeneMat) -GeneMat.small = GeneMat[c(1:10,511:550),] -Sizes = MedianNorm(GeneMat.small) -EBOut = EBTest(Data = GeneMat.small, - Conditions = as.factor(rep(c("C1","C2"), each = 5)), - sizeFactors = Sizes, maxround = 5) -PP = GetPPMat(EBOut) -} -\keyword{ DE } -\keyword{ Two condition }% __ONLY ONE__ keyword per line diff --git a/.svn/pristine/dc/dcd4b44ec559a4e0b93a9aec104bf9a0c06477bb.svn-base b/.svn/pristine/dc/dcd4b44ec559a4e0b93a9aec104bf9a0c06477bb.svn-base deleted file mode 100644 index 7b93b9d..0000000 --- a/.svn/pristine/dc/dcd4b44ec559a4e0b93a9aec104bf9a0c06477bb.svn-base +++ /dev/null @@ -1,97 +0,0 @@ -\name{EBMultiTest} -\alias{EBMultiTest} -\title{ -Using EM algorithm to calculate the posterior probabilities -of interested patterns in a multiple condition study -} -\description{ -'EBMultiTest' is built based on the assumption of NB-Beta Empirical Bayes model. It utilizes the EM algorithm to give the posterior probability of the interested patterns. -} -\usage{ -EBMultiTest(Data, NgVector = NULL, Conditions, AllParti = NULL, - sizeFactors, maxround, Pool = F, NumBin = 1000, - ApproxVal=10^-10, PoolLower=.25, PoolUpper = .75, Print=T,Qtrm=1,QtrmCut=0) -} -\arguments{ - - \item{Data}{A data matrix contains expression values for each transcript (gene or isoform level). In which rows should be transcripts and columns should be samples.} - \item{NgVector}{A vector indicates the uncertainty group assignment of each isoform. -e.g. if we use number of isoforms in the host gene to define the uncertainty groups, -suppose the isoform is in a gene with 2 isoforms, Ng of this isoform should be 2. -The length of this vector should be the same as the number of rows in Data. -If it's gene level data, Ngvector could be left as NULL.} - \item{Conditions}{A vector indicates the condition in which each sample belongs to. } - - \item{AllParti}{A matrix indicates the interested patterns. Columns shoule be conditions and rows should be patterns. The matrix could be obtained by the GetPatterns function. If AllParti=NULL, all possible patterns will be used.} - - \item{sizeFactors}{The normalization factors. It should be a vector with lane specific numbers (the length of the vector should be the same as the number of samples, with the same order as the columns of Data).} - \item{maxround}{Number of iterations. The default value is 5. -Users should always check the convergency by looking at the Alpha and -Beta in output. If the hyper-parameter estimations are not converged -in 5 iterations, larger number is suggested.} - - -\item{Pool}{While working without replicates, user could define the Pool = TRUE in the EBTest function to enable pooling.} -\item{NumBin}{By defining NumBin = 1000, EBSeq will group the genes with similar means together into 1,000 bins.} -\item{PoolLower, PoolUpper}{ -With the assumption that only subset of the genes are DE in the data set, we take genes whose FC are in the PoolLower - PoolUpper quantile of the FC's as the candidate genes (default is 25\%-75\%). - -For each bin, the bin-wise variance estimation is defined as the median of the cross condition variance estimations of the candidate genes within that bin. - -We use the cross condition variance estimations for the candidate genes and the bin-wise variance estimations of the host bin for the non-candidate genes.} - -\item{ApproxVal}{The variances of the transcripts with mean < var will be approximated as mean/(1-ApproxVal).} - -\item{Print}{Whether print the elapsed-time while running the test.} - -\item{Qtrm, QtrmCut}{ -Transcripts with Qtrm th quantile < = QtrmCut will be removed before testing. The default value is Qtrm = 1 and QtrmCut=0. -By default setting, transcripts with all 0's -won't be tested. -} -} - - -\value{ -\item{Alpha}{Fitted parameter alpha of the prior beta distribution. Rows are the values for each iteration.} -\item{Beta}{Fitted parameter beta of the prior beta distribution. Rows are the values for each iteration.} -\item{P, PFromZ}{The bayes estimator of following each pattern of interest. Rows are the values for each iteration.} -\item{Z, PoissonZ}{The Posterior Probability of following each pattern of interest for each transcript. (Maybe not in the same order of input).} -\item{RList}{The fitted values of r for each transcript.} -\item{MeanList}{The mean of each transcript. (across conditions).} -\item{VarList}{The variance of each transcript. (across conditions).} -\item{QList}{The fitted q values of each transcript within each condition.} -\item{SPMean}{The mean of each transcript within each condition (adjusted by the normalization factors).} -\item{SPEstVar}{The estimated variance of each transcript within each condition (adjusted by the normalization factors).} -\item{PoolVar}{The variance of each transcript (The pooled value of within condition EstVar).} -\item{DataList}{A List of data that grouped with Ng and bias.} -\item{PPpattern}{The Posterior Probability of following each pattern (columns) for each transcript (rows). Transcripts with expression 0 for all samples are not shown in this matrix.} -\item{f}{The likelihood of likelihood of prior predictive distribution of being each pattern for each transcript. } -\item{AllParti}{The matrix describe the patterns.} -\item{PPpatternWith0}{The Posterior Probability of following each pattern (columns) for each transcript (rows). Transcripts with expression 0 for all samples are shown in this matrix with PP(any_pattrn)=NA.} -\item{ConditionOrder}{The condition assignment for C1Mean, C2Mean, etc.} -} -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - -\seealso{ -EBTest, GetMultiPP, GetMultiFC -} -\examples{ -data(MultiGeneMat) -MultiGeneMat.small = MultiGeneMat[201:210,] -Conditions = c("C1","C1","C2","C2","C3","C3") -PosParti = GetPatterns(Conditions) -Parti = PosParti[-3,] -MultiSize = MedianNorm(MultiGeneMat.small) -MultiOut = EBMultiTest(MultiGeneMat.small, NgVector = NULL, - Conditions = Conditions, AllParti = Parti, - sizeFactors = MultiSize, maxround = 5) -MultiPP = GetMultiPP(MultiOut) -} -\keyword{ DE } -\keyword{ Multiple Condition }% __ONLY ONE__ keyword per line diff --git a/.svn/pristine/dd/ddf14937fbad8d68259ce041d3629a0b8f0fa687.svn-base b/.svn/pristine/dd/ddf14937fbad8d68259ce041d3629a0b8f0fa687.svn-base deleted file mode 100644 index e68ed4b..0000000 --- a/.svn/pristine/dd/ddf14937fbad8d68259ce041d3629a0b8f0fa687.svn-base +++ /dev/null @@ -1,15 +0,0 @@ -crit_fun<-function (PPEE, thre) -{ - y <- cumsum(sort(PPEE))/(1:length(PPEE)) - mm <- y < thre - index <- sum(mm) - if (index > 0) { - out <- 1 - sort(PPEE)[index] - } - if (index == 0) { - out <- 1 - } - names(out) <- NULL - return(out) -} - diff --git a/.svn/pristine/df/df355f79c2d05ab43290917df4b1a9936944103a.svn-base b/.svn/pristine/df/df355f79c2d05ab43290917df4b1a9936944103a.svn-base deleted file mode 100644 index 3e08019..0000000 --- a/.svn/pristine/df/df355f79c2d05ab43290917df4b1a9936944103a.svn-base +++ /dev/null @@ -1,19 +0,0 @@ -\name{IsoList} -\alias{IsoList} -\docType{data} -\title{ -The simulated data for two condition isoform DE analysis -} -\description{ -'IsoList' gives the simulated data for two condition isoform DE analysis. -} -\usage{data(IsoList)} -\source{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\seealso{GeteMat} - -\examples{ -data(IsoList) -} -\keyword{datasets} diff --git a/.svn/pristine/e5/e58205be682d04cda8fcb99857e625f76ce4f626.svn-base b/.svn/pristine/e5/e58205be682d04cda8fcb99857e625f76ce4f626.svn-base deleted file mode 100644 index 24bc98e..0000000 --- a/.svn/pristine/e5/e58205be682d04cda8fcb99857e625f76ce4f626.svn-base +++ /dev/null @@ -1 +0,0 @@ -EBSeq demo diff --git a/.svn/pristine/e6/e604cb18ceca9728eb435520142fa23b71f1b3b3.svn-base b/.svn/pristine/e6/e604cb18ceca9728eb435520142fa23b71f1b3b3.svn-base deleted file mode 100644 index 7dc1021..0000000 --- a/.svn/pristine/e6/e604cb18ceca9728eb435520142fa23b71f1b3b3.svn-base +++ /dev/null @@ -1,47 +0,0 @@ -\name{QuantileNorm} -\alias{QuantileNorm} -\title{ -Quantile Normalization -} -\description{ -'QuantileNorm' gives the quantile normalization. -} -\usage{ -QuantileNorm(Data, Quantile) -} -\arguments{ - - \item{Data}{ -The data matrix with transcripts in rows and lanes in columns. -} -\item{Quantile}{ -The quantile the user wishs to use. Should be a number between 0 and 1. -} -} -\details{ -Use a quantile point to normalize the data. -} -\value{ -The function will return a vector contains the normalization factor for each lane. -% ... -} -\references{ -Bullard, James H., et al. Evaluation of statistical methods for normalization and differential expression in mRNA-Seq experiments. BMC bioinformatics 11.1 (2010): 94. -} -\author{ -Ning Leng -} - - -\seealso{ -MedianNorm -} -\examples{ -data(GeneMat) -Sizes = QuantileNorm(GeneMat,.75) -#EBOut = EBTest(Data = GeneMat, -# Conditions = as.factor(rep(c("C1","C2"), each=5)), -# sizeFactors = Sizes, maxround = 5) - -} -\keyword{ Normalization }% __ONLY ONE__ keyword per line diff --git a/.svn/pristine/e7/e7fc7ef7aa1d69813f6b7be35ae17c1cf3deec02.svn-base b/.svn/pristine/e7/e7fc7ef7aa1d69813f6b7be35ae17c1cf3deec02.svn-base deleted file mode 100644 index ddd2478..0000000 --- a/.svn/pristine/e7/e7fc7ef7aa1d69813f6b7be35ae17c1cf3deec02.svn-base +++ /dev/null @@ -1,64 +0,0 @@ -\name{GetMultiFC} -\alias{GetMultiFC} -\title{ -Calculate the Fold Changes for Multiple Conditions -} -\description{ -'GetMultiFC' calculates the Fold Changes for each pair of conditions in a multiple condition study.} -\usage{ -GetMultiFC(EBMultiOut, SmallNum = 0.01) -} -\arguments{ - \item{EBMultiOut}{The output of EBMultiTest function.} -\item{SmallNum}{A small number will be added for each transcript in each condition to avoid Inf and NA. -Default is 0.01.} -} -\details{ -Provide the FC (adjusted by the normalization factors) for each pair of comparisons. -A small number will be added for each transcript in each condition to avoid Inf and NA. -Default is set to be 0.01. -} -\value{ -\item{FCMat}{The FC of each pair of comparison (adjusted by the normalization factors).} -\item{Log2FCMat}{The log 2 FC of each pair of comparison (adjusted by the normalization factors).} -\item{PostFCMat}{The posterior FC of each pair of comparison.} -\item{Log2PostFCMat}{The log 2 posterior FC of each pair of comparison.} -\item{CondMean}{The mean of each transcript within each condition (adjusted by the normalization factors).} -\item{ConditionOrder}{The condition assignment for C1Mean, C2Mean, etc.} -} - - -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} - -\author{ -Ning Leng -} - - -\seealso{ -EBMultiTest, PostFC -} -\examples{ -data(MultiGeneMat) -MultiGeneMat.small = MultiGeneMat[201:210,] - -Conditions = c("C1","C1","C2","C2","C3","C3") - -PosParti = GetPatterns(Conditions) -Parti = PosParti[-3,] - -MultiSize = MedianNorm(MultiGeneMat.small) - -MultiOut = EBMultiTest(MultiGeneMat.small, - NgVector=NULL, Conditions=Conditions, - AllParti=Parti, sizeFactors=MultiSize, - maxround=5) - -MultiFC = GetMultiFC(MultiOut) - -} -% Add one or more standard keywords, see file 'KEYWORDS' in the -% R documentation directory. -\keyword{ Posterior Probability } diff --git a/.svn/pristine/ee/eea74c777b7fe7d909f049de29ed61e9e58fbfa5.svn-base b/.svn/pristine/ee/eea74c777b7fe7d909f049de29ed61e9e58fbfa5.svn-base deleted file mode 100644 index e5d8901..0000000 --- a/.svn/pristine/ee/eea74c777b7fe7d909f049de29ed61e9e58fbfa5.svn-base +++ /dev/null @@ -1,27 +0,0 @@ -f0 <- -function(Input, AlphaIn, BetaIn, EmpiricalR, NumOfGroups, log) -{ - BetaVect=do.call(c,sapply(1:length(BetaIn),function(i)rep(BetaIn[i],NumOfGroups[i]),simplify=F)) - SampleNum=dim(Input)[2] - #Product part - ChooseParam1=round(Input+EmpiricalR-1) - roundInput=round(Input) - EachChoose0=matrix(sapply(1:SampleNum, function(i)lchoose(ChooseParam1[,i], roundInput[,i])),ncol=SampleNum) - # numerical approximation to rescue -Inf ones - NoNegInfMin=min(EachChoose0[which(EachChoose0!=-Inf)]) - NoPosInfMax=max(EachChoose0[which(EachChoose0!=Inf)]) - EachChoose=EachChoose0 - EachChoose[which(EachChoose0==-Inf, arr.ind=T)]=NoNegInfMin - EachChoose[which(EachChoose0==Inf, arr.ind=T)]=NoPosInfMax - - SumEachIso=rowSums(Input) - param1=AlphaIn + rowSums(EmpiricalR) - param2=BetaVect + SumEachIso - LogConst=rowSums(EachChoose)+lbeta(param1, param2)-lbeta(AlphaIn, BetaVect) - - - if (log==F) FinalResult=exp(LogConst) - if (log==T) FinalResult=LogConst - FinalResult -} - diff --git a/.svn/pristine/f0/f0f808b13cdd6c176e4ea154987ce4b6d7002d6a.svn-base b/.svn/pristine/f0/f0f808b13cdd6c176e4ea154987ce4b6d7002d6a.svn-base deleted file mode 100644 index 28a5c34..0000000 --- a/.svn/pristine/f0/f0f808b13cdd6c176e4ea154987ce4b6d7002d6a.svn-base +++ /dev/null @@ -1,97 +0,0 @@ -\name{EBMultiTest} -\alias{EBMultiTest} -\title{ -Using EM algorithm to calculate the posterior probabilities -of interested patterns in a multiple condition study -} -\description{ -'EBMultiTest' is built based on the assumption of NB-Beta Empirical Bayes model. It utilizes the EM algorithm to give the posterior probability of the interested patterns. -} -\usage{ -EBMultiTest(Data, NgVector = NULL, Conditions, AllParti = NULL, - sizeFactors, maxround, Pool = F, NumBin = 1000, - ApproxVal=10^-10, PoolLower=.25, PoolUpper = .75, Print=T,Qtrm=.75,QtrmCut=10) -} -\arguments{ - - \item{Data}{A data matrix contains expression values for each transcript (gene or isoform level). In which rows should be transcripts and columns should be samples.} - \item{NgVector}{A vector indicates the uncertainty group assignment of each isoform. -e.g. if we use number of isoforms in the host gene to define the uncertainty groups, -suppose the isoform is in a gene with 2 isoforms, Ng of this isoform should be 2. -The length of this vector should be the same as the number of rows in Data. -If it's gene level data, Ngvector could be left as NULL.} - \item{Conditions}{A vector indicates the condition in which each sample belongs to. } - - \item{AllParti}{A matrix indicates the interested patterns. Columns shoule be conditions and rows should be patterns. The matrix could be obtained by the GetPatterns function. If AllParti=NULL, all possible patterns will be used.} - - \item{sizeFactors}{The normalization factors. It should be a vector with lane specific numbers (the length of the vector should be the same as the number of samples, with the same order as the columns of Data).} - \item{maxround}{Number of iterations. The default value is 5. -Users should always check the convergency by looking at the Alpha and -Beta in output. If the hyper-parameter estimations are not converged -in 5 iterations, larger number is suggested.} - - -\item{Pool}{While working without replicates, user could define the Pool = TRUE in the EBTest function to enable pooling.} -\item{NumBin}{By defining NumBin = 1000, EBSeq will group the genes with similar means together into 1,000 bins.} -\item{PoolLower, PoolUpper}{ -With the assumption that only subset of the genes are DE in the data set, we take genes whose FC are in the PoolLower - PoolUpper quantile of the FC's as the candidate genes (default is 25\%-75\%). - -For each bin, the bin-wise variance estimation is defined as the median of the cross condition variance estimations of the candidate genes within that bin. - -We use the cross condition variance estimations for the candidate genes and the bin-wise variance estimations of the host bin for the non-candidate genes.} - -\item{ApproxVal}{The variances of the transcripts with mean < var will be approximated as mean/(1-ApproxVal).} - -\item{Print}{Whether print the elapsed-time while running the test.} - -\item{Qtrm, QtrmCut}{ -Transcripts with Qtrm th quantile < = QtrmCut will be removed before testing. The default value is Qtrm = 0.75 and QtrmCut=10. -By default setting, transcripts that have >75\% of the samples with expression less than 10 -won't be tested. -} -} - - -\value{ -\item{Alpha}{Fitted parameter alpha of the prior beta distribution. Rows are the values for each iteration.} -\item{Beta}{Fitted parameter beta of the prior beta distribution. Rows are the values for each iteration.} -\item{P, PFromZ}{The bayes estimator of following each pattern of interest. Rows are the values for each iteration.} -\item{Z, PoissonZ}{The Posterior Probability of following each pattern of interest for each transcript. (Maybe not in the same order of input).} -\item{RList}{The fitted values of r for each transcript.} -\item{MeanList}{The mean of each transcript. (across conditions).} -\item{VarList}{The variance of each transcript. (across conditions).} -\item{QList}{The fitted q values of each transcript within each condition.} -\item{SPMean}{The mean of each transcript within each condition (adjusted by the normalization factors).} -\item{SPEstVar}{The estimated variance of each transcript within each condition (adjusted by the normalization factors).} -\item{PoolVar}{The variance of each transcript (The pooled value of within condition EstVar).} -\item{DataList}{A List of data that grouped with Ng and bias.} -\item{PPpattern}{The Posterior Probability of following each pattern (columns) for each transcript (rows). Transcripts with expression 0 for all samples are not shown in this matrix.} -\item{f}{The likelihood of likelihood of prior predictive distribution of being each pattern for each transcript. } -\item{AllParti}{The matrix describe the patterns.} -\item{PPpatternWith0}{The Posterior Probability of following each pattern (columns) for each transcript (rows). Transcripts with expression 0 for all samples are shown in this matrix with PP(any_pattrn)=NA.} -\item{ConditionOrder}{The condition assignment for C1Mean, C2Mean, etc.} -} -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - -\seealso{ -EBTest, GetMultiPP, GetMultiFC -} -\examples{ -data(MultiGeneMat) -MultiGeneMat.small = MultiGeneMat[201:210,] -Conditions = c("C1","C1","C2","C2","C3","C3") -PosParti = GetPatterns(Conditions) -Parti = PosParti[-3,] -MultiSize = MedianNorm(MultiGeneMat.small) -MultiOut = EBMultiTest(MultiGeneMat.small, NgVector = NULL, - Conditions = Conditions, AllParti = Parti, - sizeFactors = MultiSize, maxround = 5) -MultiPP = GetMultiPP(MultiOut) -} -\keyword{ DE } -\keyword{ Multiple Condition }% __ONLY ONE__ keyword per line diff --git a/.svn/pristine/f6/f6287ae7469c2aabfe675c53eba854d91ce2950c.svn-base b/.svn/pristine/f6/f6287ae7469c2aabfe675c53eba854d91ce2950c.svn-base deleted file mode 100644 index 5aaf254..0000000 --- a/.svn/pristine/f6/f6287ae7469c2aabfe675c53eba854d91ce2950c.svn-base +++ /dev/null @@ -1,42 +0,0 @@ -\name{f1} -\alias{f1} -\title{ -The Prior Predictive Distribution of being DE -} -\description{ -'f1' gives the Prior Predictive Distribution of DE. -} -\usage{ -f1(Input1, Input2, AlphaIn, BetaIn, EmpiricalRSP1, - EmpiricalRSP2, NumOfGroup, log) -} -\arguments{ - - \item{Input1}{Expressions from Condition1.} - \item{Input2}{Expressions from Condition2.} - \item{AlphaIn, BetaIn, EmpiricalRSP1, EmpiricalRSP2}{The parameters estimated from last iteration of EM.} - \item{NumOfGroup}{ How many transcripts within each Ng group.} - \item{log}{If true, will give the log of the output.} -} - -\value{ -The function will return the prior predictive distribution values of being DE. -} - -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) -} -\author{ -Ning Leng -} - -\seealso{ -f0 -} -\examples{ -#f1(matrix(rnorm(100,100,1),ncol=10), -# matrix(rnorm(100,100,1),ncol=10), .5, .6, -# matrix(rnorm(100,200,1),ncol=10), -# matrix(rnorm(100,200,1),ncol=10), 100, TRUE) - -} diff --git a/.svn/pristine/f6/f66ea0f0de798dc3481cd3a4b66254eba9b5d232.svn-base b/.svn/pristine/f6/f66ea0f0de798dc3481cd3a4b66254eba9b5d232.svn-base deleted file mode 100644 index 944abf2..0000000 Binary files a/.svn/pristine/f6/f66ea0f0de798dc3481cd3a4b66254eba9b5d232.svn-base and /dev/null differ diff --git a/.svn/pristine/f7/f77a15fede31c8afc3559ef831794872c7932832.svn-base b/.svn/pristine/f7/f77a15fede31c8afc3559ef831794872c7932832.svn-base deleted file mode 100644 index 71a697a..0000000 Binary files a/.svn/pristine/f7/f77a15fede31c8afc3559ef831794872c7932832.svn-base and /dev/null differ diff --git a/.svn/pristine/f7/f7d33c89494022da45fb972761ec41eab390c6dd.svn-base b/.svn/pristine/f7/f7d33c89494022da45fb972761ec41eab390c6dd.svn-base deleted file mode 100644 index 557086e..0000000 --- a/.svn/pristine/f7/f7d33c89494022da45fb972761ec41eab390c6dd.svn-base +++ /dev/null @@ -1,38 +0,0 @@ -PostFC=function(EBoutput, SmallNum=.01) { - if(!"C1Mean"%in%names(EBoutput)) - stop("The input doesn't seem like an output from EBTest") - GeneRealMeanC1=unlist(EBoutput$C1Mean) - GeneRealMeanC2=unlist(EBoutput$C2Mean) - GeneRealMeanC1Plus=GeneRealMeanC1+SmallNum - GeneRealMeanC2Plus=GeneRealMeanC2+SmallNum - GeneRealMean=(GeneRealMeanC1+GeneRealMeanC2)/2 - - GeneRealFC=GeneRealMeanC1Plus/GeneRealMeanC2Plus - - GeneR=unlist(EBoutput$RList) - GeneR[GeneR<=0 | is.na(GeneR)]=GeneRealMean[GeneR<=0 | is.na(GeneR)]*.99/.01 - - GeneAlpha=EBoutput[[1]][nrow(EBoutput[[1]]),] - GeneBeta=unlist(sapply(1:length(EBoutput$C1Mean),function(i)rep(EBoutput[[2]][nrow(EBoutput[[1]]),i],length(EBoutput$C1Mean[[i]])))) - GeneBeta=as.vector(GeneBeta) - - # Post alpha P_a_C1= alpha + r_C1 * n_C1 - # Post beta P_b_C1= beta + Mean_C1 * n_C1 - # P_q_C1= P_a_C1/ (P_a_C1 + P_b_C1) - # Post FC = ((1-P_q_C1)/P_q_c1) /( (1-P_q_c2)/P_q_c2) - - nC1=sum(EBoutput$Conditions==levels(EBoutput$Conditions)[1]) - nC2=sum(EBoutput$Conditions==levels(EBoutput$Conditions)[2]) - GenePostAlphaC1=GeneAlpha+nC1*GeneR - GenePostAlphaC2=GeneAlpha+nC2*GeneR - GenePostBetaC1=GeneBeta+nC1*GeneRealMeanC1 - GenePostBetaC2=GeneBeta+nC2*GeneRealMeanC2 - GenePostQC1=GenePostAlphaC1/(GenePostAlphaC1+GenePostBetaC1) - GenePostQC2=GenePostAlphaC2/(GenePostAlphaC2+GenePostBetaC2) - - GenePostFC=((1-GenePostQC1)/(1-GenePostQC2))*(GenePostQC2/GenePostQC1) - Out=list(PostFC=GenePostFC[rownames(EBoutput$PPMat)], RealFC=GeneRealFC[rownames(EBoutput$PPMat)], - Direction=paste(EBoutput$ConditionOrder[[1]],"Over", EBoutput$ConditionOrder[[2]]) - ) - -} diff --git a/.svn/pristine/f7/f7ee80a03560677fad7bc968ad8274b5a1714d84.svn-base b/.svn/pristine/f7/f7ee80a03560677fad7bc968ad8274b5a1714d84.svn-base deleted file mode 100644 index 30ba666..0000000 --- a/.svn/pristine/f7/f7ee80a03560677fad7bc968ad8274b5a1714d84.svn-base +++ /dev/null @@ -1,39 +0,0 @@ -\name{LogN} -\alias{LogN} -\title{ -The function to run EM (one round) algorithm for the NB-beta model. -} -\description{ -'LogN' specifies the function to run (one round of) the EM algorithm for the NB-beta model. -} -\usage{ -LogN(Input, InputSP, EmpiricalR, EmpiricalRSP, NumOfEachGroup, - AlphaIn, BetaIn, PIn, NoneZeroLength) -} -\arguments{ - \item{Input, InputSP}{The expressions among all the samples.} - \item{NumOfEachGroup}{Number of genes in each Ng group.} - \item{AlphaIn, PIn, BetaIn, EmpiricalR, EmpiricalRSP}{The parameters from the last EM step.} - \item{NoneZeroLength}{Number of Ng groups.} -} - -\references{ -Ning Leng, John A. Dawson, James A. Thomson, Victor Ruotti, Anna I. Rissman, Bart M.G. Smits, Jill D. Haag, Michael N. Gould, Ron M. Stewart, and Christina Kendziorski. EBSeq: An empirical Bayes hierarchical model for inference in RNA-seq experiments. Bioinformatics (2013) - -} -\author{ -Ning Leng -} - - -\examples{ - -#Input = matrix(rnorm(100,100,1), ncol=10) -#rownames(Input) = paste("g",1:10) -#RIn = matrix(rnorm(100,200,1), ncol=10) -#res = LogN(Input, list(Input[,1:5], Input[,6:10]), -# RIn, list(RIn[,1:5], RIn[,6:10]), -# 10, .6, .7, .3, 1) - -} - diff --git a/.svn/pristine/f9/f90aac7359b67d4627d13f1ca53b7b0703381a00.svn-base b/.svn/pristine/f9/f90aac7359b67d4627d13f1ca53b7b0703381a00.svn-base deleted file mode 100644 index f9aa5d9..0000000 Binary files a/.svn/pristine/f9/f90aac7359b67d4627d13f1ca53b7b0703381a00.svn-base and /dev/null differ diff --git a/.svn/pristine/fa/fa0c75653e870abf26b532668b7c2953e91d3a1a.svn-base b/.svn/pristine/fa/fa0c75653e870abf26b532668b7c2953e91d3a1a.svn-base deleted file mode 100644 index 026dac6..0000000 --- a/.svn/pristine/fa/fa0c75653e870abf26b532668b7c2953e91d3a1a.svn-base +++ /dev/null @@ -1,64 +0,0 @@ -LogN <- -function(Input, InputSP, EmpiricalR, EmpiricalRSP, NumOfEachGroup, AlphaIn, BetaIn, PIn, NoneZeroLength) -{ - #2 condition case (skip the loop then maybe run faster? Code multi condition cases later) - - #For each gene (m rows of Input---m genes) - #Save each gene's F0, F1 for further likelihood calculation. - - #Get F0 for EE - F0Log=f0(Input, AlphaIn, BetaIn, EmpiricalR, NumOfEachGroup, log=T) - #Get F1 for DE - F1Log=f1(InputSP[[1]], InputSP[[2]], AlphaIn, BetaIn, EmpiricalRSP[[1]],EmpiricalRSP[[2]], NumOfEachGroup, log=T) - - #Get z - #Use data.list in logfunction - F0LogMdf=F0Log+600 - F1LogMdf=F1Log+600 - F0Mdf=exp(F0LogMdf) - F1Mdf=exp(F1LogMdf) - - z.list=PIn*F1Mdf/(PIn*F1Mdf+(1-PIn)*F0Mdf) - zNaNName=names(z.list)[is.na(z.list)] - zGood=which(!is.na(z.list)) - if(length(zGood)==0){ - #Min=min(min(F0Log[which(F0Log!=-Inf)]), - # min(F1Log[which(F1Log!=-Inf)])) - tmpMat=cbind(F0Log,F1Log) - tmpMean=apply(tmpMat,1,mean) - F0LogMdf=F0Log-tmpMean - F1LogMdf=F1Log-tmpMean - F0Mdf=exp(F0LogMdf) - F1Mdf=exp(F1LogMdf) - - z.list=PIn*F1Mdf/(PIn*F1Mdf+(1-PIn)*F0Mdf) - zNaNName=names(z.list)[is.na(z.list)] - zGood=which(!is.na(z.list)) - - } - ###Update P - #PFromZ=sapply(1:NoneZeroLength,function(i) sum(z.list[[i]])/length(z.list[[i]])) - PFromZ=sum(z.list[zGood])/length(z.list[zGood]) - F0Good=F0Log[zGood] - F1Good=F1Log[zGood] - ### MLE Part #### - # Since we dont wanna update p and Z in this step - # Each Ng for one row - - NumGroupVector=rep(c(1:NoneZeroLength),NumOfEachGroup) - - NumGroupVector.zGood=NumGroupVector[zGood] - NumOfEachGroup.zGood=tapply(NumGroupVector.zGood,NumGroupVector.zGood,length) - - StartValue=c(AlphaIn, BetaIn,PIn) - - Result<-optim(StartValue,Likefun,InputPool=list(InputSP[[1]][zGood,],InputSP[[2]][zGood,],Input[zGood,],z.list[zGood], NoneZeroLength,EmpiricalR[zGood, ],EmpiricalRSP[[1]][zGood,], EmpiricalRSP[[2]][zGood,], NumOfEachGroup.zGood)) - #LikeOutput=Likelihood( StartValue, Input , InputSP , PNEW.list, z.list) - AlphaNew= Result$par[1] - BetaNew=Result$par[2:(1+NoneZeroLength)] - PNew=Result$par[2+NoneZeroLength] - ## - Output=list(AlphaNew=AlphaNew,BetaNew=BetaNew,PNew=PNew,ZNew.list=z.list,PFromZ=PFromZ, zGood=zGood, zNaNName=zNaNName,F0Out=F0Good, F1Out=F1Good) - Output - } - diff --git a/.svn/pristine/fb/fb415deabe750837bb976ee256f2b9d65ace77e1.svn-base b/.svn/pristine/fb/fb415deabe750837bb976ee256f2b9d65ace77e1.svn-base deleted file mode 100644 index 8f22575..0000000 Binary files a/.svn/pristine/fb/fb415deabe750837bb976ee256f2b9d65ace77e1.svn-base and /dev/null differ diff --git a/.svn/pristine/fd/fd6bed16dc35e63727aaae334aa6850f439dccdd.svn-base b/.svn/pristine/fd/fd6bed16dc35e63727aaae334aa6850f439dccdd.svn-base deleted file mode 100644 index 8a264b4..0000000 --- a/.svn/pristine/fd/fd6bed16dc35e63727aaae334aa6850f439dccdd.svn-base +++ /dev/null @@ -1,236 +0,0 @@ -library(EBSeq) -# 3.1 -data(GeneMat) -str(GeneMat) -Sizes=MedianNorm(GeneMat) -EBOut=EBTest(Data=GeneMat, - Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=Sizes, maxround=5) -DEOut=GetDEResults(EBOut) -str(DEOut) -#3.2 -data(IsoList) -str(IsoList) -IsoMat=IsoList$IsoMat -str(IsoMat) -IsoNames=IsoList$IsoNames -IsosGeneNames=IsoList$IsosGeneNames -IsoSizes=MedianNorm(IsoMat) -NgList=GetNg(IsoNames, IsosGeneNames) -IsoNgTrun=NgList$IsoformNgTrun -IsoNgTrun[c(1:3,201:203,601:603)] -IsoEBOut=EBTest(Data=IsoMat, NgVector=IsoNgTrun, - Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=IsoSizes, maxround=5) -IsoDE=GetDEResults(IsoEBOut) -str(IsoDE) -#3.3 -data(MultiGeneMat) -str(MultiGeneMat) -Conditions=c("C1","C1","C2","C2","C3","C3") -PosParti=GetPatterns(Conditions) -PosParti -Parti=PosParti[-3,] -Parti -MultiSize=MedianNorm(MultiGeneMat) -MultiOut=EBMultiTest(MultiGeneMat,NgVector=NULL,Conditions=Conditions, - AllParti=Parti, sizeFactors=MultiSize, maxround=5) -MultiPP=GetMultiPP(MultiOut) -names(MultiPP) -MultiPP$PP[1:10,] -MultiPP$MAP[1:10] -MultiPP$Patterns - -#3.4 -data(IsoMultiList) -IsoMultiMat=IsoMultiList[[1]] -IsoNames.Multi=IsoMultiList$IsoNames -IsosGeneNames.Multi=IsoMultiList$IsosGeneNames -IsoMultiSize=MedianNorm(IsoMultiMat) -NgList.Multi=GetNg(IsoNames.Multi, IsosGeneNames.Multi) -IsoNgTrun.Multi=NgList.Multi$IsoformNgTrun -Conditions=c("C1","C1","C2","C2","C3","C3","C4","C4") -PosParti.4Cond=GetPatterns(Conditions) -PosParti.4Cond -Parti.4Cond=PosParti.4Cond[c(1,2,3,8,15),] -Parti.4Cond -IsoMultiOut=EBMultiTest(IsoMultiMat,NgVector=IsoNgTrun.Multi,Conditions=Conditions, - AllParti=Parti.4Cond, sizeFactors=IsoMultiSize, maxround=5) -IsoMultiPP=GetMultiPP(IsoMultiOut) -names(MultiPP) -IsoMultiPP$PP[1:10,] -IsoMultiPP$MAP[1:10] -IsoMultiPP$Patterns - - -#4.1 -data(GeneMat) -str(GeneMat) -Sizes=MedianNorm(GeneMat) -EBOut=EBTest(Data=GeneMat, - Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=Sizes, maxround=5) -DEOut=GetDEResults(EBOut) -EBOut$Alpha -EBOut$Beta -EBOut$P -GeneFC=PostFC(EBOut) -str(GeneFC) -par(mfrow=c(2,2)) -QQP(EBOut) -par(mfrow=c(2,2)) -DenNHist(EBOut) -PlotPostVsRawFC(EBOut,GeneFC) - -#4.2 -data(IsoList) -str(IsoList) -IsoMat=IsoList$IsoMat -str(IsoMat) -IsoNames=IsoList$IsoNames -IsosGeneNames=IsoList$IsosGeneNames -IsoSizes=MedianNorm(IsoMat) -NgList=GetNg(IsoNames, IsosGeneNames) -IsoNgTrun=NgList$IsoformNgTrun -IsoNgTrun[c(1:3,201:203,601:603)] -IsoEBOut=EBTest(Data=IsoMat, NgVector=IsoNgTrun, - Conditions=as.factor(rep(c("C1","C2"),each=5)),sizeFactors=IsoSizes, maxround=5) -IsoDE=GetDEResults(IsoEBOut) -str(IsoDE) -IsoEBOut$Alpha -IsoEBOut$Beta -IsoEBOut$P -IsoFC=PostFC(IsoEBOut) -str(IsoFC) -PlotPostVsRawFC(IsoEBOut,IsoFC) - -par(mfrow=c(2,2)) -PolyFitValue=vector("list",3) -for(i in 1:3) - PolyFitValue[[i]]=PolyFitPlot(IsoEBOut$C1Mean[[i]], - IsoEBOut$C1EstVar[[i]],5) -PolyAll=PolyFitPlot(unlist(IsoEBOut$C1Mean), unlist(IsoEBOut$C1EstVar),5) -lines(log10(IsoEBOut$C1Mean[[1]][PolyFitValue[[1]]$sort]), - PolyFitValue[[1]]$fit[PolyFitValue[[1]]$sort],col="yellow",lwd=2) -lines(log10(IsoEBOut$C1Mean[[2]][PolyFitValue[[2]]$sort]), - PolyFitValue[[2]]$fit[PolyFitValue[[2]]$sort],col="pink",lwd=2) -lines(log10(IsoEBOut$C1Mean[[3]][PolyFitValue[[3]]$sort]), - PolyFitValue[[3]]$fit[PolyFitValue[[3]]$sort],col="green",lwd=2) -legend("topleft",c("All Isoforms","Ig = 1","Ig = 2","Ig = 3"), - col=c("red","yellow","pink","green"),lty=1,lwd=3,box.lwd=2) -par(mfrow=c(2,3)) -QQP(IsoEBOut) -par(mfrow=c(2,3)) -DenNHist(IsoEBOut) - - -#4.3 -data(MultiGeneMat) -str(MultiGeneMat) -Conditions=c("C1","C1","C2","C2","C3","C3") -PosParti=GetPatterns(Conditions) -PosParti -PlotPattern(PosParti) -Parti=PosParti[-3,] -Parti -MultiSize=MedianNorm(MultiGeneMat) -MultiOut=EBMultiTest(MultiGeneMat,NgVector=NULL,Conditions=Conditions, - AllParti=Parti, sizeFactors=MultiSize, maxround=5) -MultiPP=GetMultiPP(MultiOut) -names(MultiPP) -MultiPP$PP[1:10,] -MultiPP$MAP[1:10] -MultiPP$Patterns -MultiFC=GetMultiFC(MultiOut) -str(MultiFC) -par(mfrow=c(2,2)) -DenNHist(MultiOut) -par(mfrow=c(2,2)) -QQP(MultiOut) - -#4.4 -data(IsoMultiList) -IsoMultiMat=IsoMultiList[[1]] -IsoNames.Multi=IsoMultiList$IsoNames -IsosGeneNames.Multi=IsoMultiList$IsosGeneNames -IsoMultiSize=MedianNorm(IsoMultiMat) -NgList.Multi=GetNg(IsoNames.Multi, IsosGeneNames.Multi) -IsoNgTrun.Multi=NgList.Multi$IsoformNgTrun -Conditions=c("C1","C1","C2","C2","C3","C3","C4","C4") -PosParti.4Cond=GetPatterns(Conditions) -PosParti.4Cond -PlotPattern(PosParti.4Cond) -Parti.4Cond=PosParti.4Cond[c(1,2,3,8,15),] -Parti.4Cond -IsoMultiOut=EBMultiTest(IsoMultiMat,NgVector=IsoNgTrun.Multi,Conditions=Conditions, - AllParti=Parti.4Cond, sizeFactors=IsoMultiSize, maxround=5) -IsoMultiPP=GetMultiPP(IsoMultiOut) -names(MultiPP) -IsoMultiPP$PP[1:10,] -IsoMultiPP$MAP[1:10] -IsoMultiPP$Patterns -IsoMultiFC=GetMultiFC(IsoMultiOut) -str(IsoMultiFC) -par(mfrow=c(3,4)) -DenNHist(IsoMultiOut) -par(mfrow=c(3,4)) -QQP(IsoMultiOut) -IsoMultiFC=GetMultiFC(IsoMultiOut) - - - -#4.5 -data(GeneMat) -GeneMat.norep=GeneMat[,c(1,6)] -Sizes.norep=MedianNorm(GeneMat.norep) -EBOut.norep=EBTest(Data=GeneMat.norep, - Conditions=as.factor(rep(c("C1","C2"))),sizeFactors=Sizes.norep, maxround=5) -DE.norep=GetDEResults(EBOut.norep) -GeneFC.norep=PostFC(EBOut.norep) - - -#4.6 -data(IsoList) -IsoMat=IsoList$IsoMat -IsoNames=IsoList$IsoNames -IsosGeneNames=IsoList$IsosGeneNames -NgList=GetNg(IsoNames, IsosGeneNames) -IsoNgTrun=NgList$IsoformNgTrun -IsoMat.norep=IsoMat[,c(1,6)] -IsoSizes.norep=MedianNorm(IsoMat.norep) -IsoEBOut.norep=EBTest(Data=IsoMat.norep, NgVector=IsoNgTrun, - Conditions=as.factor(c("C1","C2")),sizeFactors=IsoSizes.norep, maxround=5) -IsoDE.norep=GetDEResults(IsoEBOut.norep) -IsoFC.norep=PostFC(IsoEBOut.norep) - - -#4.7 -data(MultiGeneMat) -MultiGeneMat.norep=MultiGeneMat[,c(1,3,5)] -Conditions=c("C1","C2","C3") -PosParti=GetPatterns(Conditions) -Parti=PosParti[-3,] -MultiSize.norep=MedianNorm(MultiGeneMat.norep) -MultiOut.norep=EBMultiTest(MultiGeneMat.norep,NgVector=NULL,Conditions=Conditions, - AllParti=Parti, sizeFactors=MultiSize.norep, maxround=5) -MultiPP.norep=GetMultiPP(MultiOut.norep) -MultiFC.norep=GetMultiFC(MultiOut.norep) - -#4.8 -data(IsoMultiList) -IsoMultiMat=IsoMultiList[[1]] -IsoNames.Multi=IsoMultiList$IsoNames -IsosGeneNames.Multi=IsoMultiList$IsosGeneNames -IsoMultiMat.norep=IsoMultiMat[,c(1,3,5,7)] -IsoMultiSize.norep=MedianNorm(IsoMultiMat.norep) -NgList.Multi=GetNg(IsoNames.Multi, IsosGeneNames.Multi) -IsoNgTrun.Multi=NgList.Multi$IsoformNgTrun -Conditions=c("C1","C2","C3","C4") -PosParti.4Cond=GetPatterns(Conditions) -PosParti.4Cond -Parti.4Cond=PosParti.4Cond[c(1,2,3,8,15),] -Parti.4Cond -IsoMultiOut.norep=EBMultiTest(IsoMultiMat.norep,NgVector=IsoNgTrun.Multi,Conditions=Conditions, - AllParti=Parti.4Cond, sizeFactors=IsoMultiSize.norep, maxround=5) -IsoMultiPP.norep=GetMultiPP(IsoMultiOut.norep) -IsoMultiFC.norep=GetMultiFC(IsoMultiOut.norep) - - -# EOF diff --git a/.svn/wc.db b/.svn/wc.db deleted file mode 100644 index b7f31d3..0000000 Binary files a/.svn/wc.db and /dev/null differ