### 2 Les données ## 2.2 Importation sous R lipides <- read.table("lipides.csv",sep=",") Exprs <- read.table("genes.csv",sep=",") dim(Exprs) ;dim(lipides) summary(Exprs) summary(lipides) genotype=as.factor(lipides[,23]) regime=as.factor(lipides[,22]) lipides=lipides[,1 :21] ### Statistique élémentaire ## 3.1 Description univariée class(Exprs) class(t(Exprs)) boxplot(Exprs) boxplot(data.frame(t(Exprs)),horizontal=TRUE) boxplot(data.frame(scale(Exprs,scale=FALSE))) boxplot(data.frame(scale(Exprs))) boxplot(data.frame(scale(t(Exprs),scale=FALSE)),horizontal=TRUE) genes=dimnames(Exprs)[[2]] namsour=dimnames(Exprs)[[1]] ## 3.2 Description bivariée cov(Exprs[,1],Exprs[,2]) cov(lipides$C22.6n.3,lipides$C18.1n.9) cor(Exprs[,1],Exprs[,2]) cor(Exprs[,1],Exprs[,2],method="spearman") cor(lipides$C22.6n.3,lipides$C18.1n.9) plot(Exprs[,1],Exprs[,2]) plot(Exprs[,1],Exprs[,2],xlab=colnames(Exprs)[1],ylab=colnames(Exprs)[2]) plot(lipides$C22.6n.3,lipides$C18.1n.9) correl.lip = cor(lipides) image(correl.lip) image(t(correl.lip[dim(lipides)[2]:1,]),axes=F) mtext(colnames(lipides),side=3,adj=0,at=seq(0,1,l=21),cex=0.5,las=3) mtext(colnames(lipides)[dim(lipides)[2]:1],side=2,adj=0,at=seq(0,1,l=21),cex=0.5,las=1,line=2) ### 4 Analyse en Composantes Principales souracp=princomp(t(Exprs)) plot(souracp) biplot(souracp) souracp=princomp(t(scale(Exprs, scale=FALSE))) plot(souracp) biplot(souracp) souracp=prcomp(Exprs) boxplot((data.frame(souracp$x))) plot(souracp) biplot(souracp) souracp=prcomp(Exprs,scale=TRUE) boxplot(data.frame(souracp$x)) biplot(souracp) pt=23+as.integer(genotype) plot(souracp$x,type="p",pch=pt,cex=2) text(30*souracp$rotation,genes,col="blue") plot(souracp$x,type="p",pch=pt,cex=2,col=as.integer(regime)) text(30*souracp$rotation,genes,col="blue") souracp=prcomp(t(scale(Exprs,scale=FALSE))) plot(souracp) biplot(souracp) coord=souracp$x[,1 :2] coord2=coord^2 contrib=apply(coord2,1,sum)/sum(souracp[[1]][1:2]^2) hist(contrib) selec=contrib>0.5 sum(selec) genes[selec] plot(souracp$rotation,type="p",pch=pt,cex=2,col=as.integer(regime)) text(0.2*souracp$x[selec,],genes[selec],col="blue") biplot(souracp$x[selec,],souracp$rotation) boxplot(CYP4A14~genotype,data=Exprs) boxplot(CYP3A11~genotype,data=Exprs) boxplot(THIOL~genotype,data=Exprs) boxplot(PMDCI~genotype,data=Exprs) ### 5 Positionnement multidimensionnel ## 5.1 Calcul des distances rS = cor(Exprs) dS=1-rS dS2=sqrt(1-rS**2) dN=dimnames(Exprs)[[2]] ## 5.2 MDS avec la corrélation linéaire mdsour= cmdscale(dS, k=2) plot(mdsour, type="n", xlab="", ylab="",main="1-correlation des genes") text(mdsour,dN) abline(v=0,h=0) ## 5.3 MDS avec le carré de la corrélation mdsour= cmdscale(dS2, k=2) plot(mdsour, type="n", xlab="", ylab="",main="MDS : 1- correlation carree des genes") text(mdsour,dN) abline(v=0,h=0) ### 6 Classification ## 6.1 Classification ascendante hiérarchique tExprs <- t(Exprs) dist.tExprs <- dist(tExprs) length(dist.tExprs) hc.tExprs <- hclust(dist.tExprs,method="ward") plot(hc.tExprs) plot(hclust(dist(t(Exprs)),method="ward")) plot(hc.tExprs$height[118 :100],type="b") classif.4G <- cutree(hc.tExprs,k=4) names(classif.4G) sort(classif.4G) names(classif.4G[classif.4G==1]) names(classif.4G[classif.4G==2]) names(classif.4G[classif.4G==3]) names(classif.4G[classif.4G==4]) sink('groupes.txt') for (i in 1 :4) { eff <- length(classif.4G[classif.4G==i]) print(paste("*** Groupe ",i," : ", eff," gènes ***")) print(names(classif.4G[classif.4G==i]))} sink() ## 6.2 Double classification lf = function(d) hclust(d, method="ward") heatmap(as.matrix(Exprs),hclustfun=lf) heatmap(scale(as.matrix(Exprs),scale=FALSE),hclustfun=lf) ## 6.3 Agrégation autour de centres mobiles km.genes=kmeans(tExprs,centers=4) table(classif.4G,km.genes$cluster,dnn=c("tree","kmeans")) mat.init.km.genes=matrix(nrow=4,ncol=40) for (i in 1 :4) mat.init.km.genes[i,]=apply(tExprs[classif.4G==i,],2,mean) km.genes.init=kmeans(tExprs,centers=mat.init.km.genes) table(classif.4G,km.genes.init$cluster,dnn=c("tree","kmeans")) ## 6.4 Représentation d’une classification library(cluster) pamv=pam(dS2,6) mds= cmdscale(dS2, k=2) col=pamv$clustering plot(mds, type="n", xlab="cp1", ylab="cp2") text(mds,dN,col=col) clusplot(dS2,pamv$clustering,diss=TRUE,labels=2,color=TRUE,col.txt=pamv$clustering) ### 7 Modèle linéaire ## 7.1 Lecture du tableau de données sur les 40 souris (en lignes) et les 120 gènes (en colonnes) Exprs<-read.table("genes.csv",header=T,sep=",") names(Exprs) dim(Exprs) summary(Exprs) nomgene = names(Exprs) lipides<-read.table("lipides.csv",header=T,sep=",") summary(lipides) dim(lipides) names(lipides) genotype=as.factor(lipides[,"Genotype"]) regime=as.factor(lipides[,"Regime"]) ## 7.2 Le gène AOX est-il régulé par le génotype de la souris ? t.test(Exprs[,"AOX"] ~ genotype,var.equal=T) anov.AOX = lm(Exprs[,"AOX"] ~ genotype) summary(anov.AOX) names(summary(anov.AOX)) summary(anov.AOX)$coefficients summary(anov.AOX)$coefficients["genotypewt","Pr(>|t|)"] anov.AOX.1 = lm(Exprs[,"AOX"] ~ genotype-1) summary(anov.AOX.1) anov1=lm(Exprs[,"AOX"] ~ 1) summary(anov1) anova(anov.AOX,anov1) plot(anov.AOX) ## 7.3 Quels sont les gènes régulés par le génotype de la souris ? pval = apply(Exprs,2,function(x) summary(lm(x ~ genotype))$coefficients["genotypewt","Pr(>|t|)"]) hist(pval) library(multtest) procs = c("Bonferroni", "BH") tmp = mt.rawp2adjp(pval, procs) #Calcul des p-values ajustées à partir des p-values brutes (pval) help(mt.rawp2adjp) adjp = tmp$adjp[order(tmp$index), ] res.adjp = round(adjp, 2) dimnames(res.adjp)[[1]] = names(Exprs) mt.plot(cbind(pval,adjp[, procs]), statt,plottype = "pvsr", proc = c("pval",procs),lty = c(1,2,3), col = c("black", "blue", "red"), leg=c(50,0.8),lwd=2) abline(h=c(0.01,0.05,0.1),lty=c(1,2,3)) listgenes= names(Exprs)[res.adjp[,"BH"]<0.05] listgenes ## 7.4 Le gène AOX est-il régulé par le régime de la souris ? anov.AOX.R = lm(Exprs[,"AOX"] ~ regime) summary(anov.AOX.R) names(summary(anov.AOX.R)) names(summary(anov.AOX.R)) summary(anov.AOX.R)$fstatistic anov1=lm(Exprs[,"AOX"] ~ 1) summary(anov1) anova(anov.AOX.R,anov1) ## 7.5 Quels sont les gènes régulés par le regime de la souris ? pval = rep(NA,120) for (i in 1:120) { lm.complet = lm(Exprs[,i]~ regime) lm.reduit = lm(Exprs[,i] ~ 1) anova(lm.complet,lm.reduit) pval[i] = anova(lm.complet,lm.reduit)[2, "Pr(>F)"] } hist(pval) procs = c("Bonferroni", "BH") tmp = mt.rawp2adjp(pval, procs) help(mt.rawp2adjp) adjp = tmp$adjp[order(tmp$index), ] res.adjp = round(adjp, 2) dimnames(res.adjp)[[1]] = names(Exprs) mt.plot(cbind(pval,adjp[, procs]), statt,plottype = "pvsr", proc = c("pval",procs),lty = c(1,2,3), col = c("black", "blue", "red"), leg=c(50,0.8),lwd=2) abline(h=c(0.01,0.05,0.1),lty=c(1,2,3)) listgenes= names(Exprs)[res.adjp[,"BH"]<0.05] listgenes ## 7.6 Le gène AOX est-il différentiellement exprimé selon le génotype, le régime ou les 2 à la fois ? anov1.AOX =lm(Exprs[,"AOX"] ~ genotype+ regime+ genotype:regime) summary(anov1.AOX) anova(anov1.AOX,lm(Exprs[,"AOX"] ~ 1)) anov2.AOX =lm(Exprs[,"AOX"] ~ genotype+ regime) summary(anov2.AOX) anova(anov1.AOX,anov2.AOX) ## 7.7 Les gènes sont-ils différentiellement exprimés selon le génotype, le régime ou les 2 à la fois ? pval = rep(NA,120) for (i in 1:120) { lm.complet = lm(Exprs[,i]~ genotype+ regime+ genotype:regime) lm.reduit = lm(Exprs[,i] ~ 1) anova(lm.complet,lm.reduit) pval[i] = anova(lm.complet,lm.reduit)[2, "Pr(>F)"] } hist(pval) procs = c("Bonferroni", "BH") tmp = mt.rawp2adjp(pval, procs) help(mt.rawp2adjp) adjp = tmp$adjp[order(tmp$index), ] res.adjp = round(adjp, 2) dimnames(res.adjp)[[1]] = names(Exprs) mt.plot(cbind(pval,adjp[, procs]), statt,plottype = "pvsr", proc = c("pval",procs),lty = c(1,2,3), col = c("black", "blue", "red"), leg=c(50,0.8),lwd=2) abline(h=c(0.01,0.05,0.1),lty=c(1,2,3)) listgenes= names(Exprs)[res.adjp[,"BH"]<0.05] pval.interaction=rep(NA,120) pval.regime=rep(NA,120) pval.genotype=rep(NA,120) for (gene in listgenes) { i=match(gene,names(Exprs)) lm.complet = lm(Exprs[,i]~ genotype+ regime+ genotype:regime) lm.additif = lm(Exprs[,i] ~ genotype+ regime) lm.genotype=lm(Exprs[,i] ~ genotype) lm.regime =lm(Exprs[,i] ~ regime) pval.interaction[i]=anova(lm.complet,lm.additif)[2, "Pr(>F)"] pval.regime[i]=anova(lm.additif,lm.regime)[2, "Pr(>F)"] pval.genotype[i]=anova(lm.additif,lm.genotype)[2, "Pr(>F)"] } list1 = nomgene[pval.interaction < 0.05] list1 = list1[!is.na(list1)] list1 list2 = nomgene[(pval.interaction > 0.05) & (pval.regime <0.05) & (pval.genotype <0.05)] list2 = list2[!is.na(list2)] list2 list3 = nomgene[(pval.interaction > 0.05) & (pval.regime >0.05) & (pval.genotype <0.05)] list3 = list3[!is.na(list3)] list3 list4 = nomgene[(pval.interaction > 0.05) & (pval.regime <0.05) & (pval.genotype >0.05)] list4 = list4[!is.na(list4)] list4 pval.total = data.frame(nomgene=nomgene,fdr=res.adjp[,"BH"], interaction=round(pval.interaction,5), regime=round(pval.regime,5),genotype=round(pval.genotype,5)) pval.total ### 8. Compléments ## 8.1 Analyse factorielle discriminante dat=data.frame(Exprs,regime=regime) acp=prcomp(dat[21:40,1:119]) plot(acp$x,type="n") text(acp$x[,1:2],as.character(dat[21:40,120])) library(MASS) afd = lda(dat[21:40,1:119],dat$regime[21:40]) plot(afd) plot(afd,dimen=2) afd.select=lda(dat[21:40,c(14,26:32,46,47,58,76,89,95,108)],dat$regime[21:40]) plot(afd.select) plot(afd.select,dimen=2) abline(h=0,v=0,lty=2) plot(afd.select$scaling,type="n") text(afd.select$sca,rownames(afd.select$sca),cex=0.75) abline(h=0,v=0,lty=2) ## 8.2 Analyse canonique Exprs <- read.table("genes.csv",sep=",") lipides <- read.table("lipides.csv",sep=",") genotype=as.factor(lipides[,23]) regime=as.factor(lipides[,22]) lipides=lipides[,1 :21] Exprs.ac <- c("CAR1","BIEN","CYP3A11","CYP4A10","CYP4A14","AOX","THIOL","CYP2c29","S14","GSTpi2") X = Exprs[,Exprs.ac] Y = lipides library(CCA) res.cc=cc(X,Y) plt.cc(res.cc) plt.cc(res.cc,type="v",var.label=TRUE) plt.cc(res.cc,var.label=T,ind.names=paste(genotype,regime,sep="-"))