# Demo showing some of R's functionality in comparison with Matlab's # statistics toolbox; see statistics-demo.m # # file: statistics-demo.R (Peter Harmand, 13.12.08) # ------------------------------------------------------------------------- # ------------------------------------------------------------------------- # Program and documentation # ------------------------------------------------------------------------- # The official homepage is # http://www.r-project.org/ # # Official list of documentation # http://cran.r-project.org/other-docs.html # # My recommendations: # # "The R Guide" by Jason Owen # My main recommendation for my second semester students; # Basic R and basic statistics on less than 60 pages # http://cran.r-project.org/doc/contrib/Owen-TheRGuide.pdf # "Simple R" by John Verzani # More on R and statistic. # There is a 400-page book which grew out of this public domain text # "Using R for introductory statistics" # Bibliothek Standort Wechloy: inf 640 CO 2231 # http://cran.r-project.org/doc/contrib/Verzani-SimpleR.pdf # The official introduction # [R directory]\doc\manual\R-intro.pdf # If you know something about programming and statistics, you should start # here. # "An Introduction to R" by Petra Kuhnert and Bill Venables # More than 350 pages; also advanced material; problem oriented # http://cran.r-project.org/doc/contrib/Kuhnert+Venables-R_Course_Notes.zip # A 4-page summary with the most important commands # "R reference card" by Tom Short # http://cran.r-project.org/doc/contrib/Short-refcard.pdf # # # Books # # There is a large number of books on R. I mention only: # # "Introductory statistics with R" by Peter Dalgaard # Very good introduction. # Bibliothek Standort Wechloy: 4 mat 710 CP 0743 # "Modern applied statistics with S" by W. N. Venables and B. D. Ripley # After half a year of R and one year of statistics you will enjoy it. # Bibliothek Standort Wechloy: 4 inf 630 CJ 1178,4 # "Programmieren mit R" by Uwe Ligges # Good for all programming aspects; in German. # Bibliothek Standort Wechloy: 4 inf 340 r CP 9494 # # Archive of the R-help user group # http://tolstoy.newcastle.edu.au/R/ # !!! Don't post questions, if you haven't read *all* manuals for # several days or if you don't understand the mathematics/statistics # A German R-user group # http://r-statistik.foren-city.de/ # # Link lists # http://www.uni-koeln.de/themen/statistik/software/s/ # ------------------------------------------------------------------------- # Interface, invoking and using R # ------------------------------------------------------------------------- # *** Explain basic usage (editor, command window, ...) # Working directory (setwd, getwd) # Other interfaces: # # R-Commander (A taste of the look-and-feel of classical statistics # programs: selection of variables, tests und graphics by clicking # with the mouse) # http://socserv.mcmaster.ca/jfox/Misc/Rcmdr/ # Tinn-R (Editor with syntax highlightning and additional menus and # workspace browser # http://www.sciviews.org/Tinn-R/ # ------------------------------------------------------------------------- # Basic operations # ------------------------------------------------------------------------- # Define variables a = 5 # new way b <- 6 # old way a; b (c = 7) # shows result # vectors x = c(1,2,3,4) y = c(1,0,3,-2) x+y x*y # !! Attention: R adds vectors of different length by recycling # the shorter one y+1:5 y+1:8 length(x) # length sum(x) # sum cumsum(x) # cumulative sum mean(x) # mean x^2 2^x # indexing (x = -3:10) x[1] x[2:4] x>2 which(x>2) x[x>2] # function arguments by order or by name seq(0,1,0.1) seq(to=1, by=0.1, from=0) seq(0,1,length=11) # matrices A = matrix(c(1,2,-4, 3,9,-2, 4,12,0), nrow=3, ncol=3, byrow=TRUE) A A[1:9] A[,1] # first column dim(A) # add a column to the right of A A = cbind(A, c(1,1,2)); A ## A (spaltenweise) zu 4*3-Matrix machen dim(A) = c(4,3); A # add rownames rownames(A) = c("Zeile 1","Zeile 2","Zeile 3"); A # addressing row by name A["Zeile 2",] # ------------------------------------------------------------------------- # Data input # ------------------------------------------------------------------------- # *** Explain: no native Excel support, but csv # data.frame # Several other data formats (SPSS,...) # Also low level file input # set working directory setwd("d:/home/r.wrk/work") setwd("g:/temp") # read csv-file generated by German Excel # ; instead of , as field seperator and , instead of . as decimal separator Data = read.csv2("klausur0506.csv", header=TRUE) # check Data # try also #fix(Data) # variables are accessible in the following form Data[,1] Data$Punkte # make the variables accessible by their names attach(Data) # notice the difference: Punkte is just a numeric vector, # Studiengang is classified as a factor with 4 levels Punkte Studiengang # a compact form with all attributes is always given with str(..) str(Punkte) str(Studiengang) str(Data) # compare with attributes(Data) # ------------------------------------------------------------------------- # Simple summary statistics # ------------------------------------------------------------------------- # trivial: mean, variance, ... mean(Punkte); var(Punkte) # more interesting: summary (works with most R objects and gives most # important aspects depending on the type of the object) summary(Punkte) summary(Studiengang) # group the variable Punkte according to Studiengang (Punkte.G = split(Punkte,Studiengang)) # mean for the four subgroups sapply(Punkte.G, mean) # ------------------------------------------------------------------------- # Simple statistical graphics # ------------------------------------------------------------------------- # *** Explain: graphics system # R's oddity with axis # In comparison with matlab no gui support, no animation, ... # but Open-GL will come ### histogram # the default hist(Punkte) # change the appearence hist(Punkte, main="Klausurergebnis", # set the title ylab="Anzahl", # label the y-axis col="LightBlue") # change color of the bars # Change y scale to density and add the normal approximation # (Not a good visual test for normal distribution; better use qq-plot) hist(Punkte, freq=FALSE, main="Klausurergebnis mit Normalapproximation", ylab="Dichte") curve(dnorm(x, mean=mean(Punkte), sd=sd(Punkte)), col="red", add = TRUE) ### barplot # plot the counts of Studiengang barplot(table(Studiengang)) ### boxplot # four boxplots to compare the groups (also good for detecting outliers) boxplot(Punkte ~ Studiengang) ### scatterplot # A simulation to compute pi: # n random number in the interval [0,1] n = 200 x = runif(n) y = runif(n) # check which points are inside the unit circle; logical vector inside = x^2+y^2<1 # approximation for pi 4*sum(inside)/n # Plot: color and plot symbol according to vector inside; it is automatically # transformed to 0 1 plot(x,y, col=inside+2, pch=inside+16, asp=1) # add part of the circle curve(sqrt(1-x^2), 0, 1, add = TRUE) ### qq-plot qqnorm(Punkte) qqline(Punkte) # zus�tzliche Gerade # there are reasons for this choice of axes, however # almost all other statistics programs graph it the other way qqnorm(Punkte, datax = TRUE) qqline(Punkte, datax = TRUE) # ------------------------------------------------------------------------- # Simple statistical analysis # ------------------------------------------------------------------------- ### Linear regression # One example from Quinn-Keough, chap. 5 Data = read.csv2("christ.csv", header=TRUE) # no attach, declare variables individually CWD = Data[,7] ripdens = Data[,4] # Simplest form of linear regression erg = lm(CWD ~ ripdens) summary(erg) # Plot of the date and the regression line plot(ripdens,CWD, col="red", pch=16) abline(erg, col="blue") # Look what R has computed str(erg) # For the most important things there are extractor functions, e.g. coef(erg) # Plot of the residuals against the fitted values plot(fitted(erg), residuals(erg)) ### two sample t-test # Another example from Quinn-Keough Data = read.csv("ward.csv", header=TRUE) attach(Data) # Take a look at the data: # similar distribution, almost the same variance, significant difference # of the mean/median boxplot(EGGS ~ ZONE) eggs.m = EGGS[ZONE == "Mussel"] eggs.l = EGGS[ZONE == "Littor"] # two-sample t-test with equal variances t.test(eggs.m, eggs.l, var.equal=T) # highly significant # Other shorter form of input t.test(EGGS ~ ZONE, var.equal=T) # two sample t-Test with different variances (Welch test) t.test(eggs.l, eggs.m) ### chi^2-test for independence # For this test chisq.test requires the input as matrix, data.frame or table M <- matrix(c(25,47,12,42), ncol=2, nrow=2, byrow=T, dimnames=list(c("mit LAK","ohne LAK"),c("R�ck", "keine R�ck"))) # function to print a pretty table with row and column sums mit.rand <- function(tab){ Tab = cbind(tab,rowSums(tab)) Tab = rbind(Tab,colSums(Tab)) colnames(Tab) = c(colnames(tab),"Gesamt") rownames(Tab) = c(rownames(tab),"Gesamt") names(dimnames(Tab)) = names(dimnames(tab)) Tab } mit.rand(M) chisq.test(M, correct=F) ### Test for normality # The Kolmogorov-Smirnov test requires the reference probability distribution # to be known. Estimating the parameters from the sample or using the # standarised sample looses power. The same holds for the Lilliefors # correction of this bias ("there is no need to have Lilliefors test in R, # except for archeological interest"). Good modern test is Shapiro-Wilk. shapiro.test(Punkte) # ------------------------------------------------------------------------- # Some more advanced topics # ------------------------------------------------------------------------- ### exploratory analysis of multivariate data # scatterplot matrix, smoother # Example from Rencher Data = read.csv2("citycrime.csv", header=TRUE, row.names=1) # Matrix of scatter plots # The variant with smooth and numbers for correlation coefficient scaled # by their value; from the help for pairs panel.cor <- function(x, y, digits=2, prefix="", cex.cor) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y)) txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex=cex*r) } pairs(Data, lower.panel=panel.cor, upper.panel=panel.smooth) ### principle component analysis erg = prcomp(Data) biplot(erg) # Skalierung der Beobachtungen mit den singul�ren Werten biplot(erg, scale=0, main="scale=0") ### logistic regression as a generalised linear model # First example from the book of Hosmer-Lemeshow Data <- read.csv2("chdage.csv", na.strings=".") X <- Data$AGE Y <- Data$CHD lab.X <- "Alter" lab.Y <- "Wahrscheinlichkeit f�r koronare Herzkrankheit" #anz.g <- 10 #grenzen <- quantile(X, probs=0:anz.g/anz.g, na.rm = T) grenzen <- c(20,29,34,39,44,49,54,59,69) anz.g <- length(grenzen) - 1 X.gruppiert <- cut(X, grenzen, include.lowest=T) table(X.gruppiert, Y) Y.g <- split(Y, X.gruppiert) mittel.Y.g <- sapply(Y.g, mean) mittel.X.g <- grenzen[-(anz.g+1)] + diff(grenzen)/2 plot(X, Y, xlab=lab.X, ylab=lab.Y) points(mittel.X.g, mittel.Y.g, col="red", pch=15) erg <- glm(Y ~ X, family = binomial) summary(erg) curve(1/(1 + exp(-coef(erg)[1] - coef(erg)[2]*x)), min(X), max(X), add=T, col="blue") # ------------------------------------------------------------------------- # Basic programming # ------------------------------------------------------------------------- x = numeric(41) # empty vector of length 41 for numeric entries # Attention: initial value x_0 has index 1, x_40 is in x[41] x[1] = 0.1 # set initial value r = 3.5 # set parameter for (k in 2:41) x[k] = r*x[k-1]*(1 - x[k-1]) # Plot of the sequence as lines with points plot(0:40,x, type="l", col="red") points(0:40,x, col="green3", pch=15) ### cobweb X = c(rep(x[1:40],each=2), x[41]) Y = c(0,rep(x[2:41],each=2)) # Plot aus Streckenzug, Funktionsgraph und Winkelhalbierender # Gleiche Skalierung der Achsen mit asp=1 # Die Gerade g(x) = x kann man mit curve nicht mit x plotten - man braucht 1*x plot(X,Y, type="l", col="red", xlim=c(0,1), ylim=c(0,1), asp=1) curve(r*x*(1-x), 0, 1, col="blue", add=T) curve(1*x, 0, 1, add=T) # Attention: no end in for and if-else; use { } for multiple commands # Attention: a statement is executed if it is syntactic complete p = 0.03 if(p<0.05) print("Hooray!") else print("Bah.") if(p<0.05) print("Hooray!") else print("Bah.") if (p<0.05) { print("Hooray!") } else { print("Bah.") } # this use is ok x = seq(0,3,length=91) y = numeric(length(x)) for (k in 1:length(x)) { if (x[k] <= 1) y[k] = 2*x[k]^2 else if (x[k] <= 2) y[k] = -x[k] + 3 else y[k] = 2*(x[k]-2)^2 + 1 } plot(x,y,type="l") # ------------------------------------------------------------------------- # Some mathematics (speed and precision) # ------------------------------------------------------------------------- # Compute 100 times the median of 10000 numbers uniformly distributed in [0,1] # This tests mainly the speed of sorting system.time(for(i in 1:100) median(runif(10000))) # Compute the eigenvalues and eigenvectors of a 200 by 200 matrix with # random integer entries from -10..10 M = matrix(sample(-10:10, 40000, replace=TRUE), nr=200, nc=200) system.time(eigen(M)) # A not so simple 3*3-matrix A = matrix(21:29,3,3); A # Determinant is not zero for R det(A) # R refuses to compute the inverse # it gives the same condition number as Matlab # setting the option tol one can force R to compute a result solve(A)