# Demo showing some of R's functionality in comparison with Matlab's
# statistics toolbox; see statistics-demo.m
#
# file: statistics-demo.R (Peter Harmand, 13.12.08)
# -------------------------------------------------------------------------


# -------------------------------------------------------------------------
# Program and documentation
# -------------------------------------------------------------------------

# The official homepage is
#    http://www.r-project.org/
#
# Official list of documentation
#    http://cran.r-project.org/other-docs.html
#
# My recommendations:
#
# "The R Guide" by Jason Owen
#    My main recommendation for my second semester students;
#    Basic R and basic statistics on less than 60 pages
#    http://cran.r-project.org/doc/contrib/Owen-TheRGuide.pdf
# "Simple R" by John Verzani
#    More on R and statistic.
#    There is a 400-page book which grew out of this public domain text
#    "Using R for introductory statistics"
#    Bibliothek Standort Wechloy: inf 640 CO 2231
#    http://cran.r-project.org/doc/contrib/Verzani-SimpleR.pdf
# The official introduction
#    [R directory]\doc\manual\R-intro.pdf
#    If you know something about programming and statistics, you should start
#    here.
# "An Introduction to R" by Petra Kuhnert and Bill Venables
#    More than 350 pages; also advanced material; problem oriented
#    http://cran.r-project.org/doc/contrib/Kuhnert+Venables-R_Course_Notes.zip
# A 4-page summary with the most important commands
#    "R reference card"  by Tom Short
#    http://cran.r-project.org/doc/contrib/Short-refcard.pdf
#
#
# Books
#
# There is a large number of books on R. I mention only:
#
# "Introductory statistics with R" by Peter Dalgaard
#    Very good introduction.
#    Bibliothek Standort Wechloy: 4 mat 710 CP 0743
# "Modern applied statistics with S" by  W. N. Venables and B. D. Ripley
#    After half a year of R and one year of statistics you will enjoy it.
#    Bibliothek Standort Wechloy: 4 inf 630 CJ 1178,4
# "Programmieren mit R" by Uwe Ligges
#    Good for all programming aspects; in German.
#    Bibliothek Standort Wechloy: 4 inf 340 r CP 9494
#
# Archive of the R-help user group
#    http://tolstoy.newcastle.edu.au/R/
#    !!! Don't post questions, if you haven't read *all* manuals for
#    several days or if you don't understand the mathematics/statistics
# A German R-user group
#    http://r-statistik.foren-city.de/
#
# Link lists
#    http://www.uni-koeln.de/themen/statistik/software/s/


# -------------------------------------------------------------------------
# Interface, invoking and using R
# -------------------------------------------------------------------------

# *** Explain basic usage (editor, command window, ...)
# Working directory (setwd, getwd)

# Other interfaces:
#
# R-Commander (A taste of the look-and-feel of classical statistics
#    programs: selection of variables, tests und graphics by clicking
#    with the mouse)
#    http://socserv.mcmaster.ca/jfox/Misc/Rcmdr/
# Tinn-R (Editor with syntax highlightning and additional menus and
#    workspace browser
#    http://www.sciviews.org/Tinn-R/


# -------------------------------------------------------------------------
# Basic operations
# -------------------------------------------------------------------------

# Define variables
a = 5		# new way
b <- 6	# old way
a; b
(c = 7)	# shows result

# vectors
x = c(1,2,3,4)
y = c(1,0,3,-2)
x+y		
x*y

# !! Attention: R adds vectors of different length by recycling
#               the shorter one
y+1:5
y+1:8

length(x)	# length
sum(x)	# sum
cumsum(x)	# cumulative sum
mean(x)	# mean
x^2
2^x

# indexing
(x = -3:10)
x[1]
x[2:4]

x>2
which(x>2)
x[x>2]

# function arguments by order or by name
seq(0,1,0.1)
seq(to=1, by=0.1, from=0)
seq(0,1,length=11)

# matrices
A = matrix(c(1,2,-4,
             3,9,-2,
             4,12,0), nrow=3, ncol=3, byrow=TRUE)
A
A[1:9]
A[,1]	# first column
dim(A)
# add a column to the right of A
A = cbind(A, c(1,1,2)); A
## A (spaltenweise) zu 4*3-Matrix machen
dim(A) = c(4,3); A
# add rownames
rownames(A) = c("Zeile 1","Zeile 2","Zeile 3"); A
# addressing row by name
A["Zeile 2",]


# -------------------------------------------------------------------------
# Data input
# -------------------------------------------------------------------------

# *** Explain: no native Excel support, but csv
# data.frame
# Several other data formats (SPSS,...)
# Also low level file input

# set working directory
setwd("d:/home/r.wrk/work")
setwd("g:/temp")

# read csv-file generated by German Excel
# ; instead of , as field seperator and , instead of . as decimal separator
Data = read.csv2("klausur0506.csv", header=TRUE)

# check
Data
# try also
#fix(Data)

# variables are accessible in the following form
Data[,1]
Data$Punkte

# make the variables accessible by their names
attach(Data)

# notice the difference: Punkte is just a numeric vector,
# Studiengang is classified as a factor with 4 levels
Punkte
Studiengang

# a compact form with all attributes is always given with str(..)
str(Punkte)
str(Studiengang)
str(Data)
# compare with
attributes(Data)


# -------------------------------------------------------------------------
# Simple summary statistics
# -------------------------------------------------------------------------

# trivial: mean, variance, ...
mean(Punkte); var(Punkte)

# more interesting: summary (works with most R objects and gives most
# important aspects depending on the type of the object)
summary(Punkte)
summary(Studiengang)

# group the variable Punkte according to Studiengang
(Punkte.G = split(Punkte,Studiengang))

# mean for the four subgroups
sapply(Punkte.G, mean)


# -------------------------------------------------------------------------
# Simple statistical graphics
# -------------------------------------------------------------------------

# *** Explain: graphics system
# R's oddity with axis
# In comparison with matlab no gui support, no animation, ...
# but Open-GL will come

### histogram
# the default
hist(Punkte)

# change the appearence
hist(Punkte, main="Klausurergebnis", # set the title
             ylab="Anzahl",          # label the y-axis
             col="LightBlue")        # change color of the bars

# Change y scale to density and add the normal approximation
# (Not a good visual test for normal distribution; better use qq-plot)
hist(Punkte, freq=FALSE, main="Klausurergebnis mit Normalapproximation",
   ylab="Dichte")
curve(dnorm(x, mean=mean(Punkte), sd=sd(Punkte)), col="red", add = TRUE)


### barplot
# plot the counts of Studiengang
barplot(table(Studiengang))


### boxplot
# four boxplots to compare the groups (also good for detecting outliers)
boxplot(Punkte ~ Studiengang)


### scatterplot
# A simulation to compute pi:
# n random number in the interval [0,1]
n = 200
x = runif(n)
y = runif(n)

# check which points are inside the unit circle; logical vector
inside = x^2+y^2<1
# approximation for pi
4*sum(inside)/n
# Plot: color and plot symbol according to vector inside; it is automatically
# transformed to 0 1
plot(x,y, col=inside+2, pch=inside+16, asp=1)	
# add part of the circle
curve(sqrt(1-x^2), 0, 1, add = TRUE)


### qq-plot
qqnorm(Punkte)
qqline(Punkte) # zus�tzliche Gerade

# there are reasons for this choice of axes, however
# almost all other statistics programs graph it the other way
qqnorm(Punkte, datax = TRUE)
qqline(Punkte, datax = TRUE)


# -------------------------------------------------------------------------
# Simple statistical analysis
# -------------------------------------------------------------------------

### Linear regression

# One example from Quinn-Keough, chap. 5
Data = read.csv2("christ.csv", header=TRUE)

# no attach, declare variables individually
CWD = Data[,7]
ripdens = Data[,4]

# Simplest form of linear regression
erg = lm(CWD ~ ripdens)
summary(erg)

# Plot of the date and the regression line
plot(ripdens,CWD, col="red", pch=16)
abline(erg, col="blue")

# Look what R has computed
str(erg)

# For the most important things there are extractor functions, e.g.
coef(erg)

# Plot of the residuals against the fitted values
plot(fitted(erg), residuals(erg))


### two sample t-test

# Another example from Quinn-Keough
Data = read.csv("ward.csv", header=TRUE)
attach(Data)

# Take a look at the data:
# similar distribution, almost the same variance, significant difference
# of the mean/median
boxplot(EGGS ~ ZONE)

eggs.m = EGGS[ZONE == "Mussel"]
eggs.l = EGGS[ZONE == "Littor"]

# two-sample t-test with equal variances
t.test(eggs.m, eggs.l, var.equal=T)
# highly significant

# Other shorter form of input
t.test(EGGS ~ ZONE, var.equal=T)

# two sample t-Test with different variances (Welch test)
t.test(eggs.l, eggs.m)


### chi^2-test for independence

# For this test chisq.test requires the input as matrix, data.frame or table
M <- matrix(c(25,47,12,42), ncol=2, nrow=2, byrow=T,
       dimnames=list(c("mit LAK","ohne LAK"),c("R�ck", "keine R�ck")))
# function to print a pretty table with row and column sums
mit.rand <- function(tab){
  Tab = cbind(tab,rowSums(tab))
  Tab = rbind(Tab,colSums(Tab))
  colnames(Tab) = c(colnames(tab),"Gesamt")
  rownames(Tab) = c(rownames(tab),"Gesamt")
  names(dimnames(Tab)) = names(dimnames(tab))
  Tab
}
mit.rand(M)
chisq.test(M, correct=F)


### Test for normality

# The Kolmogorov-Smirnov test requires the reference probability distribution
# to be known. Estimating the parameters from the sample or using the
# standarised sample looses power. The same holds for the Lilliefors
# correction of this bias ("there is no need to have Lilliefors test in R,
# except for archeological interest"). Good modern test is Shapiro-Wilk.
shapiro.test(Punkte)


# -------------------------------------------------------------------------
# Some more advanced topics
# -------------------------------------------------------------------------

### exploratory analysis of multivariate data
#   scatterplot matrix, smoother

# Example from Rencher
Data = read.csv2("citycrime.csv", header=TRUE, row.names=1)

# Matrix of scatter plots
# The variant with smooth and numbers for correlation coefficient scaled
# by their value; from the help for pairs
panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits=digits)[1]
    txt <- paste(prefix, txt, sep="")
    if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt, cex=cex*r)
}
pairs(Data, lower.panel=panel.cor, upper.panel=panel.smooth)


### principle component analysis

erg = prcomp(Data)
biplot(erg)

#	Skalierung der Beobachtungen mit den singul�ren Werten
biplot(erg, scale=0, main="scale=0")


### logistic regression as a generalised linear model

# First example from the book of Hosmer-Lemeshow
Data <- read.csv2("chdage.csv", na.strings=".")

X <- Data$AGE
Y <- Data$CHD
lab.X <- "Alter"
lab.Y <- "Wahrscheinlichkeit f�r koronare Herzkrankheit"
#anz.g <- 10

#grenzen <- quantile(X, probs=0:anz.g/anz.g, na.rm = T)
grenzen <- c(20,29,34,39,44,49,54,59,69)
anz.g <- length(grenzen) - 1
X.gruppiert <- cut(X, grenzen, include.lowest=T)
table(X.gruppiert, Y)
Y.g <- split(Y, X.gruppiert)
mittel.Y.g <- sapply(Y.g, mean)
mittel.X.g <- grenzen[-(anz.g+1)] + diff(grenzen)/2
plot(X, Y, xlab=lab.X, ylab=lab.Y)
points(mittel.X.g, mittel.Y.g, col="red", pch=15)

erg <- glm(Y ~ X, family = binomial)
summary(erg)

curve(1/(1 + exp(-coef(erg)[1] - coef(erg)[2]*x)), min(X), max(X),
      add=T, col="blue")



# -------------------------------------------------------------------------
# Basic programming
# -------------------------------------------------------------------------

x = numeric(41)  # empty vector of length 41 for numeric entries
                 # Attention: initial value x_0 has index 1, x_40 is in x[41]
x[1] = 0.1	     # set initial value  	
r = 3.5          # set parameter

for (k in 2:41)
   x[k] = r*x[k-1]*(1 - x[k-1])

# Plot of the sequence as lines with points
plot(0:40,x, type="l", col="red")
points(0:40,x, col="green3", pch=15)

### cobweb


X = c(rep(x[1:40],each=2), x[41])
Y = c(0,rep(x[2:41],each=2))
# Plot aus Streckenzug, Funktionsgraph und Winkelhalbierender
# Gleiche Skalierung der Achsen mit asp=1
# Die Gerade g(x) = x kann man mit curve nicht mit x plotten - man braucht 1*x
plot(X,Y, type="l", col="red", xlim=c(0,1), ylim=c(0,1), asp=1)
curve(r*x*(1-x), 0, 1, col="blue", add=T)
curve(1*x, 0, 1, add=T)

# Attention: no end in for and if-else; use { } for multiple commands
# Attention: a statement is executed if it is syntactic complete
p = 0.03

if(p<0.05) print("Hooray!") else print("Bah.")

if(p<0.05)
   print("Hooray!")
else
   print("Bah.")

if (p<0.05) {
   print("Hooray!")
} else {
   print("Bah.")
}

# this use is ok
x = seq(0,3,length=91)
y = numeric(length(x))
for (k in 1:length(x)) {
   if (x[k] <= 1)
      y[k] = 2*x[k]^2
   else
      if (x[k] <= 2)
         y[k] = -x[k] + 3
      else
         y[k] = 2*(x[k]-2)^2  + 1
}
plot(x,y,type="l")


# -------------------------------------------------------------------------
# Some mathematics (speed and precision)
# -------------------------------------------------------------------------

# Compute 100 times the median of 10000 numbers uniformly distributed in [0,1]
# This tests mainly the speed of sorting
system.time(for(i in 1:100) median(runif(10000)))

# Compute the eigenvalues and eigenvectors of a 200 by 200 matrix with
# random integer entries from -10..10
M = matrix(sample(-10:10, 40000, replace=TRUE), nr=200, nc=200)
system.time(eigen(M))

# A not so simple 3*3-matrix
A = matrix(21:29,3,3); A
# Determinant is not zero for R
det(A)
# R refuses to compute the inverse
#   it gives the same condition number as Matlab
#   setting the option tol one can force R to compute a result
solve(A)