R

= Resources = >> RStudio on nodep (JHU Intranet Access Only)
 * R Help
 * R Coding Conventions
 * R Basic
 * Programming in R
 * Statistics with R
 * A short course
 * Statistical Computing in R
 * R by example

= Packages = R CMD INSTALL packagename   # installs a package. R CMD check packagename        # runs the QA tools on the package. R CMD build packagename         # creates a package ﬁle. Listing Libraries > library Listing R Data Sets > data Install a new package to your computer: Method 1: Install from source $ R CMD INSTALL mypkg -l /my/own/R-packages/ Method 2: Install from CRAN directly > install.packages("mypkg", lib="/my/own/R-packages/") Test if a package is installed

is.installed <- function(mypkg) is.element(mypkg, installed.packages[,1]) > is.installed('MASS') [1] TRUE > is.installed('base') [1] TRUE > is.installed('ROCR') [1] FALSE > is.installed <- function(mypkg) is.element(mypkg, installed.packages[,1]) > if (!( is.installed('CompQuadForm') )) install.packages("CompQuadForm")
 * 1) defining a function

= Data Structures and Data Types =

Vectors
"c" stands for "concatenate". Several ways to define them

> c(1,2,3,4,5) > 1:5 > seq(1, 5, by=1) > seq(1, 5, lenth=5)

Factors
A factor is a vector coding for a qualitatitative (non-numeric) variable

> x <- factor( sample(c("Yes", "No", "Perhaps"), 5, replace=T) ) > l <- c("Yes", "No", "Perhaps") > x <- factor( sample(l, 5, replace=T), levels=l ) > levels(x)
 * 1) specify the list of acceptable values, or "levels" of this factor.

> table(x)
 * 1) One can summarize a factor with a contingency table

Create a factor that follows a certain pattern with the "gl" command. > gl(1,4) > gl(2,4) > gl(2,4, labels=c(T,F)) > gl(2,1,8) > gl(2,1,8, labels=c(T,F))

The "interaction" command builds a new factor by concatenating the levels of two factors.

> x <- gl(2,4) > y <- gl(2,1,8) > interaction(x,y) > data.frame(x,y, int=interaction(x,y))

The "expand.grid" computes a cartesian product (and yields a data.frame). > x <- c("A", "B", "C") > y <- 1:2 > z <- c("a", "b") > expand.grid(x,y,z)

When playing with factors, people sometimes want to turn them into numbers. This can be ambiguous and/or dangerous.

> x <- factor(c(3,4,5,1)) > as.numeric(x) # Do NOT do that Missing values > x <- c(1,5,9,NA,2)

The default behaviour of many functions is to reject data containing missing values > mean(x) > mean(x, na.rm=T) > na.omit(x) attr(,"na.action") attr(,"class") > d <- data.frame(x, y=rev(x)) > na.omit(d)
 * 1) Ask R to first remove the missing values.
 * 1) You can do that yourself with the "na.omit" function.
 * 1) This also works with data.frames

Do NOT use missing values in boolean tests > x [1] 1  5  9 NA  2 > x == 5 [1] FALSE TRUE FALSE    NA FALSE > is.na(x) > x <- c(-1, 0,1,2,NA) > cbind(X=x, LogX=log(x))

Data Frames
List of vectors, each with the same length. > n <- 10 > df <- data.frame( x=rnorm(n), y=sample(c(T,F),n,replace=T) ) > str(df) > summary(df) > df > df$x > df[,1] > df"x" > dim(df) > names(df) > row.names(df) Change the colomn/row names. > names(df) <- c("a", "b") > row.names(df) <- LETTERS[1:10] > names(df) > row.names(df) > str(df)
 * 1) The "str" command prints out the structure of an object
 * 1) The "summary" command print concise information about an object

Turn the columns the data.frame into actual variables with the "attach" command. Do not forget to "detach" the data.frame after use. > data(faithful) > str(faithful) > attach(faithful) > str(eruptions) > detach

The "merge" command joins two data frames -- it is the same JOIN as in Databases. merge(a,b) # SELECT * FROM a INNER JOIN b ON a.y = b.y AND a.z = b.z merge(x, y, all.x = TRUE)   # LEFT JOIN merge(x, y, all.y = TRUE)  # RIGHT JOIN merge(x, y, all  = TRUE)   # OUTER JOIN By default, the join is over the columns common present in both data frames, but you can restrict it to a subset of them, with the "by" argument. merge(a, b, by=c("y", "z"))

Data frames are often used to store data to be analyzed. We shall detail those examples later -- do not be frightened if you have never heard of "regression", we shall shortly demystify this notion. data(cars)   # load the "cars" data frame lm( dist ~ speed, data=cars)
 * 1) Regression

lm( dist ~ poly(speed,3), data=cars)
 * 1) Polynomial regression

library(Design) lm( y ~ rcs(x) )
 * 1) Regression with splines

glm(y ~ x1 + x2, family=binomial, data=...) library(Design) lrm(death ~ blood.presure + age)
 * 1) Logistic regression

nls( y ~ a + b * exp(c * x), start = c(a=1, b=1, c=-1) ) data(USArrest) princomp( ~ Murder + Assault + UrbanPop, data=USArrest)
 * 1) Non linear regression
 * 1) Principal Component Analysis

xyplot( x ~ y | group )
 * 1) Treillis graphics

d <- data.frame(...) as.matrix(d) data.matrix(d)

Lists
Vectors only contain simple types (numbers, booleans or strings); Lists, on the contrary, may contain anything, for instance data frames or other lists.

> h <- list > h"foo" <- 1 > h"bar" <- c("a", "b", "c") > str(h)

You can access one element with the "[[" operator, you can access several elements with the "[" operator.

> h"bar" > h2 > h[1:2] > h[2]    # Beware, the result is not the second element, # but a list containing this second element. > str(h[2])

For instance, the graphic parameters are stored in a list, used as a hash table. > str( par )

The results of most statistical analyses is not a simple number or array, but a list containing all the relevant values. > n <- 100 > x <- rnorm(n) > y <- 1 - 2 * x + rnorm(n) > r <- lm(y~x) > str(r) > str( summary(r) )

To delete an element from a list:

> h"bar" <- NULL > str(h)

Matrices
Matrices are 2-dimensional tables, but contrary to data frames (whose type may vary from one column to the next), their elements all have the same type.

A matrix:

> m <- matrix( c(1,2,3,4), nrow=2 ) > m

Caution: by default, the elements of a matrix are given vertically, column after column.

> matrix( 1:3, nrow=3, ncol=3 ) > matrix( 1:3, nrow=3, ncol=3, byrow=T ) > t(matrix( 1:3, nrow=3, ncol=3 ))

Matrix product (beware: A * B is an element-by-element product):

> x <- matrix( c(6,7), nrow=2 ) > m %*% x

Determinant: > det(m)

Transpose: > t(m)

A diagonal matrix: > diag(c(1,2))

Identity matrix (or, more generally, a scalar matrix, i.e., the matrix of a homothety): > diag(1,2) > diag(rep(1,2)) > diag(2)

The "cbind" and "rbind" functions put data frames side by side or on top of each other. They also work with matrices. > cbind( c(1,2), c(3,4) ) > rbind( c(1,3), c(2,4) )

The trace of a matrix: > sum(diag(m))

The inverse of a matrix: > solve(m)

Actually, one rarely need the inverse of a matrix -- we usually just want to multiply a given vector by this inverse: this operation is simpler, faster and numerically more stable. > solve(m, x) > solve(m) %*% x

Eigenvalues: > eigen(m)$values Eigenvectors: > eigen(m)$vectors

Matrices and arrays
"array" type, generalizes matrices in higher dimensions.

> d <- array(rnorm(3*3*2), dim=c(3,3,2)) > str(d)

Contigency tables are arrays > data(HairEyeColor) > HairEyeColor > str(HairEyeColor) > is.array(HairEyeColor)

Attributes
Attach meta-data to an object: these are called "attributes". For instance, names of the elements of a list are in an attribute.

> l <- list(a=1, b=2, c=3) > str(l) > attributes(l) > a <- data.frame(a=1:2, b=3:4) > str(a) > attributes(a) > a <- matrix(1:4, nr=2) > rownames(a) <- letters[1:2] > colnames(a) <- LETTERS[1:2] > str(a) > attributes(a) > data(HairEyeColor) > str(HairEyeColor)

It is also used to hold the code of a function if you want to keep the comments.

> f <- function (x) { +  # Useless function +  x + 1 + } > f function (x) { # Useless function x + 1 } > str(f) function (x) - attr(*, "source")= chr [1:4] "function (x) {" ... > attr(f, "source") <- NULL > str(f) function (x) > f function (x) {    x + 1 }

Operations of vectors and arrays
The "apply" function applies a function (mean, quartile, etc.) to each column or row of a data.frame, matrix or array.

> options(digits=4) > df <- data.frame(x=rnorm(20),y=rnorm(20),z=rnorm(20)) > apply(df,2,mean) > apply(df,2,range)

It also works in higher dimensions. The second argument indicates the indices along which the program should loop, i.e., the dimensions used to slice the data, i.e., the dimensions that will remain after the computation.

> options(digits=2) > m <- array(rnorm(10^3), dim=c(10,10,10)) > a <- apply(m, 1, mean) > b <- apply(m, c(1,2), mean) > apply(b, 1, mean)

The "tapply" function groups the observations along the value of one (or several) factors and applies a function (mean, etc.) to the resulting groups. The "by" command is similar.

> tapply(1:20, gl(2,10,20), sum) > by(1:20, gl(2,10,20), sum)

The "sapply" function applies a function to each element of a list (or vector, etc.) and returns, if possible, a vector. The "lapply" function is similar but returns a list.

> x <- list(a=rnorm(10), b=runif(100), c=rgamma(50,1)) > lapply(x,sd) > sapply(x,sd)

In particular, the "sapply" function can apply a function to each column of a data.frame without specifying the dimension numbers required by the "apply" command (at the beginning, you never know if it sould be 1 or 2 and you end up trying both to retain the one whose result has the expected dimension).

The "split" command cuts the data, as the "tapply" function, but does not apply any function afterwards.

> str(InsectSprays) > str( split(InsectSprays$count, InsectSprays$spray)  ) > sapply( split(InsectSprays$count, InsectSprays$spray),  mean  ) > tapply( InsectSprays$count, InsectSprays$spray, mean )

Strings
R is not the best tool to process strings, but you sometimes have to do it. Strings are delimited by double or single quotes.

> "Hello" == 'Hello'

You do not print a string with the "print" function but with the "cat" function. The "print" function only gives you the representation of the string.

> print("Hello\n") [1] "Hello\n" > cat("Hello\n") Hello > s <- "C:\\Program Files\\" > print(s) [1] "C:\\Program Files\\" > cat(s, "\n") C:\Program Files\

You can concatenate strings with the "paste" function. To get the desired result, you may have to play with the "sep" argument.

> paste("Hello", "World", "!") > paste("Hello", "World", "!", sep="") > paste("Hello", " World", "!", sep="") > x <- 5 > paste("x=", x) > paste("x=", x, paste="")

The "cat" function also accepts a "sep" argument.

> cat("x=", x, "\n") > cat("x=", x, "\n", sep="")

Sometimes, you do not want to concatenate strings stored in different variables, but the elements of a vector of strings. If you want the result to be a single string, and not a vector of strings, you must add a "collapse" argument.

> s <- c("Hello", " ", "World", "!") > paste(s) > paste(s, sep="") > paste(s, collapse="")

In some circumstances, you can even need both (the "cat" function does not accept this "collapse" argument).

> s <- c("Hello", "World!") > paste(1:3, "Hello World!") > paste(1:3, "Hello World!", sep=":") > paste(1:3, "Hello World!", sep=":", collapse="\n") > cat(paste(1:3, "Hello World!", sep=":", collapse="\n"), "\n")

The "nchar" function gives the length of a string (I am often looking for a "strlen" function: there it is).

> nchar("Hello World!")

The "substring" function extract part of a string (the second argument is the starting position, the third argument is 1 + the end position).

> s <- "Hello World" > substring(s, 4, 6)

The "strsplit" function splits a string into chunks, at each occurrence of a given "string".

> s <- "foo, bar, baz" > strsplit(s, ", ") > s <- "foo-->bar-->baz" > strsplit(s, "-->")

Actually, it is not a string, but a regular expression.

> s <- "foo, bar, baz" > strsplit(s, ", *")

You can also use it to get the individual characters of a string.

> strsplit(s, "") > str(strsplit(s, ""))

The grep function looks for a "string" in a vector of strings.

> s <- apply(matrix(LETTERS[1:24], nr=4), 2, paste, collapse="") > s > grep("O", s) > grep("O", s, value=T) Actually, it does not look for a string, but for a regular expression.

The "regexpr" performs the same task as the "grep" function, but gives a different result: the position and length of the first match (or -1 if there is none)

> regexpr("o", "Hello") > regexpr("o", c("Hello", "World!")) > s <- c("Hello", "World!") > i <- regexpr("o", s) > attr(i, "match.length")

Sometimes, you want an "approximate" matches, not exact matches, accounting for potential spelling or typing mistakes: the "agrep" function provides suc a "fuzzy" matching. It is used by the "help.search" function.

> grep ("abc", c("abbc", "jdfja", "cba")) > agrep ("abc", c("abbc", "jdfja", "cba"))

The "gsub" function replaces each occurrence of a string (a regular expression, actually) by a strin.

> s <- "foo bar baz" > gsub(" ", "", s)  # Remove all the spaces > s <- "foo bar   baz" > gsub(" ", "", s) > gsub(" ", " ", s) > gsub(" +", "", s) > gsub(" +", " ", s)  # Remove multiple spaces and replace them by single spaces

The "sub" is similar to "gsub" but only replaces the first occurrence.

> s <- "foo bar baz" > sub(" ", "", s)

Date
To convert a string into a Date object:

> as.Date("2005-05-15")

If you convert from an ambiguous format, you must specify the format:

> as.Date("15/05/2005", format="%d/%m/%Y") > as.Date("15/05/05", format="%d/%m/%y") > as.Date("01/02/03", format="%y/%m/%d") > as.Date("01/02/03", format="%y/%d/%m")

You can compute the difference between two dates -- it is a number of days.

> a <- as.Date("01/02/03", format="%y/%m/%d") > b <- as.Date("01/02/03", format="%y/%d/%m") > a - b

Today's date:

> Sys.Date

You can add a Date and a number (a number of days).

> Sys.Date + 21

You can format the date to produce one of those ambiguous formats your clients like.

> format(Sys.Date, format="%d%m%y") > format(Sys.Date, format="%A, %d %B %Y")

Miscellanies: match
?match x[as.logical( match(x, c(2,4), nomatch=0) )]
 * 1) Get the 2's and 4's

There are a few function written with "match":

> setdiff function (x, y) unique(if (length(x) || length(y)) x[match(x, y, 0) == 0] else x) 

> match.fun("%in%") function (x, table) match(x, table, nomatch = 0) > 0 

> intersect function (x, y) unique(y[match(x, y, 0)]) 

> is.element function (el, set) match(el, set, 0) > 0 

> setequal function (x, y) all(c(match(x, y, 0) > 0, match(y, x, 0) > 0)) 

Several ways to select a part of a vector.

> x <- seq(-1, 1, by=.1) # create a vector > x > x[5:10] > x[c(5,7:10)] > x[-(5:10)]  # We remove the elements whose index lies between 5 and 10 > x>0 > x[ x>0 ]

Name the coordinates of a vector > names(x) NULL > names(x) <- letters[1:length(x)]   # "letters" is a vector of strings, containing 26 lower case letters. > x > x["r"]

One can also define those names while creating the vector. > c(a=1, b=5, c=10, d=7)

A few operations on vectors: > x <- rnorm(10) > sort(x) > rev(sort(x)) > o <- order(x) > x[ o[1:3] ] > x <- sample(1:5, 10, replace=T) > sort(x) > unique(x)   # No need to sort the data before

Another ways of creating vectors. The "seq" command generates arithmetic sequences. > seq(0,10, length=11) > seq(0,10, by=1)

The "rep" command repeats a number or a vector. > rep(1,10) > rep(1:5,3) > rep(1:5,each=3) > rep(1:5,2,each=3)

Text files
Read the data from the file into the data frame mytable mytable <- read.table("tablefile.txt", header=TRUE)

mycsv <- read.table("file1.csv", header=TRUE, sep=",") or mycsv <- read.csv("file1.csv") NA is the code for missing data. You cannot test temp==NA, so there is a function is.na.

If variable names aren’t included myfile <- read.table("file1.txt", col.names=c("id","name", "grade","age", "address")) or myfile <- read.table("file1.txt") names(myfile) <- c("id","name", "grade","age", "address"))

c is a function that makes a single vector from its arguments. names is a function that accesses the variable names of a data frame Some functions (such as names) can be used on the LHS of an assignment.

Other datasets
Read and convert other formats: library(foreign) stata <- read.dta("salary.dta") spss <- read.spss("salary.sav", to.data.frame=TRUE) sasxport <- read.xport("salary.xpt")

Web pages
read.table can read from the web myfile<-read.table("http://my.jhu.edu/mp/data/file1.dat", header=TRUE)

Writing data
write(x, file = "data", ncolumns = if(is.character(x)) 1 else 5, append = FALSE, sep = " ") Write Table write.table(out$results, file = "SKAT1.out", col.names = TRUE, sep = "\t")

From Data to Graphics
= Programming with R =

Error Recover
= Graphics With R =

Scatter Plot
ggplot2: Split Data Range into Multiple Chart Series  > rdata <- read.table(textConnection(" 1 City X Y 2 Atlanta 4 15 3 Atlanta 5 18 4 Boston 6 16 5 Boston 6 16 6 Boston 7 12 7 Boston 11 11 8 Chicago 10 13 9 Chicago 13 10 10 Chicago 15 8 11 Detroit 10 9 12 Detroit 15 5 13 Detroit 13 3 14 Detroit 14 6 "), header=TRUE) > closeAllConnections

> library(ggplot2) > p <- ggplot(rdata, aes(x = X, y = Y, colour = City, shape = City, label = City))

Draw Default scatterplot:

> p1 <- p + geom_point + xlab(NULL) + ylab(NULL)

Add labels & remove legend:

> p2 <- p1 + geom_text(aes(hjust = -0.1, vjust = 0.5)) + opts(legend.position = "none")

Define max axis limits:

> maxl <- max(rdata$X, rdata$Y) Set x-axis and y-axis min&max limits:

> p3 <- p2 + scale_x_continuous(limits = c(0, maxl)) + scale_y_continuous(limits = c(0, maxl))

a few more formatting adjustments, and we will have a chart ready to be used.

> formatted <- p3 + scale_colour_brewer(palette = "Set1") + opts(panel.background = theme_rect(colour = "grey")) + opts(panel.grid.minor = theme_line(colour = NA)) + opts(panel.grid.major = theme_line(colour = NA))

jpeg('myplot.jpg') formatted dev.off library(ggplot2) day <- 1:100 dat <- data.frame( day=day,  x = day+(1+rnorm(100, 0, 10)),  y = 5 + day+(1+rnorm(100, 0, 10)),  when = cut(day, 5) ) ggplot(dat, aes(x=x, y=y, colour=when)) + geom_point ggplot(dat, aes(x=x, y=y, colour=day)) + geom_point + scale_colour_gradient(low="pink", high="red")

Save Plot
 jpeg('myplot.jpg')
 * 1) call a driver

x <- c(1,3,6,9,12) y <- c(1.5,2,7,8,15) plot(x,y, xlab="x axis", ylab="y axis", main="my plot", ylim=c(0,20), xlim=c(0,20), pch=15, col="blue") x2 <- c(0.5, 3, 5, 8, 12) y2 <- c(0.8, 1, 2, 4, 6) points(x2, y2, pch=16, col="green") myline.fit <- lm(y ~ x) abline(myline.fit)
 * 1) plot

dev.off Graphic Drivers : Format		Driver		Notes JPG		jpeg		Can be used anywhere, but doesn't resize PNG		png		Can be used anywhere, but doesn't resize PDF		pdf		Best choice with pdflatex; easily resizable Postscript	postscript	Best choice with latex and Open Office; easily resizable From Shell: 
 * 1) turn off the driver
 * 1) !/bin/sh


 * 1) $ -cwd
 * 2) $ -j y

echo -e "

jpeg('myplot.jpg')
 * 1) call a driver

x <- c(1,3,6,9,12) y <- c(1.5,2,7,8,15) plot(x,y, xlab=\"x axis\", ylab=\"y axis\", main=\"my plot\", ylim=c(0,20), xlim=c(0,20), pch=15, col=\"blue\") x2 <- c(0.5, 3, 5, 8, 12) y2 <- c(0.8, 1, 2, 4, 6) points(x2, y2, pch=16, col=\"green\") myline.fit <- lm(y ~ x) abline(myline.fit)
 * 1) plot

dev.off
 * 1) turn off the driver

" > run.plot.R

R --no-save < run.plot.R > run.plot.tmp

Probability Distributions
= Basic Statistical with R =

Testing Independenc
= More Statisctics with R =

Time series
= Sweave and Survival Analysis = = Longitudinal Models = = Numerical Methods = = Dataframes = = Miscellaneous =

wget http://cran.r-project.org/src/contrib/bitops_1.0-4.1.tar.gz R CMD INSTALL bitops_1.0-4.1.tar.gz
 * 1) name space required

cat > /usr/lib64/R/library/bitops/NAMESPACE
 * 1) Default NAMESPACE created by R
 * 2) Remove the previous line if you edit this file

exportPattern(".")
 * 1) Export all names

ctrl D

wget http://cran.r-project.org/src/contrib/gtools_2.7.0.tar.gz R CMD INSTALL gtools_2.7.0.tar.gz

wget http://cran.r-project.org/src/contrib/Archive/gdata/gdata_2.8.2.tar.gz R CMD INSTALL gdata_2.8.2.tar.gz

install.packages("caTools", dependencies = TRUE)

install.packages("gplots", dependencies = TRUE) install.packages("ROCR", dependencies = TRUE) install.packages("pROC", dependencies = TRUE) library(ROCR) library(pROC)

= Applications 	 =