code(n00b): October 2010

Wednesday, October 27, 2010

R: Biplot (generic)

# I can't get the variables in to a matrix directly. Why? If I knew why, i would fix it. So bear with this.

# put variables of interest into a dataframe:
neo <- data.frame(infile$dep.var1,infile$dep.var2)
agt.smith <- data.frame(infile$indep.var1,infile$indep.var2)

# now put it in a matrix:
x <- as.matrix(neo)
y <- as.matrix(agt.smith)

# and here's the plot:
biplot(x,y)

# for more details: ?biplot

# /* END OF BIPLOTS */

Tuesday, October 26, 2010

R: Logistic Regression

# practically reposted from Quick R

# i haven't had the chance to test this, but I really don't see why it shouldn't work... if you try using it and it refuses to cooperate, send me an email.

# Data input: generic

fit <- glm(binary.outcome.var~indep.var1+indep.var2+indep.var3+...+indep.var,data=infile,family=binomial()) # where all indep.var(s) are continous variables. And assuming your the file your data resides in is called infile.
summary(fit) # display results
confint(fit) # 95% CI for the coefficients
exp(coef(fit)) # exponentiated coefficients
exp(confint(fit)) # 95% CI for exponentiated coefficients
predict(fit, type="response") # predicted values
residuals(fit, type="deviance") # residuals

# /* END OF LOGISTIC REGRESSION*/

Monday, October 25, 2010

R: Principal coordinates metric and non-metric scaling

# lib(s): labdsv

# Data input: generic

# - Principal coordinates metric scaling - (aka Classical (Metric) Multidimensional Scaling )

# This produces some shit:

the.dist <- dist(infile)
the.cmd <- cmdscale(the.dist)

# How about a plot?
x<- the.cmd[,1]
y<- the.cmd[,2]
plot(x,y) # what is the plot supposed to show anyway?!?

# - End of Principal coordinates metric scaling - (aka Classical (Metric) Multidimensional Scaling )

# - Principal coordinates non-metric scaling - (aka Nonmetric Multidimensional Scaling)

library(labdsv)
non-met <- nmds(the.dist)

# how about another plot?
plot(non-met)

# - End of Principal coordinates non-metric scaling - (aka Nonmetric Multidimensional Scaling)

# what actually happens in either of the above, however, remains a mystery to me.

# /* END OF PRINCIPAL COORDINATES METRIC/NON-METRIC SCALING */

R: SMA (standard major axis) Regression

# !!! One should first test the significance of the correlation coefficient (r) to determine if the hypothesis of a relationship is supported. No SMA regression equation should be computed when this condition is not met. This remains a less-than-ideal solution since SMA slope estimates cannot be tested for significance. Confidence intervals should also be used with caution: simulations have shown that, as the slope departs from +/-1, the SMA slope estimate is increasingly biased and the confidence interval includes the true value less and less often. Even when the slope is near +/-1, the confidence interval is too narrow if n is very small or if the correlation is weak !!!

# Data input: generic

# To test the significance of the correlation coefficient (r):
cor.test(infile$var1,infile$var2,method="spearman")

# If the result of the above is significant(ie: your variables are significantly correlated), then the following function may be applied.
# ref for regression method: http://www.palass.org/modules.php?name=palaeo_math&page=7

sma.reg <- function(X,Y) # where X is the independent variable and Y is the dependent variable.
{
# calculate the regression slope:
std.devX <- sd(X)
std.devY <- sd(Y)
reg.slope <- std.devX/std.devY

# calculate the y-intercept
av.X <- mean(X)
av.Y <- mean(Y)
y.intercept <- av.Y-(reg.slope*av.X)

# calculate the sign of the slope:
Xmc <- X-av.X
Ymc <- Y-av.Y
XmcYmc <- Xmc*Ymc
slope.dir <- sum(XmcYmc)
if(slope.dir<0)
{
min.Ycoords <- -1*reg.slope*(min(X))+y.intercept
max.Ycoords <- -1*reg.slope*(max(X))+y.intercept
Xcoords <- c(min(X),max(X))
Ycoords <- c(min.Ycoords,max.Ycoords)
plot(X,Y)
lines(Xcoords,Ycoords)
paste("Y","=","-",reg.slope,"X","+",y.intercept)
} else
{
min.Ycoords <- reg.slope*(min(X))+y.intercept
max.Ycoords <- reg.slope*(max(X))+y.intercept
Xcoords <- c(min(X),max(X))
Ycoords <- c(min.Ycoords,max.Ycoords)
plot(X,Y)
lines(Xcoords,Ycoords)
paste("Y","=",reg.slope,"X","+",y.intercept)
}
}
# /* END OF SMA REGRESSION */

Thursday, October 21, 2010

R: Quardratic Discriminant Analysis

# Data input: generic

# lib(s): MASS, klaR

# Quadratic Discriminant Analysis with resubstitution prediction and equal prior probabilities.

library(MASS)
fit <- qda(dep.cat.var ~ indep.var1 + indep.var2 + indep.var3 + ... + indep.var, data=na.omit(infile),prior=c(1,1,1,...,1)/X) # where the number of 1's in prior=c(1,1,1,...,1) is equivalent to the number of groups in dep.cat.var and X is the number of groups in dep.cat.var (an integer).
# i would imagine all the independent variables here are continous variables, but i could be wrong. :\

# an exploratory graph
library(klaR)
partimat(dep.cat.var ~ indep.var1 + indep.var2 + indep.var3 + ... + indep.var,data=infile,method="qda")

# /* END OF QUADRATIC DISCRIMINANT ANALYSIS */

Monday, October 18, 2010

R: Linear Discriminant Analysis

lib(s): MASS, klaR

# Data input: generic

# Before using this script:
library(MASS)
library(klaR)

# - Using Jacknifed Prediction - wtf is jacknifed prediction???

fit <- lda(infile$dep.cat.var ~ indep.var1 + indep.var2 + indep.var3 + ... + indep.var, data=infile, na.action="na.omit") # this leaves out NA values. Don't know why if you add "CV=TRUE" at the back, it doesn't work - WTF?
fit # show results

# visualize the results with a graph.
plot(fit)

# another visual - not beta tested... it says figure margins too large (whatever that means)
plot(fit, dimen=1, type="both") # fit from lda

# Assess the accuracy of the prediction
# percent correct for each group in you dependent categorical variable
ct <- table(infile$dep.cat.var, fit$class)
diag(prop.table(ct, 1))
# total percent correct
sum(diag(prop.table(ct)))

# an exploratory graph
partimat(infile$dep.cat.var ~ indep.var1 + indep.var2 + indep.var3 + ... + indep.var, data=infile, method="lda")

# /* END OF LINEAR DISCRIMINANT ANALYSIS */

Tuesday, October 12, 2010

R: Multiple Correspondence Analysis

# lib(s): ca, rgl

# Red indicates stuff ya gotta to edit.

# Data input: generic

cat.infile <- data.frame("infile$cat.var1,"infile$cat.var2","infile$cat.var3",...,"infile$cat.var")

library(ca) # rgl will load automatically

# syntax is: mjca(obj, nd = 2, lambda = "adjusted", supcol = NA, subsetcol = NA, ps = "", maxit = 50, epsilon = 0.0001)
# approach to MCA is adjusted by lambda, where:
# lambda="indicator" - analysis based on simple correspondence analysis of the indicator matrix
# lambda="Burt" - analysis based on an eigen-decomposition of the Burt matrix
# lambda="adjusted" - analysis based on the Burt matrix with an adjustment of inerplot(Mtias
# lambda="JCA" - Joint Correspondence Analysis
# lib(s): ca, rgl

library(ca) # rgl will load automatically

MCA <- mjca(cat.infile) # where cat.infile is a bunch categorical variables (coded with dummy values???)
summary(MCA)
# how are the results visualized????
plot(MCA)
plot(MCA, mass = TRUE, contrib = "absolute", map ="rowgreen", arrows = c(FALSE, TRUE)) # asymmetric map

# /* END OF MULTIPLE CORRESPONDENCE ANALYSIS */

R: Canonical Discriminant Analysis

# Data input: generic

# lib(s): candisc, heplots # !!! heplots is a necessary dependency, and wasn't available on the SG host, so I used USA-CA(1) - think it's UC Berkeley !!!

# Red indicates stuff ya gotta to edit.

# this requires the input of some kind of precomputed multivariate model - usually MANOVA.
# Generalized canonical discriminant analysis may also be perfromed for one term in a multivariate linear model (ie: an object of class mlm) - this means the output of some lm() function.

library(candisc)

# MANOVA:
Y <- cbind(infile$dep.var1,infile$dep.var2,...,infile$dep.var)# dependent variables have to be cbind-ed together.
output <- manova(Y ~ indep.var1*indep.var2*indep.var3*...*indep.var)

# Canonical Discriminant Analysis (in its simplest form)
candisc(output)

# Here's the mlm:
Y <- cbind(infile$dep.var1,infile$dep.var2,...,infile$dep.var)# dependent variables have to be cbind-ed together.
output <- lm(Y ~ indep.var1*indep.var2*indep.var3*...*indep.var)

# and here's the generalized canonical discriminant analysis (in its simplest form)
candisc(output,term="indep.var")

# a plot for thought:
candisc.result <- candisc(output)
plot(candisc.result)

# /* END OF CANONICAL DISCRIMINANT ANALYSIS */