code(n00b): September 2010

Thursday, September 23, 2010

R: Cluster Analysis

# lib(s): pvclust, mclust

# Red indicates stuff ya gotta to edit.

# Essentially reposted from: http://www.statmethods.net/advstats/cluster.html

# Data input: generic

# infile (as mentioned in the script below) should contain all the variables you want to analyze, and nothin else.

# remove/estimate missing data and/or rescale variables for compatibilty
infile <- na.omit(infile) # listwise deletion of missing
infile <- scale(infile) # standardize variables

# - K-MEANS - alpha tested ok on 23Sept2010

# Determine number of clusters
wss <- (nrow(infile)-1)*sum(apply(infile,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(infile,
centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")

# K-Means Cluster Analysis
fit <- kmeans(infile, 5) # 5 cluster solution
# get cluster means
aggregate(infile,by=list(fit$cluster),FUN=mean)
# append cluster assignment
infile <- data.frame(infile, fit$cluster)

# - END OF K-MEANS -

# - Hierarchical Agglomerative - alpha tested ok on 23Sept2010

# Ward Hierarchical Clustering
d <- dist(infile, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward")
plot(fit) # display dendogram
groups <- cutree(fit, k=5) # cut tree into 5 clusters
# draw dendrogram with red borders around the 5 clusters
rect.hclust(fit, k=5, border="red")

# - Hierarchical Agglomerative - FULL DEMO

# If you can't do data handling on R, do it on excel or something.
# Your data gotta look like this (I ran the demo on this data):

# If your data's set up right, here's the minimal script you need:
infile <- read.csv("E:/AseanDat.csv",header=T)
library(pls)
neo <- as.matrix(infile[,-1])
neo.std <- stdize(neo)
neo.dist <- dist(neo.std, method = "euclidean") # distance matrix
fit <- hclust(neo.dist, method="complete")
plot(fit,labels=infile$Country,main="ASEAN Economies") # display dendrogram

The dendrogram will look like this:

Note how it relates back to the input data:

# You can draw boxes around the clusters with R - if you want R to show x number of groups:

# draw dendogram with red borders around the major clusters

rect.hclust(fit, k=4, border="red") # in this case, the 4 main groups are highlighted.

# ... sometimes, R doesn't do a very good job - if that's the case(like in this example), just copy the output and put the boxes in yourself using whatever graphics editing software you want.

# Ward Hierarchical Clustering with Bootstrapped p values

library(pvclust)
fit <- pvclust(infile, method.hclust="ward",
method.dist="euclidean")
plot(fit) # dendogram with p values
# add rectangles around groups highly supported by the data
pvrect(fit, alpha=.95)

# - END OF HIERARCHICAL AGGLOMERATIVE -

# - MODEL-BASED - alpha tested ok on 23Sept2010
# Model Based Clustering

library(mclust)
fit <- Mclust(infile)
plot(fit, infile) # plot results
print(fit) # display the best model

# - END OF MODEL-BASED -

# - Possible Means of Visualising of Results - alpha tested ok on 23Sept2010

# K-Means Clustering with 5 clusters
fit <- kmeans(infile, 5)

# Cluster Plot against 1st 2 principal components
# vary parameters for most readable graph
library(cluster)
clusplot(infile, fit$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)

# Centroid Plot against 1st 2 discriminant functions
library(fpc)
plotcluster(infile, fit$cluster)

# - END OF RESULT VISUALISATION -

# - SOLUTION VALIDATION -

# comparing 2 cluster solutions - alpha tested ok on 23Sept2010
library(fpc)
cluster.stats(d, fit1$cluster, fit2$cluster)

# - END OF SOLUTION VALIDATION -

# /* END OF CLUSTER ANALYSIS */

Tuesday, September 21, 2010

R: Generic Data Input / Installing Libraries

# Usually, the easiest way to input data (from MS Excel or most other GUI spreadsheets) is to save your data text file (usually tab-delimited or comma-separated values), then have R read it in.

# Red indicates stuff you gotta change.

# For data in a TAB-DELIMITED form:
infile <- read.delim("C:/...",header=TRUE) # where "C:/..." is the path of the file you want to load.

# For data in a comma-seperated form:
infile <- read.csv("C:/...",header=TRUE)# where "C:/..." is the path of the file you want to load.

# To pick out your variables of interest and put them in a data frame(not always necessary):
VarsOfInt <- data.frame(infile$Var1,infile$Var2,infile$Var3,...,infile$Var)

# NOTE: Input files do not have to be called infile - you can call it whatever you want. But if you do, you'll have to make sure you change any subsequent infile you see to match whatever you called the file you read in.

# To install libraries:

install.packages("name_of_library") # OR

install.packages("name_of_library",dependencies=TRUE) # if you want all the dependencies installed as well - this might prevent future problems but might also take all day.

Monday, September 20, 2010

R: Correspondence Analysis

# In general, I didn't code this myself.
# Compiled from various sources.

# lib(s): ca, rgl

# Red indicates stuff you have to edit.

# Before using this script:
library(ca) # rgl will load automatically

# the easiest way to do this:
ca(infile)

mytable <- with(mydata, table(infile$cat.var1,infile$cat.var2)) # create a 2 way table
prop.table(mytable, 1) # row percentages
prop.table(mytable, 2) # column percentages
fit <- ca(mytable)
# print(fit) # basic results. Unnecessary in view of the next line.
summary(fit) # extended results. Quantities in the result are multiplied by 1000 (ie: expressed in permills)
plot(fit) # symmetric map
plot(fit, mass = TRUE, contrib = "absolute", map ="rowgreen", arrows = c(FALSE, TRUE)) # asymmetric map

# /* END OF CORRESPONDENCE ANALYSIS */

R: Log-Linear Modelling

# In general, I didn't code this myself.
# Compiled from various sources.

# Data input: generic

# Red indicates stuff you have to edit.

# - Start of Log-linear Modelling Script 1 -

# lib(s): stats

# before using this script:
library(stats)

mytab <- table(Cat.Var1,Cat.Var2)# where the categories of Cat.Var1 will be the rows and Cat.Var2 will be the columns.
loglin(mytab,margin=c(1,2)) # Produces a seemingly coherent output. But i no idea in hell what's going on. WTF is margin???

# description of margin: a list of vectors with the marginal totals to be fit. (Hierarchical) log-linear models can be specified in terms of these marginal totals which give the ‘maximal’ factor subsets contained in the model. For example, in a three-factor model, list(c(1, 2), c(1, 3)) specifies a model which contains parameters for the grand mean, each factor, and the 1-2 and 1-3 interactions, respectively (but no 2-3 or 1-2-3 interaction), i.e., a model where factors 2 and 3 are independent conditional on factor 1 (sometimes represented as ‘[12][13]’).

# - End of Log-linear Modelling Script 1 -

# - Start of Log-linear Modelling Script 2 - alpha tested ok on 14 Sept 2010

# lib(s): MASS

# before using this script:
library(MASS)

loglm(~ Cat.Var1 + Cat.Var2, xtabs(~ Cat.Var1 + Cat.Var2, infile))

# - End of Log-linear Modelling Script 2 -

# - Start of Log-linear Modelling Script 3 - alpha tested ok on 14 Sept 2010.
# (This is a somewhat DIY log-linear modelling script. All it reqires is the input of a table like the one mentioned at the start of the log-linear modelling section.)

# Log-Linear analysis using General Linear Model (http://users.ox.ac.uk/~polf0050/ISS%20Lecture%206.pdf(?))

mytab <- table(infile$CarVar1,infile$CatVar2)

lg.lnr <- function(mytab)
{
r.str <- rep(rownames(mytab),times=length(colnames(mytab)))
c.str <- rep(colnames(mytab),each=length(rownames(mytab)))
freq <- as.vector(mytab)
dat.frm <- data.frame(r.str,c.str,freq)
result <- glm(freq~r.str+c.str,family="poisson",data=dat.frm)
r1 <- anova.glm(result,test="Chisq")
r2 <- anova(result,test="Chisq")
plot(result)
r3 <- summary(result)
to.print <- list(r1,r2,r3)
to.print
}

# - End of Log-linear Modelling Script 3 -

# /* END OF LOG-LINEAR MODELLING */

R: MANOVA

# (mostly) taken from Quick R website.

# Data input: generic

# Red indicates stuff you have to edit.

Y <- cbind(infile$dep.var1,infile$dep.var2,...,infile$dep.var)# dependent variables have to be cbind-ed together.
fit <- manova(Y ~ indep.var1*indep.var2*indep.var3*...*indep.var)
summary(fit, test="Pillai")

R: Canonical Correspondence Analysis

# Classical Canonical Correspondence Analysis - alpha tested ok on 23 Aug 2010 -

# In general, I didn't code this myself.
# Compiled from various sources.

# lib(s): CCA, mixOmics(?)

# Before using this script:
library(CCA)

# Data input: generic

# Red indicates stuff you gotta change.

# - CANONICAL CORRESPONDANCE ANALYSIS 1 -

X <- as.matrix(infile$dat.frm1) # declare dataframe with first set of variables as a matrix
Y <- as.matrix(infile$dat.frm2) # declare dataframe with second set of variables as a matrix

# Tested on nutrimouse data (from CCA library).

# Canonical correspondence analysis assumes that for both datasets, the number if cases is more than the number of variables. If that is not the case, consider sampling a specific number of variables.
correl <- matcor(X, Y)
img.matcor(correl,type=2)
res.cc<-cc(X, Y)
barplot(res.cc$cor,xlab = "Dimension",ylab = "Canonical correlations", names.arg=1:10,ylim=c(0,1))
plt.cc(res.cc)

# this assumes that the number of cases exceeds the number of variables for both datasets in question.
X <- as.matrix(infile$var1,infile$var2,infile$var3,...,infile$var)
Y <- as.matrix(infile$var1,infile$var2,infile$var3,...,infile$var)

# Regularized CCA - (regularized? wtf does that mean???) - alpha tested ok on 23 Aug 2010 -

lib(s): mixOmics,CCA

# think you might need mixOmics package.
# mixOmics requires the gtk runtime environment.
# Windows users: search and get from download.com or get from here: http://download.cnet.com/3001-2192_4-10280276.html?spi=cd9c2c926c6f63eceaf7d4e103ceb2a9
# ubuntu (and probably other debian variants): open the terminal and type: sudo apt-get install libX11-dev freeglut3 freeglut3-dev then install as usual.

res.regul <- estim.regul(X, Y, plt = TRUE, grid1 = seq(0.0001, 0.2, l=51), grid2 = seq(0, 0.2, l=51)) # !!! this command will take the better part of 90 mins to run !!!
contour(res.regul$grid1, res.regul$grid2, res.regul$mat, add = TRUE, levels = c(0,0.5,0.7), col = "blue")
contour(res.regul$grid1, res.regul$grid2, res.regul$mat, add = TRUE, levels = c(0.8,0.85,0.88), col = "darkgreen")

res.rcc <- rcc(X, Y, 0.008096, 0.064) # rcc(X,Y,lambda1,lambda2)
barplot(res.rcc$cor, xlab = "Dimension", ylab = "Canonical correlations", names.arg = 1:21, ylim = c(0,1))

# some kind of extra plot
plt.cc(res.rcc, var.label = TRUE,ind.names = paste(nutrimouse$genotype, nutrimouse$diet, sep = " "))

# - END OF CANONICAL CORRESPONDENCE ANALYSIS 1 -

# - CANONICAL CORRESPONDANCE ANALYSIS 2 - using the function from package ade4

VarSet1 <- data.frame(infile$Var1,infile$Var2,infile$Var3,...infile$Var)
VarSet2 <- data.frame(infile$Var1,infile$Var2,infile$Var3,...infile$Var)

cca(VarSet1,VarSet2) # no negative values allowed in VarSet1 and VarSet2

# - END OF CANONICAL CORRESPONDENCE ANALYSIS 2 -

# /* END OF CANONICAL CORRESPONDENCE ANALYSIS */

R: Canonical Correlation

I'm working on this.

R: Multiple Regression

# In general, I didn't code this myself.

# lib(s): MASS
# Note to self: Multiple regression is covered in Linear Models I of LSE's Computational Statistics course by Penzer.

# Red indicates stuff you have to edit.

# Before using this script:
library(MASS)

# Data input: generic

# alpha tested ok on 3 Aug 2010
# For data frame 'infile':

MultiReg1 <- lm(dependent.var~independent.var1+independent.var2+independent.var3+...+independent.var,data=infile) # beta tested ok against result from STATA 10.
summary(MultiReg1)
anova(MultiReg1)# (sequential) anova table in the order which the variables are specified. Changing the order in which the independent variables are specified will change the results of the anova.
# par(mfrow = c(2,2))# Uncomment if you want all 4 plots in one window
plot(MultiReg1)
# par(mfrow = c(1,1)) # returns plot window to single plot mode.
boxcox(MultiReg1)
# To add variables into the model:
addterm(MultiReg1, ~.+Var1+Var2+Var3+...+Var, test="F") # adds a single term to those listed and displays the corresponding F-stat. ~. denotes the existing model.
# To remove variables from the model:
MultiReg2 <- update(MultiReg1,~(.-Var1),data=infile)

# Other useful functions # - alpha tested ok on 3 Aug 2010 -
coefficients(MultiReg) # model coefficients
confint(MultiReg, level=0.95) # CIs for model parameters
fitted(MultiReg) # predicted values
residuals(MultiReg) # residuals
vcov(MultiReg) # covariance matrix for model parameters
influence(MultiReg) # regression diagnostics
# Backwards Elimination # alpha tested ok on 2 Aug 2010
MultiReg <- lm(DepVar~indep.var1+indep.var2+indep.var3+...+IndepVar)
dropterm(MultiReg, test="F")
MultiReg2 <- update(MultiReg, ~.-IndepVar)
dropterm(MultiReg2,test="F") # to drop more variables, repeat this dropterm(...) and NewReg <- update(OldReg, ~.-VarToDrp) as necessary.

R: Principal Component Analysis

# In general, I didn't code this myself. Compiled from various sources.

# lib(s): ade4, xtable

# Download this script (.rtf file)

# Red indicates stuff you have to edit.

# - START OF PCA SCRIPT 1 - [alpha tested ok on 26 Jul 2010. Not sure if it really does what it's supposed to do.]

# Data input: generic

RmodePCA1 <- function(VarsOfInt) # no categorical variables allowed in VarsOfInt
{
infile.pca <- prcomp(VarOfInt,retx=TRUE,center=TRUE,scale.=TRUE)
#sd <- infile.pca$sdev
loadings <- infile.pca$rotation
rownames(loadings) <- colnames(VarOfInt)
scores <- infile.pca$x
plot(scores[,1],scores[,2],ylab="PCA 2", xlab="PCA 1",type="n",xlim=c(min(scores[,1:2]),max(scores[,1:2])),ylim=c(min(scores[,1:2]),max(scores[,1:2])))
arrows(0,0,loadings[,1]*10,loadings[,2]*10,length=0.1,angle=20,col="red")
text(loadings[,1]*10*1.2,loadings[,2]*10*1.2,rownames(loadings),col="red",cex=0.7)
summary(infile.pca)
}
# - END OF PCA SCRIPT 1 -

# - START OF PCA SCRIPT 2 - [alpha tested ok on 27Jul2010.]

# Data input: generic

RmodePCA2<-function(VarOfInt)# no categorical variables allowed in VarsOfInt
{
R <- cor(VarOfInt)
eig.vec <- eigen(R)
loadings2 <- eig.vec$vectors
#stddev <- sqrt(eig.vec$values)
rownames(loadings2) <- colnames(VarOfInt)
stdize <- function(x) {(x-mean(x))/sd(x)}
X <- apply(VarOfInt,MARGIN=2,FUN=stdize)
scores <- X%*%loadings2
plot(scores[,1],scores[,2],ylab="PCA 2", xlab="PCA 1",type="n",xlim=c(min(scores[,1:2]),max(scores[,1:2])),ylim=c(min(scores[,1:2]),max(scores[,1:2])))
arrows(0,0,loadings2[,1]*10,loadings2[,2]*10,length=0.1,angle=20,col="red")
text(loadings2[,1]*10*1.2,loadings2[,2]*10*1.2,rownames(loadings2),col="red",cex=0.7)
ldgs <- summary(loadings2)
scrs <- summary(scores)
display <- list(ldgs,scrs)
display
}
# - END OF PCA SCRIPT 2 -

# - START OF PCA SCRIPT 3 - !!! Major Script Editing Required !!!

# Data input: generic
# Red indicates stuff you _absolutely have to edit_.

# Preliminary data manipulation for use of PCA script 3 alpha tested ok 29 Jul 2010]

names(infile)
id.code <- paste(infile$Var1,infile$Var2,...,infile$Var) # create unique identifier for both sets of data.
infile$code <- id.code
row.names(infile) <- infile$code
infile$code <- NULL
# Var <- as.factor(infile$Var) # all categorical variables should be recognized as a factor. To check: class(Var).
fctrs <- data.frame(infile$Monsoon,infile$Location,year,id.code) # put all the factors into one dataframe, along with the unique identifier.
row.names(fctrs) <- fctrs$id.code
# Now, get rid of all the factors(ie: categorical variables) in the data for PCA. Both dataframes already have the same identifiers.
infile$Var1 <- NULL
infile$Var2 <- NULL
infile$Var3 <- NULL # repeat as necessary.

# PCA script 3 assumes the following:
# 1) There are 2 dataframes.
# 2) Categorical data (factors) and non-categorical data are in seperate dataframes.
# 3) Identifers for both dataframes are identical.
# 4) All factors have the same number of cases.
# 5) Libraries needed for the following scripts are ade4 and xtable. Should probably install any associated dependencies at the same time.

# Before using these scripts:
library(ade4)
library(xtable)

# - START OF PCA SCRIPT 3.1 (Normed PCA with factors)- [alpha tested ok on 29 Jul 2010)
pca1 <- dudi.pca(infile, scann=F,nf=3) # Assuming 3 factors, the eigen values of which exceed 1. Edit as necessary.
inertie <- cumsum(pca1$eig)/sum(pca1$eig)
barplot(pca1$eig)
# for plots, various permutations of nf(s) and factors.
par(mfrow = c(2, 2))
s.corcircle(pca1$co, xax = 1, yax = 2)
s.corcircle(pca1$co, xax = 1, yax = 3)
s.corcircle(pca1$co, xax = 2, yax = 3)
par(mfrow = c(3, 2))
s.class(pca1$li, fctrs$infile.fctr1, xax = 1, yax = 2, cellipse = 0)
s.class(pca1$li, fctrs$infile.fctr2, xax = 1, yax = 2, cellipse = 0)
s.class(pca1$li, fctrs$infile.fctr1, xax = 1, yax = 3, cellipse = 0)
s.class(pca1$li, fctrs$infile.fctr2, xax = 1, yax = 3, cellipse = 0)
s.class(pca1$li, fctrs$infile.fctr1, xax = 2, yax = 3, cellipse = 0)
s.class(pca1$li, fctrs$infile.fctr2, xax = 2, yax = 3, cellipse = 0) # repeat as necessary
# - END OF PCA SCRIPT 3.1 -

# - START OF PCA SCRIPT 3.2 -[alpha tested ok on 29 Jul 2010]
# - 1-way ANOVA for studying spatial/temporal effects -
options(show.signif.stars = F)
ressta <- anova(lm(infile[, 1] ~ fctrs$fctr1))
xtable(ressta)
graphnf <- function(x, gpe)
{
stripchart(x ~ gpe)
points(tapply(x, gpe, mean), 1:length(levels(gpe)), col = "red",
pch = 19, cex = 1.5)
abline(v = mean(x), lty = 2)
moyennes <- tapply(x, gpe, mean)
traitnf <- function(n) segments(moyennes[n], n, mean(x), n,
col = "blue", lwd = 2)
sapply(1:length(levels(gpe)), traitnf)
}
graphnf(infile[, 1], fctrs$fctr1)
probasta <- rep(0, 23)
for (i in 1:N) # where N is the number of variables you have in infile.
{
ressta <- anova(lm(infile[, i] ~ fctrs$fctr1))
probasta[i] <- ressta[[1, x] # where [[1,x]] is the number of levels in fctr1
}
xtable(rbind(colnames(infile), round(probasta, dig = 4))) # why the hell does the table output in code?!?
intdat <- gl(x, y, ordered = TRUE) # gl(x,y,...) - _x_ levels with _y_ cases in each level.
intplan <- cbind(as.numeric(fctrs$fctr1), intdat)
s.value(intplan, pca1$tab[, 1], sub = colnames(infile)[1],
possub = "topleft", csub = 2)
par(mfrow = c(4, 8))
for (i in 2:N) # where N is the number of variables in infile.
{
s.value(intplan, pca1$tab[, i], sub = colnames(infile)[i],
possub = "topleft", csub = 2)
}
# - END OF PCA SCRIPT 3.2 -

# - START OF PCA SCRIPT 3.3 - [alpha tested ok on 29 Jul 2010]
# - The Within PCA - [alpha tested ok on 29 Jul 2010]
sepmil <- split(pca1$tab, fctrs$fctr1)
names(sepmil)
meanspring <- apply(sepmil$spring, 2, mean)
round(pca1$tab[1, ] - meanspring, digits = 4)
wit1 <- within(pca1, fctrs$fctr1, scan = FALSE)
# names(wit1)
# [1] "tab" "cw" "lw" "eig" "rank" "nf" "c1" "li" "co" "l1"
# [11] "call" "ratio" "ls" "as" "tabw" "fac"
plot(wit1)
# - END OF PCA SCRIPT 3.3 -

# - START OF PCA SCRIPT 3.4 - [alpha tested ok on 29 Jul 2010]
# - The Between PCA -
bet1 <- between(pca1, fctrs$fctr1, scan = FALSE, nf = a) # where a eigen value(s) >1?
# SAMPLE OUTPUT
# names(bet1)
# [1] "tab" "cw" "lw" "eig" "rank" "nf" "l1" "co" "li" "c1"
# [11] "call" "ratio" "ls" "as"
bet1$tab # shows the means of all the variables by group
bet1$ratio # shows the "between group inertia"... whatever the hell that means. :S
bet1$eig # no. of eigen values = number_of_groups -1
plot(bet1)
# - END OF PCA SCRIPT 3.4 -

# In conclusion:
# For y=total inertia of data 'X'
# a = inertia of X-
# b = inertia of X+
# then,
# y=a+b
# wit1$ratio
# [1] 0.6277314
# bet1$ratio
# [1] 0.3722686
# wit1$ratio + bet1$ratio
# [1] 1

# - END OF PCA SCRIPT 3 -

# - BETA TESTED SCRIPT -

Rpca1 <- prcomp(infile,retx=TRUE,center=TRUE,scale.=TRUE) # !!! do NOT scale if you have variables that have the same value for all cases !!!
loadings <- Rpca1$rotation
scores <- Rpca1$x
rownames(loadings) <- colnames(infile)
bmp("H:/output/Plot.bmp") # outputs the following graph to a bitmap. Comment out if unnecessary.
plot(scores[,1],scores[,2],ylab="PCA 2", xlab="PCA 1",type="n",xlim=c((min(scores[,1:2])-1),(max(scores[,1:2])+1)),ylim=c((min(scores[,1:2])-1),(max(scores[,1:2]+1)))) # defines the main parameters of the plot. With this line, no points, lines, eigen vectors, whatsoever will be produced - just the axis(es))

segments(0,0,loadings[,1]*10,loadings[,2]*10, col="red") # this will produce the line plots of the loadings.

text(loadings[,1]*10*1.2,loadings[,2]*10*1.2,rownames(loadings),col="red",cex=0.7) # label the radial lines produced.

points(scores[R1:R2,C1],scores[R1:R2,C2],pch=1,col="blue") # this adds point plots of the specified co-ordinates. R1 is a number specifiying the first row where your data starts. R2 specifies the last row of data. C1 and C2 are both columns (usually corresponding to score column 1 and score column 2).

points(scores[R1:R2,C1],scores[R1:R2,C2],pch=2,col="green") # repeat these lines as necessary. You can change the stuff highlighted in blue if you want.

# .
# .
# .
points(scores[R1:R2,C1],scores[R1:R2,C2],pch=10,col="black")

leg.txt <- c("Cat1","Cat2","Cat3","Cat4","Cat5","Cat6","Cat7","Cat8","Cat9","Cat10") # Comment out if unnecessary.
legend("topright",legend=leg.txt,col=c("blue","green","firebrick","gold","black"),pch=c(1,2,3,4,5,25,13,12,11,10),cex=0.75,title="Title") # This makes a box with the text in leg.txt appear in the top right. Comment out if unnecessary.

write.table(loadings,"H:/output/Output-ldgs.txt",sep="\t") # To output the resulting loadings. Comment out if unnecessary.
scrs <- as.data.frame(scores) # Scores are originally in a matrix. Chage to dataframe if you want to save them. Comment out if unnecessary.
write.table(scrs,"H:/output/Output-scrs.txt",sep="\t") # To output the resulting scores. Comment out if unnecessary.

# - END OF BETA TESTED SCRIPT -

# /* END OF PRINCIPAL COMPONENT ANALYSIS */

gnuplot: Histograms

Horizontal (sort of) histograms for gnuplot
Think the original script was coded by Tim Hoffman: http://groups.google.com/group/comp.graphics.apps.gnuplot/browse_thread/thread/dc0fa723f21ff268?pli=1

# dataset is:
#1 5 6
#2 7 8
#3 9 10

#Stacked Histogram
reset
set boxwidth 0.3 relative
set style fill solid 1.0 border -1
set style data histograms
set style histogram rowstacked # data grouped in rows
#set datafile missing '-' #sets missing values
set key outside right nobox
set title "Stroke Subtype"
#unset ytics # gets rid of y-axis
unset border # gets rid of graph border
set xtics ("Analysed" 0.00000, "Not Analysed" 1.00000) #sets labels for x-axis, 0 = Data_set 1, 1 = Data_set 2
set xtics border in scale 0,0 nomirror rotate by 90
set ytics rotate by 90
set ylabel "Percentage (%)" rotate by 90
set yrange [0:100]
plot '1.txt' using 1 ti "TIA", '' using 2 ti "POCI/LACI", '' using 3 ti "TACI/PACI"

#Cluster Histogram
reset
set boxwidth 0.5 relative
set style fill solid 1.0 border -1
set style data histograms
#set datafile missing '-' #sets missing values
set key outside right nobox
set title ""
#unset ytics # gets rid of y-axis
unset border # gets rid of graph border
set xtics ("Analysed" 0.00000, "Not Analysed" 1.00000) #sets labels for x-axis, 0 = Data_set 1, 1 = Data_set 2
set xtics border in scale 0,0 nomirror rotate by 90
set ytics border in scale 0,0 rotate by 90
set yrange [0:100]

gnuplot: Range Plot

# Range plot script written on 31 Aug 2009
# Punctuation of any kind is likely a crucial part of the script, and should be left untouched.
# Sections that are optional are enclosed with tags.

# Datafile looks like this:
# 1 54.96876 1425.523 757.3601 5 79.70358 2722.085 956.1428
# 2 11.31661 2247.381 981.4445 6 39.31570 1119.610 638.5529
# 3 142.9088 2125.066 1169.924 7 134.9087 2153.851 678.3504
# 4 171.1888 2908.621 1223.032
set output 'test1.png'
set title "RANGE PLOT BETA"
#
# set xrange [X1:X2] # where X2>X1
# set yrange [0:3500] # where Y2>Y1
# set key outside right
#
set xtics ("P01" 1.00000, "P02" 2.00000, "P06" 3.00000, "P07" 4.00000, "P03" 5.00000, "P04" 6.00000, "P05" 7.00000)
set xlabel "STATIONS"
set ylabel "CONCENTRATION (MG/KG DRY)"
# plot "FILENAME.EXT" using X:Y1:Y2 with filled curves, "" using X:Y1, "" using X:Y2...
plot "1.txt" using 1:2:3 with filledcurves ti "Nearfield Baseline Range", "" using 1:2 with lines notitle, "" using 1:3 with lines notitle, "" using 1:4 with linespoints ls 17 ti "Mean if Nearfield Range", "" using 5:6:7 with filledcurves ti "Farfield Baseline Range", "" using 5:6 with lines notitle, "" using 5:7 with lines notitle, "" using 5:8 with linespoints ls 7 ti "Mean of Farfield Range"

Thursday, September 23, 2010

Tuesday, September 21, 2010

Monday, September 20, 2010

Potentially Helpful Links