code(n00b): R: Data Handling 2

Calculating the monthly average temperature (across the domain):

In demo.dat, there is about 10 week’s worth of weekly temperature data recorded at 2 locations. As a general rule, use only 1 index variable. If the desired index is a combination of 2 or more variables, paste them together with paste(…).

What you ultimately decide to use will usually depend on what you want your output to look like or what kind of post-processing you prefer to deal with. "Bears" is an imaginary variable. No bears of any kind were observed at the site.

The original data:

> demo.dat
          Date Lat.degN Lon.degE Actual.SST.degC Bears Lctn
1 2009-02-27       1.5    103.5           28.01     1    A
2 2009-02-27       0.5    103.5           28.00     2    B
3 2009-03-06       1.5    103.5           28.44     3    A
4 2009-03-06       0.5    103.5           28.38     4    B
5 2009-03-13       1.5    103.5           28.34     5    A
.      .             .      .                .       .    .
.      .             .      .                .       .    .
.      .             .      .                .       .    .
19 2009-05-01       1.5    103.5           29.75    19    A
20 2009-05-01       0.5    103.5           29.84    20    B

tapply(…)

> av.wk.SST <- tapply(demo.dat$Actual.SST.degC,demo.dat$Date,mean)
> av.wk.SST# this is an _array_.
2009-02-27 2009-03-06 2009-03-13 2009-03-20 2009-03-27 2009-04-03
     28.005      28.410      28.345      28.860      29.225      29.345
2009-04-10 2009-04-17 2009-04-24 2009-05-01
     29.635      29.840      30.075      29.795

> av.wk.SST <- as.data.frame(av.wk.SST)# Post processing.
> av.wk.SST
            av.wk.SST
2009-02-27     28.005
2009-03-06     28.410
2009-03-13     28.345
2009-03-20     28.860
2009-03-27     29.225
2009-04-03     29.345
2009-04-10     29.635
2009-04-17     29.840
2009-04-24     30.075
2009-05-01     29.795
>

tapply(…) is usually used on single variables.

by(…)

The by(…) function can handle more than one variable at one time, but requites a little more post processing.

> demo.dat <- demo.dat[c(1:10),]
> df1 <- by(demo.dat[,c(4,5)],demo.dat$Date,colMeans)
> df1
demo.dat$Date: 2009-02-27
Actual.SST.degC           Bears
         28.005           1.500
------------------------------------------------------------
demo.dat$Date: 2009-03-06
Actual.SST.degC           Bears
          28.41            3.50
------------------------------------------------------------
demo.dat$Date: 2009-03-13
Actual.SST.degC           Bears
         28.345           5.500
------------------------------------------------------------
    .    .
    .    .
    .    .
------------------------------------------------------------
demo.dat$Date: 2009-04-24
Actual.SST.degC           Bears
         30.075          17.500
------------------------------------------------------------
demo.dat$Date: 2009-05-01
Actual.SST.degC           Bears
         29.795          19.500

> length(df1)
[1] 10

> sst.bears <- data.frame((matrix(unlist(df1),nrow=10,byrow=T)),row.names=names(df1))
> sst.bears
                X1   X2
2009-02-27 28.005 1.5
2009-03-06 28.410 3.5
2009-03-13 28.345 5.5
      .       .      .
      .       .      .
2009-04-24 30.075 17.5
2009-05-01 29.795 19.5

> names(sst.bears) <- c("avSST","avBears")
> sst.bears
             avSST avBears
2009-02-27 28.005     1.5
2009-03-06 28.410     3.5
2009-03-13 28.345     5.5
2009-03-20 28.860     7.5
2009-03-27 29.225     9.5
2009-04-03 29.345    11.5
2009-04-10 29.635    13.5
2009-04-17 29.840    15.5
2009-04-24 30.075    17.5
2009-05-01 29.795    19.5
>

aggregate(…)

> df1 <- demo.dat[,c(1,4,5)]
> df1
          Date Actual.SST.degC Bears
1 2009-02-27            28.01     1
2 2009-02-27            28.00     2
3 2009-03-06            28.44     3
.       .                  .       .
.       .                  .       .
.       .                  .       .
20 2009-05-01            29.84    20

> df2 <- aggregate(df1[,c(2,3)],by=list(df1$Date),FUN=mean)
> df2
       Group.1 Actual.SST.degC Bears
1 2009-02-27           28.005   1.5
2 2009-03-06           28.410   3.5
3 2009-03-13           28.345   5.5
4 2009-03-20           28.860   7.5
5 2009-03-27           29.225   9.5
6 2009-04-03           29.345 11.5
7 2009-04-10           29.635 13.5
8 2009-04-17           29.840 15.5
9 2009-04-24           30.075 17.5
10 2009-05-01           29.795 19.5
>
reshape(…)
reshape merely reshapes data. No calculations involved. If there’s more than one possible value for each combination you define, it will take the first available value.

> demo.dat1 <- demo.dat[,c(1,4,5,6)]
> names(demo.dat1)
[1] "Date"            "Actual.SST.degC" "Bears"           "Lctn"
> df1 <- reshape(demo.dat1,idvar="Date",timevar="Lctn",direction="wide")
> df1
          Date Actual.SST.degC.A Bears.A Actual.SST.degC.B Bears.B
1 2009-02-27              28.01       1             28.00       2
3 2009-03-06              28.44       3             28.38       4
5 2009-03-13              28.34       5             28.35       6
7 2009-03-20              28.87       7             28.85       8
9 2009-03-27              29.20       9             29.25      10
11 2009-04-03              29.30      11             29.39      12
13 2009-04-10              29.63      13             29.64      14
15 2009-04-17              29.79      15             29.89      16
17 2009-04-24              30.00      17             30.15      18
19 2009-05-01              29.75      19             29.84      20
>

t(…)
The transpose function.

> df2 <- t(df1)
> df2
                  1             3             5             7
Date              "2009-02-27 " "2009-03-06 " "2009-03-13 " "2009-03-20 "
Actual.SST.degC.A "28.01"       "28.44"       "28.34"       "28.87"
Bears.A           " 1"          " 3"          " 5"          " 7"
Actual.SST.degC.B "28.00"       "28.38"       "28.35"       "28.85"
Bears.B           " 2"          " 4"          " 6"          " 8"
                  9             11            13            15
Date              "2009-03-27 " "2009-04-03 " "2009-04-10 " "2009-04-17 "
Actual.SST.degC.A "29.20"       "29.30"       "29.63"       "29.79"
Bears.A           " 9"          "11"          "13"          "15"
Actual.SST.degC.B "29.25"       "29.39"       "29.64"       "29.89"
Bears.B           "10"          "12"          "14"          "16"
                  17            19
Date              "2009-04-24 " "2009-05-01 "
Actual.SST.degC.A "30.00"       "29.75"
Bears.A           "17"          "19"
Actual.SST.degC.B "30.15"       "29.84"
Bears.B           "18"          "20"
>

code(n00b)

Thursday, March 7, 2013

R: Data Handling 2

No comments:

Post a Comment

Potentially Helpful Links