Wednesday, September 25, 2013

R: Working with character strings


If the variable in question is not recognized as a character vector, declare it as such.
Here, we will work on a variable called PicName variable in the taken from the raw data.
> PicNames <- levels(AIraw$PicName)
> PicNames
 [1] "SIN_220512_01_1B" "SIN_220512_01_1F" "SIN_220512_01_2B" "SIN_220512_01_2F"
 [5] "SIN_220512_01_3B" "SIN_220512_01_3F" "SIN_220512_03_1B" "SIN_220512_03_1F"
 [9] "SIN_220512_03_2B" "SIN_220512_03_2F" "SIN_220512_03_3B" "SIN_220512_03_3F"
[13] "SIN_220512_12_1B" "SIN_220512_12_1F" "SIN_220512_12_2B" "SIN_220512_12_2F"
[17] "SIN_220512_12_3B" "SIN_220512_12_3F”
> class(PicNames)
[1] "character"
strsplit(…) the string split function.
> PN.splt <- unlist(strsplit(PicNames,"_"))
> PN.splt
 [1] "SIN"    "220512" "01"     "1B"     "SIN"    "220512" "01"     "1F"   
 [9] "SIN"    "220512" "01"     "2B"     "SIN"    "220512" "01"     "2F"   
[17] "SIN"    "220512" "01"     "3B"     "SIN"    "220512" "01"     "3F"   
[25] "SIN"    "220512" "03"     "1B"     "SIN"    "220512" "03"     "1F"   
[33] "SIN"    "220512" "03"     "2B"     "SIN"    "220512" "03"     "2F"   
[41] "SIN"    "220512" "03"     "3B"     "SIN"    "220512" "03"     "3F"   
[49] "SIN"    "220512" "12"     "1B"     "SIN"    "220512" "12"     "1F"   
[57] "SIN"    "220512" "12"     "2B"     "SIN"    "220512" "12"     "2F"   
[65] "SIN"    "220512" "12"     "3B"     "SIN"    "220512" "12"     "3F"
There are apparently 4 pieces of information for every element in PicNames, each separated by an underscore for easy reading. Since we have split on the _ , all the elements are in a single vector.
> Country <- PN.splt[seq(1,72,4)]
> Country
 [1] "SIN" "SIN" "SIN" "SIN" "SIN" "SIN" "SIN" "SIN" "SIN" "SIN" "SIN" "SIN"
[13] "SIN" "SIN" "SIN" "SIN" "SIN" "SIN"
> Date <- PN.splt[seq(2,72,4)]
> Date
 [1] "220512" "220512" "220512" "220512" "220512" "220512" "220512" "220512"
 [9] "220512" "220512" "220512" "220512" "220512" "220512" "220512" "220512"
[17] "220512" "220512"
> Var1 <- PN.splt[seq(3,72,4)]
> Var1
 [1] "01" "01" "01" "01" "01" "01" "03" "03" "03" "03" "03" "03" "12" "12" "12"
[16] "12" "12" "12"
> Var2 <- PN.splt[seq(4,72,4)]
> Var2
 [1] "1B" "1F" "2B" "2F" "3B" "3F" "1B" "1F" "2B" "2F" "3B" "3F" "1B" "1F" "2B"
[16] "2F" "3B" "3F"
>
substr(…) the substring function.
> CountryCode <- substr(PicNames,1,3)
> CountryCode
 [1] "SIN" "SIN" "SIN" "SIN" "SIN" "SIN" "SIN" "SIN" "SIN" "SIN" "SIN" "SIN"
[13] "SIN" "SIN" "SIN" "SIN" "SIN" "SIN"
> SamplingTime <- substr(PicNames,5,10)
> SamplingTime
 [1] "220512" "220512" "220512" "220512" "220512" "220512" "220512" "220512"
 [9] "220512" "220512" "220512" "220512" "220512" "220512" "220512" "220512"
[17] "220512" "220512"
> Var1a <- substr(PicNames,12,13)
> Var1a
 [1] "01" "01" "01" "01" "01" "01" "03" "03" "03" "03" "03" "03" "12" "12" "12"
[16] "12" "12" "12"
> Var1b <- substr(PicNames,15,16)
> Var1b
 [1] "1B" "1F" "2B" "2F" "3B" "3F" "1B" "1F" "2B" "2F" "3B" "3F" "1B" "1F" "2B"
[16] "2F" "3B" "3F"


textConnection(…)
> txtCnct <- textConnection(PicNames)
> PN <- read.delim(txtCnct,sep="_",header=F)
> close(txtCnct)
> PN
    V1     V2 V3 V4
1  SIN 220512  1 1B
2  SIN 220512  1 1F
3  SIN 220512  1 2B
4  SIN 220512  1 2F
5  SIN 220512  1 3B
6  SIN 220512  1 3F
7  SIN 220512  3 1B
8  SIN 220512  3 1F
9  SIN 220512  3 2B
10 SIN 220512  3 2F
11 SIN 220512  3 3B
12 SIN 220512  3 3F
13 SIN 220512 12 1B
14 SIN 220512 12 1F
15 SIN 220512 12 2B
16 SIN 220512 12 2F
17 SIN 220512 12 3B
18 SIN 220512 12 3F
>