R语言实现数据操作-CDA数据分析师官网

R语言实现数据操作

2017-12-17

R语言实现数据操作

1.选择与查看数据
#选定数据
>data(iris)
#查看数据,按列展开，观测数据类型
>str(iris)
'data.frame':   150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#按列展开，进行数据统计观测
>summary(iris)
Sepal.Length    Sepal.Width
Min.   :4.300   Min.   :2.000
1st Qu.:5.100   1st Qu.:2.800
Median :5.800   Median :3.000
Mean   :5.843   Mean   :3.057
3rd Qu.:6.400   3rd Qu.:3.300
Max.   :7.900   Max.   :4.400
Petal.Length    Petal.Width
Min.   :1.000   Min.   :0.100
1st Qu.:1.600   1st Qu.:0.300
Median :4.350   Median :1.300
Mean   :3.758   Mean   :1.199
3rd Qu.:5.100   3rd Qu.:1.800
Max.   :6.900   Max.   :2.500
       Species
setosa    :50
versicolor:50
virginica :50

#按行展开，查看前10行
>head(iris,10)                 Sepal.Length Sepal.Width Petal.Length
1           5.1         3.5          1.4
2           4.9         3.0          1.4
3           4.7         3.2          1.3
4           4.6         3.1          1.5
5           5.0         3.6          1.4
6           5.4         3.9          1.7
7           4.6         3.4          1.4
8           5.0         3.4          1.5
9           4.4         2.9          1.4
10          4.9         3.1          1.5
   Petal.Width Species
1          0.2 setosa
2          0.2 setosa
3          0.2 setosa
4          0.2 setosa
5          0.2 setosa
6          0.4 setosa
7          0.3 setosa
8          0.2 setosa
9          0.2 setosa
10         0.1 setosa
#按行展开，观测后10行
>tail(iris,10)
    Sepal.Length Sepal.Width Petal.Length
141          6.7         3.1          5.6
142          6.9         3.1          5.1
143          5.8         2.7          5.1
144          6.8         3.2          5.9
145          6.7         3.3          5.7
146          6.7         3.0          5.2
147          6.3         2.5          5.0
148          6.5         3.0          5.2
149          6.2         3.4          5.4
150          5.9         3.0          5.1
    Petal.Width   Species
141         2.4 virginica
142         2.3 virginica
143         1.9 virginica
144         2.3 virginica
145         2.5 virginica
146         2.3 virginica
147         1.9 virginica
148         2.0 virginica
149         2.3 virginica
150         1.8 virginica
#观测数据内的某一行                `
>table(iris$Sepal.Length)
4.3 4.4 4.5 4.6 4.7 4.8 4.9   5 5.1 5.2
1   3   1   4   2   5   6 10   9   4
5.3 5.4 5.5 5.6 5.7 5.8 5.9   6 6.1 6.2
1   6   7   6   8   7   3   6   6   4
6.3 6.4 6.5 6.6 6.7 6.8 6.9   7 7.1 7.2
9   7   5   2   8   3   4   1   1   3
7.3 7.4 7.6 7.7 7.9
1   1   1   4   1
#观测数据的容量
> object.size(iris)
7088 bytes

深入观测方法

#选择某一行某一列数据,一行一列
>iris[1,1]
[1] 5.1
#使用c()选择多行
> sepal.iris = iris[,c("Sepal.Length","Sepal.Width")]
> str(sepal.iris)
'data.frame':   150 obs. of 2 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#指定观测那几行的那几个
> FIVE.sepal.iris = iris[1:5,c("Sepal.Length","Sepal.Width")]
> str(FIVE.sepal.iris)
'data.frame':   5 obs. of 2 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6
#设置筛选条件，例如iris中species的仅包括setosa类型的数据，后面指定了列数
> setosa.data = iris[iris$Species=="setosa",1:5]
> str(setosa.data)
'data.frame':   50 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#使用subset函数来获取数据集的子集
> sepal.data = subset(iris,select = c("Sepal.Length","Sepal.Width"))
> str(sepal.data)
'data.frame':   150 obs. of 2 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#subset获取仅包含setosa的数据
> setosa.data = subset(iris,Species=="setosa")
> str(setosa.data)
'data.frame':   50 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#subset运用条件来筛选数据
> example.data = subset(iris,Petal.Length<=1.4 & Petal.Width>=0.2,select = Species )
> str(example.data)
'data.frame':   21 obs. of 1 variable:
$ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#具有相同行相同列的数据合并为一组，
> flower.type = data.frame(Species = "setosa",Flower = "iris")
> merge(flower.type,iris[1:3,],by = "Species")
Species Flower Sepal.Length Sepal.Width Petal.Length Petal.Width
1 setosa   iris          5.1         3.5          1.4         0.2
2 setosa   iris          4.9         3.0          1.4         0.2
3 setosa   iris          4.7         3.2          1.3         0.2
#函数order可以返回指定列进行数据排序后的数据框，下面是花萼长度从大到小排序
> head(iris[order(iris$Sepal.Length,decreasing = TRUE),])
    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
132          7.9         3.8          6.4         2.0 virginica
118          7.7         3.8          6.7         2.2 virginica
119          7.7         2.6          6.9         2.3 virginica
123          7.7         2.8          6.7         2.0 virginica
136          7.7         3.0          6.1         2.3 virginica
106          7.6         3.0          6.6         2.1 virginica
扩展
#函数sub与gsub支持使用正则表达示对字符串的处理，分别替换第一个字符与所有字符
> iris10 = iris
> sub("e","z",names(iris10))
[1] "Szpal.Length" "Szpal.Width" "Pztal.Length" "Pztal.Width" "Spzcies"
> gsub("e","z",names(iris10))
[1] "Szpal.Lzngth" "Szpal.Width" "Pztal.Lzngth" "Pztal.Width" "Spzcizs"

R语言

数据分析咨询请扫描二维码

上一篇机器学习和 AI 领域必须了解的工具

下一篇应用大数据技术加强智库建设

R语言实现数据操作

考试指南

报考指南

热门栏目