R语言实战之描述性统计分析

R语言实战之描述性统计分析

下面展示一些 描述性统计分析的R代码语言

vars <- c("mpg","hp","wt")
head(mtcars[vars])

#创造一个统计的函数列表
#通过sapply()计算描述性统计变量(包括偏度和峰度)
mystats <- function(x,na.omit = FALSE){
  if(na.omit)
    x <- x[!is.na(x)]
  m <- mean(x)
  n <- length(x)
  s <- sd(x)
  skew <- sum((x-m)^3/s^3)/n
  kurt <- sum((x-m)^4/s^4)/n - 3
  return(c(n=n, mean = m, stdev = s, skew = skew, kurtosis = kurt))
}
sapply(mtcars[vars],mystats)
#sapply(mtcars[myvars],mystats,na.omit=TRUE)单纯忽略缺失值

library(Hmisc)
describe(mtcars[vars])
#通过pastecs包中的stat.desc()函数计算描述性统计量
library(pastecs)
stat.desc(mtcars[vars])

library(psych)
describe(mtcars[vars])

#分组计算描述性统计量
aggregate(mtcars[vars], by = list(am = mtcars$am), mean)
#aggregate()允许在每次调用中使用平均数、标准差这样的单返回值函数。


#使用by()分组计算描述性统计量
dstats <- function(x)sapply(x,mystats)
vars <- c("mpg","hp","wt")
by(mtcars[vars],mtcars$am,dstats)

#使用doBy包中的summaryBy()分组计算概述统计量
library(doBy)
summaryBy(mpg + hp + wt ~ am, data = mtcars, FUN = mystats)

#使用psych包分组计算概述统计量
library(psych)
describeBy(mtcars[vars],mtcars$am)
#不允许指定任意函数,所以普适性较低
#psych包中的describeBy()函数可计算和describe()相同的描述性统计量,只按照一个或多个分组的变量分层。

#对比分析
library(doBy)
summaryBy(mpg+hp+wt~am,data=mtcars,FUN=mystats)
library(psych)
describeBy(mtcars[vars],list(am=mtcars$am))


#使用reshape包分组计算概述统计量
library(reshape)
dstats1 <- function(x)(c(n = length(x), mean = mean(x), sd = sd(x)))
dfm <- melt(mtcars, measure.vars = vars, id.vars = c("am","cyl"))
cast(dfm, am + cyl + variable ~ . , dstats1)
> vars <- c("mpg","hp","wt")
> head(mtcars[vars])
                   mpg  hp   wt
Mazda RX4         21.0 110 2.62
Mazda RX4 Wag     21.0 110 2.88
Datsun 710        22.8  93 2.32
Hornet 4 Drive    21.4 110 3.21
Hornet Sportabout 18.7 175 3.44
Valiant           18.1 105 3.46
> 
> #创造一个统计的函数列表
> #通过sapply()计算描述性统计变量(包括偏度和峰度)
> mystats <- function(x,na.omit = FALSE){
+   if(na.omit)
+     x <- x[!is.na(x)]
+   m <- mean(x)
+   n <- length(x)
+   s <- sd(x)
+   skew <- sum((x-m)^3/s^3)/n
+   kurt <- sum((x-m)^4/s^4)/n - 3
+   return(c(n=n, mean = m, stdev = s, skew = skew, kurtosis = kurt))
+ }
> sapply(mtcars[vars],mystats)
            mpg      hp      wt
n        32.000  32.000 32.0000
mean     20.091 146.688  3.2172
stdev     6.027  68.563  0.9785
skew      0.611   0.726  0.4231
kurtosis -0.373  -0.136 -0.0227
> #sapply(mtcars[myvars],mystats,na.omit=TRUE)单纯忽略缺失值
> 
> library(Hmisc)
> describe(mtcars[vars])
    vars  n   mean    sd median trimmed   mad   min    max  range skew kurtosis    se
mpg    1 32  20.09  6.03  19.20   19.70  5.41 10.40  33.90  23.50 0.61    -0.37  1.07
hp     2 32 146.69 68.56 123.00  141.19 77.10 52.00 335.00 283.00 0.73    -0.14 12.12
wt     3 32   3.22  0.98   3.33    3.15  0.77  1.51   5.42   3.91 0.42    -0.02  0.17
> #通过pastecs包中的stat.desc()函数计算描述性统计量
> library(pastecs)
> stat.desc(mtcars[vars])
                mpg       hp      wt
nbr.val       32.00   32.000  32.000
nbr.null       0.00    0.000   0.000
nbr.na         0.00    0.000   0.000
min           10.40   52.000   1.513
max           33.90  335.000   5.424
range         23.50  283.000   3.911
sum          642.90 4694.000 102.952
median        19.20  123.000   3.325
mean          20.09  146.688   3.217
SE.mean        1.07   12.120   0.173
CI.mean.0.95   2.17   24.720   0.353
var           36.32 4700.867   0.957
std.dev        6.03   68.563   0.978
coef.var       0.30    0.467   0.304
> 
> library(psych)
> describe(mtcars[vars])
    vars  n   mean    sd median trimmed   mad   min    max  range skew kurtosis    se
mpg    1 32  20.09  6.03  19.20   19.70  5.41 10.40  33.90  23.50 0.61    -0.37  1.07
hp     2 32 146.69 68.56 123.00  141.19 77.10 52.00 335.00 283.00 0.73    -0.14 12.12
wt     3 32   3.22  0.98   3.33    3.15  0.77  1.51   5.42   3.91 0.42    -0.02  0.17
> 
> #分组计算描述性统计量
> aggregate(mtcars[vars], by = list(am = mtcars$am), mean)
  am  mpg  hp   wt
1  0 17.1 160 3.77
2  1 24.4 127 2.41
> #aggregate()允许在每次调用中使用平均数、标准差这样的单返回值函数。
> 
> 
> #使用by()分组计算描述性统计量
> dstats <- function(x)sapply(x,mystats)
> vars <- c("mpg","hp","wt")
> by(mtcars[vars],mtcars$am,dstats)
mtcars$am: 0
            mpg       hp     wt
n        19.000  19.0000 19.000
mean     17.147 160.2632  3.769
stdev     3.834  53.9082  0.777
skew      0.014  -0.0142  0.976
kurtosis -0.803  -1.2097  0.142
----------------------------------------------------------------------------- 
mtcars$am: 1
             mpg      hp     wt
n        13.0000  13.000 13.000
mean     24.3923 126.846  2.411
stdev     6.1665  84.062  0.617
skew      0.0526   1.360  0.210
kurtosis -1.4554   0.563 -1.174
> 
> #使用doBy包中的summaryBy()分组计算概述统计量
> library(doBy)
> summaryBy(mpg + hp + wt ~ am, data = mtcars, FUN = mystats)
  am mpg.n mpg.mean mpg.stdev mpg.skew mpg.kurtosis hp.n hp.mean hp.stdev hp.skew hp.kurtosis wt.n
1  0    19     17.1      3.83   0.0140       -0.803   19     160     53.9 -0.0142      -1.210   19
2  1    13     24.4      6.17   0.0526       -1.455   13     127     84.1  1.3599       0.563   13
  wt.mean wt.stdev wt.skew wt.kurtosis
1    3.77    0.777   0.976       0.142
2    2.41    0.617   0.210      -1.174
> 
> #使用psych包分组计算概述统计量
> library(psych)
> describeBy(mtcars[vars],mtcars$am)

 Descriptive statistics by group 
group: 0
    vars  n   mean    sd median trimmed   mad   min    max  range  skew kurtosis    se
mpg    1 19  17.15  3.83  17.30   17.12  3.11 10.40  24.40  14.00  0.01    -0.80  0.88
hp     2 19 160.26 53.91 175.00  161.06 77.10 62.00 245.00 183.00 -0.01    -1.21 12.37
wt     3 19   3.77  0.78   3.52    3.75  0.45  2.46   5.42   2.96  0.98     0.14  0.18
----------------------------------------------------------------------------- 
group: 1
    vars  n   mean    sd median trimmed   mad   min    max  range skew kurtosis    se
mpg    1 13  24.39  6.17  22.80   24.38  6.67 15.00  33.90  18.90 0.05    -1.46  1.71
hp     2 13 126.85 84.06 109.00  114.73 63.75 52.00 335.00 283.00 1.36     0.56 23.31
wt     3 13   2.41  0.62   2.32    2.39  0.68  1.51   3.57   2.06 0.21    -1.17  0.17
> #不允许指定任意函数,所以普适性较低
> #psych包中的describeBy()函数可计算和describe()相同的描述性统计量,只按照一个或多个分组的变量分层。
> 
> #对比分析
> library(doBy)
> summaryBy(mpg+hp+wt~am,data=mtcars,FUN=mystats)
  am mpg.n mpg.mean mpg.stdev mpg.skew mpg.kurtosis hp.n hp.mean hp.stdev hp.skew hp.kurtosis wt.n
1  0    19     17.1      3.83   0.0140       -0.803   19     160     53.9 -0.0142      -1.210   19
2  1    13     24.4      6.17   0.0526       -1.455   13     127     84.1  1.3599       0.563   13
  wt.mean wt.stdev wt.skew wt.kurtosis
1    3.77    0.777   0.976       0.142
2    2.41    0.617   0.210      -1.174
> library(psych)
> describeBy(mtcars[vars],list(am=mtcars$am))

 Descriptive statistics by group 
am: 0
    vars  n   mean    sd median trimmed   mad   min    max  range  skew kurtosis    se
mpg    1 19  17.15  3.83  17.30   17.12  3.11 10.40  24.40  14.00  0.01    -0.80  0.88
hp     2 19 160.26 53.91 175.00  161.06 77.10 62.00 245.00 183.00 -0.01    -1.21 12.37
wt     3 19   3.77  0.78   3.52    3.75  0.45  2.46   5.42   2.96  0.98     0.14  0.18
----------------------------------------------------------------------------- 
am: 1
    vars  n   mean    sd median trimmed   mad   min    max  range skew kurtosis    se
mpg    1 13  24.39  6.17  22.80   24.38  6.67 15.00  33.90  18.90 0.05    -1.46  1.71
hp     2 13 126.85 84.06 109.00  114.73 63.75 52.00 335.00 283.00 1.36     0.56 23.31
wt     3 13   2.41  0.62   2.32    2.39  0.68  1.51   3.57   2.06 0.21    -1.17  0.17
> 
> 
> #使用reshape包分组计算概述统计量
> library(reshape)
> dstats1 <- function(x)(c(n = length(x), mean = mean(x), sd = sd(x)))
> dfm <- melt(mtcars, measure.vars = vars, id.vars = c("am","cyl"))
> cast(dfm, am + cyl + variable ~ . , dstats1)
   am cyl variable  n   mean     sd
1   0   4      mpg  3  22.90  1.453
2   0   4       hp  3  84.67 19.655
3   0   4       wt  3   2.94  0.408
4   0   6      mpg  4  19.12  1.632
5   0   6       hp  4 115.25  9.179
6   0   6       wt  4   3.39  0.116
7   0   8      mpg 12  15.05  2.774
8   0   8       hp 12 194.17 33.360
9   0   8       wt 12   4.10  0.768
10  1   4      mpg  8  28.07  4.484
11  1   4       hp  8  81.88 22.655
12  1   4       wt  8   2.04  0.409
13  1   6      mpg  3  20.57  0.751
14  1   6       hp  3 131.67 37.528
15  1   6       wt  3   2.75  0.128
16  1   8      mpg  2  15.40  0.566
17  1   8       hp  2 299.50 50.205
18  1   8       wt  2   3.37  0.283

以上,共勉,有问题欢迎评论~

你可能感兴趣的:(R语言,算法,深度学习,数据分析,数据结构)