下面展示一些 描述性统计分析的R代码语言
。
vars <- c("mpg","hp","wt")
head(mtcars[vars])
#创造一个统计的函数列表
#通过sapply()计算描述性统计变量(包括偏度和峰度)
mystats <- function(x,na.omit = FALSE){
if(na.omit)
x <- x[!is.na(x)]
m <- mean(x)
n <- length(x)
s <- sd(x)
skew <- sum((x-m)^3/s^3)/n
kurt <- sum((x-m)^4/s^4)/n - 3
return(c(n=n, mean = m, stdev = s, skew = skew, kurtosis = kurt))
}
sapply(mtcars[vars],mystats)
#sapply(mtcars[myvars],mystats,na.omit=TRUE)单纯忽略缺失值
library(Hmisc)
describe(mtcars[vars])
#通过pastecs包中的stat.desc()函数计算描述性统计量
library(pastecs)
stat.desc(mtcars[vars])
library(psych)
describe(mtcars[vars])
#分组计算描述性统计量
aggregate(mtcars[vars], by = list(am = mtcars$am), mean)
#aggregate()允许在每次调用中使用平均数、标准差这样的单返回值函数。
#使用by()分组计算描述性统计量
dstats <- function(x)sapply(x,mystats)
vars <- c("mpg","hp","wt")
by(mtcars[vars],mtcars$am,dstats)
#使用doBy包中的summaryBy()分组计算概述统计量
library(doBy)
summaryBy(mpg + hp + wt ~ am, data = mtcars, FUN = mystats)
#使用psych包分组计算概述统计量
library(psych)
describeBy(mtcars[vars],mtcars$am)
#不允许指定任意函数,所以普适性较低
#psych包中的describeBy()函数可计算和describe()相同的描述性统计量,只按照一个或多个分组的变量分层。
#对比分析
library(doBy)
summaryBy(mpg+hp+wt~am,data=mtcars,FUN=mystats)
library(psych)
describeBy(mtcars[vars],list(am=mtcars$am))
#使用reshape包分组计算概述统计量
library(reshape)
dstats1 <- function(x)(c(n = length(x), mean = mean(x), sd = sd(x)))
dfm <- melt(mtcars, measure.vars = vars, id.vars = c("am","cyl"))
cast(dfm, am + cyl + variable ~ . , dstats1)
> vars <- c("mpg","hp","wt")
> head(mtcars[vars])
mpg hp wt
Mazda RX4 21.0 110 2.62
Mazda RX4 Wag 21.0 110 2.88
Datsun 710 22.8 93 2.32
Hornet 4 Drive 21.4 110 3.21
Hornet Sportabout 18.7 175 3.44
Valiant 18.1 105 3.46
>
> #创造一个统计的函数列表
> #通过sapply()计算描述性统计变量(包括偏度和峰度)
> mystats <- function(x,na.omit = FALSE){
+ if(na.omit)
+ x <- x[!is.na(x)]
+ m <- mean(x)
+ n <- length(x)
+ s <- sd(x)
+ skew <- sum((x-m)^3/s^3)/n
+ kurt <- sum((x-m)^4/s^4)/n - 3
+ return(c(n=n, mean = m, stdev = s, skew = skew, kurtosis = kurt))
+ }
> sapply(mtcars[vars],mystats)
mpg hp wt
n 32.000 32.000 32.0000
mean 20.091 146.688 3.2172
stdev 6.027 68.563 0.9785
skew 0.611 0.726 0.4231
kurtosis -0.373 -0.136 -0.0227
> #sapply(mtcars[myvars],mystats,na.omit=TRUE)单纯忽略缺失值
>
> library(Hmisc)
> describe(mtcars[vars])
vars n mean sd median trimmed mad min max range skew kurtosis se
mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61 -0.37 1.07
hp 2 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73 -0.14 12.12
wt 3 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42 -0.02 0.17
> #通过pastecs包中的stat.desc()函数计算描述性统计量
> library(pastecs)
> stat.desc(mtcars[vars])
mpg hp wt
nbr.val 32.00 32.000 32.000
nbr.null 0.00 0.000 0.000
nbr.na 0.00 0.000 0.000
min 10.40 52.000 1.513
max 33.90 335.000 5.424
range 23.50 283.000 3.911
sum 642.90 4694.000 102.952
median 19.20 123.000 3.325
mean 20.09 146.688 3.217
SE.mean 1.07 12.120 0.173
CI.mean.0.95 2.17 24.720 0.353
var 36.32 4700.867 0.957
std.dev 6.03 68.563 0.978
coef.var 0.30 0.467 0.304
>
> library(psych)
> describe(mtcars[vars])
vars n mean sd median trimmed mad min max range skew kurtosis se
mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61 -0.37 1.07
hp 2 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73 -0.14 12.12
wt 3 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42 -0.02 0.17
>
> #分组计算描述性统计量
> aggregate(mtcars[vars], by = list(am = mtcars$am), mean)
am mpg hp wt
1 0 17.1 160 3.77
2 1 24.4 127 2.41
> #aggregate()允许在每次调用中使用平均数、标准差这样的单返回值函数。
>
>
> #使用by()分组计算描述性统计量
> dstats <- function(x)sapply(x,mystats)
> vars <- c("mpg","hp","wt")
> by(mtcars[vars],mtcars$am,dstats)
mtcars$am: 0
mpg hp wt
n 19.000 19.0000 19.000
mean 17.147 160.2632 3.769
stdev 3.834 53.9082 0.777
skew 0.014 -0.0142 0.976
kurtosis -0.803 -1.2097 0.142
-----------------------------------------------------------------------------
mtcars$am: 1
mpg hp wt
n 13.0000 13.000 13.000
mean 24.3923 126.846 2.411
stdev 6.1665 84.062 0.617
skew 0.0526 1.360 0.210
kurtosis -1.4554 0.563 -1.174
>
> #使用doBy包中的summaryBy()分组计算概述统计量
> library(doBy)
> summaryBy(mpg + hp + wt ~ am, data = mtcars, FUN = mystats)
am mpg.n mpg.mean mpg.stdev mpg.skew mpg.kurtosis hp.n hp.mean hp.stdev hp.skew hp.kurtosis wt.n
1 0 19 17.1 3.83 0.0140 -0.803 19 160 53.9 -0.0142 -1.210 19
2 1 13 24.4 6.17 0.0526 -1.455 13 127 84.1 1.3599 0.563 13
wt.mean wt.stdev wt.skew wt.kurtosis
1 3.77 0.777 0.976 0.142
2 2.41 0.617 0.210 -1.174
>
> #使用psych包分组计算概述统计量
> library(psych)
> describeBy(mtcars[vars],mtcars$am)
Descriptive statistics by group
group: 0
vars n mean sd median trimmed mad min max range skew kurtosis se
mpg 1 19 17.15 3.83 17.30 17.12 3.11 10.40 24.40 14.00 0.01 -0.80 0.88
hp 2 19 160.26 53.91 175.00 161.06 77.10 62.00 245.00 183.00 -0.01 -1.21 12.37
wt 3 19 3.77 0.78 3.52 3.75 0.45 2.46 5.42 2.96 0.98 0.14 0.18
-----------------------------------------------------------------------------
group: 1
vars n mean sd median trimmed mad min max range skew kurtosis se
mpg 1 13 24.39 6.17 22.80 24.38 6.67 15.00 33.90 18.90 0.05 -1.46 1.71
hp 2 13 126.85 84.06 109.00 114.73 63.75 52.00 335.00 283.00 1.36 0.56 23.31
wt 3 13 2.41 0.62 2.32 2.39 0.68 1.51 3.57 2.06 0.21 -1.17 0.17
> #不允许指定任意函数,所以普适性较低
> #psych包中的describeBy()函数可计算和describe()相同的描述性统计量,只按照一个或多个分组的变量分层。
>
> #对比分析
> library(doBy)
> summaryBy(mpg+hp+wt~am,data=mtcars,FUN=mystats)
am mpg.n mpg.mean mpg.stdev mpg.skew mpg.kurtosis hp.n hp.mean hp.stdev hp.skew hp.kurtosis wt.n
1 0 19 17.1 3.83 0.0140 -0.803 19 160 53.9 -0.0142 -1.210 19
2 1 13 24.4 6.17 0.0526 -1.455 13 127 84.1 1.3599 0.563 13
wt.mean wt.stdev wt.skew wt.kurtosis
1 3.77 0.777 0.976 0.142
2 2.41 0.617 0.210 -1.174
> library(psych)
> describeBy(mtcars[vars],list(am=mtcars$am))
Descriptive statistics by group
am: 0
vars n mean sd median trimmed mad min max range skew kurtosis se
mpg 1 19 17.15 3.83 17.30 17.12 3.11 10.40 24.40 14.00 0.01 -0.80 0.88
hp 2 19 160.26 53.91 175.00 161.06 77.10 62.00 245.00 183.00 -0.01 -1.21 12.37
wt 3 19 3.77 0.78 3.52 3.75 0.45 2.46 5.42 2.96 0.98 0.14 0.18
-----------------------------------------------------------------------------
am: 1
vars n mean sd median trimmed mad min max range skew kurtosis se
mpg 1 13 24.39 6.17 22.80 24.38 6.67 15.00 33.90 18.90 0.05 -1.46 1.71
hp 2 13 126.85 84.06 109.00 114.73 63.75 52.00 335.00 283.00 1.36 0.56 23.31
wt 3 13 2.41 0.62 2.32 2.39 0.68 1.51 3.57 2.06 0.21 -1.17 0.17
>
>
> #使用reshape包分组计算概述统计量
> library(reshape)
> dstats1 <- function(x)(c(n = length(x), mean = mean(x), sd = sd(x)))
> dfm <- melt(mtcars, measure.vars = vars, id.vars = c("am","cyl"))
> cast(dfm, am + cyl + variable ~ . , dstats1)
am cyl variable n mean sd
1 0 4 mpg 3 22.90 1.453
2 0 4 hp 3 84.67 19.655
3 0 4 wt 3 2.94 0.408
4 0 6 mpg 4 19.12 1.632
5 0 6 hp 4 115.25 9.179
6 0 6 wt 4 3.39 0.116
7 0 8 mpg 12 15.05 2.774
8 0 8 hp 12 194.17 33.360
9 0 8 wt 12 4.10 0.768
10 1 4 mpg 8 28.07 4.484
11 1 4 hp 8 81.88 22.655
12 1 4 wt 8 2.04 0.409
13 1 6 mpg 3 20.57 0.751
14 1 6 hp 3 131.67 37.528
15 1 6 wt 3 2.75 0.128
16 1 8 mpg 2 15.40 0.566
17 1 8 hp 2 299.50 50.205
18 1 8 wt 2 3.37 0.283
以上,共勉,有问题欢迎评论~