R可在多种操作系统下运行,如Windows、MacOS、多种Linux和UNIX等;
命令行驱动
R即时解释,输入命令,即可获得相应的结果;
In [3]:
x <- 4 #也可用=赋值
print(x)
In [4]:
typeof(x)
In [5]:
is.vector(x)
'double'
In [6]:
y <- c( 88 , 5 , 12 , 13 )
print(y)
print(typeof(y))
print(is.vector(y))
In [7]:
x1 <- c( 1 , 2 , 3 , 4 , 5 )
print(x1)
x2 <- 1 : 5
print(x2)
In [8]:
seq(from = 12 , to = 30 , by = 3 )
In [9]:
seq(from=1.1, to= 2 , length= 10 )
[1] "double"
[1] TRUE
In [10]:
rep( 8 , 4 )
c( 8 , 8 , 8 , 8 )
In [11]:
y <- c(y[ 1 : 3 ], 168 , y[ 4 ])
print(y)
In [12]:
y <- c(y[ 1 : 3 ], c( 56 , 24 , 35 , 10 , 5 , 7 ), y[ 4 ])
print(y)
In [13]:
length(y)
In [14]:
c( 1 , 2 , 4 ) + c( 5 , 0 ,-1)
c( 1 , 2 , 4 ) - c( 5 , 0 ,-1)
c( 1 , 2 , 4 ) * c( 5 , 0 ,-1)
c( 1 , 2 , 4 ) / c( 5 , 0 ,-1)
In [15]:
y[ 2 ]
In [16]:
y[ 2 : 4 ]
In [17]:
print(y)
y[ 2 : 4 ] = c( 8 , 14 , 67 )
print(y)
0.2 Inf -
In [18]:
print(y)
print(y[-c( 1 : 3 )]) # 或者b=(1:3) y[-b]
X = c(1,1,1)
Y = c(2,2,2)
temp = c(14.7,18.5,25.9)
RH = c(66,73,41)
wind = c(2.7,8.5,3.6)
rain = c(0,0,0)
area = c(0,0,0)
rank = c(1,2,3)
In [19]:
X = c( 1 , 1 , 1 )
Y = c( 2 , 2 , 2 )
temp = c(14.7,18.5,25.9)
RH = c( 66 , 73 , 41 )
wind = c(2.7,8.5,3.6)
rain = c( 0 , 0 , 0 )
area = c( 0 , 0 , 0 )
rank = c( 1 , 2 , 3 )
ForeData = cbind(X,Y,temp,RH,wind,rain,area,rank)
print(ForeData)
print(is.matrix(ForeData)) # 判断是否为矩阵
X Y temp RH wind rain area rank
[1,] 1 2 14.7 66 2.7 0 0 1
[2,] 1 2 18.5 73 8.5 0 0 2
[3,] 1 2 25.9 41 3.6 0 0 3
[1] TRUE
In [20]:
mdat <- matrix(c( 1 , 2 , 3 , 11 , 12 , 13 ), nrow = 2 , ncol = 3 , byrow = TRUE, dimnames =
print(mdat)
In [21]:
x = matrix(nrow = 2 , ncol = 2 ) # 注意:不能写成matrix(2,3)
x[ 1 , 1 ] = 1
x[ 2 , 1 ] = 2
x[ 1 , 2 ] = 3
x[ 2 , 2 ] = 4
print(x)
In [22]:
colnames(x) = c(‘a’,‘b’)
rownames(x) = c(‘1’,‘2’)
print(x)
In [23]:
print(ForeData[ 2 , 3 ])
row1 1 2 3
row2 11 12 13
a b
1 1 3
2 2 4
temp
18.
In [24]:
print(ForeData[ 1 : 2 , 1 : 3 ])
In [25]:
print(ForeData[ 1 : 2 , c( 1 , 3 )])
X Y temp
[1,] 1 2 14.
[2,] 1 2 18.
X temp
[1,] 1 14.
[2,] 1 18.
In [26]:
a = c( 1 : 60 )
dim1 = c(‘R1’,‘R2’,‘R3’,‘R4’)
dim2 = c(‘C1’,‘C2’,‘C3’,‘C4’,‘C5’)
dim3 = c(‘T1’,‘T2’,‘T3’)
f = array(a,c( 4 , 5 , 3 ),dimnames = list(dim1,dim2,dim3))
print(f)
X = c(1,1,1)
Y = c(2,2,2)
temp = c(14.7,18.5,25.9)
RH = c(66,73,41)
wind = c(2.7,8.5,3.6)
rain = c(0,0,0)
area = c(0,0,0)
month = c(‘aug’,‘aug’,‘aug’)
day = c(‘fri’,‘fri’,‘fri’)
In [27]:
X = c( 1 , 1 , 1 )
Y = c( 2 , 2 , 2 )
temp = c(14.7,18.5,25.9)
RH = c( 66 , 73 , 41 )
wind = c(2.7,8.5,3.6)
rain = c( 0 , 0 , 0 )
area = c( 0 , 0 , 0 )
month = c(‘aug’,‘aug’,‘aug’)
day = c(‘fri’,‘fri’,‘fri’)
ForeDataFrm = data.frame(FX = X,FY = Y, Fmonth = month,Fday = day, Ftemp = temp
print(ForeDataFrm)
In [28]:
names(ForeDataFrm)
In [29]:
is.data.frame(ForeDataFrm)
FX FY Fmonth Fday Ftemp FRH Fwind Frain Farea
1 1 2 aug fri 14.7 66 2.7 0 0
2 1 2 aug fri 18.5 73 8.5 0 0
3 1 2 aug fri 25.9 41 3.6 0 0
'FX' 'FY' 'Fmonth' 'Fday' 'Ftemp' 'FRH' 'Fwind' 'Frain' 'Farea'
In [30]:
print(ForeDataFrm[,c( 1 , 3 )])
print(ForeDataFrm[,c(‘FX’,‘Fmonth’)])
In [31]:
ForeDataFrm$Fwind
ForeDataFrm[[‘Fwind’]]
ForeDataFrm[[ 7 ]]
FX Fmonth
1 1 aug
2 1 aug
3 1 aug
FX Fmonth
1 1 aug
2 1 aug
3 1 aug
In [32]:
a <- 123.
is.numeric(a)
is.integer(a)
is.character(a)
is.logical(a)
In [33]:
b <- “123.4”
is.numeric(b)
is.integer(b)
is.character(b)
is.logical(b)
In [34]:
typeof(a)
typeof(b)
'double'
'character'
In [35]:
a <- as.character(a)
b <- as.double(b)
typeof(a)
typeof(b)
In [36]:
e <- c( 1 : 10 )
f <- as.matrix(e)
print(f)
二章 数据的导入 ;这里只是以txt的导入为例,如果想看更多的文件导入方式,可以看下配套知识点
的第二章,里面有更多格式的文件导入方式;
'character'
'double'
In [37]:
ReportCard1 = read.table(file=‘/home/mw/input/wlong6309/ReportCard1.txt’, heade
ReportCard2 = read.table(file=‘/home/mw/input/wlong6309/ReportCard2.txt’, heade
names(ReportCard1)
names(ReportCard2)
In [38]:
ReportCard = merge(ReportCard1, ReportCard2, by = ‘xh’)
print(head(ReportCard))
In [39]:
Ord = order(ReportCard$math, na.last = TRUE, decreasing = TRUE)
print(Ord) # Ord为位置向量, 1 号学生的数学成绩最高, 3 号学生的数学成绩最低或者为缺失值
'xh' 'sex' 'poli' 'chi' 'math'
'xh' 'fore' 'phy' 'che' 'geo' 'his'
xh sex poli chi math fore phy che geo his
1 92101 2 96 96 87.5 72 93 65 76.0 92
2 92102 1 94 97 86.5 61 93 64 79.5 95
3 92103 2 NA NA NA 66 98 79 89.0 81
4 92104 2 89 97 69.5 86 83 62 83.0 94
5 92105 1 82 85 79.5 60 88 66 72.5 98
6 92106 2 88 88 78.0 60 90 70 81.5 77
In [40]:
a = ReportCard[Ord,]
print(head(a))
In [41]:
a = is.na(ReportCard$math)
print(ReportCard[a,])
In [42]:
a = complete.cases(ReportCard)
print(ReportCard[!a,])
xh sex poli chi math fore phy che geo his
1 92101 2 96 96 87.5 72 93 65 76.0 92
33 92204 2 88 81 87.5 60 84 63 79.0 92
2 92102 1 94 97 86.5 61 93 64 79.5 95
32 92203 2 74 93 84.5 50 89 72 82.5 92
34 92205 2 81 79 84.0 60 91 64 81.0 92
31 92202 1 78 89 83.5 81 91 77 81.0 93
xh sex poli chi math fore phy che geo his
3 92103 2 NA NA NA 66 98 79 89 81
xh sex poli chi math fore phy che geo his
3 92103 2 NA NA NA 66 98 79 89 81
27 92142 2 NaN 70 59 22 68 26 26 63
In [43]:
install.packages(“mice”)
library(mice)
Updating HTML index of packages in ‘.Library’
Making ‘packages.html’ … done
Warning message:
“As of rlang 0.4.0, dplyr must be at least version 0.8.0.
install.packages("dplyr")
and restart R.”In [44]:
print(md.pattern(ReportCard))
xh sex fore phy che geo his chi math poli
58 1 1 1 1 1 1 1 1 1 1 0
1 1 1 1 1 1 1 1 1 1 0 1
1 1 1 1 1 1 1 1 0 0 0 3
0 0 0 0 0 0 0 1 1 2 4
In [45]:
round(sqrt(log( 10 , 2 )),digits= 3 )
In [46]:
mean(y) # 中位数
median(y)
sd(y)
var(y)
max(y)
min(y)
In [47]:
attach(ReportCard)#访问数据框中域访问
SumScore = poli + chi + math + fore + phy + che + geo + his
detach(ReportCard)
AvScore = SumScore/ 8 #计算平均值
ReportCardsumScore=SumScoreReportCardsumScore = SumScore ReportCardsumScore=SumScoreReportCardavScore = AvScore
sum(is.na(ReportCard$sumScore))#计算总分为缺失值的观测值的观测样本数
mean(complete.cases(ReportCard))#计算完整观测样本的比率
In [48]:
sum(y)
cumsum(y)
prod(y)
In [49]:
a = is.na(ReportCardmath)math=ReportCardmath) math = ReportCardmath)math=ReportCardmath
math = math[!a]
dnorm(math,mean(math),sd(math))
In [50]:
str = “You like R. So do I”
str_1 = strsplit(str,‘S’)[[ 1 ]] # 注:列表名$域名 或者 列表名[ [‘域名’] ] 或者 列表框[
str_2 = sub(’ ', ‘‘, sub(’ ', '’, str_1[ 1 ])) # 为什么嵌套:sub好像只能替换第一个
str_3 = toupper(str_2)
print(str_3)
In [51]:
print(diag( 4 ))
In [52]:
m = matrix( 1 , nrow= 2 , ncol= 2 )
n = matrix( 2 , nrow= 2 , ncol= 2 )
print(m)
print(n)
In [53]:
mn = m %*% n
print(mn)
In [54]:
print(diag(mn)) # 输出正对角元素值
In [55]:
mm = matrix( 1 : 9 , nrow= 3 , ncol= 3 , byrow=TRUE)
print(mm)
print(‘转置后的矩阵:’)
print(t(mm))
In [56]:
eigen(mm)
eigen() decomposition
$values
[1] 1.611684e+01 -1.116844e+00 -1.303678e-15
$vectors
[,1] [,2] [,3]
[1,] -0.2319707 -0.78583024 0.4082483
[2,] -0.5253221 -0.08675134 -0.8164966
[3,] -0.8186735 0.61232756 0.4082483
In [57]:
attach(ReportCard)#访问数据框中域访问
SumScore = poli + chi + math + fore + phy + che + geo + his
detach(ReportCard)
AvScore = SumScore/ 8 #计算平均值
ReportCardsumScore=SumScoreReportCardsumScore = SumScore ReportCardsumScore=SumScoreReportCardavScore = AvScore
ReportCard = within(ReportCard,{
avScore[avScore>= 90 ] = ‘A’
avScore[avScore>= 80 & avScore < 90 ] = ‘B’
avScore[avScore>= 70 & avScore < 80 ] = ‘C’
avScore[avScore>= 60 & avScore < 70 ] = ‘D’
avScore[avScore < 60 ] = ‘E’
})
flag = ReportCard$avScore %in% c(‘A’,“B”,“C”,“D”,“E”)
ReportCard$avScore[!flag] = NA
print(ReportCard$avScore)
The following object is masked _by_ .GlobalEnv:
math
Warning message in poli + chi + math:
“longer object length is not a multiple of shorter object length”
[1] "B" "B" NA "B" "C" "C" "C" "C" "C" "C" "C" "C" "C" "D" "C" "D" "D" "C" "D"
[20] "D" "D" "D" "D" "D" "E" "E" NA "E" "E" "E" "B" "B" "C" "C" "C" "C" "C" "C"
[39] "D" "C" "C" "C" "C" "D" "D" "D" "D" "C" "D" "D" "D" "D" "D" "D" "D" "D" "D"
[58] "E" "E" "E"
In [58]:
ReportCardsex=factor(ReportCardsex = factor(ReportCardsex=factor(ReportCardsex, levels = c( 1 , 2 ), labels = c(“M”,“F”))
str(ReportCard$sex)
In [59]:
print(head(ReportCard))
In [60]:
MaleScore = subset(ReportCard, ReportCardKaTeX parse error: Expected 'EOF', got '&' at position 12: sex == 'M' &̲ ReportCardavScore == 'E
print(MaleScore)
Factor w/ 2 levels "M","F": 2 1 2 2 1 2 2 1 1 2 ...
xh sex poli chi math fore phy che geo his sumScore avScore
1 92101 F 96 96 87.5 72 93 65 76.0 92 677.5 B
2 92102 M 94 97 86.5 61 93 64 79.5 95 670.0 B
3 92103 F NA NA NA 66 98 79 89.0 81 NA
4 92104 F 89 97 69.5 86 83 62 83.0 94 673.5 B
5 92105 M 82 85 79.5 60 88 66 72.5 98 629.5 C
6 92106 F 88 88 78.0 60 90 70 81.5 77 624.0 C
xh sex poli chi math fore phy che geo his sumScore avScore
28 92144 M 59 79.0 34.0 34 57 37 37 76 409.5 E
29 92145 M 74 84.5 30.5 33 64 34 34 71 439.5 E
30 92146 M 61 69.0 45.0 20 49 32 32 51 397.5 E
58 92234 M 66 79.0 55.5 57 52 57 41 65 451.0 E
59 92236 M 79 76.0 34.0 28 63 36 36 52 414.0 E
In [61]:
xh = sample(ReportCardxh,size=10,replace=FALSE)samples=ReportCard[ReportCardxh, size = 10 , replace = FALSE) sample_s = ReportCard[ReportCardxh,size=10,replace=FALSE)samples=ReportCard[ReportCardxh %in% xh,]
print(sample_s)
In [62]:
i = 6
repeat{ if(i > 50 ) break else {print(i); i = i + 6 }}
In [63]:
for(i in seq(from = 6 , to = 50 , by = 6 ))
print(i)
xh sex poli chi math fore phy che geo his sumScore avScore
1 92101 F 96 96 87.5 72 93 65 76.0 92 677.5 B
5 92105 M 82 85 79.5 60 88 66 72.5 98 629.5 C
7 92108 F 84 90 69.5 50 80 60 86.5 94 615.5 C
27 92142 F NaN 70 59.0 22 68 26 26.0 63 NaN
30 92146 M 61 69 45.0 20 49 32 32.0 51 397.5 E
39 92211 F 71 73 69.0 42 95 61 76.5 76 556.0 D
41 92213 M 82 76 65.0 60 75 60 78.0 76 569.0 C
46 92218 M 87 72 70.0 65 72 49 62.0 68 534.5 D
56 92231 F 83 84 38.5 60 76 46 65.5 49 515.0 D
58 92234 M 66 79 55.5 57 52 57 41.0 65 451.0 E
In [64]:
summary(ReportCard)
In [65]:
Av.Course = sapply(ReportCard[, 3 : 10 ], FUN = mean, na.rm = TRUE) # 均值
Sd.Course = sapply(ReportCard[, 3 : 10 ],FUN = sd, na.rm = TRUE) # 方差
print(Av.Course)
print(Sd.Course)
xh sex poli chi math
Min. :92101 M:30 Min. :40.00 Min. :63.00 Min. :30.50
1st Qu.:92122 F:30 1st Qu.:74.50 1st Qu.:77.00 1st Qu.:47.25
Median :92174 Median :82.50 Median :84.00 Median :62.50
Mean :92170 Mean :79.64 Mean :83.28 Mean :61.17
3rd Qu.:92217 3rd Qu.:87.00 3rd Qu.:90.00 3rd Qu.:70.75
Max. :92239 Max. :96.00 Max. :97.00 Max. :87.50
NA's :2 NA's :1 NA's :1
fore phy che geo
Min. :20.00 Min. :49.00 Min. :26.00 Min. :26.00
1st Qu.:40.75 1st Qu.:67.75 1st Qu.:45.50 1st Qu.:57.75
Median :50.00 Median :76.50 Median :55.00 Median :66.00
Mean :49.92 Mean :75.20 Mean :54.08 Mean :65.24
3rd Qu.:60.00 3rd Qu.:83.25 3rd Qu.:62.25 3rd Qu.:78.00
Max. :86.00 Max. :98.00 Max. :83.00 Max. :89.00
his sumScore avScore
Min. :49.00 Min. :372.5 Length:60
1st Qu.:71.75 1st Qu.:510.0 Class :character
Median :79.50 Median :554.0 Mode :character
Mean :78.68 Mean :548.7
3rd Qu.:91.00 3rd Qu.:589.2
Max. :98.00 Max. :677.5
NA's :2
poli chi math fore phy che geo his
79.63793 83.27966 61.16949 49.91667 75.20000 54.08333 65.24167 78.68333
poli chi math fore phy che geo his
10.575872 8.127365 15.076417 14.018501 12.351902 12.315474 15.394389 12.735233
In [66]:
Av.Course = colMeans(ReportCard[, 3 : 10 ],na.rm = TRUE) # 各科平均分
Sums.Course = colSums(ReportCard[, 3 : 10 ],na.rm = TRUE) # 各科总分
print(Av.Course)
print(Sums.Course)
In [67]:
Av.Person = rowMeans(ReportCard[, 3 : 10 ],na.rm = TRUE)
Sum.Person = rowSums(ReportCard[, 3 : 10 ],na.rm = TRUE)
print(Av.Person)
print(Sum.Person)
In [68]:
#抽取女生的数据
FeMaleCard = subset(ReportCard,ReportCard$sex == “F”)
#求女生各科成绩的平均值
Des.FeMale = sapply(FeMaleCard[ 3 : 10 ],FUN = mean,na.rm = TRUE)
print(Des.FeMale)
poli chi math fore phy che geo his
79.63793 83.27966 61.16949 49.91667 75.20000 54.08333 65.24167 78.68333
poli chi math fore phy che geo his
4619.0 4913.5 3609.0 2995.0 4512.0 3245.0 3914.5 4721.0
poli chi math fore phy che geo his
80.46429 83.05172 62.34483 48.63333 77.66667 55.80000 67.95000 78.43333
In [69]:
Des.Gender = tapply(ReportCardpoli,INDEX=ReportCardpoli,INDEX = ReportCardpoli,INDEX=ReportCardsex,FUN = summary,na.rm
print(Des.Gender)
In [70]:
Tmp = ReportCard[complete.cases(ReportCard),]
CorMatrix = cor(Tmp[,c( 5 , 7 , 8 )],use = “everything”,method = “pearson”)
print(CorMatrix)
In [71]:
Tmp = ReportCard[complete.cases(ReportCard),]
cor.test(Tmp[, 5 ],Tmp[, 7 ],alternative = “two.side”,method = “pearson”)
Min. 1st Qu. Median Mean 3rd Qu. Max.
56.00 73.25 82.00 78.87 86.75 94.00
$F
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
40.00 76.00 83.00 80.46 88.00 96.00 2
math phy che
math 1.0000000 0.7535317 0.7171637
phy 0.7535317 1.0000000 0.6207730
che 0.7171637 0.6207730 1.0000000
Pearson's product-moment correlation
data: Tmp[, 5] and Tmp[, 7]
t = 8.5775, df = 56, p-value = 8.753e-12
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6149204 0.8469769
sample estimates:
cor
0.7535317
In [72]:
CrossTable = table(ReportCard[,c( 2 , 12 )])
chisq.test(CrossTable)
In [4]:
Forest = read.table(file=‘/home/mw/input/wlong6309/ForestData.txt’, header = TR
print(head(Forest))
Warning message in chisq.test(CrossTable):
“Chi-squared approximation may be incorrect”
Pearson's Chi-squared test
data: CrossTable
X-squared = 0.67532, df = 3, p-value = 0.879
X Y month day temp RH wind rain area
1 1 2 aug fri 14.7 66 2.7 0 0
2 1 2 aug fri 18.5 73 8.5 0 0
3 1 2 aug fri 25.9 41 3.6 0 0
4 1 2 aug sat 25.9 32 3.1 0 0
5 1 2 aug sun 19.5 39 6.3 0 0
6 1 2 aug sun 17.9 44 2.2 0 0
In [74]:
stem(Forest$temp)
The decimal point is at the |
2 | 2
4 | 26666668111112333588
6 | 755
8 | 022337889038
10 | 1112334566690002223345556667888
12 | 223444677899123344777888899
14 | 012222334456677778911222222444444455667788999999
16 | 011222234446666677888888900001111222333444444446666677777888888999
18 | 00001222222334444556666777888999999000111111222233333344444556666666
20 | 11111122233334444445566666667777778888889900011112222333344445555666+3
22 | 11112223344566778888899990001223333344444455677778889999
24 | 0111112222233333566668889901333445679999
26 | 122344444788899234556788899
28 | 002336779236
30 | 2226880
32 | 344613
In [75]:
Forestmonth=factor(Forestmonth = factor(Forestmonth=factor(Forestmonth,levels = c(“jan”,“feb”,“mar”,“apr”,“may”,"ju
boxplot(temp~month,data = Forest,main = “森林地区各月温度箱线图”)
In [76]:
hist(Forest$temp,xlab = “森林地区温度”,ylab = “频率”,main = “森林地区温度直方图”,cex.
In [77]:
NumGrade = tapply(ReportCardavScore,INDEX=ReportCardavScore,INDEX = ReportCardavScore,INDEX=ReportCardavScore,FUN = length)
barplot(NumGrade,xlab = “平均分等级”,ylab = “人数”,ylim = c( 0 , 25 ))
In [78]:
Pct = round(NumGrade/length(ReportCard$avScore)* 100 , 2 )
GLabs = paste(c(“B”,“C”,“D”,“E”),Pct,“%”,sep = “”)
pie(NumGrade,labels = GLabs,cex = 0.8,main = “平均分等级饼图”,cex.main = 0.8)
In [79]:
plot(Foresttemp,Foresttemp,Foresttemp,ForestRH,main = “森林地区温度和相对湿度的散点图”,xlab = “温度”,ylab
In [80]:
plot(Foresttemp,Foresttemp,Foresttemp,ForestRH,main = “森林地区温度和相对湿度的散点图”,xlab = “温度”,ylab
M0 = lm(RH~temp,data = Forest)
abline(M0coefficients)M.Loess=loess(RH temp,data=Forest)Ord=order(Forestcoefficients) M.Loess = loess(RH~temp,data = Forest) Ord = order(Forestcoefficients)M.Loess=loess(RH temp,data=Forest)Ord=order(Foresttemp)
lines(Foresttemp[Ord],M.Loesstemp[Ord],M.Loesstemp[Ord],M.Loessfitted[Ord],lwd = 1 ,lty = 1 ,col = 2 )
In [81]:
install.packages(“scatterplot3d”)
library(“scatterplot3d”)
with(Forest,scatterplot3d(temp,RH,wind,main=“森林地区温度、相对湿度和风力的三维散点图”
Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
In [82]:
install.packages(“corrgram”)
library(“corrgram”)
corrgram(ReportCard[, 3 : 10 ],lower.panel=panel.shade,upper.panel=panel.pie,text.p
Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
In [83]:
x <- c( 95 , 89 , 68 , 90 , 88 , 60 , 81 , 67 , 60 , 60 , 60 , 63 , 60 , 92 , 60 , 88 , 88 , 87 , 60 , 73 , 60 , 97 , 91 , 60
binom.test(min(sum(x> 80 ),sum(x< 80 )),sum(x!= 80 ), 0.75)
In [84]:
spamail <- c( 310 , 350 , 370 , 377 , 380 , 400 , 415 , 425 , 440 , 295 , 325 , 296 , 250 , 340 , 298 , 365 , 37
wilcox.test(spamail, 320 ,alt=‘great’,conf.int=TRUE)
Exact binomial test
data: min(sum(x > 80), sum(x < 80)) and sum(x != 80)
number of successes = 13, number of trials = 28, p-value = 0.001436
alternative hypothesis: true probability of success is not equal to 0.75
95 percent confidence interval:
0.2751086 0.6613009
sample estimates:
probability of success
0.4642857
Wilcoxon rank sum test
data: spamail and 320
W = 14, p-value = 0.3
alternative hypothesis: true location shift is greater than 0
95 percent confidence interval:
-70 Inf
sample estimates:
difference in location
45
In [85]:
x <- c( 24 , 26 , 29 , 34 , 43 , 58 , 63 , 72 , 87 , 101 )
y <- c( 82 , 87 , 97 , 121 , 164 , 208 , 213 )
wilcox.test(x,y,alternative=“less”,exact=FALSE,correct=FALSE)
Wilcoxon rank sum test
data: x and y
W = 4.5, p-value = 0.001449
alternative hypothesis: true location shift is less than 0
In [86]:
x <- c( 98 , 67 , 13 , 18 , 38 , 41 , 8 , 12 , 289 , 262 , 57 , 30 )
dim(x)<- c( 4 , 3 )
chisq.test(x)
In [87]:
medicine<-matrix(c( 8 , 7 , 2 , 23 ), 2 , 2 )
fisher.test(medicine)
Pearson's Chi-squared test
data: x
X-squared = 15.073, df = 6, p-value = 0.01969
Fisher's Exact Test for Count Data
data: medicine
p-value = 0.002429
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.856547 143.340082
sample estimates:
odds ratio
12.12648
In [88]:
drug <- c( 80 , 203 , 236 , 252 , 284 , 368 , 457 , 393 , 133 , 180 , 100 , 160 , 156 , 295 , 320 , 448 , 465 , 48
gr.drug<-c( 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 , 4 , 4 )
kruskal.test(drug,gr.drug)
Kruskal-Wallis rank sum test
data: drug and gr.drug
Kruskal-Wallis chi-squared = 8.0721, df = 3, p-value = 0.04455
In [89]:
beijingfish <- c( 85 , 82 , 82 , 79 , 87 , 75 , 86 , 82 , 90 , 81 , 80 , 76 , 80 , 75 , 81 , 75 )
treat.BF <- c( 1 , 2 , 3 , 4 , 1 , 2 , 3 , 4 , 1 , 2 , 3 , 4 , 1 , 2 , 3 , 4 )
block.BF <- c( 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 )
friedman.test(beijingfish,treat.BF,block.BF)
Friedman rank sum test
data: beijingfish, treat.BF and block.BF
Friedman chi-squared = 8.1316, df = 3, p-value = 0.04337
In [90]:
x <- c( 65 , 79 , 67 , 66 , 89 , 85 , 84 , 73 , 88 , 80 , 86 , 75 )
y <- c( 62 , 66 , 50 , 68 , 88 , 86 , 64 , 62 , 92 , 64 , 81 , 80 )
cor.test(x,y) #pearson相关性检验
cor.test(x,y,meth=‘spearman’) # spearman相关系数
cor.test(x,y,meth=‘kendall’) # kendall相关系数
x:318,910,200,409,425,502,314,1210,1022,1225
y:524,1019,638,815,913,928,605,1516,1219,1624
Pearson's product-moment correlation
data: x and y
t = 3.4403, df = 10, p-value = 0.006328
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.2811026 0.9209916
sample estimates:
cor
0.7362315
Warning message in cor.test.default(x, y, meth = "spearman"):
“Cannot compute exact p-value with ties”
Spearman's rank correlation rho
data: x and y
S = 65.227, p-value = 0.003265
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.7719346
Warning message in cor.test.default(x, y, meth = "kendall"):
“Cannot compute exact p-value with ties”
Kendall's rank correlation tau
data: x and y
z = 2.6181, p-value = 0.008842
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.5846846
In [91]:
x<-c( 318 , 910 , 200 , 409 , 425 , 502 , 314 , 1210 , 1022 , 1225 )
y<-c( 524 , 1019 , 638 , 815 , 913 , 928 , 605 , 1516 , 1219 , 1624 )
plot(x,y)
lm.reg<-lm(y~ 1 +x)
summary(lm.reg)
op=par(mfrow=c( 2 , 2 ))
plot(lm.reg)#产生四个图,分别是:1 residual vs fitted;2 Normal QQ-plot;3 scale-loc
par(op)
Call:
lm(formula = y ~ 1 + x)
Residuals:
Min 1Q Median 3Q Max
-191.52 -86.63 45.26 79.32 138.17
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 393.0431 79.6510 4.935 0.00114 **
x 0.8983 0.1057 8.498 2.82e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 125.4 on 8 degrees of freedom
Multiple R-squared: 0.9003, Adjusted R-squared: 0.8878
F-statistic: 72.21 on 1 and 8 DF, p-value: 2.821e-05
In [92]:
point <- data.frame(x= 425 )
lm.pred <- predict(lm.reg,point,interval=‘prediction’,level=0.95)
print(lm.pred)
fit lwr upr
1 774.8322 466.5557 1083.109
In [93]:
#从 1 加到 100
#方法1:for循环
sum1= 0
for(i in seq (from= 1 , to= 100 ,by= 1 ) ) sum1=sum1+i
print(sum1)
#方法2:repeat循环
i= 0
sum2= 0
repeat{if(i> 100 ) break else {sum2=sum2+i; i=i+1}}
print(sum2)
#方法3:while循环
sum3= 0
i= 0
while(i<= 100 ){ sum3=sum3+i; i=i+1}
print(sum3)
#方法4 : sum函数
print(sum(c( 1 : 100 )))
In [94]:
#从 1 的平方加到 100 的平方#方法1:for函数
sum4= 0
for(i in seq (from= 1 , to= 100 , by= 1 ) ) sum4=sum4+i^ 2
print(sum4)
#方法2:repeat循环
i= 0
sum5= 0
repeat{if(i> 100 )break else { sum5=sum5+i^ 2 ; i=i+1}}
print(sum5)
#方法3:while循环
sum6= 0
i= 0
while (i<= 100 ){ sum6=sum6+i^ 2 ; i=i+1}
print(sum6)
#方法4: sum函数
print(sum(c(( 1 : 100 )^ 2 )))
In [95]:
t = seq(from= 1 , to= 100 , by= 2 ) #从 1 到100,间隔为 2 ,输出数
print(t)
In [96]:
t = c( 1 : 200 )
t = t[-5] #删除第 5 个元素
t = c(t[ 1 : 4 ], 11 , 21 , t[ 5 : 199 ]) #在第五个元素的位置上,添加11,21两个数
print(t)
In [97]:
y = c( 1 : 24 )
t = array(y, c( 3 , 4 , 2 )) #访问第二组数据
print(t)
X = c(1,1,1)
Y = c(2,2,2)
temp = c(14.7,18.5,25.9)
RH = c(66,73,41)
In [98]:
X = c( 1 , 1 , 1 )
Y = c( 2 , 2 , 2 )
temp = c(14.7,18.5,25.9)
RH = c( 66 , 73 , 41 )
data = data.frame(X,Y,temp,RH) #定义数据框
print(data)
In [99]:
print(data[,‘temp’] ) #访问temp列,或者也可以写成data[,3]
X Y temp RH
1 1 2 14.7 66
2 1 2 18.5 73
3 1 2 25.9 41
In [100]:
set.seed( 100 )
y = rnorm( 100 , 0 , 1 )#生成 100 个标准正态分布的 100 个数
print(y)
In [101]:
y = sort(y)#将y中的数值进行排序
print(y)
In [102]:
plot(y, dnorm(y, 0 , 1 ), type=“l”, main=“正态分布密度图”) #生成正态分布密度图
In [103]:
#首先,定义一个f函数
f = function(n){
sum = 0 #定义一个sum,存储总和
for(i in 1 :n) sum = sum + i^ 3 #定义一个for循环,依次将n个数的立方求和
return(sum) #返回求和后的数
}
f( 5 ) #当n为 5 时,调用f函数,结果为 225
In [104]:
round(abs(exp( 1 )-exp( 2 ))^( 1 / 3 ), 2 )
In [105]:
x=c( 3 : 95 )
mean(x)
median(x)
sd(x)
var(x)
max(x)
min(x)
length(x)
sum(x)
In [106]:
Reportcard1 = read.table(“/home/mw/input/wlong6309/ReportCard1.txt”,header=T)
Reportcard2 = read.table(“/home/mw/input/wlong6309/ReportCard2.txt”,header=T)
Reportcard = merge(Reportcard1,Reportcard2,by=‘xh’)
print(head(Reportcard))
xh sex poli chi math fore phy che geo his
1 92101 2 96 96 87.5 72 93 65 76.0 92
2 92102 1 94 97 86.5 61 93 64 79.5 95
3 92103 2 NA NA NA 66 98 79 89.0 81
4 92104 2 89 97 69.5 86 83 62 83.0 94
5 92105 1 82 85 79.5 60 88 66 72.5 98
6 92106 2 88 88 78.0 60 90 70 81.5 77
In [107]:
Reportcard = na.omit(Reportcard)
print(head(Reportcard))
In [108]:
Reportcardsex=factor(Reportcardsex = factor(Reportcardsex=factor(Reportcardsex, levels=c( 1 , 2 ),labels=c(“M”,“F”))
Reportcard$sex
In [109]:
SumScore = rowSums(Reportcard[, 3 : 10 ], na.rm=TRUE)
ReportcardSumScore=SumScoreAvScore=rowMeans(Reportcard[,3:10],na.rm=TRUE)ReportcardSumScore = SumScore AvScore = rowMeans(Reportcard[, 3 : 10 ], na.rm=TRUE) ReportcardSumScore=SumScoreAvScore=rowMeans(Reportcard[,3:10],na.rm=TRUE)ReportcardAvScore = AvScore
print(head(Reportcard))
xh sex poli chi math fore phy che geo his
1 92101 2 96 96 87.5 72 93 65 76.0 92
2 92102 1 94 97 86.5 61 93 64 79.5 95
4 92104 2 89 97 69.5 86 83 62 83.0 94
5 92105 1 82 85 79.5 60 88 66 72.5 98
6 92106 2 88 88 78.0 60 90 70 81.5 77
7 92108 2 84 90 69.5 50 80 60 86.5 94
Levels :
xh sex poli chi math fore phy che geo his SumScore AvScore
1 92101 F 96 96 87.5 72 93 65 76.0 92 677.5 84.6875
2 92102 M 94 97 86.5 61 93 64 79.5 95 670.0 83.7500
4 92104 F 89 97 69.5 86 83 62 83.0 94 663.5 82.9375
5 92105 M 82 85 79.5 60 88 66 72.5 98 631.0 78.8750
6 92106 F 88 88 78.0 60 90 70 81.5 77 632.5 79.0625
7 92108 F 84 90 69.5 50 80 60 86.5 94 614.0 76.7500
In [110]:
Reportcard = within(Reportcard,{
AvScore[AvScore>= 90 ] = ‘A’
AvScore[AvScore>= 80 & AvScore< 90 ] = ‘B’
AvScore[AvScore>= 70 & AvScore< 80 ] = ‘C’
AvScore[AvScore>= 60 & AvScore< 70 ] = ‘D’
AvScore[AvScore< 60 ] = ‘E’
})
avScore = Reportcard[, 12 ]#将重新编码的数据保存到avScore中
print(avScore)
[1] “B” “B” “B” “C” “C” “C” “C” “C” “C” “C” “C” “C” “C” “C” “D” “D” “C” “D” “C”
[20] “D” “D” “D” “D” “E” “E” “E” “E” “E” “B” “C” “C” “C” “C” “C” “C” “C” “C” “C”
[39] “C” “C” “D” “D” “D” “D” “D” “D” “D” “D” “D” “D” “D” “D” “D” “D” “D” “E” “E”
[58] “E”
In [111]:
n=table(Reportcard$AvScore)
barplot(n,ylim=c( 0 , 25 )) #生成柱状图
In [112]:
data = matrix(c( 1 , 2 , 3 , 4 , 5 , 6 ), nrow= 2 )
row_max = c()
row_min = c()
col_max = c()
col_min = c()
for(i in 1 :nrow(data))
{
row_max = c(row_max, max(data[i,]))
row_min = c(row_min, min(data[i,]))
}
data = cbind(data, row_max, row_min)
for(j in 1 :ncol(data))
{
col_max = c(col_max, max(data[,j]))
col_min = c(col_min, min(data[,j]))
}
data = rbind(data, col_max, col_min)
print(data)
row_max row_min
1 3 5 5 1
2 4 6 6 2
col_max 2 4 6 6 2
col_min 1 3 5 5 1
In [3]:
install.packages(“rvest”)
library(rvest)#包含爬虫函数的包
page_text <- read_html(“https://sjz.58.com/xinfang/”)#加载第一页的数据
#获取小区名称
estate_name <- page_text %>% html_nodes(“span.items-name”) %>% html_text()
#获取小区所在位置
estate_detail_address <- page_text %>% html_nodes(“span.list-map”) %>% html_tex
estate_brief_address <- substr(estate_detail_address, 3 , 4 )#所在县区
#均价
estate_price <- page_text %>% html_nodes(“p.price”) %>% html_nodes(“span”)%>% h
#处理数据:翰林观天下售价显示的是周边均价(保留)
estate_price <- c(estate_price[ 1 : 16 ], “15990”, estate_price[ 17 : 59 ])
#将爬取到的数据存入数据框中
estate <- data.frame(name=estate_name,address=estate_brief_address,price=estate
print(head(estate))
至于用R爬取 58 同城新房代码见后续完整的项目哈,马上安排更新,欢迎 点赞、Fork 哈!!
【 R语言配套知识点详细总结】
In [ ]:
Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
name address price
1 紫晶悦和中心 长安 14800
2 天润福庭 藁城 10500
3 美好时光 裕华 12500
4 玖筑翰府 开发 11000
5 绿城诚园 新华 12800
6 东华国樾府 裕华 15500