-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathoutlierDetectionMethods.R
More file actions
157 lines (143 loc) · 4.31 KB
/
outlierDetectionMethods.R
File metadata and controls
157 lines (143 loc) · 4.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#Different Outlier Detection Methods
lof <- function(df, n, k){
#Method M1 -- LOF (multivariate)
outlier.scores <- lofactor(df, k)
# pick top n outliers
m1 <- order(outlier.scores, decreasing=T)[1:n]
o1 <-t(t(m1))
#M1 output LOF detection
m1 <-t(t(o1[ order(-o1[,1], o1[,1],decreasing=TRUE), ]))
return(m1)
}
#mean <- function(df){
#Method M2 -- largest difference from the sample mean (monovariate)
# output_m2 <- outlier(df, logical=TRUE)
# find_outlier_m2 <- which(output_m2==TRUE, arr.ind=TRUE)
# m2 <-find_outlier_m2[,1]
# o2 <-t(t(m2))
# #M2 Output outlier detection
# m2 <-t(t(o2[ order(-o2[,1], o2[,1],decreasing=TRUE), ]))
# #df_outlier$meandiff <- replace(df_outlier$meandiff,m2,1)
# m2
#}
mahal <- function(df, n){
#Method M3 -- Mahalanobis distance (multivariate)
m.dist.order <- order(mahalanobis(df, colMeans(df), cov(df),tol=1e-100), decreasing=TRUE)
is.outlier <- rep(FALSE, nrow(df))
is.outlier[m.dist.order[1:n]] <- TRUE # Mark as outliers the n most extreme points
col <- is.outlier + 1
m3 <- which(is.outlier ==TRUE, arr.ind=TRUE)
o3 <- t(t(m3))
#M3 Output Mahalanobis outlier detection (multivariate)
m3 <-t(t(o3[ order(-o3[,1], o3[,1],decreasing=TRUE), ]))
#df_outlier$mahalanobis <- replace(df_outlier$mahalanobis, m3, 1)
return(unique(m3))
}
chisq <- function(df){
#Method M4 -- Chi square (monovariate)
m4 <- c()
for(i in 1:ncol(df)){
out <- outlier(as.numeric(df[,i]))
colout <- which(df[,i] == out)
#print(colout)
for(j in 1:length(colout)){
if(colout[j] %in% m4){
next
} else{
m4[length(m4)+1] <- colout[j]
}
}
}
o4 <- t(t(m4))
#M4 Output of Chisquare method
m4 <-t(t(o4[ order(-o4[,1], o4[,1],decreasing=TRUE), ]))
#df_outlier$chisq <- replace(df_outlier$chisq, m4, 1)
return(unique(m4))
}
km <- function(df, n, k){
#outlier detection using k means
kmeans.result <- kmeans(df, centers=k)
#cluster centres
kmeans.result$centers
#Cluster Ids
kmeans.result$cluster
#Calculate distance between objects and cluster centres
centers <- kmeans.result$centers[kmeans.result$cluster, ]
distances <- sqrt(rowSums((df - centers)^2))
outliers <- order(distances, decreasing=T)[1:n]
return(unique(outliers))
}
box <- function(df){
#outlier detection using boxplot
outliers <- c()
for(i in 1:ncol(df)){
box <- boxplot(df[,i])
for(j in 1:length(box$out)){
a = which(df[,i] == box$out[j])
if(length(a) == 1){
outliers[length(outliers) + 1] <- a
}
}
}
return(unique(outliers))
#outliers
}
MADO <- function(df){
#outlier detection using Median Absolute Detection
outliers <- c()
for(i in 1:ncol(df)){
#Calculate Median Absolute Detection
medianAD <- mad(df[,i])
#Find Medium for every column
med <- median(df[,i])
#Set Upper and lower bounds
upper_bound = med + 2*medianAD
lower_bound = med - 2*medianAD
#Fetch outliers based on lower and upper bounds defined
greater = which(df[,i] > upper_bound)
if(as.numeric(length(greater))){
for(j in 1:as.numeric(length(greater))){
outliers[length(outliers) + 1] <- greater[j]
}
}
lesser = which(df[,i] < lower_bound)
if(as.numeric(length(lesser))){
for(j in 1:as.numeric(length(lesser))){
outliers[length(outliers) + 1] <- lesser[j]
}
}
}
#Unique row names
return(unique(outliers))
}
threeSig <- function(df){
#ThreeSigma Outlier Detection
sigma <- 3
outliers <- c()
for(i in 1:ncol(df)){
#Mean for every column
daMean <- mean(df[,i])
#Standard deviation for every column
daSD <- sd(df[,i])
#Calculate Lower and Upper bounds
lower_bound <- daMean - sigma*daSD
upper_bound <- daMean + sigma*daSD
#Outliers based on bounds calculated
greater = which(df[,i] > upper_bound)
if(as.numeric(length(greater))){
for(j in 1:as.numeric(length(greater))){
outliers[length(outliers) + 1] <- greater[j]
#print(greater[j])
}
}
lesser = which(df[,i] < lower_bound)
if(as.numeric(length(lesser))){
for(j in 1:as.numeric(length(lesser))){
outliers[length(outliers) + 1] <- lesser[j]
#print(lesser[j])
}
}
}
#Unique outliers
return(unique(outliers))
}