activeLearning-R/outlierDetectionMethods.R at master · kns94/activeLearning-R · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#Different Outlier Detection Methods

lof <- function(df, n, k){
  #Method M1 -- LOF (multivariate)
  outlier.scores <- lofactor(df, k)
  # pick top n outliers
  m1 <- order(outlier.scores, decreasing=T)[1:n]
  o1 <-t(t(m1))
  #M1 output LOF detection
  m1 <-t(t(o1[ order(-o1[,1], o1[,1],decreasing=TRUE), ]))
  return(m1)
}

#mean <- function(df){
#Method M2 -- largest difference from the sample mean (monovariate)
#  output_m2 <- outlier(df, logical=TRUE)
#  find_outlier_m2 <- which(output_m2==TRUE, arr.ind=TRUE)
#  m2 <-find_outlier_m2[,1]
#  o2 <-t(t(m2))
#  #M2 Output  outlier detection
#  m2 <-t(t(o2[ order(-o2[,1], o2[,1],decreasing=TRUE), ]))
#  #df_outlier$meandiff <- replace(df_outlier$meandiff,m2,1)
#  m2
#}

mahal <- function(df, n){
  #Method M3 -- Mahalanobis distance (multivariate)
  m.dist.order                  <- order(mahalanobis(df, colMeans(df), cov(df),tol=1e-100), decreasing=TRUE)
  is.outlier                    <- rep(FALSE, nrow(df))
  is.outlier[m.dist.order[1:n]] <- TRUE # Mark as outliers the n most extreme points
  col                           <- is.outlier + 1
  m3 <- which(is.outlier ==TRUE, arr.ind=TRUE)
  o3 <- t(t(m3))
  #M3 Output Mahalanobis outlier detection (multivariate)
  m3 <-t(t(o3[ order(-o3[,1], o3[,1],decreasing=TRUE), ]))
  #df_outlier$mahalanobis <- replace(df_outlier$mahalanobis, m3, 1)
  return(unique(m3))
}

chisq <- function(df){
  #Method M4 -- Chi square (monovariate)
  m4 <- c()
  for(i in 1:ncol(df)){
    out <- outlier(as.numeric(df[,i]))
    colout <- which(df[,i] == out)
    #print(colout)
    for(j in 1:length(colout)){
      if(colout[j] %in% m4){
        next
      } else{
        m4[length(m4)+1] <- colout[j]
      }
    }
  }
  o4 <- t(t(m4))
  #M4 Output of Chisquare method
  m4 <-t(t(o4[ order(-o4[,1], o4[,1],decreasing=TRUE), ]))
  #df_outlier$chisq <- replace(df_outlier$chisq, m4, 1)
  return(unique(m4))
}

km <- function(df, n, k){
  #outlier detection using k means
  kmeans.result <- kmeans(df, centers=k)
  #cluster centres
  kmeans.result$centers
  #Cluster Ids
  kmeans.result$cluster
  #Calculate distance between objects and cluster centres
  centers <- kmeans.result$centers[kmeans.result$cluster, ]
  distances <- sqrt(rowSums((df - centers)^2))
  outliers <- order(distances, decreasing=T)[1:n]
  return(unique(outliers))
}

box <- function(df){
  #outlier detection using boxplot
  outliers <- c()
  for(i in 1:ncol(df)){
    box <- boxplot(df[,i])
    for(j in 1:length(box$out)){
      a = which(df[,i] == box$out[j])
      if(length(a) == 1){
        outliers[length(outliers) + 1] <- a
      }
    }
  }
  return(unique(outliers))
  #outliers
}

MADO <- function(df){
  #outlier detection using Median Absolute Detection
  outliers <- c()
  for(i in 1:ncol(df)){
    #Calculate Median Absolute Detection
    medianAD <- mad(df[,i])
    #Find Medium for every column
    med <- median(df[,i])
    #Set Upper and lower bounds
    upper_bound = med + 2*medianAD
    lower_bound = med - 2*medianAD
    #Fetch outliers based on lower and upper bounds defined
    greater = which(df[,i] > upper_bound)

    if(as.numeric(length(greater))){
      for(j in 1:as.numeric(length(greater))){
        outliers[length(outliers) + 1] <- greater[j]
      }
    }

    lesser = which(df[,i] < lower_bound)

    if(as.numeric(length(lesser))){
      for(j in 1:as.numeric(length(lesser))){
        outliers[length(outliers) + 1] <- lesser[j]
      }
    }
  }
  #Unique row names
  return(unique(outliers))
}

threeSig <- function(df){
  #ThreeSigma Outlier Detection
  sigma <- 3
  outliers <- c()
  for(i in 1:ncol(df)){
    #Mean for every column
    daMean <- mean(df[,i])
    #Standard deviation for every column
    daSD <- sd(df[,i])
    #Calculate Lower and Upper bounds
    lower_bound <- daMean - sigma*daSD
    upper_bound <- daMean + sigma*daSD
    #Outliers based on bounds calculated
    greater = which(df[,i] > upper_bound)

    if(as.numeric(length(greater))){
      for(j in 1:as.numeric(length(greater))){
        outliers[length(outliers) + 1] <- greater[j]
        #print(greater[j])
      }
    }

    lesser = which(df[,i] < lower_bound)

    if(as.numeric(length(lesser))){
      for(j in 1:as.numeric(length(lesser))){
        outliers[length(outliers) + 1] <- lesser[j]
        #print(lesser[j])
      }
    }
  }
  #Unique outliers
  return(unique(outliers))
}