###
#
# CLUSTERING
#
###

# Load the dataset
cities <- read.table("worldcities.csv", sep=",", header = T)
summary(cities)
x <- cities$lng
y <- cities$lat
x <- x / max(x)
y <- y / max(y)
plot(x,y)
df = data.frame(x, y)

# KMeans clustering
results <- kmeans(df, centers=6)
results
plot(x, y, col=results$cluster)

# Are clusters aligned with classes?
library(ggplot2)
data(iris)
subset <- iris[,1:2]

# classes
p1<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = iris$Species, main = "Ground truth") + theme_bw()

resultsIris <-kmeans(subset, centers = 3)
assignments <- as.factor(resultsIris$cluster)
p2<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = assignments, main = "Clustering") + theme_bw()
gridExtra::grid.arrange(p1, p2, nrow = 2)

# Exercise -> try to cluster the iris data according to Length and Width!

# Try to fit models and see what can be learnt best!
library(mclust)
fit <- Mclust(df)
# summarize the fits
plot(fit)
summary(fit)

#Hierarchical clustering
distMat = dist(df)
resultsH <- hclust(distMat)

plot(resultsH)
clusters <- cutree(resultsH, 6)
plot(x, y, col=clusters)


###
#
# ATTRIBUTE EVALUATION
#
###


# Attribute evaluation functions are implemented in the package "CORElearn"

# Load the library
#install.packages("CORElearn")
library(CORElearn)


#
# Attribute estimation in classification
#


#
# Example 1
#

mr <- read.table("mushroom.txt", sep=",", header=T)
mr$edibility <- as.factor(mr$edibility)
summary(mr)

qplot(mr$edibility, ylab="Number of species", main="Edibility of mushrooms", geom = c("bar"))


# the attrEval function evaluates the quality of the attributes/dependent variables 
# specified by the formula. Parameter formula is used as a mechanism to select
# attributes and prediction variable(class). The simplest way is to specify just 
# response variable: "Class ~ .". In this case all other attributes in the data set 
# are evaluated with the selected heuristic method.

# attribute evaluation using information gain
sort(attrEval(edibility ~ ., mr, "InfGain"), decreasing = TRUE)

# build a decision tree using information gain as a splitting criterion
dt <- CoreModel(edibility ~ ., mr, model="tree", selectionEstimator="InfGain")
plot(dt, mr)


#
# Example 2
#

quadrant <- read.table("quadrant.txt", sep=",", header=T)
summary(quadrant)

quadrant$Class <- as.factor(quadrant$Class)

plot(quadrant, col=quadrant$Class)
plot(quadrant$a1, quadrant$a2, col=quadrant$Class)

# attribute evaluation using information gain
sort(attrEval(Class ~ ., quadrant, "InfGain"), decreasing = TRUE)

# information gain is a myopic measure 
# (it assumes that attributes are conditionally independent given the class)

# using information gain to construct a decision tree will produce a poor result
dt2 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator="InfGain")
plot(dt2, quadrant)

# non-myopic measures (Relief and ReleifF)
sort(attrEval(Class ~ ., quadrant, "Relief"), decreasing = TRUE)
sort(attrEval(Class ~ ., quadrant, "ReliefFequalK"), decreasing = TRUE)
sort(attrEval(Class ~ ., quadrant, "ReliefFexpRank"), decreasing = TRUE)

# use ?attrEval to find out different variations of ReliefF measure...

# build a decision tree using ReliefFequalK as a splitting criterion
dt3 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator = "ReliefFequalK")
plot(dt3, quadrant)


#
# Example 3
#

players <- read.table("players.txt", sep=",", header=TRUE)
summary(players)
players$position <- as.factor(players$position)

sort(attrEval(position ~ ., players, "InfGain"), decreasing = TRUE)
sort(attrEval(position ~ ., players, "Gini"), decreasing = TRUE)

# the "id" attribute is overestimated, 
# although it does not carry any useful information

# GainRatio moderates the overestimation of the "id" attribute
sort(attrEval(position ~ ., players, "GainRatio"), decreasing = TRUE)

# Both ReliefF and MDL measures identify the "id" attribute as irrelevant
sort(attrEval(position ~ ., players, "ReliefFequalK"), decreasing = TRUE)
sort(attrEval(position ~ ., players, "MDL"), decreasing = TRUE)

#
#
# Attribute selection
#
#

student <- read.table("student.txt", sep=",", header=T)
student$G1 <- cut(student$G1, c(-Inf, 9, 20), labels=c("fail", "pass"))
student$G2 <- cut(student$G2, c(-Inf, 9, 20), labels=c("fail", "pass"))
student$G3 <- cut(student$G3, c(-Inf, 9, 20), labels=c("fail", "pass"))
student$studytime <- cut(student$studytime, c(-Inf, 1, 2, 3, 4), labels=c("none", "few", "hefty", "endless"))

train = student[1:(nrow(student) * 0.7),]
test = student[(nrow(student) * 0.7):nrow(student),]

#
# Feature selection using a filter method
#

# evaluate the quality of attributes
sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE)

# train a model using everything
model <- CoreModel(studytime ~ ., data=train, model='tree')
predictions <- predict(model, test, type='class')
sum(predictions == test$studytime) / length(predictions)

# train a model using the best evaluated attributes
model <- CoreModel(studytime ~ sex + Walc + Dalc + freetime + higher + paid, data=train, model='tree')
predictions <- predict(model, test, type='class')
sum(predictions == test$studytime) / length(predictions)


# using the appropriate combination of attributes...
model <- CoreModel(studytime ~ Dalc + paid + G2, data=train, model='tree')
predictions <- predict(model, test, type='class')
sum(predictions == test$studytime) / length(predictions)

# What about some model-based feature importances?
library(caret)
set.seed(123780)
#Number randomely variable selected is mtry
control <- trainControl(method='repeatedcv',number=10, repeats=1)

rf.model <- train(studytime ~ ., 
                  data=train, 
                  method='rf', 
                  metric='Accuracy', 
                  trControl=control)

## varImp extracts the feature relevances
rf.importances<- varImp(rf.model, scale = FALSE)
plot(rf.importances, top = 20)
importance.df.values <- rf.importances$importance
importance.df.names <- rownames(rf.importances$importance)
importance.rf.whole <- data.frame(score = importance.df.values,cnames = importance.df.names)
importance.rf.whole <- importance.rf.whole[order(importance.rf.whole$Overall, decreasing = T),]
feature.names.rf <- importance.rf.whole$cnames[1:20]

mdl.importances <- sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE)[1:20]
feature.names.mdl <- names(mdl.importances)
paste0(length(intersect(feature.names.rf, feature.names.mdl))*100/20,"% overlap between top ranked features!")

scaleVector <- function(x) {(x - min(x))/(max(x) - min(x))}

# Let's visualize how similar the rankings' distributions are
rf.all <- scaleVector(importance.rf.whole$Overall)
mdl.all <- scaleVector(as.vector(mdl.importances))

rank.frame <- data.frame(rf.all, mdl.all)

library(dplyr)
library(tidyr)

# How do the two distributions differ?
summary(rank.frame)
rank.frame %>% gather() %>% ggplot(aes(x=value,fill=key))+geom_density(alpha=0.4)+theme_bw()