# Please download datasets "spamlearn.txt" and "spamtest.txt" 
# into a local directory. Set that directory as the current working directory of R.
# You can achive this using the "setwd" command or by selecting "File -> Change dir..."

# We are going to use the following packages:  rpart, CORElearn, e1071, randomForest, 
# kernlab, and nnet. Make sure that the packages are installed.


# You install a package in R with the function install.packages():
#
#     install.packages(c("ggplot2",rpart", "rpart.plot", "CORElearn", "e1071", "randomForest", "kernlab","caret"))
#
# To install packages without root access:
#
#     install.packages(install.packages(c("rpart", "rpart.plot", "CORElearn", "e1071", "randomForest", "kernlab")), lib="path to my folder")
#     library(CORElearn, lib.loc="path to my folder")
#


#
#
# Linear regression
#
#
library(ggplot2)

dataCars <- read.table("cars.txt", sep=',',header = T, row.names = NULL)
plot(dataCars$price, dataCars$horsepower)
qplot(horsepower, price, data = dataCars, geom = c("point","smooth"))
linearReg <- lm(horsepower ~ price, data = dataCars)
linearReg
plot(horsepower ~ price, data = dataCars)
abline(linearReg)

#
#
# Nonlinear regression
#
#

nonlinearReg <- nls(horsepower ~ x*price^(1/2), data=dataCars, start=c(x=1))
plot(dataCars$price, dataCars$horsepower)
o = order(dataCars$price)
lines(dataCars$price[o], predict(nonlinearReg)[o])

nonlinearReg2 <- nls(horsepower ~ x*price^y, data=dataCars, start=c(x=1, y=1))
lines(dataCars[,'price'][o], predict(nonlinearReg2)[o])


#
#
# Classification
#
#

learn <- read.table("spamlearn.txt", header = T)
test <- read.table("spamtest.txt", header = T)

learn$Class = as.factor(learn$Class)
test$Class = as.factor(test$Class)

# the target variable is the "Class" attribute
observed <- test$Class


#
#
# KNN
#
#

# fit a model using the "CORElearn" library

library(CORElearn)
cm.knn <- CoreModel(Class ~ ., data = learn, model="knn", kInNN = 10)
predicted <- predict(cm.knn, test, type="class")

# The classification accuracy
CA <- function(observed, predicted)
{
  t <- table(observed, predicted)
  
  sum(diag(t)) / sum(t)
}

CA(observed, predicted)

# How does this compare to the majority classifier?
table(observed)
sum(observed=='spam')/length(observed)

## using cross-validation to determine top k on the learn data?
## key idea: iteratively hide parts of the train set to get more robust estimate of performance
## for a given set of hyperparameters.

folds <- 3
foldIdx <- cvGen(nrow(learn), k=folds)
evalCore<-list()
overallScores <- c()

## we will check K in range from 1 to 10.
for (neigh in 1:10){
  
  ## for each k, we perform a cross validation on the train (learn) set.
  for (j in 1:folds) {
    
    ## some log
    print(paste0("Evaluation of k = ",neigh," and fold: ",j))
    
    ## select data from train and test (within the learn data set!!)
    dTrain <- learn[foldIdx!=j,]
    dTest  <- learn[foldIdx==j,]
    
    ## train a knn with a given parameter k
    modelCore <- CoreModel(Class~., dTrain, model="knn", kInNN = neigh) 
    
    ## predict on the test set (within the learn set)
    predCore <- predict(modelCore, dTest)
    
    ## compute the metrics
    evalCore[[j]] <- modelEval(modelCore, correctClass=dTest$Class,
                               predictedClass=predCore$class, predictedProb=predCore$prob )
    
    ## cleanup
    destroyModels(modelCore)
  }
  
  ## aggregate the results
  results <- gatherFromList(evalCore)
  
  ## get mean performance across all folds
  meanPerformances <- sapply(results, mean)
  
  ## add to the overallScores the accuracy. Note that the index corresponds to the k!
  overallScores <- c(overallScores, meanPerformances['accuracy'])
}

## best performing in CV shall be used as the final model
bestK <- which.max(overallScores)

## train the knn on whole learn data set
cm.knn <- CoreModel(Class ~ ., data = learn, model="knn", kInNN = bestK)

## predict on the test set
predicted <- predict(cm.knn, test, type="class")

## get the performance
CA(observed, predicted)

qplot(1:10, overallScores, xlab = "k", ylab = "Performance (acc)", geom = c("point","line"))

## Many times, libraries have in-built functionality!
performances <- c()
for (kParam in 1:10){
  cm.knn <- cvCoreModel(Class ~ ., data = learn, model="knn", kInNN = kParam, folds = 3)
  performances <- c(performances, cm.knn$avgs['accuracy'])
}
optK <- which.max(performances)
cm.knn <- CoreModel(Class ~ ., data = learn, model="knn", kInNN = optK)
predicted <- predict(cm.knn, test, type="class")
CA(observed, predicted)

#
#
# KNN for regression
#
#

cm.knnReg <- CoreModel(make ~ ., data=learn, model="regTree", modelTypeReg=7, minNodeWeightTree=Inf)
predicted <- predict(cm.knnReg, test, type="class")

MSE <- function(observed, predicted)
{
  sum((observed-predicted)^2)
}

MSE(test$make, predicted)
MSE(test$make, mean(test$make))


#
#
# DECISION TREES
#
#

# fit a model using the "rpart" library

library(rpart)
dt <- rpart(Class ~ ., data = learn)
plot(dt);text(dt)
predicted <- predict(dt, test, type="class")
CA(observed, predicted)

# obtaining decision rules from trees:
library(rpart.plot)
rpart.plot(dt)
rpart.rules(dt)
rpart.rules(dt, extra=4)

#
#
# RANDOM FOREST
#
#

# fit a model using the "CORElearn" library

library(CORElearn)
cm.rf <- CoreModel(Class ~ ., data = learn, model="rf")
predicted <- predict(cm.rf, test, type="class")
CA(observed, predicted)


# fit a model using the "randomForest" library

library(randomForest)
rf <- randomForest(Class ~ ., data = learn)
predicted <- predict(rf, test, type="class")
CA(observed, predicted)


# Some additional models

#
#
# NAIVE BAYES CLASSIFIER
#
#


# fit a model using the "CORElearn" library

library(CORElearn)
cm.nb <- CoreModel(Class ~ ., data = learn, model="bayes")
predicted <- predict(cm.nb, test, type="class")
CA(observed, predicted)


#
#
# SVM
#
#

# fit a model using the "e1071" library

library(e1071)

sm <- svm(Class ~ ., data = learn, cost = 100)
predicted <- predict(sm, test, type="class")
CA(observed, predicted)

library(caret)

# Scheme.
control <- trainControl(method="repeatedcv", number=3, repeats=3)

# Parameter grid.
grid <- expand.grid(C=1:10)

# Training
modelSVM <- train(Class~., data=learn, method="svmLinear", trControl=control, tuneGrid=grid)
predicted <- predict(modelSVM, test, type="raw")
CA(observed, predicted)

# Let's revisit the KNNs.
grid <- expand.grid(k=1:10)

# Training
control <- trainControl(method="repeatedcv", number=3, repeats = 3)
modelKNN <- train(Class~., data=learn, method="knn", trControl=control, tuneGrid=grid)
predicted <- predict(modelKNN, test, type="raw")
CA(observed, predicted)

# Random Search
control <- trainControl(method="repeatedcv", number=3, repeats=1, search="random", allowParallel=TRUE)
set.seed(1523) # repeatability
rf_random <- train(Class~., data=learn, method="rf", metric="Accuracy", tuneLength=5, trControl=control,verbose = TRUE)
qplot(rf_random$results$mtry,rf_random$results$Accuracy, geom = c("line","point"))


# parallelism options

set.seed(112233)
library(parallel) 
# Calculate the number of cores
no_cores <- detectCores() - 1

library(doParallel)
# create the cluster for caret to use
cl <- makePSOCKcluster(no_cores)
registerDoParallel(cl)

# Random Search
control <- trainControl(method="repeatedcv", number=3, repeats=1, search="random", allowParallel=TRUE)
set.seed(1523) # repeatability
rf_random <- train(Class~., data=learn, method="rf", metric="Accuracy", tuneLength=5, trControl=control,verbose = TRUE)
qplot(rf_random$results$mtry,rf_random$results$Accuracy, geom = c("line","point"))

stopCluster(cl)
registerDoSEQ()