Skip to content
Snippets Groups Projects
Commit 95babfe9 authored by Nishi, Nishi (PG/T - Comp Sci & Elec Eng)'s avatar Nishi, Nishi (PG/T - Comp Sci & Elec Eng)
Browse files

R File

parent e2baee31
No related branches found
No related tags found
No related merge requests found
4lab.R 0 → 100644
# ************************************************
# This work is licensed under a Creative Commons
# Attribution-NonCommercial 4.0 International License.
# ************************************************
#
#
# Department of Computer Science
# University of Surrey
# GUILDFORD
# Surrey GU2 7XH
#
# 24th October 2019
#
# UPDATE
# 1.00 1/2/2017 Initial Version
# 1.01 19/2/2018 Updated function calls
# 1.02 6/3/2018 Corrected bug removed "inputs_inputs"
# 1.03 24/2/2019 Lab 4 2019 updates
# 1.04 1/3/2019 Added "PerformanceAnalytics" as required library
# 1.05 22/10/2019 Updated for PBA (Basic solution)
# 1.08 29/05/2020 main() initialise allResults
# 1.09 2/6/2020 Use lab4DataPrepNew.R
# 1.10 26/11/2020 Changed "discreet" to "discrete" [woops!]
# ************************************************
# For lab 4
# ************************************************
# clears all objects in "global environment"
rm(list=ls())
# ************************************************
# Global Environment variables
# - i.e. available to all functions
# ************************************************
# Good practice to place constants in variables
# I use UPPERCASE to identify these in my code
DATASET_FILENAME <- "UCI-G.csv" #Name of input dataset file
OUTPUT_FIELD <- "Status" # Field name of the output class to predict
# These are the data preparation values
HOLDOUT <- 70 # % split to create TRAIN dataset
# Cutoff values - you can experiment with these
CUTOFF_OUTLIER <- 0.99 # Confidence p-value for outlier detection
# negative = analyse but do not replace outliers
CUTOFF_DISCRETE <- 6 # Number of empty bins to determine discrete
CUTOFF_REDUNDANT <- 0.95 # Linear correlation coefficient cut-off
# Indicates the type of each field
TYPE_DISCRETE <- "DISCRETE" # field is discrete (numeric)
TYPE_ORDINAL <- "ORDINAL" # field is continuous numeric
TYPE_SYMBOLIC <- "SYMBOLIC" # field is a string
TYPE_NUMERIC <- "NUMERIC" # field is initially a numeric
TYPE_IGNORE <- "IGNORE" # field is not encoded
MAX_LITERALS <- 55 # Maximum numner of 1-hot-encoding fields
# These are the supervised model constants
PDF_FILENAME <- "tree.pdf" # Name of PDF with graphical tree diagram
RULES_FILENAME <- "rules.txt" # Name of text file with rules saved
RESULTS_FILENAME <- "results.csv" # Name of the CSV results file
NODE_LEVEL <- 1 # The number is the node level of the tree to print
BOOST <- 20 # Number of boosting iterations. 1=single model
FOREST_SIZE <- 1000 # Number of trees in the forest
SCALE_DATASET <- TRUE # Set to true to scale dataset before ML stage
BASICNN_HIDDEN <- 5 # 10 hidden layer neurons
BASICNN_EPOCHS <- 100 # Maximum number of training epocs
# See https://cran.r-project.org/web/packages/h2o/h2o.pdf
DEEP_HIDDEN <- c(5,5) # Number of neurons in each layer
DEEP_STOPPING <- 2 # Number of times no improvement before stop
DEEP_TOLERANCE <- 0.01 # Error threshold
DEEP_ACTIVATION <- "TanhWithDropout" # Non-linear activation function
DEEP_REPRODUCABLE <- TRUE # Set to TRUE to test training is same for each run
# Define and then load the libraries used in this project
# Library from CRAN Version
# pacman 0.5.1
# outliers 0.14
# corrplot 0.84
# MASS 7.3.53
# formattable 0.2.0.1
# stats 4.0.3
# PerformanceAnalytics 2.0.4
# stringr 1.4.0
# partykit 1.2.8
# C50 0.1.3.1
# randomForest 4.6.14
# h2o 3.32.0.1
# keras 2.3.0.0
MYLIBRARIES<-c("outliers",
"corrplot",
"MASS",
"formattable",
"stats",
"caret",
"PerformanceAnalytics",
"stringr",
"partykit",
"C50",
"randomForest",
"keras",
"h2o")
# User defined functions are next
# ************************************************
# simpleDT() :
#
# Create C5 Decision Tree on the raw dataset
# A decision tree may not need the dataset to be pre-processed
#
# INPUT :
# Data Frame - train - original train dataset
# Data Frame - test - original test dataset
# boolean - plot - TRUE = plot charts
#
# OUTPUT :
# : Data Frame - measures - performance metrics
# **
simpleDT<-function(train,test,plot=TRUE){
positionClassOutput<-which(names(train)==OUTPUT_FIELD)
tree<-C50::C5.0(x=train[,-positionClassOutput],
y=factor(train[,positionClassOutput]),
rules=TRUE,
trials=1)
return(tree)
} #endof simpleDT()
# ************************************************
# main() :
#
# Entry point to execute your data analytics
#
# INPUT: None
#
# OUTPUT :None
# ************************************************
main<-function(){
loans<-NreadDataset(DATASET_FILENAME)
original<-NConvertClass(loans)
original<-NPREPROCESSING_splitdataset(original)
measures<-simpleDT(original$train,original$test)
print(summary(measures))
allResults<-NULL
# write the code to answer the questions in 03 Lab4.docx
# Write the code to read in a CSV file with the name given in DATASET_FILENAME
# German Credit Score dataset
} #end of main()
print("Tree plotted in PDF file")
# ************************************************
# This is where R starts execution
gc() # garbage collection to automatically release memory
# clear plots and other graphics
if(!is.null(dev.list())) dev.off()
graphics.off()
# clears the console area
cat("\014")
print("START Supervised Machine Learning")
library(pacman)
pacman::p_load(char=MYLIBRARIES,install=TRUE,character.only=TRUE)
#Load additional R script files provide for this lab
source("lab4DataPrepNew.R")
source("4labFunctions.R")
set.seed(123)
# ************************************************
main()
print("end")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment