R File

95babfe9 · Nishi, Nishi (PG/T - Comp Sci & Elec Eng) · e2baee31 · 95babfe9
Commit 95babfe9 authored 1 year ago by Nishi, Nishi (PG/T - Comp Sci & Elec Eng)
--- a/4lab.R
+++ b/4lab.R
+# ************************************************
+# This work is licensed under a Creative Commons
+# Attribution-NonCommercial 4.0 International License.
+# ************************************************
+#
+# 
+# Department of Computer Science
+# University of Surrey
+# GUILDFORD
+# Surrey GU2 7XH
+#
+# 24th October 2019
+#
+# UPDATE
+# 1.00      1/2/2017    Initial Version
+# 1.01      19/2/2018   Updated function calls
+# 1.02      6/3/2018    Corrected bug removed "inputs_inputs"
+# 1.03      24/2/2019   Lab 4 2019 updates
+# 1.04      1/3/2019    Added "PerformanceAnalytics" as required library
+# 1.05      22/10/2019  Updated for PBA (Basic solution)
+# 1.08      29/05/2020  main() initialise allResults
+# 1.09      2/6/2020    Use lab4DataPrepNew.R
+# 1.10      26/11/2020  Changed "discreet" to "discrete" [woops!]
+# ************************************************
+# For lab 4
+# ************************************************
+#  clears all objects in "global environment"
+rm(list=ls())
+# ************************************************
+# Global Environment variables
+# - i.e. available to all functions
+# ************************************************
+# Good practice to place constants in variables
+# I use UPPERCASE to identify these in my code
+DATASET_FILENAME  <- "UCI-G.csv"          #Name of input dataset file
+OUTPUT_FIELD      <- "Status"             # Field name of the output class to predict
+# These are the data preparation values
+HOLDOUT           <- 70                   # % split to create TRAIN dataset
+# Cutoff values - you can experiment with these
+CUTOFF_OUTLIER    <- 0.99                 # Confidence p-value for outlier detection
+                                          # negative = analyse but do not replace outliers
+CUTOFF_DISCRETE   <- 6                    # Number of empty bins to determine discrete
+CUTOFF_REDUNDANT  <- 0.95                 # Linear correlation coefficient cut-off
+# Indicates the type of each field
+TYPE_DISCRETE     <- "DISCRETE"           # field is discrete (numeric)
+TYPE_ORDINAL      <- "ORDINAL"            # field is continuous numeric
+TYPE_SYMBOLIC     <- "SYMBOLIC"           # field is a string
+TYPE_NUMERIC      <- "NUMERIC"            # field is initially a numeric
+TYPE_IGNORE       <- "IGNORE"             # field is not encoded
+MAX_LITERALS      <- 55                    # Maximum numner of 1-hot-encoding fields
+# These are the supervised model constants
+PDF_FILENAME      <- "tree.pdf"           # Name of PDF with graphical tree diagram
+RULES_FILENAME    <- "rules.txt"          # Name of text file with rules saved
+RESULTS_FILENAME  <- "results.csv"        # Name of the CSV results file
+NODE_LEVEL        <- 1                    # The number is the node level of the tree to print
+BOOST             <- 20                   # Number of boosting iterations. 1=single model
+FOREST_SIZE       <- 1000                 # Number of trees in the forest
+SCALE_DATASET     <- TRUE                 # Set to true to scale dataset before ML stage
+BASICNN_HIDDEN    <- 5                    # 10 hidden layer neurons
+BASICNN_EPOCHS    <- 100                  # Maximum number of training epocs
+# See https://cran.r-project.org/web/packages/h2o/h2o.pdf
+DEEP_HIDDEN       <- c(5,5)               # Number of neurons in each layer
+DEEP_STOPPING     <- 2                    # Number of times no improvement before stop
+DEEP_TOLERANCE    <- 0.01                 # Error threshold
+DEEP_ACTIVATION   <- "TanhWithDropout"    # Non-linear activation function
+DEEP_REPRODUCABLE <- TRUE                 # Set to TRUE to test training is same for each run
+# Define and then load the libraries used in this project
+# Library from CRAN     Version
+# pacman	               0.5.1
+# outliers	             0.14
+# corrplot	             0.84
+# MASS	                 7.3.53
+# formattable 	         0.2.0.1
+# stats                  4.0.3
+# PerformanceAnalytics   2.0.4
+# stringr                1.4.0
+# partykit               1.2.8
+# C50                    0.1.3.1
+# randomForest           4.6.14
+# h2o                    3.32.0.1
+# keras                  2.3.0.0
+MYLIBRARIES<-c("outliers",
+               "corrplot",
+               "MASS",
+               "formattable",
+               "stats",
+               "caret",
+               "PerformanceAnalytics",
+               "stringr",
+               "partykit",
+               "C50",
+               "randomForest",
+               "keras",
+               "h2o")
+# User defined functions are next
+# ************************************************
+# simpleDT() :
+#
+# Create C5 Decision Tree on the raw dataset
+# A decision tree may not need the dataset to be pre-processed
+#
+# INPUT :
+# Data Frame - train - original train dataset
+# Data Frame - test - original test dataset
+# boolean - plot - TRUE = plot charts
+#
+# OUTPUT :
+# : Data Frame - measures - performance metrics
+# **
+simpleDT<-function(train,test,plot=TRUE){
+  positionClassOutput<-which(names(train)==OUTPUT_FIELD)
+  tree<-C50::C5.0(x=train[,-positionClassOutput],
+                  y=factor(train[,positionClassOutput]),
+                  rules=TRUE,
+                  trials=1)
+  return(tree)
+} #endof simpleDT()
+# ************************************************
+# main() :
+#
+# Entry point to execute your data analytics
+#
+# INPUT:  None
+#
+# OUTPUT :None
+# ************************************************
+main<-function(){
+  loans<-NreadDataset(DATASET_FILENAME)
+  original<-NConvertClass(loans)
+  original<-NPREPROCESSING_splitdataset(original)
+  measures<-simpleDT(original$train,original$test)
+  print(summary(measures))
+  allResults<-NULL
+  # write the code to answer the questions in 03 Lab4.docx
+  # Write the code to read in a CSV file with the name given in DATASET_FILENAME
+  # German Credit Score dataset
+} #end of main()
+print("Tree plotted in PDF file")
+# ************************************************
+# This is where R starts execution
+gc() # garbage collection to automatically release memory
+# clear plots and other graphics
+if(!is.null(dev.list())) dev.off()
+graphics.off()
+# clears the console area
+cat("\014")
+print("START Supervised Machine Learning")
+library(pacman)
+pacman::p_load(char=MYLIBRARIES,install=TRUE,character.only=TRUE)
+#Load additional R script files provide for this lab
+source("lab4DataPrepNew.R")
+source("4labFunctions.R")
+set.seed(123)
+# ************************************************
+main()
+print("end")