diff --git a/4lab.R b/4lab.R new file mode 100644 index 0000000000000000000000000000000000000000..1acb2a90f3342bad6533eb5a947f83ac88af978a --- /dev/null +++ b/4lab.R @@ -0,0 +1,198 @@ +# ************************************************ +# This work is licensed under a Creative Commons +# Attribution-NonCommercial 4.0 International License. +# ************************************************ +# +# +# Department of Computer Science +# University of Surrey +# GUILDFORD +# Surrey GU2 7XH +# +# 24th October 2019 +# +# UPDATE +# 1.00 1/2/2017 Initial Version +# 1.01 19/2/2018 Updated function calls +# 1.02 6/3/2018 Corrected bug removed "inputs_inputs" +# 1.03 24/2/2019 Lab 4 2019 updates +# 1.04 1/3/2019 Added "PerformanceAnalytics" as required library +# 1.05 22/10/2019 Updated for PBA (Basic solution) +# 1.08 29/05/2020 main() initialise allResults +# 1.09 2/6/2020 Use lab4DataPrepNew.R +# 1.10 26/11/2020 Changed "discreet" to "discrete" [woops!] +# ************************************************ +# For lab 4 +# ************************************************ + +# clears all objects in "global environment" +rm(list=ls()) + +# ************************************************ +# Global Environment variables +# - i.e. available to all functions + +# ************************************************ +# Good practice to place constants in variables +# I use UPPERCASE to identify these in my code + +DATASET_FILENAME <- "UCI-G.csv" #Name of input dataset file +OUTPUT_FIELD <- "Status" # Field name of the output class to predict + +# These are the data preparation values + +HOLDOUT <- 70 # % split to create TRAIN dataset + +# Cutoff values - you can experiment with these + +CUTOFF_OUTLIER <- 0.99 # Confidence p-value for outlier detection + # negative = analyse but do not replace outliers +CUTOFF_DISCRETE <- 6 # Number of empty bins to determine discrete +CUTOFF_REDUNDANT <- 0.95 # Linear correlation coefficient cut-off + +# Indicates the type of each field + +TYPE_DISCRETE <- "DISCRETE" # field is discrete (numeric) +TYPE_ORDINAL <- "ORDINAL" # field is continuous numeric +TYPE_SYMBOLIC <- "SYMBOLIC" # field is a string +TYPE_NUMERIC <- "NUMERIC" # field is initially a numeric +TYPE_IGNORE <- "IGNORE" # field is not encoded + +MAX_LITERALS <- 55 # Maximum numner of 1-hot-encoding fields + +# These are the supervised model constants + +PDF_FILENAME <- "tree.pdf" # Name of PDF with graphical tree diagram +RULES_FILENAME <- "rules.txt" # Name of text file with rules saved +RESULTS_FILENAME <- "results.csv" # Name of the CSV results file +NODE_LEVEL <- 1 # The number is the node level of the tree to print +BOOST <- 20 # Number of boosting iterations. 1=single model +FOREST_SIZE <- 1000 # Number of trees in the forest +SCALE_DATASET <- TRUE # Set to true to scale dataset before ML stage + +BASICNN_HIDDEN <- 5 # 10 hidden layer neurons +BASICNN_EPOCHS <- 100 # Maximum number of training epocs + +# See https://cran.r-project.org/web/packages/h2o/h2o.pdf + +DEEP_HIDDEN <- c(5,5) # Number of neurons in each layer +DEEP_STOPPING <- 2 # Number of times no improvement before stop +DEEP_TOLERANCE <- 0.01 # Error threshold +DEEP_ACTIVATION <- "TanhWithDropout" # Non-linear activation function +DEEP_REPRODUCABLE <- TRUE # Set to TRUE to test training is same for each run + +# Define and then load the libraries used in this project + +# Library from CRAN Version +# pacman 0.5.1 +# outliers 0.14 +# corrplot 0.84 +# MASS 7.3.53 +# formattable 0.2.0.1 +# stats 4.0.3 +# PerformanceAnalytics 2.0.4 +# stringr 1.4.0 +# partykit 1.2.8 +# C50 0.1.3.1 +# randomForest 4.6.14 +# h2o 3.32.0.1 +# keras 2.3.0.0 + +MYLIBRARIES<-c("outliers", + "corrplot", + "MASS", + "formattable", + "stats", + "caret", + "PerformanceAnalytics", + "stringr", + "partykit", + "C50", + "randomForest", + "keras", + "h2o") + +# User defined functions are next +# ************************************************ +# simpleDT() : +# +# Create C5 Decision Tree on the raw dataset +# A decision tree may not need the dataset to be pre-processed +# +# INPUT : +# Data Frame - train - original train dataset +# Data Frame - test - original test dataset +# boolean - plot - TRUE = plot charts +# +# OUTPUT : +# : Data Frame - measures - performance metrics +# ** +simpleDT<-function(train,test,plot=TRUE){ + + positionClassOutput<-which(names(train)==OUTPUT_FIELD) + tree<-C50::C5.0(x=train[,-positionClassOutput], + y=factor(train[,positionClassOutput]), + rules=TRUE, + trials=1) + return(tree) +} #endof simpleDT() + + + + +# ************************************************ +# main() : +# +# Entry point to execute your data analytics +# +# INPUT: None +# +# OUTPUT :None +# ************************************************ +main<-function(){ + + loans<-NreadDataset(DATASET_FILENAME) + original<-NConvertClass(loans) + original<-NPREPROCESSING_splitdataset(original) + measures<-simpleDT(original$train,original$test) + print(summary(measures)) + + allResults<-NULL + # write the code to answer the questions in 03 Lab4.docx + + # Write the code to read in a CSV file with the name given in DATASET_FILENAME + # German Credit Score dataset + + +} #end of main() + + +print("Tree plotted in PDF file") + +# ************************************************ +# This is where R starts execution + +gc() # garbage collection to automatically release memory + +# clear plots and other graphics +if(!is.null(dev.list())) dev.off() +graphics.off() + +# clears the console area +cat("\014") + +print("START Supervised Machine Learning") + +library(pacman) +pacman::p_load(char=MYLIBRARIES,install=TRUE,character.only=TRUE) + +#Load additional R script files provide for this lab +source("lab4DataPrepNew.R") +source("4labFunctions.R") + +set.seed(123) + +# ************************************************ +main() + +print("end") diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..fdddb29aa445bf3d6a5d843d6dd77e10a9f99657 --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to <https://unlicense.org> diff --git a/My_code/.gitkeep b/My_code/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/New_file b/New_file new file mode 100644 index 0000000000000000000000000000000000000000..8f400a07355720395dfa73ebaafdd7d53b9593ba --- /dev/null +++ b/New_file @@ -0,0 +1,4 @@ +welcome to +advance web technology lab +@university of surrey +hi there is an updation