Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
G
git-intro1
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Nishi, Nishi (PG/T - Comp Sci & Elec Eng)
git-intro1
Commits
95babfe9
Commit
95babfe9
authored
1 year ago
by
Nishi, Nishi (PG/T - Comp Sci & Elec Eng)
Browse files
Options
Downloads
Patches
Plain Diff
R File
parent
e2baee31
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
4lab.R
+198
-0
198 additions, 0 deletions
4lab.R
with
198 additions
and
0 deletions
4lab.R
0 → 100644
+
198
−
0
View file @
95babfe9
# ************************************************
# This work is licensed under a Creative Commons
# Attribution-NonCommercial 4.0 International License.
# ************************************************
#
#
# Department of Computer Science
# University of Surrey
# GUILDFORD
# Surrey GU2 7XH
#
# 24th October 2019
#
# UPDATE
# 1.00 1/2/2017 Initial Version
# 1.01 19/2/2018 Updated function calls
# 1.02 6/3/2018 Corrected bug removed "inputs_inputs"
# 1.03 24/2/2019 Lab 4 2019 updates
# 1.04 1/3/2019 Added "PerformanceAnalytics" as required library
# 1.05 22/10/2019 Updated for PBA (Basic solution)
# 1.08 29/05/2020 main() initialise allResults
# 1.09 2/6/2020 Use lab4DataPrepNew.R
# 1.10 26/11/2020 Changed "discreet" to "discrete" [woops!]
# ************************************************
# For lab 4
# ************************************************
# clears all objects in "global environment"
rm
(
list
=
ls
())
# ************************************************
# Global Environment variables
# - i.e. available to all functions
# ************************************************
# Good practice to place constants in variables
# I use UPPERCASE to identify these in my code
DATASET_FILENAME
<-
"UCI-G.csv"
#Name of input dataset file
OUTPUT_FIELD
<-
"Status"
# Field name of the output class to predict
# These are the data preparation values
HOLDOUT
<-
70
# % split to create TRAIN dataset
# Cutoff values - you can experiment with these
CUTOFF_OUTLIER
<-
0.99
# Confidence p-value for outlier detection
# negative = analyse but do not replace outliers
CUTOFF_DISCRETE
<-
6
# Number of empty bins to determine discrete
CUTOFF_REDUNDANT
<-
0.95
# Linear correlation coefficient cut-off
# Indicates the type of each field
TYPE_DISCRETE
<-
"DISCRETE"
# field is discrete (numeric)
TYPE_ORDINAL
<-
"ORDINAL"
# field is continuous numeric
TYPE_SYMBOLIC
<-
"SYMBOLIC"
# field is a string
TYPE_NUMERIC
<-
"NUMERIC"
# field is initially a numeric
TYPE_IGNORE
<-
"IGNORE"
# field is not encoded
MAX_LITERALS
<-
55
# Maximum numner of 1-hot-encoding fields
# These are the supervised model constants
PDF_FILENAME
<-
"tree.pdf"
# Name of PDF with graphical tree diagram
RULES_FILENAME
<-
"rules.txt"
# Name of text file with rules saved
RESULTS_FILENAME
<-
"results.csv"
# Name of the CSV results file
NODE_LEVEL
<-
1
# The number is the node level of the tree to print
BOOST
<-
20
# Number of boosting iterations. 1=single model
FOREST_SIZE
<-
1000
# Number of trees in the forest
SCALE_DATASET
<-
TRUE
# Set to true to scale dataset before ML stage
BASICNN_HIDDEN
<-
5
# 10 hidden layer neurons
BASICNN_EPOCHS
<-
100
# Maximum number of training epocs
# See https://cran.r-project.org/web/packages/h2o/h2o.pdf
DEEP_HIDDEN
<-
c
(
5
,
5
)
# Number of neurons in each layer
DEEP_STOPPING
<-
2
# Number of times no improvement before stop
DEEP_TOLERANCE
<-
0.01
# Error threshold
DEEP_ACTIVATION
<-
"TanhWithDropout"
# Non-linear activation function
DEEP_REPRODUCABLE
<-
TRUE
# Set to TRUE to test training is same for each run
# Define and then load the libraries used in this project
# Library from CRAN Version
# pacman 0.5.1
# outliers 0.14
# corrplot 0.84
# MASS 7.3.53
# formattable 0.2.0.1
# stats 4.0.3
# PerformanceAnalytics 2.0.4
# stringr 1.4.0
# partykit 1.2.8
# C50 0.1.3.1
# randomForest 4.6.14
# h2o 3.32.0.1
# keras 2.3.0.0
MYLIBRARIES
<-
c
(
"outliers"
,
"corrplot"
,
"MASS"
,
"formattable"
,
"stats"
,
"caret"
,
"PerformanceAnalytics"
,
"stringr"
,
"partykit"
,
"C50"
,
"randomForest"
,
"keras"
,
"h2o"
)
# User defined functions are next
# ************************************************
# simpleDT() :
#
# Create C5 Decision Tree on the raw dataset
# A decision tree may not need the dataset to be pre-processed
#
# INPUT :
# Data Frame - train - original train dataset
# Data Frame - test - original test dataset
# boolean - plot - TRUE = plot charts
#
# OUTPUT :
# : Data Frame - measures - performance metrics
# **
simpleDT
<-
function
(
train
,
test
,
plot
=
TRUE
){
positionClassOutput
<-
which
(
names
(
train
)
==
OUTPUT_FIELD
)
tree
<-
C50
::
C5.0
(
x
=
train
[,
-
positionClassOutput
],
y
=
factor
(
train
[,
positionClassOutput
]),
rules
=
TRUE
,
trials
=
1
)
return
(
tree
)
}
#endof simpleDT()
# ************************************************
# main() :
#
# Entry point to execute your data analytics
#
# INPUT: None
#
# OUTPUT :None
# ************************************************
main
<-
function
(){
loans
<-
NreadDataset
(
DATASET_FILENAME
)
original
<-
NConvertClass
(
loans
)
original
<-
NPREPROCESSING_splitdataset
(
original
)
measures
<-
simpleDT
(
original
$
train
,
original
$
test
)
print
(
summary
(
measures
))
allResults
<-
NULL
# write the code to answer the questions in 03 Lab4.docx
# Write the code to read in a CSV file with the name given in DATASET_FILENAME
# German Credit Score dataset
}
#end of main()
print
(
"Tree plotted in PDF file"
)
# ************************************************
# This is where R starts execution
gc
()
# garbage collection to automatically release memory
# clear plots and other graphics
if
(
!
is.null
(
dev.list
()))
dev.off
()
graphics.off
()
# clears the console area
cat
(
"\014"
)
print
(
"START Supervised Machine Learning"
)
library
(
pacman
)
pacman
::
p_load
(
char
=
MYLIBRARIES
,
install
=
TRUE
,
character.only
=
TRUE
)
#Load additional R script files provide for this lab
source
(
"lab4DataPrepNew.R"
)
source
(
"4labFunctions.R"
)
set.seed
(
123
)
# ************************************************
main
()
print
(
"end"
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment