Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
F
Final Year Project
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Kilinchan, Kaan (UG - Computer Science)
Final Year Project
Commits
43133135
Commit
43133135
authored
5 years ago
by
Kilinchan, Kaan (UG - Computer Science)
Browse files
Options
Downloads
Patches
Plain Diff
Revert "Upload New File"
This reverts commit
84dd95dc
parent
84dd95dc
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
climatehyptest2.py
+0
-231
0 additions, 231 deletions
climatehyptest2.py
with
0 additions
and
231 deletions
climatehyptest2.py
deleted
100644 → 0
+
0
−
231
View file @
84dd95dc
# -*- coding: utf-8 -*-
"""
climateHypTest2.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/16nUPjaYYnWurxCPeCCBJVWPcAMj6sIUa
Code for hypothesis testing of the datasets in order to determine the each of the previously tested null hypotheses
may be proven or disprove.
Statistical tests that are carried out are: pvalues, histograms, kernel density estimators, t-tests and linear regressions.
#***************************************************************************************
# * Title: lab7_hypothesis-checkpoint
# * Author: Dr Manal Helal
# * Date: November 2019
# * Code version: 1.02
#***************************************************************************************
"""
import
pandas
as
pd
from
scipy
import
stats
import
numpy
as
np
import
scipy.stats
as
ss
import
csv
#Import the dataset
df
=
pd
.
read_csv
(
'
climateUKRand.csv
'
,
engine
=
'
python
'
,
encoding
=
'
utf-8
'
,
error_bad_lines
=
False
)
print
(
len
(
df
))
df
.
mean
()
df
.
fillna
(
0
)
df
.
head
()
#Sort the dataset by their value field using quicksort. Smallest values will be first.
df
=
df
.
sort_values
(
'
temp
'
,
ascending
=
True
,
inplace
=
False
,
kind
=
'
quicksort
'
,
na_position
=
'
last
'
,
ignore_index
=
False
)
print
(
round
(
len
(
df
)
/
2
))
#Split the dataset into two, each set being of an equal length.
df1
=
df
.
iloc
[:
round
(
len
(
df
)
/
2
)]
df2
=
df
.
iloc
[
round
(
len
(
df
)
/
2
):]
#GEt the T-Test statistic value and the p value of the datasets.
print
(
stats
.
ttest_ind
(
df1
.
dropna
()[
'
TotalSamples
'
],
df2
.
dropna
()[
'
TotalSamples
'
]))
# Commented out IPython magic to ensure Python compatibility.
# %matplotlib inline
import
sys
import
matplotlib.pyplot
as
plt
#Trim the sets to the required fields only.
valsColumn
=
[
currColumn
for
currColumn
in
df
.
columns
if
"
temp
"
in
currColumn
or
"
TotalSamples
"
in
currColumn
]
df
=
df
[
df
.
TotalSamples
!=
0
]
df
=
df
[
df
.
temp
!=
0
]
#df = df[valsColumn]
#valsColumn = [v for v in valsColumn if "Value" in v]
print
(
"
\n
(2) TOTAL SAMPLES IN FILE:
"
)
print
(
"
\t\t\t\t
%d
"
%
(
len
(
df
.
index
)))
print
(
valsColumn
)
ValueHigherThanzero
=
[
idx
for
idx
,
isRanked
in
enumerate
(
df
[
'
temp
'
])
if
isRanked
>
0
]
print
(
"
\n
(3) TOTAL with a value greater than zero:
"
)
print
(
"
\t\t\t\t
%d
"
%
(
len
(
ValueHigherThanzero
)))
#if df['Value'].dtype != 'int':
# df['Value'] = df['Value'].astype('int')
#Generate histograms for both of the fields that are being tested.
#Generate one with the original values and one with the log values.
png_file
=
'
histTemp.png
'
hist
=
df
[
'
temp
'
].
hist
(
bins
=
60
)
hist
.
get_figure
().
savefig
(
png_file
)
plt
.
show
()
png_file
=
'
histSamples.png
'
hist
=
df
[
'
TotalSamples
'
].
hist
(
bins
=
60
)
hist
.
get_figure
().
savefig
(
png_file
)
plt
.
show
()
png_file
=
'
histlogTemp.png
'
df
[
'
log_Temp
'
]
=
np
.
log
(
df
[
'
temp
'
])
df
.
replace
([
np
.
inf
,
-
np
.
inf
],
np
.
nan
).
dropna
(
axis
=
1
)
lhist
=
df
[
'
log_Temp
'
].
hist
(
bins
=
30
)
lhist
.
get_figure
().
savefig
(
png_file
)
plt
.
show
()
df1
=
df
.
iloc
[:
round
(
len
(
df
)
/
2
)]
df2
=
df
.
iloc
[
round
(
len
(
df
)
/
2
):]
#Generate a KDE for both tested fields.
png_file
=
'
KDE.png
'
df1
[
'
log_Temp
'
].
plot
(
kind
=
'
kde
'
)
kde
=
df2
[
'
log_Temp
'
].
plot
(
kind
=
'
kde
'
)
kde
.
get_figure
().
savefig
(
png_file
)
plt
.
show
()
png_file
=
'
histlogSamples.png
'
df
[
'
log_Sample
'
]
=
np
.
log
(
df
[
'
TotalSamples
'
])
lhist
=
df
[
'
log_Sample
'
].
hist
(
bins
=
30
)
lhist
.
get_figure
().
savefig
(
png_file
)
plt
.
show
()
df1
=
df
.
iloc
[:
round
(
len
(
df
)
/
2
)]
df2
=
df
.
iloc
[
round
(
len
(
df
)
/
2
):]
png_file
=
'
KDE2.png
'
df1
[
'
log_Sample
'
].
plot
(
kind
=
'
kde
'
)
kde
=
df2
[
'
log_Sample
'
].
plot
(
kind
=
'
kde
'
)
kde
.
get_figure
().
savefig
(
png_file
)
plt
.
show
()
def
probGivenPop
(
RPOP_data
,
NRPOP_data
,
log_pop
):
"""
model each variable as a normal CRV.
This function will find the conditional probability of the two variables: P(value | sample)
P(RPOP = sample) = 0 and P(RPOP = sample) = 0, since they are CRVs. An interval can be added in order to
get non-zero values.
Look at the ratio of P(sample -.5 < RPOP < sample + .5) to P(sample -.5 < NRPOP < sample + .5)
and normalize by the total count of each.
"""
#Compute the probabilities under each population
bw
=
.
5
#bandwidth for estimates
P_RPOP
=
ss
.
norm
.
cdf
(
log_pop
+
bw
,
RPOP_data
.
mean
(),
RPOP_data
.
std
())
-
\
ss
.
norm
.
cdf
(
log_pop
-
bw
,
RPOP_data
.
mean
(),
RPOP_data
.
std
())
#probability among ranked
P_NRPOP
=
ss
.
norm
.
cdf
(
log_pop
+
bw
,
NRPOP_data
.
mean
(),
NRPOP_data
.
std
())
-
\
ss
.
norm
.
cdf
(
log_pop
-
bw
,
NRPOP_data
.
mean
(),
NRPOP_data
.
std
())
#probability among not ranked
#Normalize by population of each to get an estimated number of samples with each value:
Est_Counties_Ranked_at_pop
=
P_RPOP
*
len
(
RPOP_data
)
Est_Counties_NotRanked_at_pop
=
P_NRPOP
*
len
(
NRPOP_data
)
#Compute the probability: value / all values (in the given population)
return
Est_Counties_Ranked_at_pop
/
(
Est_Counties_Ranked_at_pop
+
Est_Counties_NotRanked_at_pop
)
print
(
"
\t
55000: %.4f
"
%
probGivenPop
(
df1
[
'
log_Temp
'
],
df2
[
'
log_Temp
'
],
np
.
log
(
55000
)))
print
(
"
\t
55000: %.4f
"
%
probGivenPop
(
df1
[
'
log_Sample
'
],
df2
[
'
log_Sample
'
],
np
.
log
(
55000
)))
print
(
"
\n
LIST MEAN AND STD_DEC PER COLUMN:
"
)
mean_sd
=
df
[
valsColumn
].
describe
()[
1
:
3
]
## returns mean and std only, other describe output is ignored
mean_sd_dict
=
dict
([(
c
,
(
round
(
mean_sd
[
c
][
'
mean
'
],
4
),
round
(
mean_sd
[
c
][
'
std
'
],
4
),
))
for
c
in
mean_sd
.
columns
])
from
pprint
import
pprint
pprint
(
mean_sd_dict
)
#sort values by sample
df
.
sort_values
(
by
=
[
'
TotalSamples
'
])
print
(
len
(
df
))
index
=
int
(
0.5
*
len
(
df
.
index
))
lowHalf
,
highHalf
=
df
[:
index
],
df
[
index
:]
#Split the dataset into two
lowHalfSAMPLES
=
df1
[
'
TotalSamples
'
][
~
df1
[
'
TotalSamples
'
].
isnull
()][:
100
]
#or .dropna()
highHalfSAMPLES
=
df2
[
'
TotalSamples
'
][
~
df2
[
'
TotalSamples
'
].
isnull
()][:
100
]
dataSAMPLES
=
pd
.
concat
([
lowHalfSAMPLES
,
highHalfSAMPLES
])
#Generate a kde:
dataSAMPLES
.
plot
(
kind
=
'
kde
'
)
#highHalfSAMPLES.plot(kind='kde')
plt
.
axis
([
-
500
,
1500
,
0
,
0.0035
])
#zoom in to these dimensions
plt
.
show
()
#Calculate the means, standard deviations, length of each subset.
mean1
,
mean2
=
lowHalfSAMPLES
.
mean
(),
highHalfSAMPLES
.
mean
()
sd1
,
sd2
=
lowHalfSAMPLES
.
std
(),
highHalfSAMPLES
.
std
()
#standard deviation across both
n1
,
n2
=
len
(
lowHalfSAMPLES
),
len
(
highHalfSAMPLES
)
df1
,
df2
=
(
n1
-
1
),
(
n2
-
1
)
print
(
"
Mean of lower 50%%: %.1f (%d)
\n
Mean of upper 50%%: %.1f (%d)
\n
"
%
(
mean1
,
n1
,
mean2
,
n2
))
##two sample t-test, assuming equal variance:
pooled_var
=
(
df1
*
sd1
**
2
+
df2
*
sd2
**
2
)
/
(
df1
+
df2
)
#pooled variance
t
=
(
mean1
-
mean2
)
/
np
.
sqrt
(
pooled_var
*
(
1.0
/
n1
+
1.0
/
n2
))
print
(
t
)
p
=
1
-
ss
.
t
.
cdf
(
np
.
abs
(
t
),
df1
+
df2
)
print
(
"
t: %.4f, df: %.1f, p: %.5f
"
%
(
t
,
df1
+
df2
,
p
))
#one tail (if you hypothesize)
print
(
'
t-statistic = %6.3f pvalue = %6.4f
'
%
ss
.
ttest_ind
(
lowHalfSAMPLES
,
highHalfSAMPLES
))
#two tails
#Use original data
data
=
df
[
~
df
[
'
TotalSamples
'
].
isnull
()]
#drop nas
data
=
data
[
data
[
'
TotalSamples
'
]
!=
0
]
#drop zeros
data
[
'
samples
'
]
=
data
[
'
TotalSamples
'
]
#Create a scatter plot of the data.
data
.
dropna
(
subset
=
[
'
temp
'
],
inplace
=
True
)
data
.
dropna
(
subset
=
[
'
log_Temp
'
],
inplace
=
True
)
data
.
plot
(
kind
=
'
scatter
'
,
x
=
'
log_Temp
'
,
y
=
'
samples
'
,
alpha
=
0.3
)
#Generate a nonlogged scatter
data
.
plot
(
kind
=
'
scatter
'
,
x
=
'
temp
'
,
y
=
'
samples
'
,
alpha
=
0.3
)
##Have scipy figure out the regression coefficients:
beta1
,
beta0
,
r
,
p
,
stderr
=
ss
.
linregress
(
data
[
'
log_Temp
'
],
data
[
'
samples
'
])
#(assume beta1 and beta0 are estimate; i.e. beta1hat, beta0hat; we will almost never know non-estimated betas)
print
(
"
yhat = beta0 + beta1*x = %.2f + %.2fx
"
%
(
beta0
,
beta1
))
#now plot the line:
xpoints
=
np
.
linspace
(
data
[
'
log_Temp
'
].
min
()
*
.
90
,
data
[
'
log_Temp
'
].
max
()
*
1.1
,
100
)
plt
.
plot
(
xpoints
,
beta0
+
beta1
*
xpoints
,
color
=
'
red
'
,
linewidth
=
2
)
#"beta0 + beta1*xpoints" is using vector algebra
#Generate KDEs and final linear regression.
data
[
'
samples
'
].
plot
(
kind
=
'
kde
'
)
#print(data['samples'])
data
.
dropna
(
subset
=
[
'
samples
'
],
inplace
=
True
)
data
.
dropna
(
subset
=
[
'
log_Temp
'
],
inplace
=
True
)
#print(data['samples'])
print
(
"
before logging SAMPLES; r = %.3f, p = %.5f
"
%
ss
.
pearsonr
(
data
[
'
log_Temp
'
],
data
[
'
samples
'
]))
plt
.
show
()
data
[
'
log_samples
'
]
=
np
.
log
(
data
[
'
samples
'
]
+
1
)
data
[
'
log_samples
'
].
plot
(
kind
=
'
kde
'
)
#after transform
print
(
"
after logging SAMPLES; r = %.3f, p = %.5f
"
%
ss
.
pearsonr
(
data
[
'
log_Temp
'
],
data
[
'
log_samples
'
]))
data
.
plot
(
kind
=
'
scatter
'
,
x
=
'
log_Temp
'
,
y
=
'
log_samples
'
,
alpha
=
0.3
)
beta1
,
beta0
,
r
,
p
,
stderr
=
ss
.
linregress
(
data
[
'
log_Temp
'
],
data
[
'
log_samples
'
])
# we will talk about r, p, and stderr next
xpoints
=
np
.
linspace
(
data
[
'
log_Temp
'
].
min
()
*
.
90
,
data
[
'
log_Temp
'
].
max
()
*
1.1
,
100
)
plt
.
plot
(
xpoints
,
beta0
+
beta1
*
xpoints
,
color
=
'
red
'
,
linewidth
=
2
)
#note: "beta0 + beta1*xpoints" is using vector algebra
plt
.
show
()
print
(
"
from linregress: r = %.3f, p = %.5f
"
%
(
r
,
p
))
#The red line has a steeper slope now => greater r
data
.
plot
(
kind
=
'
scatter
'
,
x
=
'
temp
'
,
y
=
'
samples
'
,
alpha
=
0.3
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment