Skip to content
Snippets Groups Projects
Commit 43133135 authored by Kilinchan, Kaan (UG - Computer Science)'s avatar Kilinchan, Kaan (UG - Computer Science)
Browse files

Revert "Upload New File"

This reverts commit 84dd95dc
parent 84dd95dc
No related branches found
No related tags found
No related merge requests found
# -*- coding: utf-8 -*-
"""climateHypTest2.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/16nUPjaYYnWurxCPeCCBJVWPcAMj6sIUa
Code for hypothesis testing of the datasets in order to determine the each of the previously tested null hypotheses
may be proven or disprove.
Statistical tests that are carried out are: pvalues, histograms, kernel density estimators, t-tests and linear regressions.
#***************************************************************************************
# * Title: lab7_hypothesis-checkpoint
# * Author: Dr Manal Helal
# * Date: November 2019
# * Code version: 1.02
#***************************************************************************************
"""
import pandas as pd
from scipy import stats
import numpy as np
import scipy.stats as ss
import csv
#Import the dataset
df = pd.read_csv('climateUKRand.csv', engine='python', encoding='utf-8', error_bad_lines=False)
print(len(df))
df.mean()
df.fillna(0)
df.head()
#Sort the dataset by their value field using quicksort. Smallest values will be first.
df = df.sort_values('temp',ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False)
print(round(len(df)/2))
#Split the dataset into two, each set being of an equal length.
df1 = df.iloc[:round(len(df)/2)]
df2 = df.iloc[round(len(df)/2):]
#GEt the T-Test statistic value and the p value of the datasets.
print(stats.ttest_ind(df1.dropna()['TotalSamples'], df2.dropna()['TotalSamples']))
# Commented out IPython magic to ensure Python compatibility.
# %matplotlib inline
import sys
import matplotlib.pyplot as plt
#Trim the sets to the required fields only.
valsColumn = [ currColumn for currColumn in df.columns if "temp" in currColumn
or "TotalSamples" in currColumn]
df = df[df.TotalSamples != 0]
df = df[df.temp != 0]
#df = df[valsColumn]
#valsColumn = [v for v in valsColumn if "Value" in v]
print ("\n(2) TOTAL SAMPLES IN FILE:")
print("\t\t\t\t%d "%(len(df.index)))
print (valsColumn)
ValueHigherThanzero = [ idx for idx,isRanked in enumerate(df['temp']) if isRanked>0 ]
print("\n(3) TOTAL with a value greater than zero:")
print("\t\t\t\t%d "%(len(ValueHigherThanzero)))
#if df['Value'].dtype != 'int':
# df['Value'] = df['Value'].astype('int')
#Generate histograms for both of the fields that are being tested.
#Generate one with the original values and one with the log values.
png_file = 'histTemp.png'
hist = df['temp'].hist(bins=60)
hist.get_figure().savefig(png_file)
plt.show()
png_file = 'histSamples.png'
hist = df['TotalSamples'].hist(bins=60)
hist.get_figure().savefig(png_file)
plt.show()
png_file = 'histlogTemp.png'
df['log_Temp'] = np.log(df['temp'])
df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
lhist = df['log_Temp'].hist(bins=30)
lhist.get_figure().savefig(png_file)
plt.show()
df1 = df.iloc[:round(len(df)/2)]
df2 = df.iloc[round(len(df)/2):]
#Generate a KDE for both tested fields.
png_file = 'KDE.png'
df1['log_Temp'].plot(kind='kde')
kde = df2['log_Temp'].plot(kind='kde')
kde.get_figure().savefig(png_file)
plt.show()
png_file = 'histlogSamples.png'
df['log_Sample'] = np.log(df['TotalSamples'])
lhist = df['log_Sample'].hist(bins=30)
lhist.get_figure().savefig(png_file)
plt.show()
df1 = df.iloc[:round(len(df)/2)]
df2 = df.iloc[round(len(df)/2):]
png_file = 'KDE2.png'
df1['log_Sample'].plot(kind='kde')
kde = df2['log_Sample'].plot(kind='kde')
kde.get_figure().savefig(png_file)
plt.show()
def probGivenPop (RPOP_data, NRPOP_data, log_pop):
""" model each variable as a normal CRV.
This function will find the conditional probability of the two variables: P(value | sample)
P(RPOP = sample) = 0 and P(RPOP = sample) = 0, since they are CRVs. An interval can be added in order to
get non-zero values.
Look at the ratio of P(sample -.5 < RPOP < sample + .5) to P(sample -.5 < NRPOP < sample + .5)
and normalize by the total count of each."""
#Compute the probabilities under each population
bw = .5 #bandwidth for estimates
P_RPOP = ss.norm.cdf(log_pop+bw, RPOP_data.mean(), RPOP_data.std()) - \
ss.norm.cdf(log_pop-bw, RPOP_data.mean(), RPOP_data.std())#probability among ranked
P_NRPOP = ss.norm.cdf(log_pop+bw, NRPOP_data.mean(), NRPOP_data.std()) - \
ss.norm.cdf(log_pop-bw, NRPOP_data.mean(), NRPOP_data.std())#probability among not ranked
#Normalize by population of each to get an estimated number of samples with each value:
Est_Counties_Ranked_at_pop = P_RPOP * len(RPOP_data)
Est_Counties_NotRanked_at_pop = P_NRPOP * len(NRPOP_data)
#Compute the probability: value / all values (in the given population)
return Est_Counties_Ranked_at_pop / (Est_Counties_Ranked_at_pop + Est_Counties_NotRanked_at_pop)
print ("\t 55000: %.4f" % probGivenPop(df1['log_Temp'], df2['log_Temp'], np.log(55000)))
print ("\t 55000: %.4f" % probGivenPop(df1['log_Sample'], df2['log_Sample'], np.log(55000)))
print ("\n LIST MEAN AND STD_DEC PER COLUMN:")
mean_sd = df[valsColumn].describe()[1:3] ## returns mean and std only, other describe output is ignored
mean_sd_dict = dict([(c, (round(mean_sd[c]['mean'], 4), round(mean_sd[c]['std'], 4), )) for c in mean_sd.columns])
from pprint import pprint
pprint(mean_sd_dict)
#sort values by sample
df.sort_values(by=['TotalSamples'])
print(len(df))
index = int(0.5 * len(df.index))
lowHalf, highHalf = df[:index], df[index:]
#Split the dataset into two
lowHalfSAMPLES = df1['TotalSamples'][~df1['TotalSamples'].isnull()][:100] #or .dropna()
highHalfSAMPLES = df2['TotalSamples'][~df2['TotalSamples'].isnull()][:100]
dataSAMPLES = pd.concat([lowHalfSAMPLES, highHalfSAMPLES])
#Generate a kde:
dataSAMPLES.plot(kind='kde')
#highHalfSAMPLES.plot(kind='kde')
plt.axis([-500,1500,0,0.0035]) #zoom in to these dimensions
plt.show()
#Calculate the means, standard deviations, length of each subset.
mean1, mean2 = lowHalfSAMPLES.mean(), highHalfSAMPLES.mean()
sd1, sd2 = lowHalfSAMPLES.std(), highHalfSAMPLES.std() #standard deviation across both
n1, n2 = len(lowHalfSAMPLES), len(highHalfSAMPLES)
df1, df2 = (n1 - 1), (n2 - 1)
print ("Mean of lower 50%%: %.1f (%d) \nMean of upper 50%%: %.1f (%d) \n " % (mean1, n1, mean2, n2))
##two sample t-test, assuming equal variance:
pooled_var = (df1*sd1**2 + df2*sd2**2) / (df1 + df2) #pooled variance
t = (mean1 - mean2) / np.sqrt(pooled_var * (1.0/n1 + 1.0/n2))
print (t)
p = 1 - ss.t.cdf(np.abs(t), df1+df2)
print ("t: %.4f, df: %.1f, p: %.5f" % (t, df1+df2, p)) #one tail (if you hypothesize)
print ('t-statistic = %6.3f pvalue = %6.4f' % ss.ttest_ind(lowHalfSAMPLES, highHalfSAMPLES)) #two tails
#Use original data
data = df[~df['TotalSamples'].isnull()]#drop nas
data = data[data['TotalSamples']!=0]#drop zeros
data['samples']= data['TotalSamples']
#Create a scatter plot of the data.
data.dropna(subset = ['temp'], inplace=True)
data.dropna(subset = ['log_Temp'], inplace=True)
data.plot(kind='scatter', x = 'log_Temp', y='samples', alpha=0.3)
#Generate a nonlogged scatter
data.plot(kind='scatter', x = 'temp', y='samples', alpha=0.3)
##Have scipy figure out the regression coefficients:
beta1, beta0, r, p, stderr = ss.linregress(data['log_Temp'], data['samples'])
#(assume beta1 and beta0 are estimate; i.e. beta1hat, beta0hat; we will almost never know non-estimated betas)
print ("yhat = beta0 + beta1*x = %.2f + %.2fx" % (beta0, beta1))
#now plot the line:
xpoints = np.linspace(data['log_Temp'].min()*.90, data['log_Temp'].max()*1.1, 100)
plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #"beta0 + beta1*xpoints" is using vector algebra
#Generate KDEs and final linear regression.
data['samples'].plot(kind='kde')
#print(data['samples'])
data.dropna(subset = ['samples'], inplace=True)
data.dropna(subset = ['log_Temp'], inplace=True)
#print(data['samples'])
print ("before logging SAMPLES; r = %.3f, p = %.5f" % ss.pearsonr(data['log_Temp'], data['samples']))
plt.show()
data['log_samples'] = np.log(data['samples']+1)
data['log_samples'].plot(kind='kde')#after transform
print ("after logging SAMPLES; r = %.3f, p = %.5f" % ss.pearsonr(data['log_Temp'], data['log_samples']))
data.plot(kind='scatter', x = 'log_Temp', y='log_samples', alpha=0.3)
beta1, beta0, r, p, stderr = ss.linregress(data['log_Temp'], data['log_samples'])# we will talk about r, p, and stderr next
xpoints = np.linspace(data['log_Temp'].min()*.90, data['log_Temp'].max()*1.1, 100)
plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #note: "beta0 + beta1*xpoints" is using vector algebra
plt.show()
print ("from linregress: r = %.3f, p = %.5f" % (r, p))
#The red line has a steeper slope now => greater r
data.plot(kind='scatter', x = 'temp', y='samples', alpha=0.3)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment