diff --git a/climatehyptest2.py b/climatehyptest2.py new file mode 100644 index 0000000000000000000000000000000000000000..adeb3bf3c7bc274cfdf2963240bc49fa7ef2e4b0 --- /dev/null +++ b/climatehyptest2.py @@ -0,0 +1,231 @@ +# -*- coding: utf-8 -*- +"""climateHypTest2.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/16nUPjaYYnWurxCPeCCBJVWPcAMj6sIUa + +Code for hypothesis testing of the datasets in order to determine the each of the previously tested null hypotheses + may be proven or disprove. + Statistical tests that are carried out are: pvalues, histograms, kernel density estimators, t-tests and linear regressions. + + #*************************************************************************************** + # * Title: lab7_hypothesis-checkpoint + # * Author: Dr Manal Helal + # * Date: November 2019 + # * Code version: 1.02 + #*************************************************************************************** +""" + +import pandas as pd +from scipy import stats +import numpy as np +import scipy.stats as ss +import csv + +#Import the dataset +df = pd.read_csv('climateUKRand.csv', engine='python', encoding='utf-8', error_bad_lines=False) + + +print(len(df)) +df.mean() +df.fillna(0) +df.head() + +#Sort the dataset by their value field using quicksort. Smallest values will be first. +df = df.sort_values('temp',ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False) + +print(round(len(df)/2)) + +#Split the dataset into two, each set being of an equal length. +df1 = df.iloc[:round(len(df)/2)] +df2 = df.iloc[round(len(df)/2):] + +#GEt the T-Test statistic value and the p value of the datasets. +print(stats.ttest_ind(df1.dropna()['TotalSamples'], df2.dropna()['TotalSamples'])) + +# Commented out IPython magic to ensure Python compatibility. +# %matplotlib inline + +import sys +import matplotlib.pyplot as plt + +#Trim the sets to the required fields only. +valsColumn = [ currColumn for currColumn in df.columns if "temp" in currColumn + or "TotalSamples" in currColumn] +df = df[df.TotalSamples != 0] +df = df[df.temp != 0] +#df = df[valsColumn] +#valsColumn = [v for v in valsColumn if "Value" in v] + +print ("\n(2) TOTAL SAMPLES IN FILE:") +print("\t\t\t\t%d "%(len(df.index))) + +print (valsColumn) + +ValueHigherThanzero = [ idx for idx,isRanked in enumerate(df['temp']) if isRanked>0 ] +print("\n(3) TOTAL with a value greater than zero:") +print("\t\t\t\t%d "%(len(ValueHigherThanzero))) + +#if df['Value'].dtype != 'int': + # df['Value'] = df['Value'].astype('int') + +#Generate histograms for both of the fields that are being tested. +#Generate one with the original values and one with the log values. +png_file = 'histTemp.png' +hist = df['temp'].hist(bins=60) +hist.get_figure().savefig(png_file) +plt.show() + +png_file = 'histSamples.png' +hist = df['TotalSamples'].hist(bins=60) +hist.get_figure().savefig(png_file) +plt.show() + + +png_file = 'histlogTemp.png' +df['log_Temp'] = np.log(df['temp']) +df.replace([np.inf, -np.inf], np.nan).dropna(axis=1) +lhist = df['log_Temp'].hist(bins=30) +lhist.get_figure().savefig(png_file) +plt.show() + +df1 = df.iloc[:round(len(df)/2)] +df2 = df.iloc[round(len(df)/2):] + +#Generate a KDE for both tested fields. +png_file = 'KDE.png' +df1['log_Temp'].plot(kind='kde') +kde = df2['log_Temp'].plot(kind='kde') +kde.get_figure().savefig(png_file) +plt.show() + + +png_file = 'histlogSamples.png' +df['log_Sample'] = np.log(df['TotalSamples']) +lhist = df['log_Sample'].hist(bins=30) +lhist.get_figure().savefig(png_file) +plt.show() + +df1 = df.iloc[:round(len(df)/2)] +df2 = df.iloc[round(len(df)/2):] + +png_file = 'KDE2.png' +df1['log_Sample'].plot(kind='kde') +kde = df2['log_Sample'].plot(kind='kde') +kde.get_figure().savefig(png_file) +plt.show() + +def probGivenPop (RPOP_data, NRPOP_data, log_pop): + """ model each variable as a normal CRV. + This function will find the conditional probability of the two variables: P(value | sample) + P(RPOP = sample) = 0 and P(RPOP = sample) = 0, since they are CRVs. An interval can be added in order to + get non-zero values. + Look at the ratio of P(sample -.5 < RPOP < sample + .5) to P(sample -.5 < NRPOP < sample + .5) + and normalize by the total count of each.""" + + #Compute the probabilities under each population + bw = .5 #bandwidth for estimates + P_RPOP = ss.norm.cdf(log_pop+bw, RPOP_data.mean(), RPOP_data.std()) - \ + ss.norm.cdf(log_pop-bw, RPOP_data.mean(), RPOP_data.std())#probability among ranked + P_NRPOP = ss.norm.cdf(log_pop+bw, NRPOP_data.mean(), NRPOP_data.std()) - \ + ss.norm.cdf(log_pop-bw, NRPOP_data.mean(), NRPOP_data.std())#probability among not ranked + + #Normalize by population of each to get an estimated number of samples with each value: + Est_Counties_Ranked_at_pop = P_RPOP * len(RPOP_data) + Est_Counties_NotRanked_at_pop = P_NRPOP * len(NRPOP_data) + + #Compute the probability: value / all values (in the given population) + return Est_Counties_Ranked_at_pop / (Est_Counties_Ranked_at_pop + Est_Counties_NotRanked_at_pop) + + +print ("\t 55000: %.4f" % probGivenPop(df1['log_Temp'], df2['log_Temp'], np.log(55000))) +print ("\t 55000: %.4f" % probGivenPop(df1['log_Sample'], df2['log_Sample'], np.log(55000))) + +print ("\n LIST MEAN AND STD_DEC PER COLUMN:") +mean_sd = df[valsColumn].describe()[1:3] ## returns mean and std only, other describe output is ignored +mean_sd_dict = dict([(c, (round(mean_sd[c]['mean'], 4), round(mean_sd[c]['std'], 4), )) for c in mean_sd.columns]) +from pprint import pprint +pprint(mean_sd_dict) + +#sort values by sample +df.sort_values(by=['TotalSamples']) + +print(len(df)) +index = int(0.5 * len(df.index)) +lowHalf, highHalf = df[:index], df[index:] + +#Split the dataset into two +lowHalfSAMPLES = df1['TotalSamples'][~df1['TotalSamples'].isnull()][:100] #or .dropna() +highHalfSAMPLES = df2['TotalSamples'][~df2['TotalSamples'].isnull()][:100] + + +dataSAMPLES = pd.concat([lowHalfSAMPLES, highHalfSAMPLES]) + +#Generate a kde: +dataSAMPLES.plot(kind='kde') +#highHalfSAMPLES.plot(kind='kde') + +plt.axis([-500,1500,0,0.0035]) #zoom in to these dimensions +plt.show() + +#Calculate the means, standard deviations, length of each subset. + +mean1, mean2 = lowHalfSAMPLES.mean(), highHalfSAMPLES.mean() +sd1, sd2 = lowHalfSAMPLES.std(), highHalfSAMPLES.std() #standard deviation across both +n1, n2 = len(lowHalfSAMPLES), len(highHalfSAMPLES) +df1, df2 = (n1 - 1), (n2 - 1) +print ("Mean of lower 50%%: %.1f (%d) \nMean of upper 50%%: %.1f (%d) \n " % (mean1, n1, mean2, n2)) + +##two sample t-test, assuming equal variance: +pooled_var = (df1*sd1**2 + df2*sd2**2) / (df1 + df2) #pooled variance +t = (mean1 - mean2) / np.sqrt(pooled_var * (1.0/n1 + 1.0/n2)) +print (t) +p = 1 - ss.t.cdf(np.abs(t), df1+df2) +print ("t: %.4f, df: %.1f, p: %.5f" % (t, df1+df2, p)) #one tail (if you hypothesize) +print ('t-statistic = %6.3f pvalue = %6.4f' % ss.ttest_ind(lowHalfSAMPLES, highHalfSAMPLES)) #two tails + +#Use original data +data = df[~df['TotalSamples'].isnull()]#drop nas +data = data[data['TotalSamples']!=0]#drop zeros +data['samples']= data['TotalSamples'] + +#Create a scatter plot of the data. +data.dropna(subset = ['temp'], inplace=True) +data.dropna(subset = ['log_Temp'], inplace=True) +data.plot(kind='scatter', x = 'log_Temp', y='samples', alpha=0.3) + +#Generate a nonlogged scatter +data.plot(kind='scatter', x = 'temp', y='samples', alpha=0.3) + +##Have scipy figure out the regression coefficients: +beta1, beta0, r, p, stderr = ss.linregress(data['log_Temp'], data['samples']) +#(assume beta1 and beta0 are estimate; i.e. beta1hat, beta0hat; we will almost never know non-estimated betas) +print ("yhat = beta0 + beta1*x = %.2f + %.2fx" % (beta0, beta1)) +#now plot the line: +xpoints = np.linspace(data['log_Temp'].min()*.90, data['log_Temp'].max()*1.1, 100) +plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #"beta0 + beta1*xpoints" is using vector algebra + +#Generate KDEs and final linear regression. +data['samples'].plot(kind='kde') +#print(data['samples']) +data.dropna(subset = ['samples'], inplace=True) +data.dropna(subset = ['log_Temp'], inplace=True) +#print(data['samples']) +print ("before logging SAMPLES; r = %.3f, p = %.5f" % ss.pearsonr(data['log_Temp'], data['samples'])) +plt.show() +data['log_samples'] = np.log(data['samples']+1) +data['log_samples'].plot(kind='kde')#after transform +print ("after logging SAMPLES; r = %.3f, p = %.5f" % ss.pearsonr(data['log_Temp'], data['log_samples'])) + +data.plot(kind='scatter', x = 'log_Temp', y='log_samples', alpha=0.3) +beta1, beta0, r, p, stderr = ss.linregress(data['log_Temp'], data['log_samples'])# we will talk about r, p, and stderr next +xpoints = np.linspace(data['log_Temp'].min()*.90, data['log_Temp'].max()*1.1, 100) +plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #note: "beta0 + beta1*xpoints" is using vector algebra +plt.show() +print ("from linregress: r = %.3f, p = %.5f" % (r, p)) + +#The red line has a steeper slope now => greater r + +data.plot(kind='scatter', x = 'temp', y='samples', alpha=0.3) \ No newline at end of file