diff --git a/climatehyptest2.py b/climatehyptest2.py deleted file mode 100644 index adeb3bf3c7bc274cfdf2963240bc49fa7ef2e4b0..0000000000000000000000000000000000000000 --- a/climatehyptest2.py +++ /dev/null @@ -1,231 +0,0 @@ -# -*- coding: utf-8 -*- -"""climateHypTest2.ipynb - -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/16nUPjaYYnWurxCPeCCBJVWPcAMj6sIUa - -Code for hypothesis testing of the datasets in order to determine the each of the previously tested null hypotheses - may be proven or disprove. - Statistical tests that are carried out are: pvalues, histograms, kernel density estimators, t-tests and linear regressions. - - #*************************************************************************************** - # * Title: lab7_hypothesis-checkpoint - # * Author: Dr Manal Helal - # * Date: November 2019 - # * Code version: 1.02 - #*************************************************************************************** -""" - -import pandas as pd -from scipy import stats -import numpy as np -import scipy.stats as ss -import csv - -#Import the dataset -df = pd.read_csv('climateUKRand.csv', engine='python', encoding='utf-8', error_bad_lines=False) - - -print(len(df)) -df.mean() -df.fillna(0) -df.head() - -#Sort the dataset by their value field using quicksort. Smallest values will be first. -df = df.sort_values('temp',ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False) - -print(round(len(df)/2)) - -#Split the dataset into two, each set being of an equal length. -df1 = df.iloc[:round(len(df)/2)] -df2 = df.iloc[round(len(df)/2):] - -#GEt the T-Test statistic value and the p value of the datasets. -print(stats.ttest_ind(df1.dropna()['TotalSamples'], df2.dropna()['TotalSamples'])) - -# Commented out IPython magic to ensure Python compatibility. -# %matplotlib inline - -import sys -import matplotlib.pyplot as plt - -#Trim the sets to the required fields only. -valsColumn = [ currColumn for currColumn in df.columns if "temp" in currColumn - or "TotalSamples" in currColumn] -df = df[df.TotalSamples != 0] -df = df[df.temp != 0] -#df = df[valsColumn] -#valsColumn = [v for v in valsColumn if "Value" in v] - -print ("\n(2) TOTAL SAMPLES IN FILE:") -print("\t\t\t\t%d "%(len(df.index))) - -print (valsColumn) - -ValueHigherThanzero = [ idx for idx,isRanked in enumerate(df['temp']) if isRanked>0 ] -print("\n(3) TOTAL with a value greater than zero:") -print("\t\t\t\t%d "%(len(ValueHigherThanzero))) - -#if df['Value'].dtype != 'int': - # df['Value'] = df['Value'].astype('int') - -#Generate histograms for both of the fields that are being tested. -#Generate one with the original values and one with the log values. -png_file = 'histTemp.png' -hist = df['temp'].hist(bins=60) -hist.get_figure().savefig(png_file) -plt.show() - -png_file = 'histSamples.png' -hist = df['TotalSamples'].hist(bins=60) -hist.get_figure().savefig(png_file) -plt.show() - - -png_file = 'histlogTemp.png' -df['log_Temp'] = np.log(df['temp']) -df.replace([np.inf, -np.inf], np.nan).dropna(axis=1) -lhist = df['log_Temp'].hist(bins=30) -lhist.get_figure().savefig(png_file) -plt.show() - -df1 = df.iloc[:round(len(df)/2)] -df2 = df.iloc[round(len(df)/2):] - -#Generate a KDE for both tested fields. -png_file = 'KDE.png' -df1['log_Temp'].plot(kind='kde') -kde = df2['log_Temp'].plot(kind='kde') -kde.get_figure().savefig(png_file) -plt.show() - - -png_file = 'histlogSamples.png' -df['log_Sample'] = np.log(df['TotalSamples']) -lhist = df['log_Sample'].hist(bins=30) -lhist.get_figure().savefig(png_file) -plt.show() - -df1 = df.iloc[:round(len(df)/2)] -df2 = df.iloc[round(len(df)/2):] - -png_file = 'KDE2.png' -df1['log_Sample'].plot(kind='kde') -kde = df2['log_Sample'].plot(kind='kde') -kde.get_figure().savefig(png_file) -plt.show() - -def probGivenPop (RPOP_data, NRPOP_data, log_pop): - """ model each variable as a normal CRV. - This function will find the conditional probability of the two variables: P(value | sample) - P(RPOP = sample) = 0 and P(RPOP = sample) = 0, since they are CRVs. An interval can be added in order to - get non-zero values. - Look at the ratio of P(sample -.5 < RPOP < sample + .5) to P(sample -.5 < NRPOP < sample + .5) - and normalize by the total count of each.""" - - #Compute the probabilities under each population - bw = .5 #bandwidth for estimates - P_RPOP = ss.norm.cdf(log_pop+bw, RPOP_data.mean(), RPOP_data.std()) - \ - ss.norm.cdf(log_pop-bw, RPOP_data.mean(), RPOP_data.std())#probability among ranked - P_NRPOP = ss.norm.cdf(log_pop+bw, NRPOP_data.mean(), NRPOP_data.std()) - \ - ss.norm.cdf(log_pop-bw, NRPOP_data.mean(), NRPOP_data.std())#probability among not ranked - - #Normalize by population of each to get an estimated number of samples with each value: - Est_Counties_Ranked_at_pop = P_RPOP * len(RPOP_data) - Est_Counties_NotRanked_at_pop = P_NRPOP * len(NRPOP_data) - - #Compute the probability: value / all values (in the given population) - return Est_Counties_Ranked_at_pop / (Est_Counties_Ranked_at_pop + Est_Counties_NotRanked_at_pop) - - -print ("\t 55000: %.4f" % probGivenPop(df1['log_Temp'], df2['log_Temp'], np.log(55000))) -print ("\t 55000: %.4f" % probGivenPop(df1['log_Sample'], df2['log_Sample'], np.log(55000))) - -print ("\n LIST MEAN AND STD_DEC PER COLUMN:") -mean_sd = df[valsColumn].describe()[1:3] ## returns mean and std only, other describe output is ignored -mean_sd_dict = dict([(c, (round(mean_sd[c]['mean'], 4), round(mean_sd[c]['std'], 4), )) for c in mean_sd.columns]) -from pprint import pprint -pprint(mean_sd_dict) - -#sort values by sample -df.sort_values(by=['TotalSamples']) - -print(len(df)) -index = int(0.5 * len(df.index)) -lowHalf, highHalf = df[:index], df[index:] - -#Split the dataset into two -lowHalfSAMPLES = df1['TotalSamples'][~df1['TotalSamples'].isnull()][:100] #or .dropna() -highHalfSAMPLES = df2['TotalSamples'][~df2['TotalSamples'].isnull()][:100] - - -dataSAMPLES = pd.concat([lowHalfSAMPLES, highHalfSAMPLES]) - -#Generate a kde: -dataSAMPLES.plot(kind='kde') -#highHalfSAMPLES.plot(kind='kde') - -plt.axis([-500,1500,0,0.0035]) #zoom in to these dimensions -plt.show() - -#Calculate the means, standard deviations, length of each subset. - -mean1, mean2 = lowHalfSAMPLES.mean(), highHalfSAMPLES.mean() -sd1, sd2 = lowHalfSAMPLES.std(), highHalfSAMPLES.std() #standard deviation across both -n1, n2 = len(lowHalfSAMPLES), len(highHalfSAMPLES) -df1, df2 = (n1 - 1), (n2 - 1) -print ("Mean of lower 50%%: %.1f (%d) \nMean of upper 50%%: %.1f (%d) \n " % (mean1, n1, mean2, n2)) - -##two sample t-test, assuming equal variance: -pooled_var = (df1*sd1**2 + df2*sd2**2) / (df1 + df2) #pooled variance -t = (mean1 - mean2) / np.sqrt(pooled_var * (1.0/n1 + 1.0/n2)) -print (t) -p = 1 - ss.t.cdf(np.abs(t), df1+df2) -print ("t: %.4f, df: %.1f, p: %.5f" % (t, df1+df2, p)) #one tail (if you hypothesize) -print ('t-statistic = %6.3f pvalue = %6.4f' % ss.ttest_ind(lowHalfSAMPLES, highHalfSAMPLES)) #two tails - -#Use original data -data = df[~df['TotalSamples'].isnull()]#drop nas -data = data[data['TotalSamples']!=0]#drop zeros -data['samples']= data['TotalSamples'] - -#Create a scatter plot of the data. -data.dropna(subset = ['temp'], inplace=True) -data.dropna(subset = ['log_Temp'], inplace=True) -data.plot(kind='scatter', x = 'log_Temp', y='samples', alpha=0.3) - -#Generate a nonlogged scatter -data.plot(kind='scatter', x = 'temp', y='samples', alpha=0.3) - -##Have scipy figure out the regression coefficients: -beta1, beta0, r, p, stderr = ss.linregress(data['log_Temp'], data['samples']) -#(assume beta1 and beta0 are estimate; i.e. beta1hat, beta0hat; we will almost never know non-estimated betas) -print ("yhat = beta0 + beta1*x = %.2f + %.2fx" % (beta0, beta1)) -#now plot the line: -xpoints = np.linspace(data['log_Temp'].min()*.90, data['log_Temp'].max()*1.1, 100) -plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #"beta0 + beta1*xpoints" is using vector algebra - -#Generate KDEs and final linear regression. -data['samples'].plot(kind='kde') -#print(data['samples']) -data.dropna(subset = ['samples'], inplace=True) -data.dropna(subset = ['log_Temp'], inplace=True) -#print(data['samples']) -print ("before logging SAMPLES; r = %.3f, p = %.5f" % ss.pearsonr(data['log_Temp'], data['samples'])) -plt.show() -data['log_samples'] = np.log(data['samples']+1) -data['log_samples'].plot(kind='kde')#after transform -print ("after logging SAMPLES; r = %.3f, p = %.5f" % ss.pearsonr(data['log_Temp'], data['log_samples'])) - -data.plot(kind='scatter', x = 'log_Temp', y='log_samples', alpha=0.3) -beta1, beta0, r, p, stderr = ss.linregress(data['log_Temp'], data['log_samples'])# we will talk about r, p, and stderr next -xpoints = np.linspace(data['log_Temp'].min()*.90, data['log_Temp'].max()*1.1, 100) -plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #note: "beta0 + beta1*xpoints" is using vector algebra -plt.show() -print ("from linregress: r = %.3f, p = %.5f" % (r, p)) - -#The red line has a steeper slope now => greater r - -data.plot(kind='scatter', x = 'temp', y='samples', alpha=0.3) \ No newline at end of file