Revert "Upload New File"

This reverts commit 84dd95dc

Revert "Upload New File"
43133135 · Kilinchan, Kaan (UG - Computer Science) · 84dd95dc · 84dd95dc
Commit 43133135 authored 5 years ago by Kilinchan, Kaan (UG - Computer Science)
--- a/climatehyptest2.py
+++ b/climatehyptest2.py
-# -*- coding: utf-8 -*-
-"""climateHypTest2.ipynb
-Automatically generated by Colaboratory.
-Original file is located at
-    https://colab.research.google.com/drive/16nUPjaYYnWurxCPeCCBJVWPcAMj6sIUa
-Code for hypothesis testing of the datasets in order to determine the each of the previously tested null hypotheses
-  may be proven or disprove. 
-  Statistical tests that are carried out are: pvalues, histograms, kernel density estimators, t-tests and linear regressions.
-  #***************************************************************************************
-  #  *    Title: lab7_hypothesis-checkpoint
-  #  *    Author: Dr Manal Helal
-  #  *    Date: November 2019 
-  #  *    Code version: 1.02
-  #***************************************************************************************
-"""
-import pandas as pd
-from scipy import stats
-import numpy as np
-import scipy.stats as ss
-import csv
-#Import the dataset
-df = pd.read_csv('climateUKRand.csv', engine='python', encoding='utf-8', error_bad_lines=False)
-print(len(df))
-df.mean()
-df.fillna(0)
-df.head()
-#Sort the dataset by their value field using quicksort. Smallest values will be first.
-df = df.sort_values('temp',ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False)
-print(round(len(df)/2))
-#Split the dataset into two, each set being of an equal length.
-df1 = df.iloc[:round(len(df)/2)]
-df2 = df.iloc[round(len(df)/2):]
-#GEt the T-Test statistic value and the p value of the datasets.
-print(stats.ttest_ind(df1.dropna()['TotalSamples'], df2.dropna()['TotalSamples']))
-# Commented out IPython magic to ensure Python compatibility.
-# %matplotlib inline
-import sys
-import matplotlib.pyplot as plt
-#Trim the sets to the required fields only.
-valsColumn = [ currColumn for currColumn in df.columns if "temp" in currColumn
-    or "TotalSamples" in currColumn]
-df = df[df.TotalSamples != 0] 
-df = df[df.temp != 0] 
-#df = df[valsColumn]
-#valsColumn = [v for v in valsColumn if "Value" in v]
-print ("\n(2) TOTAL SAMPLES IN FILE:")
-print("\t\t\t\t%d "%(len(df.index)))
-print (valsColumn)
-ValueHigherThanzero = [ idx for idx,isRanked in enumerate(df['temp']) if isRanked>0 ]
-print("\n(3) TOTAL with a value greater than zero:")
-print("\t\t\t\t%d "%(len(ValueHigherThanzero)))
-#if df['Value'].dtype != 'int':
- #   df['Value'] = df['Value'].astype('int')
-#Generate histograms for both of the fields that are being tested. 
-#Generate one with the original values and one with the log values.
-png_file = 'histTemp.png'
-hist = df['temp'].hist(bins=60)
-hist.get_figure().savefig(png_file)
-plt.show()
-png_file = 'histSamples.png'
-hist = df['TotalSamples'].hist(bins=60)
-hist.get_figure().savefig(png_file)
-plt.show()
-png_file = 'histlogTemp.png'
-df['log_Temp'] = np.log(df['temp'])
-df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
-lhist = df['log_Temp'].hist(bins=30)
-lhist.get_figure().savefig(png_file)
-plt.show()
-df1 = df.iloc[:round(len(df)/2)]
-df2 = df.iloc[round(len(df)/2):]
-#Generate a KDE for both tested fields.
-png_file = 'KDE.png'
-df1['log_Temp'].plot(kind='kde')
-kde = df2['log_Temp'].plot(kind='kde')
-kde.get_figure().savefig(png_file)
-plt.show()
-png_file = 'histlogSamples.png'
-df['log_Sample'] = np.log(df['TotalSamples'])
-lhist = df['log_Sample'].hist(bins=30)
-lhist.get_figure().savefig(png_file)
-plt.show()
-df1 = df.iloc[:round(len(df)/2)]
-df2 = df.iloc[round(len(df)/2):]
-png_file = 'KDE2.png'
-df1['log_Sample'].plot(kind='kde')
-kde = df2['log_Sample'].plot(kind='kde')
-kde.get_figure().savefig(png_file)
-plt.show()
-def probGivenPop (RPOP_data, NRPOP_data, log_pop):
-    """ model each variable as a normal CRV. 
-       This function will find the conditional probability of the two variables: P(value | sample)
-       P(RPOP = sample) = 0 and P(RPOP = sample) = 0, since they are CRVs. An interval can be added in order to 
-       get non-zero values.
-       Look at the ratio of P(sample -.5 < RPOP < sample + .5) to P(sample -.5 < NRPOP < sample + .5)
-       and normalize by the total count of each."""
-    #Compute the probabilities under each population 
-    bw = .5 #bandwidth for estimates
-    P_RPOP = ss.norm.cdf(log_pop+bw, RPOP_data.mean(), RPOP_data.std()) - \
-        ss.norm.cdf(log_pop-bw, RPOP_data.mean(), RPOP_data.std())#probability among ranked
-    P_NRPOP = ss.norm.cdf(log_pop+bw, NRPOP_data.mean(), NRPOP_data.std()) - \
-        ss.norm.cdf(log_pop-bw, NRPOP_data.mean(), NRPOP_data.std())#probability among not ranked
-    #Normalize by population of each to get an estimated number of samples with each value:
-    Est_Counties_Ranked_at_pop = P_RPOP * len(RPOP_data)
-    Est_Counties_NotRanked_at_pop = P_NRPOP * len(NRPOP_data)
-    #Compute the probability: value / all values (in the given population)
-    return Est_Counties_Ranked_at_pop / (Est_Counties_Ranked_at_pop + Est_Counties_NotRanked_at_pop)
-print ("\t 55000: %.4f" % probGivenPop(df1['log_Temp'], df2['log_Temp'], np.log(55000)))
-print ("\t 55000: %.4f" % probGivenPop(df1['log_Sample'], df2['log_Sample'], np.log(55000)))
-print ("\n LIST MEAN AND STD_DEC PER COLUMN:")
-mean_sd = df[valsColumn].describe()[1:3] ## returns mean and std only, other describe output is ignored
-mean_sd_dict = dict([(c, (round(mean_sd[c]['mean'], 4), round(mean_sd[c]['std'], 4), )) for c in mean_sd.columns])
-from pprint import pprint
-pprint(mean_sd_dict)
-#sort values by sample
-df.sort_values(by=['TotalSamples'])
-print(len(df))
-index = int(0.5 * len(df.index))
-lowHalf, highHalf = df[:index], df[index:]
-#Split the dataset into two
-lowHalfSAMPLES = df1['TotalSamples'][~df1['TotalSamples'].isnull()][:100] #or .dropna()
-highHalfSAMPLES = df2['TotalSamples'][~df2['TotalSamples'].isnull()][:100]
-dataSAMPLES = pd.concat([lowHalfSAMPLES, highHalfSAMPLES])
-#Generate a kde:
-dataSAMPLES.plot(kind='kde')
-#highHalfSAMPLES.plot(kind='kde')
-plt.axis([-500,1500,0,0.0035]) #zoom in to these dimensions
-plt.show()
-#Calculate the means, standard deviations, length of each subset.
-mean1, mean2 = lowHalfSAMPLES.mean(), highHalfSAMPLES.mean()
-sd1, sd2 = lowHalfSAMPLES.std(), highHalfSAMPLES.std() #standard deviation across both
-n1, n2 = len(lowHalfSAMPLES), len(highHalfSAMPLES)
-df1, df2 = (n1 - 1), (n2 - 1)
-print ("Mean of lower 50%%: %.1f (%d) \nMean of upper 50%%: %.1f (%d) \n " % (mean1, n1, mean2, n2))
-##two sample t-test, assuming equal variance:
-pooled_var = (df1*sd1**2 + df2*sd2**2) / (df1 + df2) #pooled variance
-t = (mean1 - mean2) / np.sqrt(pooled_var * (1.0/n1 + 1.0/n2)) 
-print (t)
-p = 1 - ss.t.cdf(np.abs(t), df1+df2)
-print ("t: %.4f, df: %.1f, p: %.5f" % (t, df1+df2, p)) #one tail (if you hypothesize)
-print ('t-statistic = %6.3f pvalue = %6.4f' %  ss.ttest_ind(lowHalfSAMPLES, highHalfSAMPLES)) #two tails
-#Use original data
-data = df[~df['TotalSamples'].isnull()]#drop nas
-data = data[data['TotalSamples']!=0]#drop zeros
-data['samples']= data['TotalSamples']
-#Create a scatter plot of the data.
-data.dropna(subset = ['temp'], inplace=True)
-data.dropna(subset = ['log_Temp'], inplace=True)
-data.plot(kind='scatter', x = 'log_Temp', y='samples', alpha=0.3)
-#Generate a nonlogged scatter
-data.plot(kind='scatter', x = 'temp', y='samples', alpha=0.3)
-##Have scipy figure out the regression coefficients:
-beta1, beta0, r, p, stderr = ss.linregress(data['log_Temp'], data['samples'])
-#(assume beta1 and beta0 are estimate; i.e. beta1hat, beta0hat; we will almost never know non-estimated betas)
-print ("yhat = beta0 + beta1*x = %.2f + %.2fx" % (beta0, beta1))
-#now plot the line:
-xpoints = np.linspace(data['log_Temp'].min()*.90, data['log_Temp'].max()*1.1, 100)
-plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #"beta0 + beta1*xpoints" is using vector algebra
-#Generate KDEs and final linear regression.
-data['samples'].plot(kind='kde') 
-#print(data['samples'])
-data.dropna(subset = ['samples'], inplace=True)
-data.dropna(subset = ['log_Temp'], inplace=True)
-#print(data['samples'])
-print ("before logging SAMPLES; r = %.3f, p = %.5f" % ss.pearsonr(data['log_Temp'], data['samples']))
-plt.show()
-data['log_samples'] = np.log(data['samples']+1)
-data['log_samples'].plot(kind='kde')#after transform
-print ("after logging SAMPLES; r = %.3f, p = %.5f" % ss.pearsonr(data['log_Temp'], data['log_samples']))
-data.plot(kind='scatter', x = 'log_Temp', y='log_samples', alpha=0.3)
-beta1, beta0, r, p, stderr = ss.linregress(data['log_Temp'], data['log_samples'])# we will talk about r, p, and stderr next
-xpoints = np.linspace(data['log_Temp'].min()*.90, data['log_Temp'].max()*1.1, 100)
-plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #note: "beta0 + beta1*xpoints" is using vector algebra
-plt.show()
-print ("from linregress: r = %.3f, p = %.5f" % (r, p))
-#The red line has a steeper slope now => greater r
-data.plot(kind='scatter', x = 'temp', y='samples', alpha=0.3)
\ No newline at end of file