diff --git a/climatehyptest2.py b/climatehyptest2.py
new file mode 100644
index 0000000000000000000000000000000000000000..adeb3bf3c7bc274cfdf2963240bc49fa7ef2e4b0
--- /dev/null
+++ b/climatehyptest2.py
@@ -0,0 +1,231 @@
+# -*- coding: utf-8 -*-
+"""climateHypTest2.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/16nUPjaYYnWurxCPeCCBJVWPcAMj6sIUa
+
+Code for hypothesis testing of the datasets in order to determine the each of the previously tested null hypotheses
+  may be proven or disprove. 
+  Statistical tests that are carried out are: pvalues, histograms, kernel density estimators, t-tests and linear regressions.
+  
+  #***************************************************************************************
+  #  *    Title: lab7_hypothesis-checkpoint
+  #  *    Author: Dr Manal Helal
+  #  *    Date: November 2019 
+  #  *    Code version: 1.02
+  #***************************************************************************************
+"""
+
+import pandas as pd
+from scipy import stats
+import numpy as np
+import scipy.stats as ss
+import csv
+
+#Import the dataset
+df = pd.read_csv('climateUKRand.csv', engine='python', encoding='utf-8', error_bad_lines=False)
+
+
+print(len(df))
+df.mean()
+df.fillna(0)
+df.head()
+
+#Sort the dataset by their value field using quicksort. Smallest values will be first.
+df = df.sort_values('temp',ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False)
+
+print(round(len(df)/2))
+
+#Split the dataset into two, each set being of an equal length.
+df1 = df.iloc[:round(len(df)/2)]
+df2 = df.iloc[round(len(df)/2):]
+
+#GEt the T-Test statistic value and the p value of the datasets.
+print(stats.ttest_ind(df1.dropna()['TotalSamples'], df2.dropna()['TotalSamples']))
+
+# Commented out IPython magic to ensure Python compatibility.
+# %matplotlib inline
+
+import sys
+import matplotlib.pyplot as plt
+
+#Trim the sets to the required fields only.
+valsColumn = [ currColumn for currColumn in df.columns if "temp" in currColumn
+    or "TotalSamples" in currColumn]
+df = df[df.TotalSamples != 0] 
+df = df[df.temp != 0] 
+#df = df[valsColumn]
+#valsColumn = [v for v in valsColumn if "Value" in v]
+
+print ("\n(2) TOTAL SAMPLES IN FILE:")
+print("\t\t\t\t%d "%(len(df.index)))
+
+print (valsColumn)
+
+ValueHigherThanzero = [ idx for idx,isRanked in enumerate(df['temp']) if isRanked>0 ]
+print("\n(3) TOTAL with a value greater than zero:")
+print("\t\t\t\t%d "%(len(ValueHigherThanzero)))
+
+#if df['Value'].dtype != 'int':
+ #   df['Value'] = df['Value'].astype('int')
+
+#Generate histograms for both of the fields that are being tested. 
+#Generate one with the original values and one with the log values.
+png_file = 'histTemp.png'
+hist = df['temp'].hist(bins=60)
+hist.get_figure().savefig(png_file)
+plt.show()
+
+png_file = 'histSamples.png'
+hist = df['TotalSamples'].hist(bins=60)
+hist.get_figure().savefig(png_file)
+plt.show()
+
+
+png_file = 'histlogTemp.png'
+df['log_Temp'] = np.log(df['temp'])
+df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
+lhist = df['log_Temp'].hist(bins=30)
+lhist.get_figure().savefig(png_file)
+plt.show()
+
+df1 = df.iloc[:round(len(df)/2)]
+df2 = df.iloc[round(len(df)/2):]
+
+#Generate a KDE for both tested fields.
+png_file = 'KDE.png'
+df1['log_Temp'].plot(kind='kde')
+kde = df2['log_Temp'].plot(kind='kde')
+kde.get_figure().savefig(png_file)
+plt.show()
+
+
+png_file = 'histlogSamples.png'
+df['log_Sample'] = np.log(df['TotalSamples'])
+lhist = df['log_Sample'].hist(bins=30)
+lhist.get_figure().savefig(png_file)
+plt.show()
+
+df1 = df.iloc[:round(len(df)/2)]
+df2 = df.iloc[round(len(df)/2):]
+
+png_file = 'KDE2.png'
+df1['log_Sample'].plot(kind='kde')
+kde = df2['log_Sample'].plot(kind='kde')
+kde.get_figure().savefig(png_file)
+plt.show()
+
+def probGivenPop (RPOP_data, NRPOP_data, log_pop):
+    """ model each variable as a normal CRV. 
+       This function will find the conditional probability of the two variables: P(value | sample)
+       P(RPOP = sample) = 0 and P(RPOP = sample) = 0, since they are CRVs. An interval can be added in order to 
+       get non-zero values.
+       Look at the ratio of P(sample -.5 < RPOP < sample + .5) to P(sample -.5 < NRPOP < sample + .5)
+       and normalize by the total count of each."""
+    
+    #Compute the probabilities under each population 
+    bw = .5 #bandwidth for estimates
+    P_RPOP = ss.norm.cdf(log_pop+bw, RPOP_data.mean(), RPOP_data.std()) - \
+        ss.norm.cdf(log_pop-bw, RPOP_data.mean(), RPOP_data.std())#probability among ranked
+    P_NRPOP = ss.norm.cdf(log_pop+bw, NRPOP_data.mean(), NRPOP_data.std()) - \
+        ss.norm.cdf(log_pop-bw, NRPOP_data.mean(), NRPOP_data.std())#probability among not ranked
+    
+    #Normalize by population of each to get an estimated number of samples with each value:
+    Est_Counties_Ranked_at_pop = P_RPOP * len(RPOP_data)
+    Est_Counties_NotRanked_at_pop = P_NRPOP * len(NRPOP_data)
+    
+    #Compute the probability: value / all values (in the given population)
+    return Est_Counties_Ranked_at_pop / (Est_Counties_Ranked_at_pop + Est_Counties_NotRanked_at_pop)
+
+
+print ("\t 55000: %.4f" % probGivenPop(df1['log_Temp'], df2['log_Temp'], np.log(55000)))
+print ("\t 55000: %.4f" % probGivenPop(df1['log_Sample'], df2['log_Sample'], np.log(55000)))
+
+print ("\n LIST MEAN AND STD_DEC PER COLUMN:")
+mean_sd = df[valsColumn].describe()[1:3] ## returns mean and std only, other describe output is ignored
+mean_sd_dict = dict([(c, (round(mean_sd[c]['mean'], 4), round(mean_sd[c]['std'], 4), )) for c in mean_sd.columns])
+from pprint import pprint
+pprint(mean_sd_dict)
+
+#sort values by sample
+df.sort_values(by=['TotalSamples'])
+
+print(len(df))
+index = int(0.5 * len(df.index))
+lowHalf, highHalf = df[:index], df[index:]
+
+#Split the dataset into two
+lowHalfSAMPLES = df1['TotalSamples'][~df1['TotalSamples'].isnull()][:100] #or .dropna()
+highHalfSAMPLES = df2['TotalSamples'][~df2['TotalSamples'].isnull()][:100]
+
+
+dataSAMPLES = pd.concat([lowHalfSAMPLES, highHalfSAMPLES])
+
+#Generate a kde:
+dataSAMPLES.plot(kind='kde')
+#highHalfSAMPLES.plot(kind='kde')
+
+plt.axis([-500,1500,0,0.0035]) #zoom in to these dimensions
+plt.show()
+
+#Calculate the means, standard deviations, length of each subset.
+
+mean1, mean2 = lowHalfSAMPLES.mean(), highHalfSAMPLES.mean()
+sd1, sd2 = lowHalfSAMPLES.std(), highHalfSAMPLES.std() #standard deviation across both
+n1, n2 = len(lowHalfSAMPLES), len(highHalfSAMPLES)
+df1, df2 = (n1 - 1), (n2 - 1)
+print ("Mean of lower 50%%: %.1f (%d) \nMean of upper 50%%: %.1f (%d) \n " % (mean1, n1, mean2, n2))
+
+##two sample t-test, assuming equal variance:
+pooled_var = (df1*sd1**2 + df2*sd2**2) / (df1 + df2) #pooled variance
+t = (mean1 - mean2) / np.sqrt(pooled_var * (1.0/n1 + 1.0/n2)) 
+print (t)
+p = 1 - ss.t.cdf(np.abs(t), df1+df2)
+print ("t: %.4f, df: %.1f, p: %.5f" % (t, df1+df2, p)) #one tail (if you hypothesize)
+print ('t-statistic = %6.3f pvalue = %6.4f' %  ss.ttest_ind(lowHalfSAMPLES, highHalfSAMPLES)) #two tails
+
+#Use original data
+data = df[~df['TotalSamples'].isnull()]#drop nas
+data = data[data['TotalSamples']!=0]#drop zeros
+data['samples']= data['TotalSamples']
+
+#Create a scatter plot of the data.
+data.dropna(subset = ['temp'], inplace=True)
+data.dropna(subset = ['log_Temp'], inplace=True)
+data.plot(kind='scatter', x = 'log_Temp', y='samples', alpha=0.3)
+
+#Generate a nonlogged scatter
+data.plot(kind='scatter', x = 'temp', y='samples', alpha=0.3)
+
+##Have scipy figure out the regression coefficients:
+beta1, beta0, r, p, stderr = ss.linregress(data['log_Temp'], data['samples'])
+#(assume beta1 and beta0 are estimate; i.e. beta1hat, beta0hat; we will almost never know non-estimated betas)
+print ("yhat = beta0 + beta1*x = %.2f + %.2fx" % (beta0, beta1))
+#now plot the line:
+xpoints = np.linspace(data['log_Temp'].min()*.90, data['log_Temp'].max()*1.1, 100)
+plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #"beta0 + beta1*xpoints" is using vector algebra
+
+#Generate KDEs and final linear regression.
+data['samples'].plot(kind='kde') 
+#print(data['samples'])
+data.dropna(subset = ['samples'], inplace=True)
+data.dropna(subset = ['log_Temp'], inplace=True)
+#print(data['samples'])
+print ("before logging SAMPLES; r = %.3f, p = %.5f" % ss.pearsonr(data['log_Temp'], data['samples']))
+plt.show()
+data['log_samples'] = np.log(data['samples']+1)
+data['log_samples'].plot(kind='kde')#after transform
+print ("after logging SAMPLES; r = %.3f, p = %.5f" % ss.pearsonr(data['log_Temp'], data['log_samples']))
+
+data.plot(kind='scatter', x = 'log_Temp', y='log_samples', alpha=0.3)
+beta1, beta0, r, p, stderr = ss.linregress(data['log_Temp'], data['log_samples'])# we will talk about r, p, and stderr next
+xpoints = np.linspace(data['log_Temp'].min()*.90, data['log_Temp'].max()*1.1, 100)
+plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #note: "beta0 + beta1*xpoints" is using vector algebra
+plt.show()
+print ("from linregress: r = %.3f, p = %.5f" % (r, p))
+
+#The red line has a steeper slope now => greater r
+
+data.plot(kind='scatter', x = 'temp', y='samples', alpha=0.3)
\ No newline at end of file