import numpy as np import matplotlib.pyplot as plt from ss_setPlotParams import setPlotParams from astropy.stats import median_absolute_deviation from statsmodels.distributions.empirical_distribution import ECDF from scipy.stats.mstats import winsorize def get_bloodfat_data(): file1 = 'Scottetal1978_bloodfat_1.dat' file2 = 'Scottetal1978_bloodfat_2.dat' chol1 = np.loadtxt(file1, usecols = (0), unpack = True) chol2 = np.loadtxt(file2, usecols = (0), unpack = True) return chol1, chol2 def get_ECDF_info(data1, data2, plot = True): #Compute the ECDFs. Plot them. Then find the medians and the pdf evaluated at the median. # The last quantity is approximately equal to the numerical derivative of the ECDF at the median. ecdf1 = ECDF(data1) ecdf2 = ECDF(data2) if plot: plt = setPlotParams() plt.hist(data1/np.max(data1), label = "PDF*{} (Control)".format(np.round(np.max(data1), decimals = 2)),\ histtype = "step") plt.hist(data2/np.max(data2), label = "PDF*{} (Control)".format(np.round(np.max(data2), decimals = 2)),\ histtype = "step") plt.plot(ecdf1.x, ecdf1.y, label = "ECDF (Control)", lw = 4) plt.plot(ecdf2.x, ecdf2.y, label = "ECDF (Test)", lw = 4) plt.title('Empirical distributions') plt.xlabel(r"Cholesterol level [mg dL$^{-1}$]") plt.legend(loc = 'best') plt.savefig('hw3_1.png') plt.close() k1 = np.argmin(np.abs(ecdf1.y - 0.5)) med1 = ecdf1.x[k1] prob_med1 = (ecdf1.y[k1 + 1] - ecdf1.y[k1 - 1])/(ecdf1.x[k1 + 1] - ecdf1.x[k1 - 1]) k2 = np.argmin(np.abs(ecdf2.y - 0.5)) med2 = ecdf2.x[k2] prob_med2 = (ecdf2.y[k2 + 2] - ecdf2.y[k2 - 2])/(ecdf2.x[k2 + 2] - ecdf2.x[k2 - 2]) return med1, prob_med1, med2, prob_med2 def get_bootstrap(data, Nsamples = 1000): Ndata = len(data) #Winsorize at the 95% level for the sample median distribution data_Winsorized = np.array(winsorize(data, limits = 0.025)) mean = median = std = mad = np.zeros(Nsamples) index = np.arange(Ndata) j = np.random.choice(index, size = [Ndata, Nsamples], replace = True) mean = data[j].mean(axis = 0) std = data[j].std(axis = 0, ddof = 1) median = np.median(data_Winsorized[j], axis = 0) mad = median_absolute_deviation(data_Winsorized[j], axis = 0) databoot = {"mean": mean, "median": median, "std": std, "mad": mad} return databoot def plotStuff(chol1mean, chol2mean, CI): plt = setPlotParams() n1, bins1, patches1 = plt.hist(chol1mean, histtype = 'step', fc = 'black') n2, bins2, patches2 = plt.hist(chol2mean, histtype = 'step', fc = 'orange') #Overlay the 95% confidence intervals for the population means yloc = 0.5*np.min(np.array([np.max(n1), np.max(n2)])) plt.plot(np.array([CI[0][0], CI[0][1]]), np.array([yloc, yloc]), c = 'black', lw = 4) plt.plot(np.array([CI[2][0], CI[2][1]]), np.array([yloc, yloc]), c = 'orange', lw = 4) plt.savefig('blooddata_bootstrap.png', bbox_inches = 'tight') plt.close() def get_CI(data, location = "mean"): #Get 95% CI assuming normality (95% then corresponds to 1.96-sigma). # The input object is a dictionary with the mean, median, and stds from bootstrap. # The location keyword can be set to "median" or "std" to compute CIs for those # statistics instead. mean = data[location].mean() std = data[location].std(ddof = 1) CI = mean + 1.96*std*np.array([-1, 1]) return CI def bloodfat(): chol1, chol2 = get_bloodfat_data() #Answer to part (a) -- plot the histograms and ECDFs. med1, prob_med1, med2, prob_med2 = get_ECDF_info(chol1, chol2) #Answer to part (b) -- find the medians. print("The medians computed from the ECDF are {} (control) and {} (test).".\ format(np.round(med1, decimals = 2), np.round(med2, decimals = 2))) #Answer to part (c) -- perform bootstrap and print out the mean and std of the sample means. chol1boot = get_bootstrap(chol1) chol2boot = get_bootstrap(chol2) print("The sample mean and std for the sample means are {}, {} (control) and {}, {} (test).".\ format(np.round(chol1boot["mean"].mean(), decimals = 2),\ np.round(chol1boot["mean"].std(ddof = 1), decimals = 2), \ np.round(chol2boot["mean"].mean(), decimals = 2),\ np.round(chol2boot["mean"].std(ddof = 1), decimals = 2))) #Answer to part (d) -- compute 95% CIs. chol1_mean_CI = get_CI(chol1boot) chol2_mean_CI = get_CI(chol2boot) print("95% CI for mean of control sample: {}".format(chol1_mean_CI)) print("95% CI for mean of test sample: {}".format(chol2_mean_CI)) flag1 = (chol1_mean_CI[0] <= chol2_mean_CI[0]) and (chol1_mean_CI[1] >= chol2_mean_CI[0]) flag2 = (chol2_mean_CI[0] <= chol1_mean_CI[0]) and (chol2_mean_CI[1] >= chol1_mean_CI[0]) if ((flag1) or (flag2)): print("The two CIs overlap, consistent with the means being indentical to within 95%.") else: print("No overlap between CIs, consistent with distinct means.") #Answer to part (e) -- mean and variance of \bar{x}_test - \bar{x}_control mean_diff = chol2boot["mean"].mean() - chol1boot["mean"].mean() std_diff = np.sqrt(chol2boot["mean"].std(ddof = 1)**2 + chol1boot["mean"].std(ddof = 1)**2) print("Mean and std for the difference in means: {} and {}".\ format(np.round(mean_diff, decimals = 2), np.round(std_diff, decimals = 2))) #Answer to part (f): sample distributions for the sample medians, then mean and std of these. # Then 95% CIs for these. #The bootstrap distributions for the medians have already been conputed for the Winsorized samples. chol1_med_mean = chol1boot["median"].mean() chol1_med_std = chol1boot["median"].std(ddof = 1) chol1_med_CI = get_CI(chol1boot, location = "median") chol2_med_mean = chol2boot["median"].mean() chol2_med_std = chol2boot["median"].std(ddof = 1) chol2_med_CI = get_CI(chol2boot, location = "median") print("Mean and std for the distribution of medians:\n") print("Control... mean: {}, std: {}\n Test... mean: {}, std: {}".\ format(np.round(chol1_med_mean, decimals = 2), np.round(chol1_med_std, decimals = 2),\ np.round(chol2_med_mean, decimals = 2), np.round(chol2_med_std, decimals = 2))) print("95% CIs for the population medians:\n") print("Control: {}, test: {}".format(np.round(chol1_med_CI, decimals = 2),\ np.round(chol2_med_CI, decimals = 2))) flag1 = (chol1_med_CI[0] <= chol2_med_CI[0]) and (chol1_med_CI[1] >= chol2_med_CI[0]) flag2 = (chol2_med_CI[0] <= chol1_med_CI[0]) and (chol2_med_CI[1] >= chol1_med_CI[0]) if ((flag1) or (flag2)): print("The two CIs overlap, consistent with the medians being indentical to within 95%.") else: print("No overlap between CIs, consistent with distinct medians.")