import numpy as np
import matplotlib.pyplot as plt
from ss_setPlotParams import setPlotParams
from astropy.stats import median_absolute_deviation
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.stats.mstats import winsorize

def get_bloodfat_data():
    file1 = 'Scottetal1978_bloodfat_1.dat'
    file2 = 'Scottetal1978_bloodfat_2.dat'
    chol1 = np.loadtxt(file1, usecols = (0), unpack = True)
    chol2 = np.loadtxt(file2, usecols = (0), unpack = True)
    return chol1, chol2

def get_ECDF_info(data1, data2, plot = True):
    #Compute the ECDFs. Plot them. Then find the medians and the pdf evaluated at the median.
    #   The last quantity is approximately equal to the numerical derivative of the ECDF at the median.
    ecdf1 = ECDF(data1)
    ecdf2 = ECDF(data2)
    if plot:
        plt = setPlotParams()
        plt.hist(data1/np.max(data1), label = "PDF*{} (Control)".format(np.round(np.max(data1), decimals = 2)),\
                 histtype = "step")
        plt.hist(data2/np.max(data2), label = "PDF*{} (Control)".format(np.round(np.max(data2), decimals = 2)),\
                 histtype = "step")
        plt.plot(ecdf1.x, ecdf1.y, label = "ECDF (Control)", lw = 4)
        plt.plot(ecdf2.x, ecdf2.y, label = "ECDF (Test)", lw = 4)
        plt.title('Empirical distributions')
        plt.xlabel(r"Cholesterol level [mg dL$^{-1}$]")
        plt.legend(loc = 'best')
        plt.savefig('hw3_1.png')
        plt.close()

    k1 = np.argmin(np.abs(ecdf1.y - 0.5))
    med1 = ecdf1.x[k1]
    prob_med1 = (ecdf1.y[k1 + 1] - ecdf1.y[k1 - 1])/(ecdf1.x[k1 + 1] - ecdf1.x[k1 - 1])
    k2 = np.argmin(np.abs(ecdf2.y - 0.5))
    med2 = ecdf2.x[k2]
    prob_med2 = (ecdf2.y[k2 + 2] - ecdf2.y[k2 - 2])/(ecdf2.x[k2 + 2] - ecdf2.x[k2 - 2])

    return med1, prob_med1, med2, prob_med2

def get_bootstrap(data, Nsamples = 1000):
    Ndata = len(data)
    #Winsorize at the 95% level for the sample median distribution
    data_Winsorized = np.array(winsorize(data, limits = 0.025))
    mean = median = std = mad = np.zeros(Nsamples)
    index = np.arange(Ndata)
    j = np.random.choice(index, size = [Ndata, Nsamples], replace = True)
    mean = data[j].mean(axis = 0)
    std = data[j].std(axis = 0, ddof = 1)
    median = np.median(data_Winsorized[j], axis = 0)
    mad = median_absolute_deviation(data_Winsorized[j], axis = 0)
    databoot = {"mean": mean, "median": median, "std": std, "mad": mad}
    return databoot

def plotStuff(chol1mean, chol2mean, CI):
    plt = setPlotParams()
    n1, bins1, patches1 = plt.hist(chol1mean, histtype = 'step', fc = 'black')
    n2, bins2, patches2 = plt.hist(chol2mean, histtype = 'step', fc = 'orange')
    #Overlay the 95% confidence intervals for the population means
    yloc = 0.5*np.min(np.array([np.max(n1), np.max(n2)]))
    plt.plot(np.array([CI[0][0], CI[0][1]]), np.array([yloc, yloc]), c = 'black', lw = 4)
    plt.plot(np.array([CI[2][0], CI[2][1]]), np.array([yloc, yloc]), c = 'orange', lw = 4)
    plt.savefig('blooddata_bootstrap.png', bbox_inches = 'tight')
    plt.close()

def get_CI(data, location = "mean"):
    #Get 95% CI assuming normality (95% then corresponds to 1.96-sigma).
    #   The input object is a dictionary with the mean, median, and stds from bootstrap.
    #   The location keyword can be set to "median" or "std" to compute CIs for those
    #       statistics instead.
    mean = data[location].mean()
    std = data[location].std(ddof = 1)
    CI = mean + 1.96*std*np.array([-1, 1])

    return CI

def bloodfat():
    chol1, chol2 = get_bloodfat_data()

    #Answer to part (a) -- plot the histograms and ECDFs.
    med1, prob_med1, med2, prob_med2 = get_ECDF_info(chol1, chol2)
    #Answer to part (b) -- find the medians.
    print("The medians computed from the ECDF are {} (control) and {} (test).".\
          format(np.round(med1, decimals = 2), np.round(med2, decimals = 2)))
    #Answer to part (c) -- perform bootstrap and print out the mean and std of the sample means.
    chol1boot = get_bootstrap(chol1)
    chol2boot = get_bootstrap(chol2)
    print("The sample mean and std for the sample means are {}, {} (control) and {}, {} (test).".\
          format(np.round(chol1boot["mean"].mean(), decimals = 2),\
                 np.round(chol1boot["mean"].std(ddof = 1), decimals = 2), \
                 np.round(chol2boot["mean"].mean(), decimals = 2),\
                 np.round(chol2boot["mean"].std(ddof = 1), decimals = 2)))

    #Answer to part (d) -- compute 95% CIs.
    chol1_mean_CI = get_CI(chol1boot)
    chol2_mean_CI = get_CI(chol2boot)
    print("95% CI for mean of control sample: {}".format(chol1_mean_CI))
    print("95% CI for mean of test sample: {}".format(chol2_mean_CI))
    flag1 = (chol1_mean_CI[0] <= chol2_mean_CI[0]) and (chol1_mean_CI[1] >= chol2_mean_CI[0])
    flag2 = (chol2_mean_CI[0] <= chol1_mean_CI[0]) and (chol2_mean_CI[1] >= chol1_mean_CI[0])
    if ((flag1) or (flag2)):
        print("The two CIs overlap, consistent with the means being indentical to within 95%.")
    else:
        print("No overlap between CIs, consistent with distinct means.")

    #Answer to part (e) -- mean and variance of \bar{x}_test - \bar{x}_control
    mean_diff = chol2boot["mean"].mean() - chol1boot["mean"].mean()
    std_diff = np.sqrt(chol2boot["mean"].std(ddof = 1)**2 + chol1boot["mean"].std(ddof = 1)**2)
    print("Mean and std for the difference in means: {} and {}".\
          format(np.round(mean_diff, decimals = 2), np.round(std_diff, decimals = 2)))

    #Answer to part (f): sample distributions for the sample medians, then mean and std of these.
    #   Then 95% CIs for these.
    #The bootstrap distributions for the medians have already been conputed for the Winsorized samples.
    chol1_med_mean = chol1boot["median"].mean()
    chol1_med_std = chol1boot["median"].std(ddof = 1)
    chol1_med_CI = get_CI(chol1boot, location = "median")
    chol2_med_mean = chol2boot["median"].mean()
    chol2_med_std = chol2boot["median"].std(ddof = 1)
    chol2_med_CI = get_CI(chol2boot, location = "median")
    print("Mean and std for the distribution of medians:\n")
    print("Control... mean: {}, std: {}\n Test... mean: {}, std: {}".\
          format(np.round(chol1_med_mean, decimals = 2), np.round(chol1_med_std, decimals = 2),\
                 np.round(chol2_med_mean, decimals = 2), np.round(chol2_med_std, decimals = 2)))
    print("95% CIs for the population medians:\n")
    print("Control: {}, test: {}".format(np.round(chol1_med_CI, decimals = 2),\
                                         np.round(chol2_med_CI, decimals = 2)))
    flag1 = (chol1_med_CI[0] <= chol2_med_CI[0]) and (chol1_med_CI[1] >= chol2_med_CI[0])
    flag2 = (chol2_med_CI[0] <= chol1_med_CI[0]) and (chol2_med_CI[1] >= chol1_med_CI[0])
    if ((flag1) or (flag2)):
        print("The two CIs overlap, consistent with the medians being indentical to within 95%.")
    else:
        print("No overlap between CIs, consistent with distinct medians.")