%% Example 1 - Bootstrap Aspirin 
%  This is the code used to perform the heart attack example shown in the
%  slides.
clear all
close all
aspirin_heart = 104;
aspirin_total = 11037;

placebo_heart = 189;
placebo_total = 11034;

% like flipping a coin...
asp_data = zeros(aspirin_total,1); % 11037 total subjects, all zero
asp_data(1:aspirin_heart) = 1; % set 104 subjects to 1 (1 = heart attack)

placebo_data = zeros(placebo_total, 1); % 11034 tot subjects, all zero
placebo_data(1:placebo_heart) = 1; % 189 had heart attacks

ratio_empirical = (aspirin_heart/aspirin_total)/(placebo_heart/placebo_total); % Calculate statistic for original sample
n_boot = 10000; % Number of times to perform resampling
ratio_boot = zeros(n_boot, 1);
for boot = 1:n_boot
    boot_asp = randsample(asp_data, aspirin_total, 'true'); % resample aispiring data
    boot_placebo = randsample(placebo_data, placebo_total, 'true'); % resample placebo data 
    n_boot_asp = sum(boot_asp);
    n_boot_placebo = sum(boot_placebo);    
    ratio_boot(boot) = (n_boot_asp/aspirin_total)/(n_boot_placebo/placebo_total);  % Calculate statistic for resampled data      
end

% Find 95% confidence interval
ratio_boot_sorted = sort(ratio_boot); % Sort ratio_boots from lowest to highest
lower_bound_ind = floor(0.025 * n_boot); % Find index of 2.5% value
upper_bound_ind = floor(0.975 * n_boot); % Find index of 97.5% value

lower_bound = ratio_boot_sorted(lower_bound_ind); % Use lower index to find value corresponding to 2.5% position
upper_bound = ratio_boot_sorted(upper_bound_ind); % Use higher index to find value corresponding to 97.5% position

figure
close all
hold on
histogram(ratio_boot, 'edgecolor', 'none')
xlabel('ratio')
ylabel('count')
plot([1, 1]*ratio_empirical, [0, 1]*700,'--')
plot([1, 1]*lower_bound, [0, 1]*700,'--', 'Color', 'Black')
plot([1, 1]*upper_bound, [0, 1]*700,'--', 'Color', 'Black')

%% Exercise 1 - Bootstrapped model fitting 1.1:
% Let's create some data to model. We will create 20 data points with a
% linear relationship + some noise
clear
rng(2019) % specifies seed for matlab random number generator (rng)
n_pts = 20; % Number of data points to model
X = (1:n_pts)'; % x values of data points
Y = (1*X) + 2 + 3*randn(n_pts,1); % y values of data points

% Visualize data
close all
figure
scatter(1:n_pts, Y,'k')
axis square

%% Exercise 1 - Bootstrapped model fitting 1.2:
% Now, in a new figure, let's fit a line to the sample and plot. You can use the regress(Y, model)
% function to get the beta values of the linear regression.
figure
scatter(1:n_pts, Y,'k')
hold on

model = [ones(n_pts,1) X]; % Regressors include vector of ones and x values
beta_emp = regress(Y,model); % Perform linear regression
emp_fit = model*beta_emp; % Get y-fit values
plot(X, emp_fit)
axis square

%% Exercise 1 - Bootstrapped model fitting 1.3:
% Now, we will perform bootstrapping 500 times. Make sure to save the beta
% values of each boot so we can create histograms of the y-intercept (first
% beta) and slope (second beta). Also save the fits of each boot so we can
% plot these later

n_boot = 500;
boot_fit = zeros(n_boot, n_pts); % create empty matrix for fits
boot_beta = zeros(n_boot,2); % create empty matrix for fit coeffs


% -- CODE HERE --


% Plot the original data with the fits of each bootstrap
figure
scatter(1:n_pts, Y,'k') % Plot original data
hold on
for boot=1:n_boot
plot(boot_fit(boot,:),'b')
end
axis square

%% Exercise 1 - Bootstrapped model fitting 1.4:
% Now, we will plot the original data, the empirical fit (the fit to the
% original data set) and the standard deviation of the fit at each point

% compute the standard deviation at each point
boot_fit_std = std(boot_fit,[],1);

figure
hold on
scatter(1:n_pts, Y ,'k')
errorbar(1:n_pts, emp_fit, boot_fit_std ,'capsize',0)
axis square

%% Exercise 1 - Bootstrapped model fitting 1.5:
% Finally, create histograms of the beta values (y-intercept and slope)
figure
subplot(121)
histogram(boot_beta(:,2))
title('slope')
subplot(122)
histogram(boot_beta(:,1))
title('intercept')

%% Example 2 - Permutation test paired t-test
%  This is the code used to perform the IQ example shown in the slides.
clear
close all

% Let's define some data that does have an effect of a drug
n_subj = 20; % number of participants
drug_before = 100 + 5*randn(n_subj,1); % IQ before drug
drug_after = drug_before + 3.2 + 2*randn(n_subj,1); % IQ after drug

% Let's visualize the data
figure
plot([drug_before, drug_after]' ,'k' ,'linewidth', 2)
set(gca,'xtick',1:2, 'xticklabel',{'before','after'})
xlim([0.9,2.1])
ylabel('IQ')

% Calculate statistic that we can use to assess effect of drug on IQ
emp_mean_diff = mean(drug_after-drug_before);

% Perform permutation / randomization
n_perm = 10000;
all_data = [drug_before, drug_after]; % concatenate the data
perm_diffs = zeros(n_perm,1);
for perm = 1:n_perm
   perm_ind = randperm(2*n_subj); % generate random indices
   
   perm_before= all_data(perm_ind(1:n_subj))'; % generate new pre-drug group
   perm_after = all_data(perm_ind(n_subj+1:end))'; % generate new post-drug group
   
   perm_diffs(perm) = mean(perm_after-perm_before); % calculate new statistic
    
end

% Plot the results
figure
plot([perm_before perm_after]','k' ,'linewidth', 2)
set(gca,'xtick',1:2, 'xticklabel',{'before','after'})
xlim([0.9,2.1])
ylabel('IQ')

figure
hold on
histogram(perm_diffs,'edgecolor','none')
plot([1 1]*emp_mean_diff, [0,500],'r--','linewidth',3)
xlabel('Mean difference')
ylabel('Count')
legend('Null', 'Empirical')

% Compute p value
% how many null values lie BELOW the real value? 
% Is the empirical value greater than 95% of the null?

p_val = 1 - sum(emp_mean_diff>perm_diffs)/n_perm;

%% Exercise 2 - Permutation test continuous data fitting 2.1
%  Let's create some data to model. We will create 20 data points with a
%  linear relationship + some noise
clear
close all
rng(2019) % specifies seed for matlab random number generator (rng)
n_pts = 20; % Number of data points to model
X = (1:n_pts)'; % x values of data points
Y = (1*X) + 2 + 3*randn(n_pts,1); % y values of data points

% Let's fit a linear regression model to this data
model = [ones(n_pts,1) X];
beta = regress(Y,model);

% Visualize data and model fit
figure
hold on
cmap = parula(n_pts);
scatter(X,Y,50,cmap,'filled')
plot(X,model*beta)
legend('data','fit')
axis square
disp(beta) 

%% Exercise 2 - Permutation test continuous data fitting 2.2
% Are these beta fit coeffs (i.e. y-intercept and slope) different than chance?
% Perform permutation test. Keep track of the beta fit coeffs for each
% permutation
n_perm = 10000;
beta_perm = zeros(n_perm,2);

% -- CODE HERE --

% Visualize histogram of beta coeffs
figure
subplot(121)
hold on
histogram(beta_perm(:,2),'edgecolor','none') % slope
plot([1 1] * beta(2), [0 1]*500,'--','linewidth',3)
legend('10k shuff null', 'empirical')
ylim([0,800])
title('Slope')

subplot(122)
hold on
histogram(beta_perm(:,1), 'edgecolor','none')  % intercept
plot([1 1] * beta(1), [0 1]*500,'--','linewidth',3)
ylim([0,800])
title('Intercept')

%% Exercise 2 - Permutation test continuous data fitting 2.3
% How do we compute p-values?
% Find how many values of shuffled slope coeffs are higher/lower than true
% value

% -- CODE HERE --

%% Example 3 - Permutation test EEG data spectrogram
clear all
close all
load eeg.mat

imagesc(eeg)
set(gca,'xtick',[],'ytick',[])
xlabel('Time')
ylabel('Frequency')
h=colorbar;
axis square 
ylabel(h,'Relative power')


n_perm = 5000;
perm_eeg = zeros(size(eeg,1),size(eeg,2),n_perm); % Create null matrix
n_eeg = numel(eeg);
for perm = 1:n_perm    
    perm_ind = randperm(n_eeg,n_eeg);
    perm_eeg(:,:,perm) = reshape(eeg(perm_ind),size(eeg));    
end

% Plot three examples
figure
for plt = 1:3
    subplot(1,3,plt)   
    imagesc(perm_eeg(:,:,plt))
    set(gca,'ytick',[],'xtick',[])
    axis square
end

figure
hold on
histogram(perm_eeg(1,1,:)) % grab every null value at the first pixel
plot([1, 1]*eeg(1), [0,1] * 600,'--','linewidth',2) % plot the actual value
legend('Null distribution for pixel 1','Empirical value')

% Find 2.5 and 97.5 percentile for each pixel
thresh_low = prctile(perm_eeg, 2.5, 3);  % find min along 3rd dimensin
thresh_high = prctile(perm_eeg, 97.5, 3); % find max

subplot(121)
imagesc(thresh_low,[min(eeg(:)), max(eeg(:))])
colorbar
title('lower bound')
axis square
set(gca,'xtick',[],'ytick',[])

subplot(122)
imagesc(thresh_high, [min(eeg(:)), max(eeg(:))])
colorbar
title('upper bound')
axis square
set(gca,'xtick',[],'ytick',[])

% plot the thresholded image
% These are the values that are statistically different than the null
figure
mask = eeg<thresh_low | eeg>thresh_high;
thresholded = eeg .* mask;
imagesc(thresholded, [min(eeg(:)), max(eeg(:))])
colorbar
axis square
set(gca,'xtick',[],'ytick',[])
xlabel('Time')
ylabel('Frequency')
ylabel(h,'Relative power')

%% Exercise 3 - Cross-validation model fitting 3.1
%  We have supplied some 2D data in regress1.mat that we would like to fit
%  using polynomials. We want to determine the order of the polynomial that
%  fits best. 

% In this first section, we will load and visualize the data

% Load data
clear all
close all
load regress1.mat

% Visualize data
figure
scatter(x,y)

%% Exercise 3 - Cross-validation model fitting 3.2
% Next, we will fit the data with polynomial linear models without using 
% cross validation. This will serve as a comparison to see how cross 
% validation alters our interpretation of the 'best' model.

% Let's fit this data with polynomials:
n_pts = size(y,1); % Number of data points
XX = [ones(n_pts,1), x, x.^2,x.^3,x.^4,x.^5]; % All regressors that we will use for each fit
n_models = size(XX,2); % Number of regressors (should be 6)

mse_vec = zeros(n_models, 1); % create vector to hold MSE
fit_mat = zeros(n_models, n_pts); % create matrix to hold fit (predicted y)

for mdl = 1:n_models % Loop through each polynomial order
    betas = regress(y, XX(:, 1:mdl)); % Get betas for polynomial order
    fit = XX(:, 1:mdl) * betas; % Get fit by multiplying betas with regressors
    fit_mat(mdl, :) = fit; % Add fit to fit matrix
    mse_vec(mdl) = mean((y-fit).^2); % Find and save mean square error
end

% Visualize fits - which one looks best?
figure
for mdl = 1:n_models
    subplot(2,3,mdl)
    hold on
    scatter(x,y)
    plot(x, fit_mat(mdl, :), 'LineWidth', 2)
    title_str = append('Order: ', int2str(mdl-1));
    title(title_str)
end

% Visualize MSE for each fit - which one is best?
figure
order = 0:5;
plot(order, mse_vec, '-o', 'LineWidth', 2, 'Markersize', 10)
xlabel('Order')
ylabel('MSE')

%% Exercise 3 - Cross-validation model fitting 3.3
% Next let's re-do the model fitting with leave-one-out cross validation 
% and then re-evaluate our models.

% Let's fit this data with polynomials:
n_pts = size(y,1); % Number of data points
XX = [ones(n_pts,1), x, x.^2,x.^3,x.^4,x.^5]; % All regressors that we will use for each fit
n_models = size(XX,2); % Number of regressors (should be 6)
n_folds = n_pts; % Number of folds in cross validation (should be equal to N for leave-one-out cross validation

mse_xval = zeros(n_folds, n_models); % Matrix to hold MSE values for each model for each cross validation

% -- CODE HERE --

% Plot the MSE and error bars 
% We want the model that minimizes error AND is the simplest
% The simplest model that lie within 2 standard errors of the minimum MSE
% will be chosen as the 'best' one.

figure
errorbar(0:5, mean(mse_xval), 2*sqrt(var(mse_xval)/n_pts),'capsize',0,'linewidth',3)
xlabel('Polynomial order')
ylabel('MSE')

% Which model looks best? Can you think of a way to statistically determine
% which order is best?

%% Exercise 3 - Cross-validation model fitting 3.4
% One way to statistically determine the best model is to find the lowest
% mean MSE (in our case, this is order 4), determine the 95% confidence 
% interval upper threshold of that MSE, and then find the lowest order
% model that has a confidence interval overlapping with this threshold

% Find LOWEST MSE

% -- CODER HERE --

% Then find 95% confidence interval upper threshold of that MSE. We can
% approximate the 95% confidence interval by using 2*sqrt(var/N)

% -- CODE HERE --

% Get the index of the smallest order polynomial that lies within
% thresh_mse

% -- CODE HERE --

% Which model is best?