# Code for “Don’t Overfit!” competition

Here you can find the code for my entry in the “Don’t Overfit!” prediction competition. For this competition I used Bayesian inference, implemented using Gibbs sampling. For details see my earlier post.

## Gibbs sampler

```function [pred,sel]=doGibbs(ytrain,xtrain,xtest) %#codegen

% number of draws

nrburnin=1e4;

nrsamp=1e5;

% dimension

ntest=size(xtest,1);

ntrain=size(xtrain,1);

k=size(xtest,2);

% pre-allocate

pred=zeros(ntest,1);

sel=zeros(k,1);

% means of x

xmean=mean([xtrain; xtest]);

% rescale xtrain

xisdown=(xtrain==1);

xisup=(xtrain==0);

xtrain=repmat(xmean,ntrain,1)-xtrain;

trainrscale=(1e-3)*ones(ntrain,k)-(5e-4)*(xisup | xisdown);

xtrain(xisdown)=xtrain(xisdown)+25e-5;

xtrain(xisup)=xtrain(xisup)-25e-5;

xRef=xtrain;

xtrainlb=xRef-5e-4;

xtrainlb(xisdown)=-0.5;

xtrainlb(xisup)=xRef(xisup)-25e-5;

xtrainub=xRef+5e-4;

xtrainub(xisup)=0.5;

xtrainub(xisdown)=xRef(xisdown)+25e-5;

% rescale xtest

xtisdown=(xtest==1);

xtisup=(xtest==0);

xtest=repmat(xmean,ntest,1)-xtest;

xtest(xtisdown)=xtest(xtisdown)+25e-5;

xtest(xtisup)=xtest(xtisup)-25e-5;

% startvalues

err=10;

beta=rand(k,1);

isIncl=true(k,1);

ylat=xtrain*beta;

% do Gibbs sampling

for i=1:nrburnin+nrsamp

% generate random numbers

betarands=rand(k,2);

xrands=rand(ntrain,k);

% set new error margin while burning in

if err>0

err=max(max(ylat.*~ytrain-ylat.*ytrain),0);

end

% loop over regressors

for j=1:k

% ////// subtract out the influence of this regressor /////

ylat=ylat-xtrain(:,j)*beta(j);

% //////// find upper and lower bound on beta_j ////////

betaUp=1;

betaLow=0;

rat=-ylat./xtrain(:,j);

for q=1:ntrain

% positive x

if xtrain(q,j)>0

margin=err/xtrain(q,j);

% y = true

if ytrain(q)

if rat(q)-margin>betaLow

betaLow=rat(q)-margin;

end

% y = false

else

if rat(q)+margin<betaUp

betaUp=rat(q)+margin;

end

end

% negative x

elseif xtrain(q,j)<0

margin=-err/xtrain(q,j);

% y = true

if ytrain(q)

if rat(q)+margin<betaUp

betaUp=rat(q)+margin;

end

% y = false

else

if rat(q)-margin>betaLow

betaLow=rat(q)-margin;

end

end

end

end

% ///////////// sample new beta_j ////////////////////

% marginal probability

prob=betaUp-betaLow;

% can we eliminate this regressor?

if betaLow==0 && betarands(j,1)<=1/(1+prob);

beta(j)=0;

isIncl(j)=false;

% sample a new one

else

beta(j)=betaLow+betarands(j,2)*(betaUp-betaLow);

isIncl(j)=true;

end

% ///////// update latent /////////////

ylat=ylat+xtrain(:,j)*beta(j);

end

% /////////// sample new x's ////////////////

for j=1:k

% parameter not included

if ~isIncl(j)

% parameter included

else

margin=err/beta(j);

% loop over data

for q=1:ntrain

% subtract from latent

ylat(q)=ylat(q)-xtrain(q,j)*beta(j);

% initial bounds

xUp=xtrainub(q,j);

xLow=xtrainlb(q,j);

% ratio

xrat=-ylat(q)/beta(j);

% y = true

if ytrain(q)

if xrat-margin>xLow

xLow=xrat-margin;

end

% y = false

else

if xrat+margin<xUp

xUp=xrat+margin;

end

end

% sample new x

xtrain(q,j)=xLow+xrands(q,j)*(xUp-xLow);

% update latent

ylat(q)=ylat(q)+xtrain(q,j)*beta(j);

end

end

end

% ////////////// save results ////////////////

if i>nrburnin

% sum of rounding errors is approximately normal

stdv=sqrt((1e-6)*sum(beta.^2)/12);

pred=pred+0.5+0.5*erf((xtest*beta)/(sqrt(2)*stdv));

sel=sel+double(isIncl); % threshold these for submission

end

end

% normalize

pred=pred/nrsamp;

sel=sel/nrsamp;

end```

Published with MATLAB® 7.12