# Overview

## Ridge regression

We try to minimize

$\sum_{i=1}^n \Big( Y_i - X_i \beta \Big)^2 + \lambda \sum_p \beta_p^2$

## checking the bias and variance

When looking at

set.seed(43215)

# let's consider a simple model with p independent regressors
p    = 10
beta = rev(sort(rnorm(p)))
n    = 20
p2   = 18

# let's regularize it

rr = data.table( foreach (lambda = seq(0,5,l=20),.combine=rbind) %:% foreach (i = 1:500,.combine=rbind) %do% {
X = array(rnorm(p2*n),c(n,p2));
Y = X %*% c(beta,rep(0,p2-p)) + sigma*rnorm(n)
fit = lm.ridge(Y~0+X,lambda=lambda)
res = data.frame(value=coef(fit),value0=c(beta,rep(0,p2-p)))
res$rep = i res$name = rownames(res)
res$lambda = lambda res }) rs = rr[,list(bias=mean(value-value0)^2,var=var(value-value0),mse=mean((value-value0)^2)),list(name,lambda)] ggplot(rs[name=="X1"],aes(x=lambda,y=mse)) + geom_line() + theme_bw() + geom_line(aes(y=bias),color="red") + geom_line(aes(y=var),color="blue") + # scale_y_log10() + geom_vline(xintercept = rs[name=="X1"][which.min(mse),lambda],linetype=2) ls = unique(rr$lambda)[c(1,5,10,15)]
ggplot(rr[name=="X1"][lambda %in% ls],aes(x=value, group=lambda,fill=lambda,color=lambda)) +
geom_density(alpha=0.3) + geom_vline(xintercept = beta[1],linetype=2) + theme_bw() + xlim(-1,3)
## Warning: Removed 76 rows containing non-finite values (stat_density).

# looking at the results in this case - extracting the best lambda
r2 = rs[name %in% paste("X",1:p,sep=""),mean(mse),lambda]
lambda_star = r2[,{I=which.min(V1);lambda[I]}]
beta0 = c(beta,rep(0,p2-p))

Y = X %*% c(beta,rep(0,p2-p)) + sigma*rnorm(n)
# ridge at best lambda
fit  = lm.ridge(Y~0+X,lambda=lambda_star)
# OLS
fit2 = lm.ridge(Y~0+X,lambda=0)
# remove noise to get the truth
Y = X %*% c(beta,rep(0,p2-p))
fit3 = lm(Y~0+X)

# combine results
rr2 = rbind(
data.frame(as.list(coef(fit)),name="ridge"),
data.frame(as.list(coef(fit2)),name="ols"),
data.frame(as.list(coef(fit3)),name="true"))
rr2$name = paste(rr2$name)

# melt to plot
rr3 = melt(rr2,id.vars = "name")
rr3$var = as.integer(str_replace(rr3$variable,"X",""))
ggplot(rr3,aes(x=var,y=value,color=name)) + geom_point() + geom_line() + theme_bw()

## Running cross-validation

X = array(rnorm(p2*n),c(n,p2));
Y = X %*% c(beta,rep(0,p2-p)) + sigma*rnorm(n)

# compute the Lasso with cross-validation
cvfit <- glmnet::cv.glmnet(X, Y,alpha=0,intercept=FALSE)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations
## per fold
cvfit$lambda.min ## [1] 1.062757 plot(cvfit) coef(cvfit) ## 19 x 1 sparse Matrix of class "dgCMatrix" ## 1 ## (Intercept) . ## V1 0.46242929 ## V2 0.04783239 ## V3 -0.14408576 ## V4 -0.36307332 ## V5 -0.06893057 ## V6 -0.23056429 ## V7 0.05696741 ## V8 -0.68938139 ## V9 -1.03188955 ## V10 -0.51811963 ## V11 0.45414162 ## V12 0.20980480 ## V13 0.08733596 ## V14 -0.20877095 ## V15 0.15231189 ## V16 0.23435175 ## V17 -0.12264805 ## V18 -0.38068093 ## Lasso We now minimize $\sum_{i=1}^n \Big( Y_i - X_i \beta \Big)^2 + \lambda \sum_p | \beta_p |$ # compute the Lasso with cross-validation cvfit <- glmnet::cv.glmnet(X, Y) ## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations ## per fold # attach the results rr2 = rbind(rr2,rr2[1,]) rr2[4,1:p2] = as.matrix(coef(cvfit, s = "lambda.1se"))[2:(p2+1)] rr2$name[4] <- "lasso"

rr3 = data.table(melt(rr2,id.vars = "name"))
rr3 <- rr3[,var:= as.integer(str_replace(rr3$variable,"X",""))] ggplot(rr3,aes(x=var,y=value,color=name)) + geom_line() + theme_bw() +geom_point() # Applying to Hitter database Borrowed from the book ISLR, Chapter 6 Lab 2: Ridge Regression and the Lasso library(ISLR) # remove NA Hitters = Hitters[!is.na(Hitters$Salary),]

library(corrgram)
upper.panel=panel.pie, text.panel=panel.txt)

# prepare the data for glmnet
x=model.matrix(Salary~.,Hitters)[,-1]
y=Hitters$Salary ## Ridge Regression # prepare a grid for lambda grid=10^seq(10,-2,length=100) ridge.mod=glmnet(x,y,alpha=0,lambda=grid) # dim(coef(ridge.mod)) # coef is 20x100 coefs = t(as.array(coef(ridge.mod)[,seq(1,100,11)])) dt = data.table(coefs) dt$lambda = log(ridge.mod$lambda[seq(1,100,11)])/log(10) knitr::kable(dt,digits = 4) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"), full_width = F) (Intercept) AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN lambda 535.9257 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 10.0000 535.9218 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 -0.0002 0.0000 0.0000 0.0000 0.0000 8.6667 535.8383 0.0000 0.0001 0.0004 0.0002 0.0002 0.0002 0.0008 0.0000 0.0000 0.0001 0.0000 0.0000 0.0000 -0.0003 -0.0036 0.0000 0.0000 0.0000 -0.0001 7.3333 534.0450 0.0005 0.0020 0.0079 0.0033 0.0035 0.0041 0.0169 0.0000 0.0002 0.0013 0.0003 0.0004 0.0004 -0.0057 -0.0780 0.0002 0.0000 -0.0002 -0.0011 6.0000 497.8943 0.0110 0.0401 0.1591 0.0676 0.0711 0.0842 0.3384 0.0009 0.0035 0.0261 0.0069 0.0072 0.0075 -0.0799 -1.6419 0.0045 0.0007 -0.0040 0.0065 4.6667 181.6207 0.0976 0.4148 1.2512 0.6587 0.6495 0.8612 2.6285 0.0083 0.0322 0.2386 0.0646 0.0669 0.0632 4.4262 -25.8850 0.0611 0.0084 -0.1931 3.7980 3.3333 21.2475 -0.1144 1.3889 -0.7097 1.1635 0.8456 2.1867 -2.8561 0.0094 0.0821 0.5385 0.1656 0.1754 -0.0418 36.7654 -108.6578 0.2275 0.0769 -2.6306 0.0068 2.0000 147.0439 -1.5738 5.4959 0.5084 -0.2529 0.1039 5.1516 -10.4583 -0.0532 0.2007 0.7266 0.6772 0.3600 -0.5943 61.2550 -122.8933 0.2791 0.2856 -3.7768 -27.7553 0.6667 164.4100 -1.9655 7.2719 3.5682 -2.0162 -0.7999 6.1424 -4.5520 -0.1618 0.2187 0.1757 1.2893 0.6530 -0.7818 63.2964 -117.6506 0.2822 0.3666 -3.4563 -26.4182 -0.6667 164.1132 -1.9739 7.3777 3.9366 -2.1987 -0.9162 6.2004 -3.7140 -0.1751 0.2113 0.0563 1.3661 0.7097 -0.7958 63.4049 -117.0824 0.2820 0.3732 -3.4240 -25.9908 -2.0000 ## Using cross-validation with training and testing sets # split the sample in 2 set.seed(1) train=sample(1:nrow(x), nrow(x)/2) test=(-train) y.test=y[test] # compute the ridge regression usign the train data ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12) # compute the MSE on the test data for s=4 dd = data.frame() for (l in grid) { cc = predict(ridge.mod,s=l,newx=x[test,],type = "coefficients") cmse = sum(cc[2:20,]^2) ridge.pred=predict(ridge.mod,s=l,newx=x[test,]) v1=mean((ridge.pred-y.test)^2) y.train=predict(ridge.mod,s=l,newx=x[train,]) v2=mean((y[train]-y.train)^2) dd = rbind(dd,data.frame(i=i,l=l,test=v1,train=v2,cmse=cmse)) } ggplot(dd,aes(x=l,y=test)) + geom_line() + scale_x_log10() ggplot(dd,aes(x=l,y=train)) + geom_line() + scale_x_log10() ggplot(dd,aes(x=l,y=cmse)) + geom_line() + scale_x_log10() # compare the MSE for Rdige to mse for OLS ridge.pred=predict(ridge.mod,s=0,newx=x[test,]) mean((ridge.pred-y.test)^2) ## [1] 114723.6 lm(y~x, subset=train) ## ## Call: ## lm(formula = y ~ x, subset = train) ## ## Coefficients: ## (Intercept) xAtBat xHits xHmRun xRuns ## 299.42849 -2.54027 8.36682 11.64512 -9.09923 ## xRBI xWalks xYears xCAtBat xCHits ## 2.44105 9.23440 -22.93673 -0.18154 -0.11598 ## xCHmRun xCRuns xCRBI xCWalks xLeagueN ## -1.33888 3.32838 0.07536 -1.07841 59.76065 ## xDivisionW xPutOuts xAssists xErrors xNewLeagueN ## -98.86233 0.34087 0.34165 -0.64207 -0.67442 predict(ridge.mod,s=0,type="coefficients")[1:20,] ## (Intercept) AtBat Hits HmRun Runs ## 299.44467220 -2.53538355 8.33585019 11.59830815 -9.05971371 ## RBI Walks Years CAtBat CHits ## 2.45326546 9.21776006 -22.98239583 -0.18191651 -0.10565688 ## CHmRun CRuns CRBI CWalks LeagueN ## -1.31721358 3.31152519 0.06590689 -1.07244477 59.75587273 ## DivisionW PutOuts Assists Errors NewLeagueN ## -98.94393005 0.34083276 0.34155445 -0.65312471 -0.65882930 # use the library cross-validation set.seed(1) cv.out=cv.glmnet(x[train,],y[train],alpha=0) plot(cv.out) # get best lambda bestlam=cv.out$lambda.min
bestlam
## [1] 211.7416
# compute MSE at best
ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])
mean((ridge.pred-y.test)^2)
## [1] 96015.51
ridge_best = glmnet(x,y,alpha=0,lambda = bestlam)
ridge_best = predict(ridge_best,type="coefficients",s=bestlam)[1:20,]

## The Lasso

# plot coefficients for all values of lambda
lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
plot(lasso.mod)

# plot the MSE for all coefficients
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)

# extract the best lambda in terms of cross-validation
bestlam=cv.out$lambda.min lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,]) # compute the MSE mean((lasso.pred-y.test)^2) ## [1] 102442.8 # show only coefs different from 0 out=glmnet(x,y,alpha=1,lambda=grid) lasso_best=predict(out,type="coefficients",s=bestlam,alpha=1)[1:20,] lasso_best[lasso_best!=0] ## (Intercept) Hits Walks CRuns CRBI DivisionW ## 58.4539351 1.7103295 1.9950820 0.1910196 0.3856452 -69.8884607 ## PutOuts ## 0.1727048 Let’s compare the different parameters rrf = rbind(ridge_best,lasso_best) # PCA pca = prcomp(Hitters[,c('AtBat','Hits','HmRun','Runs','RBI','Walks','Years','CAtBat','CHits','CHmRun','CRuns','CRBI','CWalks','PutOuts','Assists','Errors')]) X = as.matrix(Hitters[,c('AtBat','Hits','HmRun','Runs','RBI','Walks','Years','CAtBat','CHits','CHmRun','CRuns','CRBI','CWalks','PutOuts','Assists','Errors')]) ee = eigen(t(X)%*%X) set.seed(1) A <- rnorm(500) B <- -1*A C <- 0.2*B -1.5*A pts <- cbind(X=rnorm(500,A,.05),Y=rnorm(500,B,.05),Z=rnorm(500,C,.05)) pca2 = prcomp(pts) # Kmeans library(tripack) library(RColorBrewer) set.seed(1) pts <- cbind(X=rnorm(500,rep(seq(1,9,by=2)/10,100),.022),Y=rnorm(500,.5,.15)) plot(pts) km1 <- kmeans(pts, centers=5, nstart = 1, algorithm = "Lloyd",iter.max = 200) CL5 <- brewer.pal(5, "Pastel1") V <- voronoi.mosaic(km1$centers[,1],km1$centers[,2]) P <- voronoi.polygons(V) plot(pts,pch=19,xlim=0:1,ylim=0:1,xlab="",ylab="",col=CL5[km1$cluster])
points(km1$centers[,1],km1$centers[,2],pch=3,cex=1.5,lwd=2)
plot(V,add=TRUE)

set.seed(1)
A <- c(rep(.2,100),rep(.2,100),rep(.5,100),rep(.8,100),rep(.8,100))
B <- c(rep(.2,100),rep(.8,100),rep(.5,100),rep(.2,100),rep(.8,100))
pts <- cbind(X=rnorm(500,A,.075),Y=rnorm(500,B,.075))

The nice thing about k-mean is that it will adapt to the complexitiy of the problem

set.seed(1)
A <- runif(500)
B <- 0.5*A^10
pts <- cbind(X=rnorm(500,A,.05),Y=rnorm(500,B,.05))

km1 <- kmeans(pts, centers=5, nstart = 1, algorithm = "Lloyd",iter.max = 200)
CL5 <- brewer.pal(5, "Pastel1")
V <- voronoi.mosaic(km1$centers[,1],km1$centers[,2])
P <- voronoi.polygons(V)
plot(pts,pch=19,xlab="",ylab="",col=CL5[km1$cluster]) points(km1$centers[,1],km1\$centers[,2],pch=3,cex=1.5,lwd=2)
plot(V,add=TRUE)

Fork me on GitHub