###################################################################### ### by Chuck Anderson for CS545, Fall 2009 ### http://www.cs.colostate.edu/~anderson/cs545 ###################################################################### ###################################################################### ### Function definitions - Start ###################################################################### ###################################################################### ### makeStandardize: returns function with mu and sigma bound makeStandardizeF <- function(X) { if (missing(X)) { cat("Usage: standardize <- makeStandardizeF(X) ## X is nSamples x nDimensions Xs <- standardize(X) X2s <- standardize(X2)\n") return(invisible()) } ## X is nSamples x nDimensions mu <- colMeans(X) sigma <- sd(X) ##sd should be named colSds function(newX) { nr <- nrow(newX) nc <- ncol(newX) (newX - matrix(mu,nr,nc,byrow=TRUE)) / matrix(sigma,nr,nc,byrow=TRUE) } } ###################################################################### ### Function definitions - End ###################################################################### ### Read mpg data from UCI mpg <-read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data") mpgnames <- c("mpg","cylinders","displacement","horsepower", "weight","acceleration","year","origin","name") colnames(mpg) <- mpgnames ### Remove all samples that have at least one "?" keepRows <- apply(mpg != "?", 1, all) mpg <- mpg[keepRows,] ### Randomly pick 80% of samples for training partition randorder <- sample(nrow(mpg)) nTrain <- round(nrow(mpg)*0.8) trainRows <- randorder[1:nTrain] ### Assemble X and T matrices X <- mpg[trainRows, 2:8] T <- mpg[trainRows, 1, drop=FALSE] ### Convert all values to numeric X <- apply(X,2,as.numeric) T <- apply(T,2,as.numeric) ### Create new last column, named "noisy origin" X <- cbind(X,0) colnames(X)[8] <- "noisy origin" ### Fill new column with noisy version of "origin", using a number of values ### of standard deviation for Gaussian noise. For each value, ### solve for linear model and collect resulting weights result <- c() for (stdevlog in seq(-5,-2,by=0.2)) { X[,8] <- -2*X[,7]+rnorm(nrow(X),0,10^stdevlog) standardize <- makeStandardizeF(X) Xs <- standardize(X) ## Add constant 1 to each sample X1 <- cbind(1,Xs) colnames(X1)[1] <- "bias" ### Solve for linear model w w <- solve(t(X1) %*% X1, t(X1) %*% T) result <- rbind(result, c(stdevlog, w)) } ### plot weights versus log of noise standard deviation matplot(result[,1],result[,-1],type="b", xlab=expression(paste("log ",sigma, " = degree of independence between 8 and 9")), ylab="weights") legend("topright",c("origin","noisy origin"),pch=c("8","9"), col=c("red","green"),lty=2)