source("gradientDescents.R") ###################################################################### ### General functions: makeStandardizeF <- function(X) { if (missing(X)) { cat("Usage: standardize <- makeStandardizeF(X) ## X is nSamples x nDimensions Xs <- standardize(X) X2s <- standardize(X2)\n") return(invisible()) } ## X is nSamples x nDimensions mu <- colMeans(X) sigma <- sd(X) ##sd should be named colSds function(newX) { nr <- nrow(newX) nc <- ncol(newX) (newX - matrix(mu,nr,nc,byrow=TRUE)) / matrix(sigma,nr,nc,byrow=TRUE) } } makeIndicatorVars <- function(Y) { if (!is.matrix(Y)) Y <- matrix(Y) classes <- unique(Y) N <- nrow(Y) K <- length(classes) logicalMatrix <- (matrix(Y,N,K) == matrix(classes,N,K,byrow=TRUE)) mode(logicalMatrix) <- "numeric" ## to convert to numbers 0, 1 logicalMatrix } ######### p(C=k|x) g(X,beta) g <- function(X,beta) { fs <- exp(X %*% beta) # N x K-1 denom <- 1 + rowSums(fs) N <- nrow(X) K1 <- ncol(beta) gs <- fs / matrix(rep(denom,K1),N,K1) cbind(gs,1/denom) } ###################################################################### ## functions to be used with steepest() or scg() logregGrad <- function(beta,X,Ti) { ## beta is nComponents (with bias) x nClasses-1 ## X is nSamples x nComponents (with bias) ## Ti is indicator variables of nSamples x nClasses gs <- g(X,beta) K <- ncol(gs) - t(X) %*% (Ti[,-K] - gs[,-K]) } logregEvalToBeMinimized <- function(beta,X,Ti) { ## beta is nComponents (with bias) x nClasses-1 ## X is nSamples x nComponents (with bias) ## Ti is indicator variables of nSamples x nClasses gs <- g(X,beta) loglike <- sum(Ti * log(gs)) loglike <- loglike / nrow(Ti) likelihood <- exp(loglike) -likelihood ## because this function is to be minimized } ###################################################################### ### Read and Prepare Parkinsons data data <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data",header=TRUE,sep=",") data <- as.matrix(data[,-1]) ## remove name and make numeric data <- data[sample(nrow(data)),] ## randomly rearrange data status <- data[,"status"] data <- data[, -which(colnames(data)=="status") ] ## remove status column from data dataHealthy <- data[status==0,] dataParks <- data[status==1,] nHealthy <- nrow(dataHealthy) nParks <- nrow(dataParks) ### Force equal sampling proportion of two classes trainf <- 0.8 Xtrain <- rbind(dataHealthy[1:floor(trainf*nHealthy),], dataParks[1:floor(trainf*nParks),]) Ttrain <- matrix(c(rep(1,floor(trainf*nHealthy)), rep(2,floor(trainf*nParks)))) Xtest <- rbind(dataHealthy[-(1:floor(trainf*nHealthy)),], dataParks[-(1:floor(trainf*nParks)),]) Ttest <- matrix(c(rep(1,nHealthy-floor(trainf*nHealthy)), rep(2,nParks-floor(trainf*nParks)))) standardize <- makeStandardizeF(Xtrain) Xtrains <- standardize(Xtrain) Xtests <- standardize(Xtest) ### Now stuff special to logistic regression TtrainI <- makeIndicatorVars(Ttrain) TtestI <- makeIndicatorVars(Ttest) ## Remove last column of indicator variables for Kth class #TtrainI <- TtrainI[,-ncol(TtrainI)] #TtestI <- TtestI[,-ncol(TtestI)] ### P(C=k|X) = g(X,beta) for logistic regression ###################################################################### Xtrains <- cbind(1,Xtrains) Xtests <- cbind(1,Xtests) graphicsflag <- TRUE methods <- c("steepest","scg","mysteepest") D <- ncol(Xtrain) + 1 K <- 2 alpha <- 0.01 ## for steepest descent, Scaled-Conjugate Gradient doesn't need step size results <- c() for (method in methods) { ## Initial beta beta <- matrix(0,D,K-1) ##beta <- matrix(rnorm(D*(K-1)), D, K-1) if (graphicsflag && method=="mysteepest") x11() if (method == "steepest") { res <- steepest(beta, logregEvalToBeMinimized, logregGrad, Xtrains, TtrainI, stepsize=alpha,fPrecision=0.00001,nIterations=1000000,ftracep=TRUE) results[[method]] <- list(beta=res$x, likelihood=-res$ftrace) beta <- res$x } else if (method == "scg") { res <- scg(beta, logregEvalToBeMinimized, logregGrad, Xtrains, TtrainI, fPrecision=0.00001, nIterations=1000000, ftracep=TRUE) results[[method]] <- list(beta=res$x, likelihood=-res$ftrace) beta <- res$x } else { ## my steepest descent step <- 1 likes <- c() accs <- c() accsTest <- c() if (graphicsflag) par(mfrow=c(1,2)) while (step < 1000) { gs <- g(Xtrains,beta) beta <- beta + alpha * t(Xtrains) %*% (TtrainI[,-K] - gs[,-K]) loglike <- sum(TtrainI * log(gs)) loglike <- loglike / nrow(TtrainI) likelihood <- exp(loglike) likes <- c(likes,likelihood) preds <- apply(gs,1,which.max) accs <- c(accs, sum(preds==Ttrain)/length(Ttrain)) predsTest <- apply(g(Xtests,beta),1,which.max) accsTest <- c(accsTest, sum(predsTest==Ttest)/length(Ttest)) if (graphicsflag && step %% 100 == 0) { plot(likes,type="l",xlab="Steps",ylab="Likelihood") matplot(cbind(accs,accsTest),type="l",xlab="Steps",ylab="Accuracies") } step <- step + 1 } results[[method]] <- list(beta=beta, likelihood=likes) } ## end of method cases ## for all three methods predsTrain <- apply(g(Xtrains,beta),1,which.max) accsTrain <- sum(predsTrain==Ttrain)/length(Ttrain) predsTest <- apply(g(Xtests,beta),1,which.max) accsTest <- sum(predsTest==Ttest)/length(Ttest) results[[method]] <- c(results[[method]], list(accuracies=c(accsTrain, accsTest))) cat("Method ",method," done\n") } resultsTable <- cbind(results$mysteepest$accuracies, results$steepest$accuracies, results$scg$accuracies) colnames(resultsTable) <- c("My Steepest", "Steepest", "SCG") rownames(resultsTable) <- c("Train","Test") print(resultsTable) if (graphicsflag) x11() plot(results$mysteepest$likelihood, type="l",ylim=c(0,1),bty="n",lwd=3,xlab="Steps",ylab="Likelihood") lines(results$steepest$likelihood, type="l",col=2,lwd=3) lines(results$scg$likelihood, type="l",col=3,lwd=3) legend("topleft",c("My Steepest","Steepest","SCG"),col=c(1,2,3),lty=1,lwd=3,bty="n")