/****************************************************************************** * This file contains a simulation of the cart and pole dynamic system * and a procedure for learning to balance the pole that uses multilayer * connectionist networks. Both are described in Anderson, "Strategy * Learning with Multilayer Connectionist Representations," GTE * Laboratories TR87-509.3, 1988. This is a corrected version of the * report published in the Proceedings of the Fourth International * Workshop on Machine Learning, Irvine, CA, 1987. * * main takes six arguments: * the maximum number of trials (balancing attempts), * the number of trials over which balancing steps are averaged * beta * beta for hidden units * rho * rho for hidden units * * Please send questions and comments to anderson@cs.colostate.edu *****************************************************************************/ #include #include #define Mc 1.0 #define Mp 0.1 #define l 0.5 #define g 9.8 #define tau 0.02 #define max_cart_pos 2.4 #define max_cart_vel 1.5 #define max_pole_pos 0.2094 #define max_pole_vel 2.01 #define Gamma 0.9 float Beta = 0.2; float Beta_h = 0.05; float Rho = 1.0; float Rho_h = 0.2; struct { double cart_pos; double cart_vel; double pole_pos; double pole_vel; } the_system_state; int start_state, failure; double a[5][5], b[5], c[5], d[5][5], e[5], f[5]; double x[5], x_old[5], y[5], y_old[5], v, v_old, z[5], p; double r_hat, push, unusualness; float acgrnd(); void init_acgrnd_by_time(); /****************************************************************/ main(argc, argv) int argc; char *argv[]; { int runtimes; init_acgrnd_by_time(); if (argc < 3) exit(-1); SetRandomWeights(); if (argc > 3) Beta = atof(argv[3]); if (argc > 4) Beta_h = atof(argv[4]); if (argc > 5) Rho = atof(argv[5]); if (argc > 6) Rho_h = atof(argv[6]); Run(atoi(argv[1]), atoi(argv[2])); } /****************************************************************/ SetRandomWeights() { int i,j; for(i = 0; i < 5; i++) { for(j = 0; j < 5; j++) { a[i][j] = acgrnd() * 0.2 - 0.1; d[i][j] = acgrnd() * 0.2 - 0.1; } b[i] = acgrnd() * 0.2 - 0.1; c[i] = acgrnd() * 0.2 - 0.1; e[i] = acgrnd() * 0.2 - 0.1; f[i] = acgrnd() * 0.2 - 0.1; } } /****************************************************************/ /* If init_flag is zero, then calculate state of cart-pole system at time t+1 by Euler's method, else set state of cart-pole system to random values. */ NextState(init_flag, push) int init_flag; double push; { register double pv, ca, pp, pa, common; double sin_pp, cos_pp; if (init_flag) { the_system_state.cart_pos = acgrnd() * 2 * max_cart_pos - max_cart_pos; the_system_state.cart_vel = acgrnd() * 2 * max_cart_vel - max_cart_vel; the_system_state.pole_pos = acgrnd() * 2 * max_pole_pos - max_pole_pos; the_system_state.pole_vel = acgrnd() * 2 * max_pole_vel - max_pole_vel; start_state = 1; SetInputValues(); failure = 0; } else { pv = the_system_state.pole_vel; pp = the_system_state.pole_pos; sin_pp = sin(pp); cos_pp = cos(pp); common = (push + Mp * l * pv * pv * sin_pp) / (Mc + Mp); pa = (g * sin_pp - cos_pp * common) / (l * (4.0 / 3.0 - Mp * cos_pp * cos_pp / (Mc + Mp))); ca = common - Mp * l * pa * cos_pp / (Mc + Mp); the_system_state.cart_pos += tau * the_system_state.cart_vel; the_system_state.cart_vel += tau * ca; the_system_state.pole_pos += tau * the_system_state.pole_vel; the_system_state.pole_vel += tau * pa; SetInputValues(); start_state = 0; if ((fabs(the_system_state.cart_pos) > max_cart_pos) || (fabs(the_system_state.pole_pos) > max_pole_pos)) failure = -1; else failure = 0; } } /****************************************************************/ SetInputValues() { x[0] = (the_system_state.cart_pos + max_cart_pos) / (2 * max_cart_pos); x[1] = (the_system_state.cart_vel + max_cart_vel) / (2 * max_cart_vel); x[2] = (the_system_state.pole_pos + max_pole_pos) / (2 * max_pole_pos); x[3] = (the_system_state.pole_vel + max_pole_vel) / (2 * max_pole_vel); x[4] = 0.5; } /****************************************************************/ Run(num_trials, sample_period) int num_trials, sample_period; { register int i, j, avg_length; NextState(1, 0.0); i = 0; j = 0; avg_length = 0; printf(" B= %g Bh= %g R= %g Rh= %g nt= %d bin= %d\n", Beta,Beta_h,Rho,Rho_h,num_trials,sample_period); while (i < num_trials && j < 180000) /* one hour at .02s per step */ { Cycle(1); j++; if (failure) { avg_length += j; i++; if (!(i % sample_period)) { printf("%6d %6d\n", i, avg_length / sample_period); avg_length = 0; } NextState(1, 0.0); j = 0; } } printf("Final trial %d balanced for %d steps (%f hours).\n", i, j, (j * tau)/3600.0); /* for (i = 0; i<5; i++ ) for (j = 0; j< 5; j++ ) printf( "%3.4f ", d[i][j] ); for ( i=0; i< 5; i++ ) printf ( "%3.4f ", e[i] ); for ( i=0; i< 5; i++ ) printf ( "%3.4f ", f[i] ); */ } /****************************************************************/ double sgn(x) double x; { if (x < 0.0) return -1.0; else if (x > 0.0) return 1.0; else return 0.0; } /****************************************************************/ Cycle(learn_flag) int learn_flag; { int i, j, k; double sum, factor1, factor2; extern double exp(); /* output: state evaluation */ for(i = 0; i < 5; i++) { sum = 0.0; for(j = 0; j < 5; j++) { sum += a[i][j] * x[j]; } y[i] = 1.0 / (1.0 + exp(-sum)); } sum = 0.0; for(i = 0; i < 5; i++) { sum += b[i] * x[i] + c[i] * y[i]; } v = sum; /* output: action */ for(i = 0; i < 5; i++) { sum = 0.0; for (j = 0; j < 5; j++) sum += d[i][j] * x[j]; z[i] = 1.0 / (1.0 + exp(-sum)); } sum = 0.0; for(i = 0; i < 5; i++) sum += e[i] * x[i] + f[i] * z[i]; p = 1.0 / (1.0 + exp(-sum)); push = (acgrnd() <= p) ? 10.0 : -10.0; unusualness = (push > 0) ? 1.0 - p : -p; /* unusualness = q - p */ /* preserve current activities in evaluation network. */ v_old = v; for (i = 0; i< 5; i++) { x_old[i] = x[i]; y_old[i] = y[i]; } /* Apply the push to the pole-cart */ NextState(0, push); /* Calculate evaluation of new state. */ for(i = 0; i < 5; i++) { sum = 0.0; for(j = 0; j < 5; j++) { sum += a[i][j] * x[j]; } y[i] = 1.0 / (1.0 + exp(-sum)); } sum = 0.0; for(i = 0; i < 5; i++) { sum += b[i] * x[i] + c[i] * y[i]; } v = sum; /* action evaluation */ if (start_state) r_hat = 0.0; else if (failure) r_hat = failure - v_old; else r_hat = failure + Gamma * v - v_old; /* modification */ if (learn_flag) { for(i = 0; i < 5; i++) { factor1 = Beta_h * r_hat * y_old[i] * (1.0 - y_old[i]) * sgn(c[i]); factor2 = Rho_h * r_hat * z[i] * (1.0 - z[i]) * sgn(f[i]) * unusualness; for(j = 0; j < 5; j++) { a[i][j] += factor1 * x_old[j]; d[i][j] += factor2 * x_old[j]; } b[i] += Beta * r_hat * x_old[i]; c[i] += Beta * r_hat * y_old[i]; e[i] += Rho * r_hat * unusualness * x_old[i]; f[i] += Rho * r_hat * unusualness * z[i]; } } } /**********************************************************************/ #include #include /* Addititive congruential random number generator adapted from Hill's Computer Graphics. Tests for independence of sequence (Big-R) and uniformity of distribution (Chi-square) are output. The histogram dist. of random numbers is also output (3/22/91). A floating point number between 0 - 1 is returned. */ /* global quantities for random number generator */ #define MODULUS 32767 #define TRUE 1 #define FALSE 0 #define LENGTH 17 int the_seed; int ranlist[LENGTH]; /* global list */ int i_ran, j_ran;/* global indices */ /*--------------------------------------------------------------*/ void init_acgrnd_by_time() /* loads random values into global array ranlist[] */ { int i; int all_even; the_seed = time(NULL); /* the_seed = 0;*/ all_even = TRUE; while ( all_even ) /* try until one or more values are odd */ for ( i=0; i < LENGTH; i++ ) /* load array */ { the_seed = (the_seed * 2743 + 5923 ) % MODULUS; /* adjust if value "overflowed" capacity of an integer */ if ( the_seed < 0 ) the_seed = the_seed + MODULUS; if ( the_seed % 2 != 0 ) all_even = FALSE; ranlist[i] = the_seed; } i_ran = LENGTH - 1; j_ran = 5; } /*-------------------------------------------------------------*/ float acgrnd() /* generate a random number in the range 0.0 to 1.0 */ /* use global ranlist, i_ran, and j_ran */ { int t, randnum; float rand; randnum = ( ranlist[i_ran] + ranlist[j_ran] ) % MODULUS; if ( randnum < 0 ) randnum = randnum + MODULUS; ranlist[i_ran] = randnum; i_ran = ( i_ran == 0 ) ? LENGTH - 1 : i_ran--; j_ran = ( j_ran == 0 ) ? LENGTH - 1 : j_ran--; rand = (float) randnum / MODULUS; return rand; } /**************************************************************************** ***************************************************************************** Example results. First argument is maximum number of trials (failures). Second argument is number of trails to average the balancing time (steps) over. Next four are beta, beta_h, rho, rho_h. on Sun IPC: >cc -O -o mpole multilayer-pole.c -lm >mpole 10000 1000 .2 .05 1 .2 B=0.2 Bh=0.05 R=1 Rh=0.2 nt=10000 bin=1000 1000 13 2000 17 3000 15 4000 28 Final trial 4638 balanced for 180000 steps (1.000000 hours). *****************************************************************************/