#ifndef __TOKENS_H__ #define __TOKENS_H__ /* * "Copyright (c) 2012 by Fritz Sieker." * * Permission to use, copy, modify, and distribute this software and its * documentation for any purpose, without fee, and without written * agreement is hereby granted, provided that the above copyright notice * and the following two paragraphs appear in all copies of this software, * that the files COPYING and NO_WARRANTY are included verbatim with * any distribution, and that the contents of the file README are included * verbatim as part of a file named README with any distribution. * * IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT, * INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT * OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE AUTHOR * HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" * BASIS, AND THE AUTHOR NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, * UPDATES, ENHANCEMENTS, OR MODIFICATIONS." */ /** @file tokens.h * @brief interface to functions to tokenize a line of LC3 source code * @details One of the first steps in converting a "high" level language to * some other form is to tokenize the input stream. This process is also * know as lexical analysis. It involves breaking the input into into a list * of "words" that are significant in terms of the syntax of the language. *
* For a laguage such as C, this means identifying keywords, numbers, user * defined names, and all the punctuation that is used (e.g. (){}+*-/,; ...). * In languages like C, there can be multiple "statements" on a single line and * "statements" can span multiple lines. *
* The LC3 assembly language is much simpler. Every statement is contained * on a single line of the file. The only punctuation used is the comma used * to separate multiple operands. The most complex statement is of the form: *
* label opcode operand1, operand2, operand3
*
* This code is provided to reduce the work in completing your assembler
* project. For more details, see
* this description
* from Wikipedia. If you take a compiler class like cs453, you will learn
* a lot more about lexical analysis, and how to use tools that will
* generate the code for you from a language description in a text file.
*
* @author Fritz Sieker
*/
/** Maximum length of source line */
#define MAX_LINE_LENGTH 8180
/** Max token in LC3 line, plus a few more to handle bad syntax */
#define MAX_TOKENS 10
/** Initialze the module */
void tokens_init (void);
/** Convert a single line of LC3 source code into a list of tokens and return
* the first one. Susequent tokens are retrieved using
* next_token()
. The function recoginizes the semi-colon as the
* LC3 end of line comment and discards all the comment. Tokens are separated
* by whitspace or commas. The commas are returned as part of the list.
* @param line - the source code line
* @return the first token of the line or NULL token. The value returned
* is a static variable whose contents are modified on each call. Therefore,
* the caller must copy values that need to be be preserved from call to call.
* For quoted strings used by the .STRINGZ directive, the returned token
* preserves the opening/closing quote marks, but converts all internal escape
* sequences into their actual character value.
*/
char* tokenize_line (char* line);
/** Return the next token for the list generated by tokenize_line()
* @return the next token or NULL if there are no more tokens
*/
char *next_token (void);
/** Print the tokens of the line. This is for debugging purposes.
*/
void print_tokens (void);
/** Return the number of tokens in the current line */
int token_count (void);
/** Get a specified token from the line
* @param index - which token to return
* @return - the token at the index or NULL
*/
char* get_token (int index);
/** Terminate the module */
void tokens_term (void);
#endif