You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

534 lines
18 KiB

//==============================================================================
//
// Copyright (c) 2015-
// Authors:
// * Joachim Klein <klein@tcs.inf.tu-dresden.de>
// * David Mueller <david.mueller@tcs.inf.tu-dresden.de>
//
//------------------------------------------------------------------------------
//
// This file is part of the cpphoafparser library,
// http://automata.tools/hoa/cpphoafparser/
//
// The cpphoafparser library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// The cpphoafparser library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//
//==============================================================================
#ifndef CPPHOAFPARSER_HOALEXER_H
#define CPPHOAFPARSER_HOALEXER_H
#include <map>
#include <string>
#include <stdexcept>
#include "cpphoafparser/parser/hoa_parser_exception.hh"
namespace cpphoafparser {
/** Lexer for tokenizing a HOA stream (used internally by HOAParser). */
class HOALexer {
public:
/** The type of the tokens in a HOA stream. */
enum TokenType {
TOKEN_INT,
TOKEN_IDENT,
TOKEN_STRING,
TOKEN_HEADER_NAME,
TOKEN_ALIAS_NAME,
TOKEN_EOF,
TOKEN_BODY,
TOKEN_END,
TOKEN_ABORT,
TOKEN_HOA,
TOKEN_STATE,
TOKEN_STATES,
TOKEN_START,
TOKEN_AP,
TOKEN_ALIAS,
TOKEN_ACCEPTANCE,
TOKEN_ACCNAME,
TOKEN_TOOL,
TOKEN_NAME,
TOKEN_PROPERTIES,
// Punctuation, etc.
TOKEN_NOT,
TOKEN_AND,
TOKEN_OR,
TOKEN_LPARENTH,
TOKEN_RPARENTH,
TOKEN_LBRACKET,
TOKEN_RBRACKET,
TOKEN_LCURLY,
TOKEN_RCURLY,
TOKEN_TRUE,
TOKEN_FALSE
};
/** A token in the HOA stream. */
struct Token {
/** The kind of the token. */
TokenType kind;
/** The string representation of this token (if applicable) */
std::string vString;
/** The integer representation of this token (if applicable) */
unsigned int vInteger;
/** The line where this token started */
unsigned int line;
/** The column where this token started */
unsigned int col;
/** EOF (end-of-file) constructor. */
Token() : kind(TOKEN_EOF), vString(""), vInteger(0), line(0), col(0) {}
/** Constructor for syntactic element */
Token(TokenType kind, unsigned int line, unsigned int col) : kind(kind), vString(""), vInteger(0), line(line), col(col) {}
/** Constructor for a token having variable string content (e.g., TOKEN_IDENTIFIER, TOKEN_ALIAS, TOKEN_STRING, ...) */
Token(TokenType kind, const std::string vString, unsigned int line, unsigned int col) : kind(kind), vString(vString), vInteger(0), line(line), col(col) {}
/** Constructor for an unsigned integer token */
Token(unsigned int vInteger, unsigned int line, unsigned int col) : kind(TOKEN_INT), vString(""), vInteger(vInteger), line(line), col(col) {}
/** Returns true if this token represents the end-of-file. */
bool isEOF() const {return kind == TOKEN_EOF;}
/** Returns a string name for the given token type. */
static std::string typeAsString(TokenType kind) {
switch (kind) {
case TOKEN_INT: return std::string("INT");
case TOKEN_IDENT: return std::string("IDENT");
case TOKEN_STRING: return std::string("STRING");
case TOKEN_HEADER_NAME: return std::string("HEADER_NAME");
case TOKEN_ALIAS_NAME: return std::string("ALIAS_NAME");
case TOKEN_EOF: return std::string("EOF");
case TOKEN_BODY: return std::string("BODY");
case TOKEN_END: return std::string("END");
case TOKEN_ABORT: return std::string("ABORT");
case TOKEN_HOA: return std::string("HOA");
case TOKEN_STATE: return std::string("STATE");
case TOKEN_STATES: return std::string("STATES");
case TOKEN_START: return std::string("START");
case TOKEN_AP: return std::string("AP");
case TOKEN_ALIAS: return std::string("ALIAS");
case TOKEN_ACCEPTANCE: return std::string("ACCEPTANCE");
case TOKEN_ACCNAME: return std::string("ACCNAME");
case TOKEN_TOOL: return std::string("TOOL");
case TOKEN_NAME: return std::string("NAME");
case TOKEN_PROPERTIES: return std::string("PROPERTIES");
// Punctuation: etc.
case TOKEN_NOT: return std::string("NOT");
case TOKEN_AND: return std::string("AND");
case TOKEN_OR: return std::string("OR");
case TOKEN_LPARENTH: return std::string("LPARENTH");
case TOKEN_RPARENTH: return std::string("RPARENTH");
case TOKEN_LBRACKET: return std::string("LBRACKET");
case TOKEN_RBRACKET: return std::string("RBRACKET");
case TOKEN_LCURLY: return std::string("LCURLY");
case TOKEN_RCURLY: return std::string("RCURLY");
case TOKEN_TRUE: return std::string("TRUE");
case TOKEN_FALSE: return std::string("FALSE");
}
throw std::logic_error("Unhandled token type");
}
/** Returns a string name for the given token type (for use in error messages). */
static std::string forErrorMessage(TokenType kind) {
switch (kind) {
case TOKEN_INT: return std::string("INTEGER");
case TOKEN_IDENT: return std::string("IDENTIFIER");
case TOKEN_STRING: return std::string("STRING");
case TOKEN_HEADER_NAME: return std::string("HEADER_NAME");
case TOKEN_ALIAS_NAME: return std::string("ALIAS_NAME");
case TOKEN_EOF: return std::string("END-OF_FILE");
case TOKEN_BODY: return std::string("--BODY--");
case TOKEN_END: return std::string("--END--");
case TOKEN_ABORT: return std::string("--ABORT--");
case TOKEN_HOA: return std::string("HOA:");
case TOKEN_STATE: return std::string("State:");
case TOKEN_STATES: return std::string("States:");
case TOKEN_START: return std::string("Start:");
case TOKEN_AP: return std::string("AP:");
case TOKEN_ALIAS: return std::string("Alias:");
case TOKEN_ACCEPTANCE: return std::string("Acceptance:");
case TOKEN_ACCNAME: return std::string("acc-name:");
case TOKEN_TOOL: return std::string("tool:");
case TOKEN_NAME: return std::string("name:");
case TOKEN_PROPERTIES: return std::string("properties:");
// Punctuation: etc.
case TOKEN_NOT: return std::string("!");
case TOKEN_AND: return std::string("&");
case TOKEN_OR: return std::string("|");
case TOKEN_LPARENTH: return std::string("(");
case TOKEN_RPARENTH: return std::string(")");
case TOKEN_LBRACKET: return std::string("[");
case TOKEN_RBRACKET: return std::string("]");
case TOKEN_LCURLY: return std::string("{");
case TOKEN_RCURLY: return std::string("}");
case TOKEN_TRUE: return std::string("t");
case TOKEN_FALSE: return std::string("f");
}
throw std::logic_error("Unhandled token type");
}
/** Returns a string representation of a given token (for error messages). */
static std::string forErrorMessage(Token token) {
switch (token.kind) {
case TOKEN_INT: return std::string("INTEGER ")+std::to_string(token.vInteger);
case TOKEN_IDENT: return std::string("IDENTIFIER ")+token.vString;
case TOKEN_STRING: return std::string("STRING ")+token.vString;
case TOKEN_HEADER_NAME: return std::string("HEADER ")+token.vString;
case TOKEN_ALIAS_NAME: return std::string("ALIAS ")+token.vString;
case TOKEN_EOF: return std::string("END-OF-FILE");
case TOKEN_BODY: return std::string("--BODY--");
case TOKEN_END: return std::string("--END--");
case TOKEN_ABORT: return std::string("--ABORT--");
case TOKEN_HOA: return std::string("HEADER HOA");
case TOKEN_STATES: return std::string("HEADER States");
case TOKEN_START: return std::string("HEADERr Start");
case TOKEN_AP: return std::string("HEADER AP");
case TOKEN_ALIAS: return std::string("HEADER Alias");
case TOKEN_ACCEPTANCE: return std::string("HEADER Acceptance");
case TOKEN_ACCNAME: return std::string("HEADER acc-name");
case TOKEN_TOOL: return std::string("HEADER tool");
case TOKEN_NAME: return std::string("HEADER name");
case TOKEN_PROPERTIES: return std::string("HEADER properties");
case TOKEN_STATE: return std::string("DEFINITION State");
// Punctuation: etc.
case TOKEN_NOT: return std::string("!");
case TOKEN_AND: return std::string("&");
case TOKEN_OR: return std::string("|");
case TOKEN_LPARENTH: return std::string("(");
case TOKEN_RPARENTH: return std::string(")");
case TOKEN_LBRACKET: return std::string("[");
case TOKEN_RBRACKET: return std::string("]");
case TOKEN_LCURLY: return std::string("{");
case TOKEN_RCURLY: return std::string("}");
case TOKEN_TRUE: return std::string("TRUE t");
case TOKEN_FALSE: return std::string("FALSE f");
}
throw std::logic_error("Unhandled token type");
}
/** Output function for a given token. */
friend std::ostream& operator<<(std::ostream& out, const Token& token) {
out << "<" << token.typeAsString(token.kind) << "> ";
if (token.kind == TOKEN_INT) {
out << token.vInteger;
} else {
out << token.vString;
}
out << " (" << token.line << "," << token.col << ")";
return out;
}
};
/** Constructor for a lexer, reading from the given input stream. */
HOALexer(std::istream& in)
: in(in), line(1), col(0), ch(0) {
// The headers we know
knownHeaders["HOA:"] = TOKEN_HOA;
knownHeaders["State:"] = TOKEN_STATE;
knownHeaders["States:"] = TOKEN_STATES;
knownHeaders["Start:"] = TOKEN_START;
knownHeaders["AP:"] = TOKEN_AP;
knownHeaders["Alias:"] = TOKEN_ALIAS;
knownHeaders["Acceptance:"] = TOKEN_ACCEPTANCE;
knownHeaders["acc-name:"] = TOKEN_ACCNAME;
knownHeaders["tool:"] = TOKEN_TOOL;
knownHeaders["name:"] = TOKEN_NAME;
knownHeaders["properties:"] = TOKEN_PROPERTIES;
}
/** Get the next token from the input stream. */
Token nextToken() {
// first, skip any whitespace
skip();
if (ch == EOF) return Token(TOKEN_EOF, line, col);
// handle the simple syntactic elements
switch (ch) {
case '!': return Token(TOKEN_NOT, line, col);
case '&': return Token(TOKEN_AND, line, col);
case '|': return Token(TOKEN_OR, line, col);
case '(': return Token(TOKEN_LPARENTH, line, col);
case ')': return Token(TOKEN_RPARENTH, line, col);
case '[': return Token(TOKEN_LBRACKET, line, col);
case ']': return Token(TOKEN_RBRACKET, line, col);
case '{': return Token(TOKEN_LCURLY, line, col);
case '}': return Token(TOKEN_RCURLY, line, col);
}
// remember where the token began
unsigned int lineStart = line;
unsigned int colStart = col;
// handle --XYZ-- style markers
if (ch == '-') {
unsigned int index=0;
bool canBeAbort = true;
bool canBeBody = true;
bool canBeEnd = true;
std::string abort("-ABORT--");
std::string body("-BODY--");
std::string end("-END--");
while (canBeAbort || canBeBody || canBeEnd) {
nextChar();
if (ch == EOF) {throw error("Premature end-of-file inside token", lineStart, colStart);}
if (canBeAbort && ch == abort.at(index)) {
if (index == abort.length()-1) {
return Token(TOKEN_ABORT, lineStart, colStart);
}
} else {
canBeAbort=false;
}
if (canBeBody && ch == body.at(index)) {
if (index == body.length()-1) {
return Token(TOKEN_BODY, lineStart, colStart);
}
} else {
canBeBody=false;
}
if (canBeEnd && ch == end.at(index)) {
if (index == end.length()-1) {
return Token(TOKEN_END, lineStart, colStart);
}
} else {
canBeEnd=false;
}
index++;
if (index >= abort.length()) canBeAbort = false;
if (index >= body.length()) canBeBody = false;
if (index >= end.length()) canBeEnd = false;
}
throw error("Lexical error: For token starting with '-', expected either '--BODY--', '--END--' or '--ABORT--'", lineStart, colStart);
}
// handle quoted strings
if (ch == '"') {
std::string text(1, (char)ch);
bool last_was_quote = false;
while (true) {
nextChar();
if (ch == EOF) {throw error("Premature end-of-file in quoted string", lineStart, colStart);}
text+=(char)ch;
if (ch == '"' && !last_was_quote) break;
if (ch == '\\' && !last_was_quote) {
last_was_quote = true;
} else {
last_was_quote = false;
}
}
return Token(TOKEN_STRING, text, lineStart, colStart);
}
// handle integers
if (ch >= '0' && ch <= '9') {
std::string text(1, (char)ch);
while (true) {
int next = peekChar();
if (next >= '0' && next <= '9') {
nextChar();
text+=(char)ch;
} else {
break;
}
}
if (text.at(0)=='0' && text.length() > 1) {
throw error("Syntax error parsing integer, starts with 0: "+text, lineStart, colStart);
}
try {
unsigned int vInteger = std::stoi(text);
return Token(vInteger, lineStart, colStart);
} catch (std::invalid_argument& e) {
throw error("Syntax error: "+text+" is not an integer", lineStart, colStart);
} catch (std::out_of_range& e) {
throw error("Syntax error: integer "+text+" is too big to represent as an unsigned int", lineStart, colStart);
}
} else if (ch == '@' || ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
// handle identifiers, @alias-names, headers, t and f
std::string text(1, (char)ch);
bool alias = (ch == '@');
while (true) {
int next = peekChar();
if (next == EOF) break;
if (next == ':') {
if (alias) break;
// consume ':'
nextChar();
text+=':';
break;
}
if (next == '_' ||
next == '-' ||
(next >= 'a' && next <= 'z') ||
(next >= 'A' && next <= 'Z') ||
(next >= '0' && next <= '9')) {
nextChar();
text+=(char)ch;
continue;
} else {
break;
}
}
if (alias) {
return Token(TOKEN_ALIAS_NAME, text, lineStart, colStart);
}
if (text.back() == ':') {
auto it = knownHeaders.find(text);
if (it != knownHeaders.end()) {
return Token((*it).second, text, lineStart, colStart);
}
return Token(TOKEN_HEADER_NAME, text, lineStart, colStart);
}
if (text == "t") {
return Token(TOKEN_TRUE, text, lineStart, colStart);
} else if (text == "f") {
return Token(TOKEN_FALSE, text, lineStart, colStart);
}
return Token(TOKEN_IDENT, text, lineStart, colStart);
}
throw error("Syntax error, illegal character '"+std::string(1, (char)ch)+"'", lineStart, colStart);
}
private:
/** Skip whitespace. */
void skip() {
while (true) {
nextChar();
if (ch == EOF) { // EOF
return;
}
if (ch == '/') {
skipComment();
continue;
}
if (ch == ' ' || ch == '\t') {
continue;
}
if (ch == '\n' || ch == '\r') {
line++;
col=0;
continue;
}
break;
}
}
/** Skip a comment */
void skipComment() {
nextChar();
if (ch != '*') {
throw error("Malformed start of comment", line, col);
}
bool last_was_slash = false;
bool last_was_star = false;
unsigned int nesting = 0;
while (true) {
nextChar();
if (ch == EOF) {throw error("End-of-file inside comment", line, col);}
if (ch == '\n' || ch == '\r') {
line++;
col=0;
last_was_slash = false;
last_was_star = false;
continue;
}
if (ch == '/') {
if (last_was_star) {
if (nesting == 0) {
return;
} else {
nesting--;
}
} else {
last_was_slash = true;
}
continue;
}
if (ch == '*') {
if (last_was_slash) {
nesting++;
} else {
last_was_star = true;
continue;
}
}
last_was_slash = false;
last_was_star = false;
}
}
/** Read the next char in the input stream, store in `ch` */
void nextChar() {
ch = in.get();
if (ch != EOF) {
col++;
}
}
/** Peek at the next char in the input stream without consuming */
int peekChar() {
return in.peek();
}
/**
* Construct a HOAParserExeption for a lexer error.
* @param msg the error message
* @param errLine the line number where the error occured
* @param errCol column number where the error occured
*/
HOAParserException error(const std::string& msg, unsigned int errLine, unsigned int errCol) {
return HOAParserException(msg+" (at line "+std::to_string(errLine)+", col "+std::to_string(errCol)+")", errLine, errCol);
}
private:
/** The input stream */
std::istream& in;
/** The current line number */
unsigned int line;
/** The current column number */
unsigned int col;
/** The current character (or EOF) */
int ch;
/** A map for mapping the known header names to the corresponding token types */
std::map<std::string, TokenType> knownHeaders;
};
}
#endif