You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
534 lines
18 KiB
534 lines
18 KiB
//==============================================================================
|
|
//
|
|
// Copyright (c) 2015-
|
|
// Authors:
|
|
// * Joachim Klein <klein@tcs.inf.tu-dresden.de>
|
|
// * David Mueller <david.mueller@tcs.inf.tu-dresden.de>
|
|
//
|
|
//------------------------------------------------------------------------------
|
|
//
|
|
// This file is part of the cpphoafparser library,
|
|
// http://automata.tools/hoa/cpphoafparser/
|
|
//
|
|
// The cpphoafparser library is free software; you can redistribute it and/or
|
|
// modify it under the terms of the GNU Lesser General Public
|
|
// License as published by the Free Software Foundation; either
|
|
// version 2.1 of the License, or (at your option) any later version.
|
|
//
|
|
// The cpphoafparser library is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
// Lesser General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Lesser General Public
|
|
// License along with this library; if not, write to the Free Software
|
|
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
//
|
|
//==============================================================================
|
|
|
|
#ifndef CPPHOAFPARSER_HOALEXER_H
|
|
#define CPPHOAFPARSER_HOALEXER_H
|
|
|
|
#include <map>
|
|
#include <string>
|
|
#include <stdexcept>
|
|
|
|
#include "cpphoafparser/parser/hoa_parser_exception.hh"
|
|
|
|
namespace cpphoafparser {
|
|
|
|
/** Lexer for tokenizing a HOA stream (used internally by HOAParser). */
|
|
class HOALexer {
|
|
public:
|
|
/** The type of the tokens in a HOA stream. */
|
|
enum TokenType {
|
|
TOKEN_INT,
|
|
TOKEN_IDENT,
|
|
TOKEN_STRING,
|
|
TOKEN_HEADER_NAME,
|
|
TOKEN_ALIAS_NAME,
|
|
|
|
TOKEN_EOF,
|
|
|
|
TOKEN_BODY,
|
|
TOKEN_END,
|
|
TOKEN_ABORT,
|
|
TOKEN_HOA,
|
|
TOKEN_STATE,
|
|
TOKEN_STATES,
|
|
TOKEN_START,
|
|
TOKEN_AP,
|
|
TOKEN_ALIAS,
|
|
TOKEN_ACCEPTANCE,
|
|
TOKEN_ACCNAME,
|
|
TOKEN_TOOL,
|
|
TOKEN_NAME,
|
|
TOKEN_PROPERTIES,
|
|
|
|
// Punctuation, etc.
|
|
TOKEN_NOT,
|
|
TOKEN_AND,
|
|
TOKEN_OR,
|
|
TOKEN_LPARENTH,
|
|
TOKEN_RPARENTH,
|
|
TOKEN_LBRACKET,
|
|
TOKEN_RBRACKET,
|
|
TOKEN_LCURLY,
|
|
TOKEN_RCURLY,
|
|
TOKEN_TRUE,
|
|
TOKEN_FALSE
|
|
};
|
|
|
|
/** A token in the HOA stream. */
|
|
struct Token {
|
|
/** The kind of the token. */
|
|
TokenType kind;
|
|
/** The string representation of this token (if applicable) */
|
|
std::string vString;
|
|
/** The integer representation of this token (if applicable) */
|
|
unsigned int vInteger;
|
|
|
|
/** The line where this token started */
|
|
unsigned int line;
|
|
/** The column where this token started */
|
|
unsigned int col;
|
|
|
|
/** EOF (end-of-file) constructor. */
|
|
Token() : kind(TOKEN_EOF), vString(""), vInteger(0), line(0), col(0) {}
|
|
/** Constructor for syntactic element */
|
|
Token(TokenType kind, unsigned int line, unsigned int col) : kind(kind), vString(""), vInteger(0), line(line), col(col) {}
|
|
/** Constructor for a token having variable string content (e.g., TOKEN_IDENTIFIER, TOKEN_ALIAS, TOKEN_STRING, ...) */
|
|
Token(TokenType kind, const std::string vString, unsigned int line, unsigned int col) : kind(kind), vString(vString), vInteger(0), line(line), col(col) {}
|
|
/** Constructor for an unsigned integer token */
|
|
Token(unsigned int vInteger, unsigned int line, unsigned int col) : kind(TOKEN_INT), vString(""), vInteger(vInteger), line(line), col(col) {}
|
|
|
|
/** Returns true if this token represents the end-of-file. */
|
|
bool isEOF() const {return kind == TOKEN_EOF;}
|
|
|
|
/** Returns a string name for the given token type. */
|
|
static std::string typeAsString(TokenType kind) {
|
|
switch (kind) {
|
|
case TOKEN_INT: return std::string("INT");
|
|
case TOKEN_IDENT: return std::string("IDENT");
|
|
case TOKEN_STRING: return std::string("STRING");
|
|
case TOKEN_HEADER_NAME: return std::string("HEADER_NAME");
|
|
case TOKEN_ALIAS_NAME: return std::string("ALIAS_NAME");
|
|
|
|
case TOKEN_EOF: return std::string("EOF");
|
|
|
|
case TOKEN_BODY: return std::string("BODY");
|
|
case TOKEN_END: return std::string("END");
|
|
case TOKEN_ABORT: return std::string("ABORT");
|
|
case TOKEN_HOA: return std::string("HOA");
|
|
case TOKEN_STATE: return std::string("STATE");
|
|
case TOKEN_STATES: return std::string("STATES");
|
|
case TOKEN_START: return std::string("START");
|
|
case TOKEN_AP: return std::string("AP");
|
|
case TOKEN_ALIAS: return std::string("ALIAS");
|
|
case TOKEN_ACCEPTANCE: return std::string("ACCEPTANCE");
|
|
case TOKEN_ACCNAME: return std::string("ACCNAME");
|
|
case TOKEN_TOOL: return std::string("TOOL");
|
|
case TOKEN_NAME: return std::string("NAME");
|
|
case TOKEN_PROPERTIES: return std::string("PROPERTIES");
|
|
|
|
// Punctuation: etc.
|
|
case TOKEN_NOT: return std::string("NOT");
|
|
case TOKEN_AND: return std::string("AND");
|
|
case TOKEN_OR: return std::string("OR");
|
|
case TOKEN_LPARENTH: return std::string("LPARENTH");
|
|
case TOKEN_RPARENTH: return std::string("RPARENTH");
|
|
case TOKEN_LBRACKET: return std::string("LBRACKET");
|
|
case TOKEN_RBRACKET: return std::string("RBRACKET");
|
|
case TOKEN_LCURLY: return std::string("LCURLY");
|
|
case TOKEN_RCURLY: return std::string("RCURLY");
|
|
case TOKEN_TRUE: return std::string("TRUE");
|
|
case TOKEN_FALSE: return std::string("FALSE");
|
|
}
|
|
throw std::logic_error("Unhandled token type");
|
|
}
|
|
|
|
/** Returns a string name for the given token type (for use in error messages). */
|
|
static std::string forErrorMessage(TokenType kind) {
|
|
switch (kind) {
|
|
case TOKEN_INT: return std::string("INTEGER");
|
|
case TOKEN_IDENT: return std::string("IDENTIFIER");
|
|
case TOKEN_STRING: return std::string("STRING");
|
|
case TOKEN_HEADER_NAME: return std::string("HEADER_NAME");
|
|
case TOKEN_ALIAS_NAME: return std::string("ALIAS_NAME");
|
|
|
|
case TOKEN_EOF: return std::string("END-OF_FILE");
|
|
|
|
case TOKEN_BODY: return std::string("--BODY--");
|
|
case TOKEN_END: return std::string("--END--");
|
|
case TOKEN_ABORT: return std::string("--ABORT--");
|
|
case TOKEN_HOA: return std::string("HOA:");
|
|
case TOKEN_STATE: return std::string("State:");
|
|
case TOKEN_STATES: return std::string("States:");
|
|
case TOKEN_START: return std::string("Start:");
|
|
case TOKEN_AP: return std::string("AP:");
|
|
case TOKEN_ALIAS: return std::string("Alias:");
|
|
case TOKEN_ACCEPTANCE: return std::string("Acceptance:");
|
|
case TOKEN_ACCNAME: return std::string("acc-name:");
|
|
case TOKEN_TOOL: return std::string("tool:");
|
|
case TOKEN_NAME: return std::string("name:");
|
|
case TOKEN_PROPERTIES: return std::string("properties:");
|
|
|
|
// Punctuation: etc.
|
|
case TOKEN_NOT: return std::string("!");
|
|
case TOKEN_AND: return std::string("&");
|
|
case TOKEN_OR: return std::string("|");
|
|
case TOKEN_LPARENTH: return std::string("(");
|
|
case TOKEN_RPARENTH: return std::string(")");
|
|
case TOKEN_LBRACKET: return std::string("[");
|
|
case TOKEN_RBRACKET: return std::string("]");
|
|
case TOKEN_LCURLY: return std::string("{");
|
|
case TOKEN_RCURLY: return std::string("}");
|
|
case TOKEN_TRUE: return std::string("t");
|
|
case TOKEN_FALSE: return std::string("f");
|
|
}
|
|
throw std::logic_error("Unhandled token type");
|
|
}
|
|
|
|
/** Returns a string representation of a given token (for error messages). */
|
|
static std::string forErrorMessage(Token token) {
|
|
switch (token.kind) {
|
|
case TOKEN_INT: return std::string("INTEGER ")+std::to_string(token.vInteger);
|
|
case TOKEN_IDENT: return std::string("IDENTIFIER ")+token.vString;
|
|
case TOKEN_STRING: return std::string("STRING ")+token.vString;
|
|
case TOKEN_HEADER_NAME: return std::string("HEADER ")+token.vString;
|
|
case TOKEN_ALIAS_NAME: return std::string("ALIAS ")+token.vString;
|
|
|
|
case TOKEN_EOF: return std::string("END-OF-FILE");
|
|
|
|
case TOKEN_BODY: return std::string("--BODY--");
|
|
case TOKEN_END: return std::string("--END--");
|
|
case TOKEN_ABORT: return std::string("--ABORT--");
|
|
case TOKEN_HOA: return std::string("HEADER HOA");
|
|
case TOKEN_STATES: return std::string("HEADER States");
|
|
case TOKEN_START: return std::string("HEADERr Start");
|
|
case TOKEN_AP: return std::string("HEADER AP");
|
|
case TOKEN_ALIAS: return std::string("HEADER Alias");
|
|
case TOKEN_ACCEPTANCE: return std::string("HEADER Acceptance");
|
|
case TOKEN_ACCNAME: return std::string("HEADER acc-name");
|
|
case TOKEN_TOOL: return std::string("HEADER tool");
|
|
case TOKEN_NAME: return std::string("HEADER name");
|
|
case TOKEN_PROPERTIES: return std::string("HEADER properties");
|
|
|
|
case TOKEN_STATE: return std::string("DEFINITION State");
|
|
|
|
// Punctuation: etc.
|
|
case TOKEN_NOT: return std::string("!");
|
|
case TOKEN_AND: return std::string("&");
|
|
case TOKEN_OR: return std::string("|");
|
|
case TOKEN_LPARENTH: return std::string("(");
|
|
case TOKEN_RPARENTH: return std::string(")");
|
|
case TOKEN_LBRACKET: return std::string("[");
|
|
case TOKEN_RBRACKET: return std::string("]");
|
|
case TOKEN_LCURLY: return std::string("{");
|
|
case TOKEN_RCURLY: return std::string("}");
|
|
case TOKEN_TRUE: return std::string("TRUE t");
|
|
case TOKEN_FALSE: return std::string("FALSE f");
|
|
}
|
|
throw std::logic_error("Unhandled token type");
|
|
}
|
|
|
|
/** Output function for a given token. */
|
|
friend std::ostream& operator<<(std::ostream& out, const Token& token) {
|
|
out << "<" << token.typeAsString(token.kind) << "> ";
|
|
if (token.kind == TOKEN_INT) {
|
|
out << token.vInteger;
|
|
} else {
|
|
out << token.vString;
|
|
}
|
|
out << " (" << token.line << "," << token.col << ")";
|
|
return out;
|
|
}
|
|
};
|
|
|
|
/** Constructor for a lexer, reading from the given input stream. */
|
|
HOALexer(std::istream& in)
|
|
: in(in), line(1), col(0), ch(0) {
|
|
// The headers we know
|
|
knownHeaders["HOA:"] = TOKEN_HOA;
|
|
knownHeaders["State:"] = TOKEN_STATE;
|
|
knownHeaders["States:"] = TOKEN_STATES;
|
|
knownHeaders["Start:"] = TOKEN_START;
|
|
knownHeaders["AP:"] = TOKEN_AP;
|
|
knownHeaders["Alias:"] = TOKEN_ALIAS;
|
|
knownHeaders["Acceptance:"] = TOKEN_ACCEPTANCE;
|
|
knownHeaders["acc-name:"] = TOKEN_ACCNAME;
|
|
knownHeaders["tool:"] = TOKEN_TOOL;
|
|
knownHeaders["name:"] = TOKEN_NAME;
|
|
knownHeaders["properties:"] = TOKEN_PROPERTIES;
|
|
}
|
|
|
|
/** Get the next token from the input stream. */
|
|
Token nextToken() {
|
|
// first, skip any whitespace
|
|
skip();
|
|
if (ch == EOF) return Token(TOKEN_EOF, line, col);
|
|
|
|
// handle the simple syntactic elements
|
|
switch (ch) {
|
|
case '!': return Token(TOKEN_NOT, line, col);
|
|
case '&': return Token(TOKEN_AND, line, col);
|
|
case '|': return Token(TOKEN_OR, line, col);
|
|
case '(': return Token(TOKEN_LPARENTH, line, col);
|
|
case ')': return Token(TOKEN_RPARENTH, line, col);
|
|
case '[': return Token(TOKEN_LBRACKET, line, col);
|
|
case ']': return Token(TOKEN_RBRACKET, line, col);
|
|
case '{': return Token(TOKEN_LCURLY, line, col);
|
|
case '}': return Token(TOKEN_RCURLY, line, col);
|
|
}
|
|
|
|
// remember where the token began
|
|
unsigned int lineStart = line;
|
|
unsigned int colStart = col;
|
|
|
|
// handle --XYZ-- style markers
|
|
if (ch == '-') {
|
|
unsigned int index=0;
|
|
bool canBeAbort = true;
|
|
bool canBeBody = true;
|
|
bool canBeEnd = true;
|
|
std::string abort("-ABORT--");
|
|
std::string body("-BODY--");
|
|
std::string end("-END--");
|
|
|
|
while (canBeAbort || canBeBody || canBeEnd) {
|
|
nextChar();
|
|
if (ch == EOF) {throw error("Premature end-of-file inside token", lineStart, colStart);}
|
|
if (canBeAbort && ch == abort.at(index)) {
|
|
if (index == abort.length()-1) {
|
|
return Token(TOKEN_ABORT, lineStart, colStart);
|
|
}
|
|
} else {
|
|
canBeAbort=false;
|
|
}
|
|
if (canBeBody && ch == body.at(index)) {
|
|
if (index == body.length()-1) {
|
|
return Token(TOKEN_BODY, lineStart, colStart);
|
|
}
|
|
} else {
|
|
canBeBody=false;
|
|
}
|
|
if (canBeEnd && ch == end.at(index)) {
|
|
if (index == end.length()-1) {
|
|
return Token(TOKEN_END, lineStart, colStart);
|
|
}
|
|
} else {
|
|
canBeEnd=false;
|
|
}
|
|
|
|
index++;
|
|
if (index >= abort.length()) canBeAbort = false;
|
|
if (index >= body.length()) canBeBody = false;
|
|
if (index >= end.length()) canBeEnd = false;
|
|
}
|
|
throw error("Lexical error: For token starting with '-', expected either '--BODY--', '--END--' or '--ABORT--'", lineStart, colStart);
|
|
}
|
|
|
|
// handle quoted strings
|
|
if (ch == '"') {
|
|
std::string text(1, (char)ch);
|
|
bool last_was_quote = false;
|
|
while (true) {
|
|
nextChar();
|
|
if (ch == EOF) {throw error("Premature end-of-file in quoted string", lineStart, colStart);}
|
|
text+=(char)ch;
|
|
if (ch == '"' && !last_was_quote) break;
|
|
if (ch == '\\' && !last_was_quote) {
|
|
last_was_quote = true;
|
|
} else {
|
|
last_was_quote = false;
|
|
}
|
|
}
|
|
|
|
return Token(TOKEN_STRING, text, lineStart, colStart);
|
|
}
|
|
|
|
// handle integers
|
|
if (ch >= '0' && ch <= '9') {
|
|
std::string text(1, (char)ch);
|
|
while (true) {
|
|
int next = peekChar();
|
|
if (next >= '0' && next <= '9') {
|
|
nextChar();
|
|
text+=(char)ch;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (text.at(0)=='0' && text.length() > 1) {
|
|
throw error("Syntax error parsing integer, starts with 0: "+text, lineStart, colStart);
|
|
}
|
|
|
|
try {
|
|
unsigned int vInteger = std::stoi(text);
|
|
return Token(vInteger, lineStart, colStart);
|
|
} catch (std::invalid_argument& e) {
|
|
throw error("Syntax error: "+text+" is not an integer", lineStart, colStart);
|
|
} catch (std::out_of_range& e) {
|
|
throw error("Syntax error: integer "+text+" is too big to represent as an unsigned int", lineStart, colStart);
|
|
}
|
|
|
|
} else if (ch == '@' || ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
|
|
// handle identifiers, @alias-names, headers, t and f
|
|
std::string text(1, (char)ch);
|
|
|
|
bool alias = (ch == '@');
|
|
while (true) {
|
|
int next = peekChar();
|
|
if (next == EOF) break;
|
|
if (next == ':') {
|
|
if (alias) break;
|
|
// consume ':'
|
|
nextChar();
|
|
text+=':';
|
|
break;
|
|
}
|
|
if (next == '_' ||
|
|
next == '-' ||
|
|
(next >= 'a' && next <= 'z') ||
|
|
(next >= 'A' && next <= 'Z') ||
|
|
(next >= '0' && next <= '9')) {
|
|
nextChar();
|
|
text+=(char)ch;
|
|
continue;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (alias) {
|
|
return Token(TOKEN_ALIAS_NAME, text, lineStart, colStart);
|
|
}
|
|
|
|
if (text.back() == ':') {
|
|
auto it = knownHeaders.find(text);
|
|
if (it != knownHeaders.end()) {
|
|
return Token((*it).second, text, lineStart, colStart);
|
|
}
|
|
return Token(TOKEN_HEADER_NAME, text, lineStart, colStart);
|
|
}
|
|
if (text == "t") {
|
|
return Token(TOKEN_TRUE, text, lineStart, colStart);
|
|
} else if (text == "f") {
|
|
return Token(TOKEN_FALSE, text, lineStart, colStart);
|
|
}
|
|
return Token(TOKEN_IDENT, text, lineStart, colStart);
|
|
}
|
|
|
|
throw error("Syntax error, illegal character '"+std::string(1, (char)ch)+"'", lineStart, colStart);
|
|
}
|
|
|
|
private:
|
|
|
|
/** Skip whitespace. */
|
|
void skip() {
|
|
while (true) {
|
|
nextChar();
|
|
if (ch == EOF) { // EOF
|
|
return;
|
|
}
|
|
if (ch == '/') {
|
|
skipComment();
|
|
continue;
|
|
}
|
|
if (ch == ' ' || ch == '\t') {
|
|
continue;
|
|
}
|
|
if (ch == '\n' || ch == '\r') {
|
|
line++;
|
|
col=0;
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/** Skip a comment */
|
|
void skipComment() {
|
|
nextChar();
|
|
if (ch != '*') {
|
|
throw error("Malformed start of comment", line, col);
|
|
}
|
|
bool last_was_slash = false;
|
|
bool last_was_star = false;
|
|
unsigned int nesting = 0;
|
|
while (true) {
|
|
nextChar();
|
|
if (ch == EOF) {throw error("End-of-file inside comment", line, col);}
|
|
if (ch == '\n' || ch == '\r') {
|
|
line++;
|
|
col=0;
|
|
last_was_slash = false;
|
|
last_was_star = false;
|
|
continue;
|
|
}
|
|
if (ch == '/') {
|
|
if (last_was_star) {
|
|
if (nesting == 0) {
|
|
return;
|
|
} else {
|
|
nesting--;
|
|
}
|
|
} else {
|
|
last_was_slash = true;
|
|
}
|
|
continue;
|
|
}
|
|
if (ch == '*') {
|
|
if (last_was_slash) {
|
|
nesting++;
|
|
} else {
|
|
last_was_star = true;
|
|
continue;
|
|
}
|
|
}
|
|
last_was_slash = false;
|
|
last_was_star = false;
|
|
}
|
|
}
|
|
|
|
/** Read the next char in the input stream, store in `ch` */
|
|
void nextChar() {
|
|
ch = in.get();
|
|
if (ch != EOF) {
|
|
col++;
|
|
}
|
|
}
|
|
|
|
/** Peek at the next char in the input stream without consuming */
|
|
int peekChar() {
|
|
return in.peek();
|
|
}
|
|
|
|
/**
|
|
* Construct a HOAParserExeption for a lexer error.
|
|
* @param msg the error message
|
|
* @param errLine the line number where the error occured
|
|
* @param errCol column number where the error occured
|
|
*/
|
|
HOAParserException error(const std::string& msg, unsigned int errLine, unsigned int errCol) {
|
|
return HOAParserException(msg+" (at line "+std::to_string(errLine)+", col "+std::to_string(errCol)+")", errLine, errCol);
|
|
}
|
|
|
|
private:
|
|
/** The input stream */
|
|
std::istream& in;
|
|
/** The current line number */
|
|
unsigned int line;
|
|
/** The current column number */
|
|
unsigned int col;
|
|
/** The current character (or EOF) */
|
|
int ch;
|
|
|
|
/** A map for mapping the known header names to the corresponding token types */
|
|
std::map<std::string, TokenType> knownHeaders;
|
|
};
|
|
|
|
}
|
|
|
|
#endif
|