tempest/resources/3rdparty/cpphoafparser-0.99.2/include/cpphoafparser/parser/hoa_lexer.hh

//==============================================================================
//
//  Copyright (c) 2015-
//  Authors:
//  * Joachim Klein <klein@tcs.inf.tu-dresden.de>
//  * David Mueller <david.mueller@tcs.inf.tu-dresden.de>
//
//------------------------------------------------------------------------------
//
//  This file is part of the cpphoafparser library,
//      http://automata.tools/hoa/cpphoafparser/
//
//  The cpphoafparser library is free software; you can redistribute it and/or
//  modify it under the terms of the GNU Lesser General Public
//  License as published by the Free Software Foundation; either
//  version 2.1 of the License, or (at your option) any later version.
//
//  The cpphoafparser library is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
//  Lesser General Public License for more details.
//
//  You should have received a copy of the GNU Lesser General Public
//  License along with this library; if not, write to the Free Software
//  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
//
//==============================================================================

#ifndef CPPHOAFPARSER_HOALEXER_H
#define CPPHOAFPARSER_HOALEXER_H

#include <map>
#include <string>
#include <stdexcept>

#include "cpphoafparser/parser/hoa_parser_exception.hh"

namespace cpphoafparser {

/** Lexer for tokenizing a HOA stream (used internally by HOAParser). */
class HOALexer {
public:
  /** The type of the tokens in a HOA stream. */
  enum TokenType {
    TOKEN_INT,
    TOKEN_IDENT,
    TOKEN_STRING,
    TOKEN_HEADER_NAME,
    TOKEN_ALIAS_NAME,

    TOKEN_EOF,

    TOKEN_BODY,
    TOKEN_END,
    TOKEN_ABORT,
    TOKEN_HOA,
    TOKEN_STATE,
    TOKEN_STATES,
    TOKEN_START,
    TOKEN_AP,
    TOKEN_ALIAS,
    TOKEN_ACCEPTANCE,
    TOKEN_ACCNAME,
    TOKEN_TOOL,
    TOKEN_NAME,
    TOKEN_PROPERTIES,

    // Punctuation, etc.
    TOKEN_NOT,
    TOKEN_AND,
    TOKEN_OR,
    TOKEN_LPARENTH,
    TOKEN_RPARENTH,
    TOKEN_LBRACKET,
    TOKEN_RBRACKET,
    TOKEN_LCURLY,
    TOKEN_RCURLY,
    TOKEN_TRUE,
    TOKEN_FALSE
  };

  /** A token in the HOA stream. */
  struct Token {
    /** The kind of the token. */
    TokenType kind;
    /** The string representation of this token (if applicable) */
    std::string vString;
    /** The integer representation of this token (if applicable) */
    unsigned int vInteger;

    /** The line where this token started */
    unsigned int line;
    /** The column where this token started */
    unsigned int col;

    /** EOF (end-of-file) constructor. */
    Token() : kind(TOKEN_EOF), vString(""), vInteger(0), line(0), col(0) {}
    /** Constructor for syntactic element */
    Token(TokenType kind, unsigned int line, unsigned int col) : kind(kind), vString(""), vInteger(0), line(line), col(col) {}
    /** Constructor for a token having variable string content (e.g., TOKEN_IDENTIFIER, TOKEN_ALIAS, TOKEN_STRING, ...) */
    Token(TokenType kind, const std::string vString, unsigned int line, unsigned int col) : kind(kind), vString(vString), vInteger(0), line(line), col(col) {}
    /** Constructor for an unsigned integer token */
    Token(unsigned int vInteger, unsigned int line, unsigned int col) : kind(TOKEN_INT), vString(""), vInteger(vInteger), line(line), col(col) {}

    /** Returns true if this token represents the end-of-file. */
    bool isEOF() const {return kind == TOKEN_EOF;}

    /** Returns a string name for the given token type. */
    static std::string typeAsString(TokenType kind) {
      switch (kind) {
      case TOKEN_INT: return std::string("INT");
      case TOKEN_IDENT: return std::string("IDENT");
      case TOKEN_STRING: return std::string("STRING");
      case TOKEN_HEADER_NAME: return std::string("HEADER_NAME");
      case TOKEN_ALIAS_NAME: return std::string("ALIAS_NAME");

      case TOKEN_EOF: return std::string("EOF");

      case TOKEN_BODY: return std::string("BODY");
      case TOKEN_END: return std::string("END");
      case TOKEN_ABORT: return std::string("ABORT");
      case TOKEN_HOA: return std::string("HOA");
      case TOKEN_STATE: return std::string("STATE");
      case TOKEN_STATES: return std::string("STATES");
      case TOKEN_START: return std::string("START");
      case TOKEN_AP: return std::string("AP");
      case TOKEN_ALIAS: return std::string("ALIAS");
      case TOKEN_ACCEPTANCE: return std::string("ACCEPTANCE");
      case TOKEN_ACCNAME: return std::string("ACCNAME");
      case TOKEN_TOOL: return std::string("TOOL");
      case TOKEN_NAME: return std::string("NAME");
      case TOKEN_PROPERTIES: return std::string("PROPERTIES");

      // Punctuation: etc.
      case TOKEN_NOT: return std::string("NOT");
      case TOKEN_AND: return std::string("AND");
      case TOKEN_OR: return std::string("OR");
      case TOKEN_LPARENTH: return std::string("LPARENTH");
      case TOKEN_RPARENTH: return std::string("RPARENTH");
      case TOKEN_LBRACKET: return std::string("LBRACKET");
      case TOKEN_RBRACKET: return std::string("RBRACKET");
      case TOKEN_LCURLY: return std::string("LCURLY");
      case TOKEN_RCURLY: return std::string("RCURLY");
      case TOKEN_TRUE: return std::string("TRUE");
      case TOKEN_FALSE: return std::string("FALSE");
      }
      throw std::logic_error("Unhandled token type");
    }

    /** Returns a string name for the given token type (for use in error messages). */
    static std::string forErrorMessage(TokenType kind) {
      switch (kind) {
      case TOKEN_INT: return std::string("INTEGER");
      case TOKEN_IDENT: return std::string("IDENTIFIER");
      case TOKEN_STRING: return std::string("STRING");
      case TOKEN_HEADER_NAME: return std::string("HEADER_NAME");
      case TOKEN_ALIAS_NAME: return std::string("ALIAS_NAME");

      case TOKEN_EOF: return std::string("END-OF_FILE");

      case TOKEN_BODY: return std::string("--BODY--");
      case TOKEN_END: return std::string("--END--");
      case TOKEN_ABORT: return std::string("--ABORT--");
      case TOKEN_HOA: return std::string("HOA:");
      case TOKEN_STATE: return std::string("State:");
      case TOKEN_STATES: return std::string("States:");
      case TOKEN_START: return std::string("Start:");
      case TOKEN_AP: return std::string("AP:");
      case TOKEN_ALIAS: return std::string("Alias:");
      case TOKEN_ACCEPTANCE: return std::string("Acceptance:");
      case TOKEN_ACCNAME: return std::string("acc-name:");
      case TOKEN_TOOL: return std::string("tool:");
      case TOKEN_NAME: return std::string("name:");
      case TOKEN_PROPERTIES: return std::string("properties:");

      // Punctuation: etc.
      case TOKEN_NOT: return std::string("!");
      case TOKEN_AND: return std::string("&");
      case TOKEN_OR: return std::string("|");
      case TOKEN_LPARENTH: return std::string("(");
      case TOKEN_RPARENTH: return std::string(")");
      case TOKEN_LBRACKET: return std::string("[");
      case TOKEN_RBRACKET: return std::string("]");
      case TOKEN_LCURLY: return std::string("{");
      case TOKEN_RCURLY: return std::string("}");
      case TOKEN_TRUE: return std::string("t");
      case TOKEN_FALSE: return std::string("f");
      }
      throw std::logic_error("Unhandled token type");
    }

    /** Returns a string representation of a given token (for error messages). */
    static std::string forErrorMessage(Token token) {
      switch (token.kind) {
      case TOKEN_INT: return std::string("INTEGER ")+std::to_string(token.vInteger);
      case TOKEN_IDENT: return std::string("IDENTIFIER ")+token.vString;
      case TOKEN_STRING: return std::string("STRING ")+token.vString;
      case TOKEN_HEADER_NAME: return std::string("HEADER ")+token.vString;
      case TOKEN_ALIAS_NAME: return std::string("ALIAS ")+token.vString;

      case TOKEN_EOF: return std::string("END-OF-FILE");

      case TOKEN_BODY: return std::string("--BODY--");
      case TOKEN_END: return std::string("--END--");
      case TOKEN_ABORT: return std::string("--ABORT--");
      case TOKEN_HOA: return std::string("HEADER HOA");
      case TOKEN_STATES: return std::string("HEADER States");
      case TOKEN_START: return std::string("HEADERr Start");
      case TOKEN_AP: return std::string("HEADER AP");
      case TOKEN_ALIAS: return std::string("HEADER Alias");
      case TOKEN_ACCEPTANCE: return std::string("HEADER Acceptance");
      case TOKEN_ACCNAME: return std::string("HEADER acc-name");
      case TOKEN_TOOL: return std::string("HEADER tool");
      case TOKEN_NAME: return std::string("HEADER name");
      case TOKEN_PROPERTIES: return std::string("HEADER properties");

      case TOKEN_STATE: return std::string("DEFINITION State");

      // Punctuation: etc.
      case TOKEN_NOT: return std::string("!");
      case TOKEN_AND: return std::string("&");
      case TOKEN_OR: return std::string("|");
      case TOKEN_LPARENTH: return std::string("(");
      case TOKEN_RPARENTH: return std::string(")");
      case TOKEN_LBRACKET: return std::string("[");
      case TOKEN_RBRACKET: return std::string("]");
      case TOKEN_LCURLY: return std::string("{");
      case TOKEN_RCURLY: return std::string("}");
      case TOKEN_TRUE: return std::string("TRUE t");
      case TOKEN_FALSE: return std::string("FALSE f");
      }
      throw std::logic_error("Unhandled token type");
    }

    /** Output function for a given token. */
    friend std::ostream& operator<<(std::ostream& out, const Token& token) {
      out << "<" << token.typeAsString(token.kind) << "> ";
      if (token.kind == TOKEN_INT) {
        out << token.vInteger;
      } else {
        out << token.vString;
      }
      out << "     (" << token.line << "," << token.col << ")";
      return out;
    }
  };

  /** Constructor for a lexer, reading from the given input stream. */
  HOALexer(std::istream& in) 
  : in(in), line(1), col(0), ch(0) {
    // The headers we know
    knownHeaders["HOA:"] = TOKEN_HOA;    
    knownHeaders["State:"] = TOKEN_STATE;
    knownHeaders["States:"] = TOKEN_STATES;
    knownHeaders["Start:"] = TOKEN_START;
    knownHeaders["AP:"] = TOKEN_AP;
    knownHeaders["Alias:"] = TOKEN_ALIAS;
    knownHeaders["Acceptance:"] = TOKEN_ACCEPTANCE;
    knownHeaders["acc-name:"] = TOKEN_ACCNAME;
    knownHeaders["tool:"] = TOKEN_TOOL;
    knownHeaders["name:"] = TOKEN_NAME;
    knownHeaders["properties:"] = TOKEN_PROPERTIES;
  }

  /** Get the next token from the input stream. */
  Token nextToken() {
    // first, skip any whitespace
    skip();
    if (ch == EOF) return Token(TOKEN_EOF, line, col);

    // handle the simple syntactic elements
    switch (ch) {
    case '!': return Token(TOKEN_NOT, line, col);
    case '&': return Token(TOKEN_AND, line, col);
    case '|': return Token(TOKEN_OR, line, col);
    case '(': return Token(TOKEN_LPARENTH, line, col);
    case ')': return Token(TOKEN_RPARENTH, line, col);
    case '[': return Token(TOKEN_LBRACKET, line, col);
    case ']': return Token(TOKEN_RBRACKET, line, col);
    case '{': return Token(TOKEN_LCURLY, line, col);
    case '}': return Token(TOKEN_RCURLY, line, col);
    }

    // remember where the token began
    unsigned int lineStart = line;
    unsigned int colStart = col;

    // handle --XYZ-- style markers
    if (ch == '-') {
      unsigned int index=0;
      bool canBeAbort = true;
      bool canBeBody  = true;
      bool canBeEnd   = true;
      std::string abort("-ABORT--");
      std::string body("-BODY--");
      std::string end("-END--");

      while (canBeAbort || canBeBody || canBeEnd) {
        nextChar();
        if (ch == EOF) {throw error("Premature end-of-file inside token", lineStart, colStart);}
        if (canBeAbort && ch == abort.at(index)) {
          if (index == abort.length()-1) {
            return Token(TOKEN_ABORT, lineStart, colStart);
          }
        } else {
          canBeAbort=false;
        }
        if (canBeBody && ch == body.at(index)) {
          if (index == body.length()-1) {
            return Token(TOKEN_BODY, lineStart, colStart);
          }
        } else {
          canBeBody=false;
        }
        if (canBeEnd && ch == end.at(index)) {
          if (index == end.length()-1) {
            return Token(TOKEN_END, lineStart, colStart);
          }
        } else {
          canBeEnd=false;
        }

        index++;
        if (index >= abort.length()) canBeAbort = false;
        if (index >= body.length()) canBeBody = false;
        if (index >= end.length()) canBeEnd = false;
      }
      throw error("Lexical error: For token starting with '-', expected either '--BODY--', '--END--' or '--ABORT--'", lineStart, colStart);
    }

    // handle quoted strings
    if (ch == '"') {
      std::string text(1, (char)ch);
      bool last_was_quote = false;
      while (true) {
        nextChar();
        if (ch == EOF) {throw error("Premature end-of-file in quoted string", lineStart, colStart);}
        text+=(char)ch;
        if (ch == '"' && !last_was_quote) break;
        if (ch == '\\' && !last_was_quote) {
          last_was_quote = true;
        } else {
          last_was_quote = false;
        }
      }

      return Token(TOKEN_STRING, text, lineStart, colStart);
    }

    // handle integers
    if (ch >= '0' && ch <= '9') {
      std::string text(1, (char)ch);
      while (true) {
        int next = peekChar();
        if (next >= '0' && next <= '9') {
          nextChar();
          text+=(char)ch;
        } else {
          break;
        }
      }

      if (text.at(0)=='0' && text.length() > 1) {
        throw error("Syntax error parsing integer, starts with 0: "+text, lineStart, colStart);
      }

      try {
        unsigned int vInteger = std::stoi(text);
        return Token(vInteger, lineStart, colStart);
      } catch (std::invalid_argument& e) {
        throw error("Syntax error: "+text+" is not an integer", lineStart, colStart);
      } catch (std::out_of_range& e) {
        throw error("Syntax error: integer "+text+" is too big to represent as an unsigned int", lineStart, colStart);
      }

    } else if (ch == '@' || ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
      // handle identifiers, @alias-names, headers, t and f
      std::string text(1, (char)ch);

      bool alias = (ch == '@');
      while (true) {
        int next = peekChar();
        if (next == EOF) break;
        if (next == ':') {
          if (alias) break;
          // consume ':'
          nextChar();
          text+=':';
          break;
        }
        if (next == '_' ||
            next == '-' ||
            (next >= 'a' && next <= 'z') ||
            (next >= 'A' && next <= 'Z') ||
            (next >= '0' && next <= '9')) {
          nextChar();
          text+=(char)ch;
          continue;
        } else {
          break;
        }
      }

      if (alias) {
        return Token(TOKEN_ALIAS_NAME, text, lineStart, colStart);
      }

      if (text.back() == ':') {
        auto it = knownHeaders.find(text);
        if (it != knownHeaders.end()) {
          return Token((*it).second, text, lineStart, colStart);
        }
        return Token(TOKEN_HEADER_NAME, text, lineStart, colStart);
      }
      if (text == "t") {
        return Token(TOKEN_TRUE, text, lineStart, colStart);
      } else if (text == "f") {
        return Token(TOKEN_FALSE, text, lineStart, colStart);
      }
      return Token(TOKEN_IDENT, text, lineStart, colStart);
    }

    throw error("Syntax error, illegal character '"+std::string(1, (char)ch)+"'", lineStart, colStart);
  }

private:

  /** Skip whitespace. */
  void skip() {
    while (true) {
      nextChar();
      if (ch == EOF) { // EOF
        return;
      }
      if (ch == '/') {
        skipComment();
        continue;
      }
      if (ch == ' ' || ch == '\t') {
        continue;
      }
      if (ch == '\n' || ch == '\r') {
        line++;
        col=0;
        continue;
      }
      break;
    }
  }

  /** Skip a comment */
  void skipComment() {
    nextChar();
    if (ch != '*') {
      throw error("Malformed start of comment", line, col);
    }
    bool last_was_slash = false;
    bool last_was_star = false;
    unsigned int nesting = 0;
    while (true) {
      nextChar();
      if (ch == EOF) {throw error("End-of-file inside comment", line, col);}
      if (ch == '\n' || ch == '\r') {
        line++;
        col=0;
        last_was_slash = false;
        last_was_star = false;
        continue;
      }
      if (ch == '/') {
        if (last_was_star) {
          if (nesting == 0) {
            return;
          } else {
            nesting--;
          }
        } else {
          last_was_slash = true;
        }
        continue;
      }
      if (ch == '*') {
        if (last_was_slash) {
          nesting++;
        } else {
          last_was_star = true;
          continue;
        }
      }
      last_was_slash = false;
      last_was_star = false;
    }
  }

  /** Read the next char in the input stream, store in `ch` */
  void nextChar() {
    ch = in.get();
    if (ch != EOF) {
      col++;
    }
  }

  /** Peek at the next char in the input stream without consuming */
  int peekChar() {
    return in.peek();
  }

  /**
   * Construct a HOAParserExeption for a lexer error.
   * @param msg the error message
   * @param errLine the line number where the error occured
   * @param errCol column number where the error occured
   */
  HOAParserException error(const std::string& msg, unsigned int errLine, unsigned int errCol) {
    return HOAParserException(msg+" (at line "+std::to_string(errLine)+", col "+std::to_string(errCol)+")", errLine, errCol);
  }

private:
  /** The input stream */
  std::istream& in;
  /** The current line number */
  unsigned int line;
  /** The current column number */
  unsigned int col;
  /** The current character (or EOF) */
  int ch;

  /** A map for mapping the known header names to the corresponding token types */
  std::map<std::string, TokenType> knownHeaders;
};

}

#endif