You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

534 lines
18 KiB

  1. //==============================================================================
  2. //
  3. // Copyright (c) 2015-
  4. // Authors:
  5. // * Joachim Klein <klein@tcs.inf.tu-dresden.de>
  6. // * David Mueller <david.mueller@tcs.inf.tu-dresden.de>
  7. //
  8. //------------------------------------------------------------------------------
  9. //
  10. // This file is part of the cpphoafparser library,
  11. // http://automata.tools/hoa/cpphoafparser/
  12. //
  13. // The cpphoafparser library is free software; you can redistribute it and/or
  14. // modify it under the terms of the GNU Lesser General Public
  15. // License as published by the Free Software Foundation; either
  16. // version 2.1 of the License, or (at your option) any later version.
  17. //
  18. // The cpphoafparser library is distributed in the hope that it will be useful,
  19. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  21. // Lesser General Public License for more details.
  22. //
  23. // You should have received a copy of the GNU Lesser General Public
  24. // License along with this library; if not, write to the Free Software
  25. // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  26. //
  27. //==============================================================================
  28. #ifndef CPPHOAFPARSER_HOALEXER_H
  29. #define CPPHOAFPARSER_HOALEXER_H
  30. #include <map>
  31. #include <string>
  32. #include <stdexcept>
  33. #include "cpphoafparser/parser/hoa_parser_exception.hh"
  34. namespace cpphoafparser {
  35. /** Lexer for tokenizing a HOA stream (used internally by HOAParser). */
  36. class HOALexer {
  37. public:
  38. /** The type of the tokens in a HOA stream. */
  39. enum TokenType {
  40. TOKEN_INT,
  41. TOKEN_IDENT,
  42. TOKEN_STRING,
  43. TOKEN_HEADER_NAME,
  44. TOKEN_ALIAS_NAME,
  45. TOKEN_EOF,
  46. TOKEN_BODY,
  47. TOKEN_END,
  48. TOKEN_ABORT,
  49. TOKEN_HOA,
  50. TOKEN_STATE,
  51. TOKEN_STATES,
  52. TOKEN_START,
  53. TOKEN_AP,
  54. TOKEN_ALIAS,
  55. TOKEN_ACCEPTANCE,
  56. TOKEN_ACCNAME,
  57. TOKEN_TOOL,
  58. TOKEN_NAME,
  59. TOKEN_PROPERTIES,
  60. // Punctuation, etc.
  61. TOKEN_NOT,
  62. TOKEN_AND,
  63. TOKEN_OR,
  64. TOKEN_LPARENTH,
  65. TOKEN_RPARENTH,
  66. TOKEN_LBRACKET,
  67. TOKEN_RBRACKET,
  68. TOKEN_LCURLY,
  69. TOKEN_RCURLY,
  70. TOKEN_TRUE,
  71. TOKEN_FALSE
  72. };
  73. /** A token in the HOA stream. */
  74. struct Token {
  75. /** The kind of the token. */
  76. TokenType kind;
  77. /** The string representation of this token (if applicable) */
  78. std::string vString;
  79. /** The integer representation of this token (if applicable) */
  80. unsigned int vInteger;
  81. /** The line where this token started */
  82. unsigned int line;
  83. /** The column where this token started */
  84. unsigned int col;
  85. /** EOF (end-of-file) constructor. */
  86. Token() : kind(TOKEN_EOF), vString(""), vInteger(0), line(0), col(0) {}
  87. /** Constructor for syntactic element */
  88. Token(TokenType kind, unsigned int line, unsigned int col) : kind(kind), vString(""), vInteger(0), line(line), col(col) {}
  89. /** Constructor for a token having variable string content (e.g., TOKEN_IDENTIFIER, TOKEN_ALIAS, TOKEN_STRING, ...) */
  90. Token(TokenType kind, const std::string vString, unsigned int line, unsigned int col) : kind(kind), vString(vString), vInteger(0), line(line), col(col) {}
  91. /** Constructor for an unsigned integer token */
  92. Token(unsigned int vInteger, unsigned int line, unsigned int col) : kind(TOKEN_INT), vString(""), vInteger(vInteger), line(line), col(col) {}
  93. /** Returns true if this token represents the end-of-file. */
  94. bool isEOF() const {return kind == TOKEN_EOF;}
  95. /** Returns a string name for the given token type. */
  96. static std::string typeAsString(TokenType kind) {
  97. switch (kind) {
  98. case TOKEN_INT: return std::string("INT");
  99. case TOKEN_IDENT: return std::string("IDENT");
  100. case TOKEN_STRING: return std::string("STRING");
  101. case TOKEN_HEADER_NAME: return std::string("HEADER_NAME");
  102. case TOKEN_ALIAS_NAME: return std::string("ALIAS_NAME");
  103. case TOKEN_EOF: return std::string("EOF");
  104. case TOKEN_BODY: return std::string("BODY");
  105. case TOKEN_END: return std::string("END");
  106. case TOKEN_ABORT: return std::string("ABORT");
  107. case TOKEN_HOA: return std::string("HOA");
  108. case TOKEN_STATE: return std::string("STATE");
  109. case TOKEN_STATES: return std::string("STATES");
  110. case TOKEN_START: return std::string("START");
  111. case TOKEN_AP: return std::string("AP");
  112. case TOKEN_ALIAS: return std::string("ALIAS");
  113. case TOKEN_ACCEPTANCE: return std::string("ACCEPTANCE");
  114. case TOKEN_ACCNAME: return std::string("ACCNAME");
  115. case TOKEN_TOOL: return std::string("TOOL");
  116. case TOKEN_NAME: return std::string("NAME");
  117. case TOKEN_PROPERTIES: return std::string("PROPERTIES");
  118. // Punctuation: etc.
  119. case TOKEN_NOT: return std::string("NOT");
  120. case TOKEN_AND: return std::string("AND");
  121. case TOKEN_OR: return std::string("OR");
  122. case TOKEN_LPARENTH: return std::string("LPARENTH");
  123. case TOKEN_RPARENTH: return std::string("RPARENTH");
  124. case TOKEN_LBRACKET: return std::string("LBRACKET");
  125. case TOKEN_RBRACKET: return std::string("RBRACKET");
  126. case TOKEN_LCURLY: return std::string("LCURLY");
  127. case TOKEN_RCURLY: return std::string("RCURLY");
  128. case TOKEN_TRUE: return std::string("TRUE");
  129. case TOKEN_FALSE: return std::string("FALSE");
  130. }
  131. throw std::logic_error("Unhandled token type");
  132. }
  133. /** Returns a string name for the given token type (for use in error messages). */
  134. static std::string forErrorMessage(TokenType kind) {
  135. switch (kind) {
  136. case TOKEN_INT: return std::string("INTEGER");
  137. case TOKEN_IDENT: return std::string("IDENTIFIER");
  138. case TOKEN_STRING: return std::string("STRING");
  139. case TOKEN_HEADER_NAME: return std::string("HEADER_NAME");
  140. case TOKEN_ALIAS_NAME: return std::string("ALIAS_NAME");
  141. case TOKEN_EOF: return std::string("END-OF_FILE");
  142. case TOKEN_BODY: return std::string("--BODY--");
  143. case TOKEN_END: return std::string("--END--");
  144. case TOKEN_ABORT: return std::string("--ABORT--");
  145. case TOKEN_HOA: return std::string("HOA:");
  146. case TOKEN_STATE: return std::string("State:");
  147. case TOKEN_STATES: return std::string("States:");
  148. case TOKEN_START: return std::string("Start:");
  149. case TOKEN_AP: return std::string("AP:");
  150. case TOKEN_ALIAS: return std::string("Alias:");
  151. case TOKEN_ACCEPTANCE: return std::string("Acceptance:");
  152. case TOKEN_ACCNAME: return std::string("acc-name:");
  153. case TOKEN_TOOL: return std::string("tool:");
  154. case TOKEN_NAME: return std::string("name:");
  155. case TOKEN_PROPERTIES: return std::string("properties:");
  156. // Punctuation: etc.
  157. case TOKEN_NOT: return std::string("!");
  158. case TOKEN_AND: return std::string("&");
  159. case TOKEN_OR: return std::string("|");
  160. case TOKEN_LPARENTH: return std::string("(");
  161. case TOKEN_RPARENTH: return std::string(")");
  162. case TOKEN_LBRACKET: return std::string("[");
  163. case TOKEN_RBRACKET: return std::string("]");
  164. case TOKEN_LCURLY: return std::string("{");
  165. case TOKEN_RCURLY: return std::string("}");
  166. case TOKEN_TRUE: return std::string("t");
  167. case TOKEN_FALSE: return std::string("f");
  168. }
  169. throw std::logic_error("Unhandled token type");
  170. }
  171. /** Returns a string representation of a given token (for error messages). */
  172. static std::string forErrorMessage(Token token) {
  173. switch (token.kind) {
  174. case TOKEN_INT: return std::string("INTEGER ")+std::to_string(token.vInteger);
  175. case TOKEN_IDENT: return std::string("IDENTIFIER ")+token.vString;
  176. case TOKEN_STRING: return std::string("STRING ")+token.vString;
  177. case TOKEN_HEADER_NAME: return std::string("HEADER ")+token.vString;
  178. case TOKEN_ALIAS_NAME: return std::string("ALIAS ")+token.vString;
  179. case TOKEN_EOF: return std::string("END-OF-FILE");
  180. case TOKEN_BODY: return std::string("--BODY--");
  181. case TOKEN_END: return std::string("--END--");
  182. case TOKEN_ABORT: return std::string("--ABORT--");
  183. case TOKEN_HOA: return std::string("HEADER HOA");
  184. case TOKEN_STATES: return std::string("HEADER States");
  185. case TOKEN_START: return std::string("HEADERr Start");
  186. case TOKEN_AP: return std::string("HEADER AP");
  187. case TOKEN_ALIAS: return std::string("HEADER Alias");
  188. case TOKEN_ACCEPTANCE: return std::string("HEADER Acceptance");
  189. case TOKEN_ACCNAME: return std::string("HEADER acc-name");
  190. case TOKEN_TOOL: return std::string("HEADER tool");
  191. case TOKEN_NAME: return std::string("HEADER name");
  192. case TOKEN_PROPERTIES: return std::string("HEADER properties");
  193. case TOKEN_STATE: return std::string("DEFINITION State");
  194. // Punctuation: etc.
  195. case TOKEN_NOT: return std::string("!");
  196. case TOKEN_AND: return std::string("&");
  197. case TOKEN_OR: return std::string("|");
  198. case TOKEN_LPARENTH: return std::string("(");
  199. case TOKEN_RPARENTH: return std::string(")");
  200. case TOKEN_LBRACKET: return std::string("[");
  201. case TOKEN_RBRACKET: return std::string("]");
  202. case TOKEN_LCURLY: return std::string("{");
  203. case TOKEN_RCURLY: return std::string("}");
  204. case TOKEN_TRUE: return std::string("TRUE t");
  205. case TOKEN_FALSE: return std::string("FALSE f");
  206. }
  207. throw std::logic_error("Unhandled token type");
  208. }
  209. /** Output function for a given token. */
  210. friend std::ostream& operator<<(std::ostream& out, const Token& token) {
  211. out << "<" << token.typeAsString(token.kind) << "> ";
  212. if (token.kind == TOKEN_INT) {
  213. out << token.vInteger;
  214. } else {
  215. out << token.vString;
  216. }
  217. out << " (" << token.line << "," << token.col << ")";
  218. return out;
  219. }
  220. };
  221. /** Constructor for a lexer, reading from the given input stream. */
  222. HOALexer(std::istream& in)
  223. : in(in), line(1), col(0), ch(0) {
  224. // The headers we know
  225. knownHeaders["HOA:"] = TOKEN_HOA;
  226. knownHeaders["State:"] = TOKEN_STATE;
  227. knownHeaders["States:"] = TOKEN_STATES;
  228. knownHeaders["Start:"] = TOKEN_START;
  229. knownHeaders["AP:"] = TOKEN_AP;
  230. knownHeaders["Alias:"] = TOKEN_ALIAS;
  231. knownHeaders["Acceptance:"] = TOKEN_ACCEPTANCE;
  232. knownHeaders["acc-name:"] = TOKEN_ACCNAME;
  233. knownHeaders["tool:"] = TOKEN_TOOL;
  234. knownHeaders["name:"] = TOKEN_NAME;
  235. knownHeaders["properties:"] = TOKEN_PROPERTIES;
  236. }
  237. /** Get the next token from the input stream. */
  238. Token nextToken() {
  239. // first, skip any whitespace
  240. skip();
  241. if (ch == EOF) return Token(TOKEN_EOF, line, col);
  242. // handle the simple syntactic elements
  243. switch (ch) {
  244. case '!': return Token(TOKEN_NOT, line, col);
  245. case '&': return Token(TOKEN_AND, line, col);
  246. case '|': return Token(TOKEN_OR, line, col);
  247. case '(': return Token(TOKEN_LPARENTH, line, col);
  248. case ')': return Token(TOKEN_RPARENTH, line, col);
  249. case '[': return Token(TOKEN_LBRACKET, line, col);
  250. case ']': return Token(TOKEN_RBRACKET, line, col);
  251. case '{': return Token(TOKEN_LCURLY, line, col);
  252. case '}': return Token(TOKEN_RCURLY, line, col);
  253. }
  254. // remember where the token began
  255. unsigned int lineStart = line;
  256. unsigned int colStart = col;
  257. // handle --XYZ-- style markers
  258. if (ch == '-') {
  259. unsigned int index=0;
  260. bool canBeAbort = true;
  261. bool canBeBody = true;
  262. bool canBeEnd = true;
  263. std::string abort("-ABORT--");
  264. std::string body("-BODY--");
  265. std::string end("-END--");
  266. while (canBeAbort || canBeBody || canBeEnd) {
  267. nextChar();
  268. if (ch == EOF) {throw error("Premature end-of-file inside token", lineStart, colStart);}
  269. if (canBeAbort && ch == abort.at(index)) {
  270. if (index == abort.length()-1) {
  271. return Token(TOKEN_ABORT, lineStart, colStart);
  272. }
  273. } else {
  274. canBeAbort=false;
  275. }
  276. if (canBeBody && ch == body.at(index)) {
  277. if (index == body.length()-1) {
  278. return Token(TOKEN_BODY, lineStart, colStart);
  279. }
  280. } else {
  281. canBeBody=false;
  282. }
  283. if (canBeEnd && ch == end.at(index)) {
  284. if (index == end.length()-1) {
  285. return Token(TOKEN_END, lineStart, colStart);
  286. }
  287. } else {
  288. canBeEnd=false;
  289. }
  290. index++;
  291. if (index >= abort.length()) canBeAbort = false;
  292. if (index >= body.length()) canBeBody = false;
  293. if (index >= end.length()) canBeEnd = false;
  294. }
  295. throw error("Lexical error: For token starting with '-', expected either '--BODY--', '--END--' or '--ABORT--'", lineStart, colStart);
  296. }
  297. // handle quoted strings
  298. if (ch == '"') {
  299. std::string text(1, (char)ch);
  300. bool last_was_quote = false;
  301. while (true) {
  302. nextChar();
  303. if (ch == EOF) {throw error("Premature end-of-file in quoted string", lineStart, colStart);}
  304. text+=(char)ch;
  305. if (ch == '"' && !last_was_quote) break;
  306. if (ch == '\\' && !last_was_quote) {
  307. last_was_quote = true;
  308. } else {
  309. last_was_quote = false;
  310. }
  311. }
  312. return Token(TOKEN_STRING, text, lineStart, colStart);
  313. }
  314. // handle integers
  315. if (ch >= '0' && ch <= '9') {
  316. std::string text(1, (char)ch);
  317. while (true) {
  318. int next = peekChar();
  319. if (next >= '0' && next <= '9') {
  320. nextChar();
  321. text+=(char)ch;
  322. } else {
  323. break;
  324. }
  325. }
  326. if (text.at(0)=='0' && text.length() > 1) {
  327. throw error("Syntax error parsing integer, starts with 0: "+text, lineStart, colStart);
  328. }
  329. try {
  330. unsigned int vInteger = std::stoi(text);
  331. return Token(vInteger, lineStart, colStart);
  332. } catch (std::invalid_argument& e) {
  333. throw error("Syntax error: "+text+" is not an integer", lineStart, colStart);
  334. } catch (std::out_of_range& e) {
  335. throw error("Syntax error: integer "+text+" is too big to represent as an unsigned int", lineStart, colStart);
  336. }
  337. } else if (ch == '@' || ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
  338. // handle identifiers, @alias-names, headers, t and f
  339. std::string text(1, (char)ch);
  340. bool alias = (ch == '@');
  341. while (true) {
  342. int next = peekChar();
  343. if (next == EOF) break;
  344. if (next == ':') {
  345. if (alias) break;
  346. // consume ':'
  347. nextChar();
  348. text+=':';
  349. break;
  350. }
  351. if (next == '_' ||
  352. next == '-' ||
  353. (next >= 'a' && next <= 'z') ||
  354. (next >= 'A' && next <= 'Z') ||
  355. (next >= '0' && next <= '9')) {
  356. nextChar();
  357. text+=(char)ch;
  358. continue;
  359. } else {
  360. break;
  361. }
  362. }
  363. if (alias) {
  364. return Token(TOKEN_ALIAS_NAME, text, lineStart, colStart);
  365. }
  366. if (text.back() == ':') {
  367. auto it = knownHeaders.find(text);
  368. if (it != knownHeaders.end()) {
  369. return Token((*it).second, text, lineStart, colStart);
  370. }
  371. return Token(TOKEN_HEADER_NAME, text, lineStart, colStart);
  372. }
  373. if (text == "t") {
  374. return Token(TOKEN_TRUE, text, lineStart, colStart);
  375. } else if (text == "f") {
  376. return Token(TOKEN_FALSE, text, lineStart, colStart);
  377. }
  378. return Token(TOKEN_IDENT, text, lineStart, colStart);
  379. }
  380. throw error("Syntax error, illegal character '"+std::string(1, (char)ch)+"'", lineStart, colStart);
  381. }
  382. private:
  383. /** Skip whitespace. */
  384. void skip() {
  385. while (true) {
  386. nextChar();
  387. if (ch == EOF) { // EOF
  388. return;
  389. }
  390. if (ch == '/') {
  391. skipComment();
  392. continue;
  393. }
  394. if (ch == ' ' || ch == '\t') {
  395. continue;
  396. }
  397. if (ch == '\n' || ch == '\r') {
  398. line++;
  399. col=0;
  400. continue;
  401. }
  402. break;
  403. }
  404. }
  405. /** Skip a comment */
  406. void skipComment() {
  407. nextChar();
  408. if (ch != '*') {
  409. throw error("Malformed start of comment", line, col);
  410. }
  411. bool last_was_slash = false;
  412. bool last_was_star = false;
  413. unsigned int nesting = 0;
  414. while (true) {
  415. nextChar();
  416. if (ch == EOF) {throw error("End-of-file inside comment", line, col);}
  417. if (ch == '\n' || ch == '\r') {
  418. line++;
  419. col=0;
  420. last_was_slash = false;
  421. last_was_star = false;
  422. continue;
  423. }
  424. if (ch == '/') {
  425. if (last_was_star) {
  426. if (nesting == 0) {
  427. return;
  428. } else {
  429. nesting--;
  430. }
  431. } else {
  432. last_was_slash = true;
  433. }
  434. continue;
  435. }
  436. if (ch == '*') {
  437. if (last_was_slash) {
  438. nesting++;
  439. } else {
  440. last_was_star = true;
  441. continue;
  442. }
  443. }
  444. last_was_slash = false;
  445. last_was_star = false;
  446. }
  447. }
  448. /** Read the next char in the input stream, store in `ch` */
  449. void nextChar() {
  450. ch = in.get();
  451. if (ch != EOF) {
  452. col++;
  453. }
  454. }
  455. /** Peek at the next char in the input stream without consuming */
  456. int peekChar() {
  457. return in.peek();
  458. }
  459. /**
  460. * Construct a HOAParserExeption for a lexer error.
  461. * @param msg the error message
  462. * @param errLine the line number where the error occured
  463. * @param errCol column number where the error occured
  464. */
  465. HOAParserException error(const std::string& msg, unsigned int errLine, unsigned int errCol) {
  466. return HOAParserException(msg+" (at line "+std::to_string(errLine)+", col "+std::to_string(errCol)+")", errLine, errCol);
  467. }
  468. private:
  469. /** The input stream */
  470. std::istream& in;
  471. /** The current line number */
  472. unsigned int line;
  473. /** The current column number */
  474. unsigned int col;
  475. /** The current character (or EOF) */
  476. int ch;
  477. /** A map for mapping the known header names to the corresponding token types */
  478. std::map<std::string, TokenType> knownHeaders;
  479. };
  480. }
  481. #endif