jaulib v1.3.0
Jau Support Library (C++, Java, ..)
token_fsm.hpp
Go to the documentation of this file.
1/*
2 * Author: Sven Gothel <sgothel@jausoft.com>
3 * Copyright (c) 1992-2022 Gothel Software e.K.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining
6 * a copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject to
11 * the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be
14 * included in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Original header of salvaged code from July 1992
25 *
26 * TOKEN_A.cpp --- Einfacher TOKEN-AUTOMAT fuer STRINGS 5. Juli 1993
27 * V1.0
28 *
29 * Sven Goethel - Stapenhorststr. 35a 4800 Bielefeld 1
30 *
31 *
32 * *************************************************************************
33 * * TOKEN AUTOMAT .... written 29.07.1992 by Sven Göthel *
34 * *************************************************************************
35 */
36#ifndef JAU_TOKEN_FSM_HPP_
37#define JAU_TOKEN_FSM_HPP_
38
39#include <vector>
40#include <string>
41#include <type_traits>
42
43#include <jau/int_types.hpp>
44#include <jau/darray.hpp>
45#include <jau/basic_algos.hpp>
46
47// #define JAU_DO_JAU_TRACE_PRINT 1
48#ifdef JAU_DO_JAU_TRACE_PRINT
49 #define JAU_TRACE_PRINT(...) fprintf(stderr, __VA_ARGS__);
50#else
51 #define JAU_TRACE_PRINT(...)
52#endif
53
54namespace jau::lang {
55 /** @defgroup Lang Languages
56 * Language functionality, programming and otherwise
57 *
58 * Supported
59 * - jau::lang::token_fsm A lexical analyzer (tokenizer) using a tabular finite-state-machine (FSM), aka `endlicher automat` (EA)
60 *
61 * For serious applications w/ regular expressions and more, as well as a `lex` C++ alternative to `flex`,
62 * consider using [Re-flex](https://github.com/Genivia/RE-flex).
63 *
64 * @{
65 */
66
67 /**
68 * Base Alphabet Specification providing the alphabet for token_fsm.
69 *
70 * Implementation delegates static code_point() function.
71 *
72 * @see token_fsm()
73 */
74 class alphabet {
75 public:
76 /**
77 * Unsigned int symbol for alphabet code-point type
78 */
79 typedef uint16_t code_point_t;
80
81 /**
82 * token_error value, denoting an invalid alphabet code-point.
83 */
85
86 typedef code_point_t (*code_point_func)(const char c) noexcept;
87
88 private:
89 std::string name_;
90 code_point_t base_;
92
93 public:
94 alphabet(std::string _name, code_point_t _base, code_point_func _cpf) noexcept
95 : name_( std::move(_name) ), base_(_base), cpf(_cpf) {}
96
97 /** Human readable name for this alphabet instance. */
98 constexpr const std::string& name() const noexcept { return name_; }
99
100 /** The fixed base used for this alphabet, i.e. number of token. */
101 constexpr code_point_t base() const noexcept { return base_; }
102
103 /** Returns the token of the given character or code_error if not element of this alphabet. */
104 constexpr code_point_t code_point(const char c) const noexcept { return cpf(c); }
105
106 std::string to_string() const noexcept {
107 std::string res("alphabet[");
108 res.append(name());
109 res.append(", base "+std::to_string(base())+"]");
110 return res;
111 }
112 };
113 inline std::string to_string(const alphabet& v) noexcept { return v.to_string(); }
114
115 inline bool operator!=(const alphabet& lhs, const alphabet& rhs ) noexcept {
116 return lhs.base() != rhs.base() || lhs.name() != rhs.name();
117 }
118
119 inline bool operator==(const alphabet& lhs, const alphabet& rhs ) noexcept {
120 return !( lhs != rhs );
121 }
122
123 /**
124 * Full ASCII base 95 alphabet with ASCII code-point sorting order.
125 *
126 * ### Properties
127 * - Base 95, i.e. full visible ASCII [32 .. 126]
128 * - 7-bit ASCII
129 * - Code page 437 compatible
130 * - Supporting ASCII code-point sorting.
131 * - Order: ` ` < `0` < `:` < `A` < `[` < `a` < `{` < `~`
132 */
133 class ascii95_alphabet : public alphabet {
134 private:
135 static code_point_t s_code_point(const char c) noexcept {
136 if( ' ' <= c && c <= '~' ) {
137 return c - ' ';
138 } else {
139 return code_error;
140 }
141 }
142
143 public:
145 : alphabet("ascii95", 95, s_code_point) {}
146 };
147
148 /**
149 * Case insensitive ASCII base 69 alphabet with ASCII code-point sorting order.
150 *
151 * ### Properties
152 * - Base 69, i.e. ASCII [32 .. 96] + [123 .. 126], merging lower- and capital-letters
153 * - 7-bit ASCII
154 * - Code page 437 compatible
155 * - Supporting ASCII code-point sorting.
156 * - Order: ` ` < `0` < `:` < `A` < `[` < `{` < `~`
157 */
158 class ascii69_alphabet : public alphabet {
159 private:
160 static code_point_t s_code_point(const char c) noexcept {
161 if( ' ' <= c && c < 'a' ) { // [ 0 .. 64 ]
162 return c - ' ';
163 } else if( 'a' <= c && c <= 'z' ) { // [ 33 .. 58 ]
164 return c - 'a' + 'A' - ' ';
165 } else if( '{' <= c && c <= '~' ) {
166 return c - '{' + 'a' - ' '; // [ 65 .. 68 ]
167 } else {
168 return code_error;
169 }
170 }
171
172 public:
174 : alphabet("ascii69", 69, s_code_point) {}
175 };
176
177 /**
178 * Case insensitive ASCII base 26 alphabet with ASCII code-point sorting order.
179 *
180 * ### Properties
181 * - Base 26, i.e. ASCII [65 .. 90], merging lower- and capital-letters
182 * - 7-bit ASCII
183 * - Code page 437 compatible
184 * - Supporting ASCII code-point sorting.
185 * - Order: `A` < `Z`
186 */
187 class ascii26_alphabet : public alphabet {
188 private:
189 static code_point_t s_code_point(const char c) noexcept {
190 if( 'A' <= c && c < 'Z' ) { // [ 0 .. 25 ]
191 return c - 'A';
192 } else if( 'a' <= c && c <= 'z' ) { // [ 0 .. 25 ]
193 return c - 'a';
194 } else {
195 return code_error;
196 }
197 }
198
199 public:
201 : alphabet("ascii26", 26, s_code_point) {}
202 };
203
204 /**
205 * A lexical analyzer (tokenizer) using a tabular finite-state-machine (FSM), aka `endlicher automat` (EA).
206 *
207 * Implemented initially by Sven Gothel in July 1992 using early C++ with and brought to a clean C++17 template.
208 *
209 * @tparam State_type used for token name and internal FSM, hence memory sensitive.
210 * Must be an unsigned integral type with minimum size of sizeof(alphabet::code_point_t), i.e. uint16_t.
211 */
212 template<typename State_type,
213 std::enable_if_t<std::is_integral_v<State_type> &&
214 std::is_unsigned_v<State_type> &&
215 sizeof(alphabet::code_point_t) <= sizeof(State_type), bool> = true>
216 class token_fsm {
217 public:
218 /**
219 * Unsigned int symbol for token-value type
220 */
221 typedef State_type uint_t;
222
223 /**
224 * token_error value, denoting an invalid token or alphabet code-point.
225 */
226 static inline constexpr const uint_t token_error = std::numeric_limits<uint_t>::max();
227
228 /**
229 * Terminal token name and ASCII string value pair, provided by user.
230 */
231 struct token_value_t {
232 /** Token numerical name, a terminal symbol. Value must be greater than zero and not equal to token_error. */
233 uint_t name;
234
235 /** Token ASCII string value to be tokenized. */
236 std::string_view value;
237
238 std::string to_string() const noexcept {
239 return "[ts "+std::to_string(name)+", value "+std::string(value)+"]";
240 }
241 };
242
243 /**
244 * Result type for token_fsm::find()
245 */
246 struct result_t {
247 /** Token numerical name (terminal symbol) if found, otherwise token_error */
248 uint_t token_name;
249
250 /** Position of first char of token in source */
251 size_t source_begin;
252
253 /** Last position in source after token. */
254 size_t source_last;
255
256 std::string to_string() const noexcept {
257 return "[ts "+std::to_string(token_name)+", pos["+std::to_string(source_begin)+".."+std::to_string(source_last)+")]";
258 }
259 };
260
261 token_fsm ( const token_fsm& src ) noexcept = default;
262 token_fsm ( token_fsm&& src ) noexcept = default;
263 token_fsm& operator=(const token_fsm& x) noexcept = default;
264 token_fsm& operator=(token_fsm&& x) noexcept = default;
265
266 uint_t state_count() const noexcept { return m_next_state-1; }
267 uint_t next_state() const noexcept { return m_next_state; }
268
269 bool empty() const noexcept { return 0 == state_count(); }
270
271 /** Returns true if this FSM containes the given token name */
272 bool contains(uint_t token_name) const noexcept {
273 return m_token_names.cend() != std::find(m_token_names.cbegin(), m_token_names.cend(), token_name);
274 }
275
276 /** Returns the number of contained token. */
277 size_t count() const noexcept { return m_token_names.size(); }
278
279 /** Returns true if the given char is listed as a separator. */
280 bool is_separator(const char c) const noexcept {
281 return m_separators.cend() != std::find(m_separators.cbegin(), m_separators.cend(), c);
282 }
283
284 private:
285 typedef jau::darray<uint_t, jau::nsize_t> matrix_t;
286
287 void grow(const uint_t required_sz) {
288 while( m_matrix.size() < required_sz ) {
289 uint_t line[m_row_len];
290 ::bzero(line, sizeof(line));
291 m_matrix.push_back(&line[0], &line[m_row_len]);
292 }
293 }
294
295 alphabet m_alphabet;
296 uint_t m_row_len;
297 uint_t m_end;
298 std::string m_separators;
299
300 matrix_t m_matrix;
301 uint_t m_next_state;
302 std::vector<uint_t> m_token_names;
303
304 public:
305
306 /**
307 * Clears the FSM. Afterwards, the FSM can be filled over again from scratch.
308 */
309 void clear() noexcept {
310 m_matrix.clear();
311 m_next_state = 1;
312 m_token_names.clear();
313 }
314
315 /**
316 * Constructs an empty instance.
317 * @param alphabet the used alphabet
318 * @param separators separator, defaults to SPACE, TAB, LF, CR
319 * @see add()
320 */
321 token_fsm (alphabet alphabet, const std::string_view separators = "\040\011\012\015")
322 : m_alphabet( std::move(alphabet) ),
323 m_row_len(m_alphabet.base()), m_end(m_row_len-1),
324 m_separators(separators),
325 m_matrix(), m_next_state(1), m_token_names()
326 { }
327
328 /**
329 * Constructs a new instance w/ given token_value_t name and value pairs.
330 *
331 * In case of an error, method will clear() and abort, user might validated via empty().
332 *
333 * Reasons for failures could be
334 * - invalid token name, e.g. 0
335 * - duplicate token name in input key_words
336 * - invalid token value
337 * - empty string
338 * - invalid character according to given alphabet or a separator
339 *
340 * @param alphabet the used alphabet
341 * @param key_words vector of to be added token_value_t name and values
342 * @param separators separator, defaults to SPACE, TAB, LF, CR
343 * @see add()
344 */
345 token_fsm ( const alphabet& alphabet, const std::vector<token_value_t>& key_words, const std::string_view separators = "\040\011\012\015")
346 : token_fsm(alphabet, separators)
347 {
348 const uint_t max_state = (uint_t) std::numeric_limits<uint_t>::max();
349
350 for( size_t word_num=0;
351 word_num < key_words.size() && m_next_state < max_state;
352 word_num++
353 )
354 {
355 if( !add( key_words[word_num] ) ) {
356 return;
357 }
358 }
359 }
360
361 /**
362 * Adds given token_value_t name and value pair.
363 *
364 * In case of an error, method will clear() and abort, user might validated via empty().
365 *
366 * Reasons for failures could be
367 * - invalid token name, e.g. 0 or token_error
368 * - duplicate token name in input key_words
369 * - invalid token value
370 * - empty string
371 * - invalid character according to given alphabet or a separator
372 *
373 * @param tkey_word the given token name and value pair
374 * @return true if successful, otherwise false
375 */
376 bool add(const token_value_t& tkey_word) noexcept {
377 if( 0 == tkey_word.name || token_error == tkey_word.name ) {
378 // invalid token name
379 return false;
380 }
381 if( contains( tkey_word.name ) ) {
382 // already contained -> ERROR
383 return false;
384 }
385 const std::string_view& key_word = tkey_word.value;
386 uint_t current_state = 0;
387 size_t char_num = 0;
388 uint_t c = token_error;
389 JAU_TRACE_PRINT("token_fsm::add: %s:\n", tkey_word.to_string().c_str());
390
391 const uint_t max_state = (uint_t) std::numeric_limits<uint_t>::max();
392 uint_t next_state = m_next_state;
393
394 for( ;
395 char_num < key_word.size() &&
396 next_state < max_state;
397 ++char_num
398 )
399 {
400 c = key_word[char_num];
401 JAU_TRACE_PRINT(" [%c, ", (char)c);
402
403 if( is_separator( c ) ) {
404 c = token_error;
405 break; // invalid character
406 }
407
408 const alphabet::code_point_t cp = m_alphabet.code_point(c);
409 if( alphabet::code_error == cp ) {
410 c = token_error;
411 break; // invalid character
412 } else {
413 c = cp;
414 }
415 const uint_t current_idx = m_row_len*current_state+c;
416 grow(current_idx+1);
417 JAU_TRACE_PRINT("c-off %zu, state %zu, idx %zu] ", (size_t)c, (size_t)current_state, (size_t)current_idx);
418
419 const uint_t current_token = m_matrix[current_idx];
420 if( !current_token ) {
421 m_matrix[current_idx] = next_state;
422 current_state = next_state++;
423 JAU_TRACE_PRINT("-> state %zu (new),\n", (size_t)current_state);
424 } else {
425 current_state = current_token;
426 JAU_TRACE_PRINT("-> state %zu (jmp),\n", (size_t)current_state);
427 }
428 }
429
430 if( char_num > 0 && c != token_error ) {
431 // token value exists (char_num) and is valid (c)
432 const uint_t current_idx = m_row_len*current_state+m_end;
433 grow(current_idx+1);
434
435 m_matrix[current_idx] = tkey_word.name;
436 m_token_names.push_back( tkey_word.name );
437 JAU_TRACE_PRINT(" -> terminal [c-off %zu, state %zu, idx %zu] = %zu\n", (size_t)m_end, (size_t)current_state, (size_t)current_idx, (size_t)tkey_word.name);
438 } else {
439 // abort on invalid char (c) or non-existing word.(char_nu,)
440 JAU_TRACE_PRINT(" -> error\n");
441 clear();
442 return false;
443 }
444
445 if( next_state >= max_state ) {
446 // FSM exceeded, abort
447 clear();
448 return false;
449 } else {
450 m_next_state = next_state;
451 return true;
452 }
453 }
454
455 /**
456 * Find a token within the given haystack, starting from given start position.
457 *
458 * This method reads over all characters until a token has been found or end-of-view.
459 *
460 * This method considers given separators.
461 *
462 * @param haystack string view to search for tokens
463 * @param start start position, allowing to reuse the view
464 * @return result_t denoting the found token, where result_t::token_name == token_error denotes not found.
465 * @see get()
466 */
467 result_t find(const std::string_view& haystack, int start=0) noexcept {
468 if( 0 == m_matrix.size() ) {
469 return token_fsm::result_t { .token_name = token_error, .source_begin = 0, .source_last = 0 };
470 }
471
472 /* Bis Zeilenende oder Gefundener Token durchsuchen */
473 uint_t c = 0;
474 jau::nsize_t i = start;
475 uint_t current_state = 0;
476 jau::nsize_t i2 = 0;
477 while( i < haystack.size() && !current_state ) {
478 i2=i++;
479 if( is_separator(haystack[i2-1]) || i2==0 ) {
480 do {
481 if( i2 == haystack.size() ) {
482 // position after token end
483 c = m_end;
484 } else if( is_separator( c = haystack[i2++] ) ) {
485 i2--; // position after token end
486 c = m_end;
487 } else {
488 const alphabet::code_point_t cp = m_alphabet.code_point(c);
489 if( alphabet::code_error == cp ) {
490 c = token_error;
491 current_state=0;
492 break; // invalid character
493 } else {
494 c = cp;
495 }
496 }
497 const uint_t current_idx = m_row_len*current_state+c;
498 if( current_idx >= m_matrix.size() ) {
499 /** end-of-matrix **/
500 break;
501 }
502 current_state = m_matrix[current_idx];
503 } while( current_state && c != m_end );
504 }
505 }
506
507 if( c == m_end && current_state ) {
508 return token_fsm::result_t { .token_name = current_state, .source_begin = i - 1, .source_last = i2 };
509 } else {
510 return token_fsm::result_t { .token_name = token_error, .source_begin = 0, .source_last = 0 };
511 }
512 }
513
514 /**
515 * Returns the token numerical name (terminal symbol) if found, otherwise token_error.
516 *
517 * This method does not consider given separators and expects given word to match a token 1:1.
518 *
519 * @param word the key word to lookup
520 * @see find()
521 */
522 uint_t get(const std::string_view& word) noexcept {
523 if( 0 == m_matrix.size() ) {
524 return 0;
525 }
526 JAU_TRACE_PRINT("token_fsm::get: %s:\n", std::string(word).c_str());
527
528 uint_t c = 0;
529 uint_t current_state = 0;
530 jau::nsize_t i2 = 0;
531 do {
532 if( i2 == word.size() ) {
533 c = m_end;
534 } else {
535 c = word[i2++];
536 const alphabet::code_point_t cp = m_alphabet.code_point(c);
537 if( alphabet::code_error == cp ) {
538 c = token_error;
539 current_state=0;
540 break; // invalid character
541 } else {
542 c = cp;
543 }
544 }
545 const uint_t current_idx = m_row_len*current_state+c;
546 JAU_TRACE_PRINT(" [c-off %zu, state %zu, idx %zu] ", (size_t)c, (size_t)current_state, (size_t)current_idx);
547 if( current_idx >= m_matrix.size() ) {
548 /** end-of-matrix **/
549 JAU_TRACE_PRINT("-> state %zu (eom),\n", (size_t)current_state);
550 break;
551 }
552 current_state = m_matrix[current_idx];
553 JAU_TRACE_PRINT("-> state %zu (ok),\n", (size_t)current_state);
554 } while( current_state && c != m_end );
555
556 if( c == m_end && current_state ) {
557 JAU_TRACE_PRINT(" -> final token %zu\n", (size_t)current_state);
558 return current_state;
559 } else {
560 JAU_TRACE_PRINT(" -> not found\n");
561 return token_error;
562 }
563 }
564
565 std::string fsm_to_string(const int token_per_row) const noexcept {
566 const uint_t sz = m_matrix.size();
567 const uint_t rows = sz / m_row_len;
568
569 std::string s = "token_fsm["+m_alphabet.to_string()+", "+std::to_string(count())+" token, sz "+
570 std::to_string(sz)+" cells / "+std::to_string(sz*sizeof(uint_t))+
571 " bytes, "+std::to_string(m_row_len)+"x"+std::to_string(rows)+
572 ", next_state "+std::to_string(m_next_state)+":";
573 char buf[80];
574 uint_t idx=0;
575 for(uint_t y=0; y<rows && idx<sz; ++y) {
576 snprintf(buf, sizeof(buf), "\n%3zu: ", (size_t)y);
577 s.append(buf);
578 for(uint_t x=0; x<m_row_len && idx<sz; ++x, ++idx) {
579 const uint_t t = m_matrix[m_row_len*y+x];
580 snprintf(buf, sizeof(buf), "%3zu, ", (size_t)t);
581 s.append(buf);
582 if( x < m_row_len-1 && ( x + 1 ) % token_per_row == 0 ) {
583 s.append("\n ");
584 }
585 }
586 }
587 s.append("]\n");
588 return s;
589 }
590
591 std::string to_string() const noexcept {
592 const uint_t sz = m_matrix.size();
593 const uint_t rows = sz / m_row_len;
594 return "token_fsm["+m_alphabet.to_string()+", "+std::to_string(count())+" token, sz "+
595 std::to_string(sz)+" cells / "+std::to_string(sz*sizeof(uint_t))
596 +" bytes, "+std::to_string(m_row_len)+"x"+std::to_string(rows)+
597 ", next_state "+std::to_string(m_next_state)+"]";
598 }
599 };
600
601 /**@}*/
602
603} // namespace jau::lexer
604
605
606
607#endif /* JAU_TOKEN_FSM_HPP_ */
Implementation of a dynamic linear array storage, aka vector.
Definition: darray.hpp:148
Base Alphabet Specification providing the alphabet for token_fsm.
Definition: token_fsm.hpp:74
Case insensitive ASCII base 26 alphabet with ASCII code-point sorting order.
Definition: token_fsm.hpp:187
Case insensitive ASCII base 69 alphabet with ASCII code-point sorting order.
Definition: token_fsm.hpp:158
Full ASCII base 95 alphabet with ASCII code-point sorting order.
Definition: token_fsm.hpp:133
constexpr bool contains(InputArray &array, const T &value)
Return true if value is contained in array.
constexpr InputIt find(InputIt first, InputIt last, const T &value)
Like std::find() of 'algorithm'.
uint_fast32_t nsize_t
Natural 'size_t' alternative using uint_fast32_t as its natural sized type.
Definition: int_types.hpp:53
constexpr T max(const T x, const T y) noexcept
Returns the maximum of two integrals (w/ branching) in O(1)
Definition: base_math.hpp:191
uint16_t code_point_t
Unsigned int symbol for alphabet code-point type.
Definition: token_fsm.hpp:79
bool operator==(const alphabet &lhs, const alphabet &rhs) noexcept
Definition: token_fsm.hpp:119
constexpr const std::string & name() const noexcept
Human readable name for this alphabet instance.
Definition: token_fsm.hpp:98
bool operator!=(const alphabet &lhs, const alphabet &rhs) noexcept
Definition: token_fsm.hpp:115
code_point_t(* code_point_func)(const char c) noexcept
Definition: token_fsm.hpp:86
std::string to_string() const noexcept
Definition: token_fsm.hpp:106
constexpr code_point_t base() const noexcept
The fixed base used for this alphabet, i.e.
Definition: token_fsm.hpp:101
constexpr code_point_t code_point(const char c) const noexcept
Returns the token of the given character or code_error if not element of this alphabet.
Definition: token_fsm.hpp:104
static constexpr const code_point_t code_error
token_error value, denoting an invalid alphabet code-point.
Definition: token_fsm.hpp:84
alphabet(std::string _name, code_point_t _base, code_point_func _cpf) noexcept
Definition: token_fsm.hpp:94
std::string to_string(const alphabet &v) noexcept
Definition: token_fsm.hpp:113
STL namespace.
#define JAU_TRACE_PRINT(...)
Definition: token_fsm.hpp:51