36#ifndef JAU_TOKEN_FSM_HPP_
37#define JAU_TOKEN_FSM_HPP_
48#ifdef JAU_DO_JAU_TRACE_PRINT
49 #define JAU_TRACE_PRINT(...) fprintf(stderr, __VA_ARGS__);
51 #define JAU_TRACE_PRINT(...)
95 : name_( std::move(_name) ), base_(_base), cpf(_cpf) {}
98 constexpr const std::string&
name() const noexcept {
return name_; }
107 std::string res(
"alphabet[");
116 return lhs.base() != rhs.base() || lhs.name() != rhs.name();
120 return !( lhs != rhs );
135 static code_point_t s_code_point(
const char c)
noexcept {
136 if(
' ' <= c && c <=
'~' ) {
145 :
alphabet("ascii95", 95, s_code_point) {}
160 static code_point_t s_code_point(
const char c)
noexcept {
161 if(
' ' <= c && c <
'a' ) {
163 }
else if(
'a' <= c && c <=
'z' ) {
164 return c -
'a' +
'A' -
' ';
165 }
else if(
'{' <= c && c <=
'~' ) {
166 return c -
'{' +
'a' -
' ';
174 :
alphabet("ascii69", 69, s_code_point) {}
189 static code_point_t s_code_point(
const char c)
noexcept {
190 if(
'A' <= c && c <
'Z' ) {
192 }
else if(
'a' <= c && c <=
'z' ) {
201 :
alphabet("ascii26", 26, s_code_point) {}
212 template<
typename State_type,
213 std::enable_if_t<std::is_integral_v<State_type> &&
214 std::is_unsigned_v<State_type> &&
221 typedef State_type uint_t;
231 struct token_value_t {
236 std::string_view value;
239 return "[ts "+
std::to_string(name)+
", value "+std::string(value)+
"]";
261 token_fsm (
const token_fsm& src )
noexcept =
default;
262 token_fsm ( token_fsm&& src )
noexcept =
default;
263 token_fsm& operator=(
const token_fsm& x)
noexcept =
default;
264 token_fsm& operator=(token_fsm&& x)
noexcept =
default;
266 uint_t state_count() const noexcept {
return m_next_state-1; }
267 uint_t next_state() const noexcept {
return m_next_state; }
269 bool empty() const noexcept {
return 0 == state_count(); }
272 bool contains(uint_t token_name)
const noexcept {
273 return m_token_names.cend() !=
std::find(m_token_names.cbegin(), m_token_names.cend(), token_name);
277 size_t count() const noexcept {
return m_token_names.size(); }
280 bool is_separator(
const char c)
const noexcept {
281 return m_separators.cend() !=
std::find(m_separators.cbegin(), m_separators.cend(), c);
287 void grow(
const uint_t required_sz) {
288 while( m_matrix.size() < required_sz ) {
289 uint_t line[m_row_len];
290 ::bzero(line,
sizeof(line));
291 m_matrix.push_back(&line[0], &line[m_row_len]);
298 std::string m_separators;
302 std::vector<uint_t> m_token_names;
309 void clear() noexcept {
312 m_token_names.clear();
321 token_fsm (alphabet alphabet,
const std::string_view separators =
"\040\011\012\015")
322 : m_alphabet(
std::
move(alphabet) ),
323 m_row_len(m_alphabet.base()), m_end(m_row_len-1),
324 m_separators(separators),
325 m_matrix(), m_next_state(1), m_token_names()
345 token_fsm (
const alphabet& alphabet,
const std::vector<token_value_t>& key_words,
const std::string_view separators =
"\040\011\012\015")
346 : token_fsm(alphabet, separators)
350 for(
size_t word_num=0;
351 word_num < key_words.size() && m_next_state < max_state;
355 if( !add( key_words[word_num] ) ) {
376 bool add(
const token_value_t& tkey_word)
noexcept {
377 if( 0 == tkey_word.name || token_error == tkey_word.name ) {
385 const std::string_view& key_word = tkey_word.value;
386 uint_t current_state = 0;
388 uint_t c = token_error;
389 JAU_TRACE_PRINT(
"token_fsm::add: %s:\n", tkey_word.to_string().c_str());
392 uint_t next_state = m_next_state;
395 char_num < key_word.size() &&
396 next_state < max_state;
400 c = key_word[char_num];
403 if( is_separator( c ) ) {
415 const uint_t current_idx = m_row_len*current_state+c;
417 JAU_TRACE_PRINT(
"c-off %zu, state %zu, idx %zu] ", (
size_t)c, (
size_t)current_state, (
size_t)current_idx);
419 const uint_t current_token = m_matrix[current_idx];
420 if( !current_token ) {
421 m_matrix[current_idx] = next_state;
422 current_state = next_state++;
425 current_state = current_token;
430 if( char_num > 0 && c != token_error ) {
432 const uint_t current_idx = m_row_len*current_state+m_end;
435 m_matrix[current_idx] = tkey_word.name;
436 m_token_names.push_back( tkey_word.name );
437 JAU_TRACE_PRINT(
" -> terminal [c-off %zu, state %zu, idx %zu] = %zu\n", (
size_t)m_end, (
size_t)current_state, (
size_t)current_idx, (
size_t)tkey_word.name);
445 if( next_state >= max_state ) {
450 m_next_state = next_state;
467 result_t
find(
const std::string_view& haystack,
int start=0) noexcept {
468 if( 0 == m_matrix.size() ) {
469 return token_fsm::result_t { .token_name = token_error, .source_begin = 0, .source_last = 0 };
475 uint_t current_state = 0;
477 while( i < haystack.size() && !current_state ) {
479 if( is_separator(haystack[i2-1]) || i2==0 ) {
481 if( i2 == haystack.size() ) {
484 }
else if( is_separator( c = haystack[i2++] ) ) {
497 const uint_t current_idx = m_row_len*current_state+c;
498 if( current_idx >= m_matrix.size() ) {
502 current_state = m_matrix[current_idx];
503 }
while( current_state && c != m_end );
507 if( c == m_end && current_state ) {
508 return token_fsm::result_t { .token_name = current_state, .source_begin = i - 1, .source_last = i2 };
510 return token_fsm::result_t { .token_name = token_error, .source_begin = 0, .source_last = 0 };
522 uint_t get(
const std::string_view& word)
noexcept {
523 if( 0 == m_matrix.size() ) {
529 uint_t current_state = 0;
532 if( i2 == word.size() ) {
545 const uint_t current_idx = m_row_len*current_state+c;
546 JAU_TRACE_PRINT(
" [c-off %zu, state %zu, idx %zu] ", (
size_t)c, (
size_t)current_state, (
size_t)current_idx);
547 if( current_idx >= m_matrix.size() ) {
552 current_state = m_matrix[current_idx];
554 }
while( current_state && c != m_end );
556 if( c == m_end && current_state ) {
558 return current_state;
565 std::string fsm_to_string(
const int token_per_row)
const noexcept {
566 const uint_t sz = m_matrix.size();
567 const uint_t rows = sz / m_row_len;
569 std::string s =
"token_fsm["+m_alphabet.to_string()+
", "+
std::to_string(count())+
" token, sz "+
575 for(uint_t y=0; y<rows && idx<sz; ++y) {
576 snprintf(buf,
sizeof(buf),
"\n%3zu: ", (
size_t)y);
578 for(uint_t x=0; x<m_row_len && idx<sz; ++x, ++idx) {
579 const uint_t t = m_matrix[m_row_len*y+x];
580 snprintf(buf,
sizeof(buf),
"%3zu, ", (
size_t)t);
582 if( x < m_row_len-1 && ( x + 1 ) % token_per_row == 0 ) {
592 const uint_t sz = m_matrix.size();
593 const uint_t rows = sz / m_row_len;
594 return "token_fsm["+m_alphabet.to_string()+
", "+
std::to_string(count())+
" token, sz "+
Implementation of a dynamic linear array storage, aka vector.
Base Alphabet Specification providing the alphabet for token_fsm.
Case insensitive ASCII base 26 alphabet with ASCII code-point sorting order.
Case insensitive ASCII base 69 alphabet with ASCII code-point sorting order.
Full ASCII base 95 alphabet with ASCII code-point sorting order.
constexpr bool contains(InputArray &array, const T &value)
Return true if value is contained in array.
constexpr InputIt find(InputIt first, InputIt last, const T &value)
Like std::find() of 'algorithm'.
uint_fast32_t nsize_t
Natural 'size_t' alternative using uint_fast32_t as its natural sized type.
constexpr T max(const T x, const T y) noexcept
Returns the maximum of two integrals (w/ branching) in O(1)
uint16_t code_point_t
Unsigned int symbol for alphabet code-point type.
ascii95_alphabet() noexcept
bool operator==(const alphabet &lhs, const alphabet &rhs) noexcept
constexpr const std::string & name() const noexcept
Human readable name for this alphabet instance.
bool operator!=(const alphabet &lhs, const alphabet &rhs) noexcept
code_point_t(* code_point_func)(const char c) noexcept
std::string to_string() const noexcept
constexpr code_point_t base() const noexcept
The fixed base used for this alphabet, i.e.
ascii26_alphabet() noexcept
constexpr code_point_t code_point(const char c) const noexcept
Returns the token of the given character or code_error if not element of this alphabet.
static constexpr const code_point_t code_error
token_error value, denoting an invalid alphabet code-point.
ascii69_alphabet() noexcept
alphabet(std::string _name, code_point_t _base, code_point_func _cpf) noexcept
std::string to_string(const alphabet &v) noexcept
#define JAU_TRACE_PRINT(...)