36#ifndef JAU_TOKEN_FSM_HPP_
37#define JAU_TOKEN_FSM_HPP_
49#ifdef JAU_DO_JAU_TRACE_PRINT
50 #define JAU_TRACE_PRINT(...) fprintf(stderr, __VA_ARGS__);
52 #define JAU_TRACE_PRINT(...)
96 : name_( std::move(_name) ), base_(_base), cpf(_cpf) {}
99 constexpr const std::string&
name() const noexcept {
return name_; }
108 std::string res(
"alphabet[");
110 res.append(
", base "+std::to_string(
base())+
"]");
117 return lhs.base() != rhs.base() || lhs.name() != rhs.name();
121 return !( lhs != rhs );
136 static code_point_t s_code_point(
const char c)
noexcept {
137 if(
' ' <= c && c <=
'~' ) {
146 :
alphabet(
"ascii95", 95, s_code_point) {}
161 static code_point_t s_code_point(
const char c)
noexcept {
162 if(
' ' <= c && c <
'a' ) {
164 }
else if(
'a' <= c && c <=
'z' ) {
165 return c -
'a' +
'A' -
' ';
166 }
else if(
'{' <= c && c <=
'~' ) {
167 return c -
'{' +
'a' -
' ';
175 :
alphabet(
"ascii69", 69, s_code_point) {}
190 static code_point_t s_code_point(
const char c)
noexcept {
191 if(
'A' <= c && c <
'Z' ) {
193 }
else if(
'a' <= c && c <=
'z' ) {
202 :
alphabet(
"ascii26", 26, s_code_point) {}
213 template<
typename State_type,
214 std::enable_if_t<std::is_integral_v<State_type> &&
215 std::is_unsigned_v<State_type> &&
229 constexpr static uint_t to_symbol(
char c)
noexcept {
return static_cast<unsigned char>(c); }
242 return "[ts "+std::to_string(
name)+
", value "+std::string(
value)+
"]";
276 return m_token_names.cend() != std::find(m_token_names.cbegin(), m_token_names.cend(), token_name);
280 size_t count() const noexcept {
return m_token_names.size(); }
284 return m_separators.cend() != std::find(m_separators.cbegin(), m_separators.cend(), c);
290 void grow(
const uint_t required_sz) {
291 m_matrix.
reserve( required_sz + 100 );
292 while( m_matrix.
size() < required_sz ) {
293 m_matrix.
resize(m_matrix.
size() + m_row_len, 0);
300 std::string m_separators;
304 darray_t m_token_names;
312 m_matrix.clear(
true);
314 m_token_names.clear(
true);
325 m_row_len(m_alphabet.base()), m_end(m_row_len-1),
326 m_separators(separators),
327 m_matrix(), m_next_state(1), m_token_names()
347 token_fsm (
const alphabet&
alphabet,
const std::vector<token_value_t>& key_words,
const std::string_view separators =
"\040\011\012\015")
350 const uint_t max_state = (
uint_t) std::numeric_limits<uint_t>::max();
352 for(
size_t word_num=0;
353 word_num < key_words.size() && m_next_state < max_state;
357 if( !
add( key_words[word_num] ) ) {
387 const std::string_view& key_word = tkey_word.
value;
393 const uint_t max_state = (
uint_t) std::numeric_limits<uint_t>::max();
397 char_num < key_word.size() &&
417 const uint_t current_idx = m_row_len*current_state+c;
419 JAU_TRACE_PRINT(
"c-off %zu, state %zu, idx %zu] ", (
size_t)c, (
size_t)current_state, (
size_t)current_idx);
421 const uint_t current_token = m_matrix[current_idx];
422 if( !current_token ) {
427 current_state = current_token;
434 const uint_t current_idx = m_row_len*current_state+m_end;
437 m_matrix[current_idx] = tkey_word.
name;
438 m_token_names.push_back( tkey_word.
name );
439 JAU_TRACE_PRINT(
" -> terminal [c-off %zu, state %zu, idx %zu] = %zu\n", (
size_t)m_end, (
size_t)current_state, (
size_t)current_idx, (
size_t)tkey_word.
name);
469 result_t find(
const std::string_view& haystack,
int start=0) noexcept {
470 if( 0 == m_matrix.size() ) {
479 while( i < haystack.size() && !current_state ) {
483 if( i2 == haystack.size() ) {
499 const uint_t current_idx = m_row_len*current_state+c;
500 if( current_idx >= m_matrix.size() ) {
504 current_state = m_matrix[current_idx];
505 }
while( current_state && c != m_end );
509 if( c == m_end && current_state ) {
510 return token_fsm::result_t { .token_name = current_state, .source_begin = i - 1, .source_last = i2 };
525 if( 0 == m_matrix.size() ) {
534 if( i2 == word.size() ) {
547 const uint_t current_idx = m_row_len*current_state+c;
548 JAU_TRACE_PRINT(
" [c-off %zu, state %zu, idx %zu] ", (
size_t)c, (
size_t)current_state, (
size_t)current_idx);
549 if( current_idx >= m_matrix.size() ) {
554 current_state = m_matrix[current_idx];
556 }
while( current_state && c != m_end );
558 if( c == m_end && current_state ) {
560 return current_state;
568 const uint_t sz = m_matrix.size();
569 const uint_t rows = sz / m_row_len;
571 std::string s =
"token_fsm["+m_alphabet.to_string()+
", "+std::to_string(
count())+
" token, sz "+
572 std::to_string(sz)+
" cells / "+std::to_string(sz*
sizeof(
uint_t))+
573 " bytes, "+std::to_string(m_row_len)+
"x"+std::to_string(rows)+
574 ", next_state "+std::to_string(m_next_state)+
":";
577 for(
uint_t y=0; y<rows && idx<sz; ++y) {
578 snprintf(buf,
sizeof(buf),
"\n%3zu: ", (
size_t)y);
580 for(
uint_t x=0; x<m_row_len && idx<sz; ++x, ++idx) {
581 const uint_t t = m_matrix[m_row_len*y+x];
582 snprintf(buf,
sizeof(buf),
"%3zu, ", (
size_t)t);
584 if( x < m_row_len-1 && ( x + 1 ) % token_per_row == 0 ) {
594 const uint_t sz = m_matrix.size();
595 const uint_t rows = sz / m_row_len;
596 return "token_fsm["+m_alphabet.to_string()+
", "+std::to_string(
count())+
" token, sz "+
597 std::to_string(sz)+
" cells / "+std::to_string(sz*
sizeof(
uint_t))
598 +
" bytes, "+std::to_string(m_row_len)+
"x"+std::to_string(rows)+
599 ", next_state "+std::to_string(m_next_state)+
"]";
Implementation of a dynamic linear array storage, aka vector, including relative positional access.
constexpr size_type size() const noexcept
Like std::vector::size().
constexpr self_t & reserve(size_type new_capacity)
Like std::vector::reserve(), increases this instance's capacity to new_capacity.
constexpr self_t & resize(size_type new_size, const value_type &val)
Like std::vector::resize(size_type, const value_type&)
Base Alphabet Specification providing the alphabet for token_fsm.
uint16_t code_point_t
Unsigned int symbol for alphabet code-point type.
constexpr const std::string & name() const noexcept
Human readable name for this alphabet instance.
code_point_t(* code_point_func)(const char c) noexcept
std::string to_string() const noexcept
constexpr code_point_t base() const noexcept
The fixed base used for this alphabet, i.e.
constexpr code_point_t code_point(const char c) const noexcept
Returns the token of the given character or code_error if not element of this alphabet.
static constexpr const code_point_t code_error
token_error value, denoting an invalid alphabet code-point.
alphabet(std::string _name, code_point_t _base, code_point_func _cpf) noexcept
ascii26_alphabet() noexcept
ascii69_alphabet() noexcept
ascii95_alphabet() noexcept
bool is_separator(const char c) const noexcept
Returns true if the given char is listed as a separator.
uint_t state_count() const noexcept
std::string to_string() const noexcept
uint_t get(const std::string_view &word) noexcept
Returns the token numerical name (terminal symbol) if found, otherwise token_error.
static constexpr const uint_t token_error
token_error value, denoting an invalid token or alphabet code-point.
State_type uint_t
Unsigned int symbol for token-value type.
size_t count() const noexcept
Returns the number of contained token.
token_fsm & operator=(const token_fsm &x) noexcept=default
uint_t next_state() const noexcept
bool empty() const noexcept
bool contains(uint_t token_name) const noexcept
Returns true if this FSM containes the given token name.
std::string fsm_to_string(const int token_per_row) const noexcept
token_fsm(const alphabet &alphabet, const std::vector< token_value_t > &key_words, const std::string_view separators="\040\011\012\015")
Constructs a new instance w/ given token_value_t name and value pairs.
bool add(const token_value_t &tkey_word)
Adds given token_value_t name and value pair.
static constexpr uint_t to_symbol(char c) noexcept
token_fsm & operator=(token_fsm &&x) noexcept=default
void clear() noexcept
Clears the FSM.
token_fsm(const token_fsm &src) noexcept=default
token_fsm(alphabet alphabet, const std::string_view separators="\040\011\012\015")
Constructs an empty instance.
result_t find(const std::string_view &haystack, int start=0) noexcept
Find a token within the given haystack, starting from given start position.
token_fsm(token_fsm &&src) noexcept=default
uint_fast32_t nsize_t
Natural 'size_t' alternative using uint_fast32_t as its natural sized type.
bool operator==(const alphabet &lhs, const alphabet &rhs) noexcept
bool operator!=(const alphabet &lhs, const alphabet &rhs) noexcept
std::string to_string(const alphabet &v) noexcept
Result type for token_fsm::find()
std::string to_string() const noexcept
size_t source_begin
Position of first char of token in source.
uint_t token_name
Token numerical name (terminal symbol) if found, otherwise token_error.
size_t source_last
Last position in source after token.
Terminal token name and ASCII string value pair, provided by user.
std::string_view value
Token ASCII string value to be tokenized.
uint_t name
Token numerical name, a terminal symbol.
std::string to_string() const noexcept
#define JAU_TRACE_PRINT(...)