jaulib v1.3.0
Jau Support Library (C++, Java, ..)
dfa_utf8_decode.cpp
Go to the documentation of this file.
1/*
2 * Author: Sven Gothel <sgothel@jausoft.com>
3 * Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> (see details below)
4 * Copyright (c) 2020 Gothel Software e.K.
5 * Copyright (c) 2020 ZAFENA AB
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be
16 * included in all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
28
29using namespace jau;
30
31std::string jau::dfa_utf8_decode(const uint8_t *buffer, const size_t buffer_size) {
32 uint32_t codepoint;
33 uint32_t state = DFA_UTF8_ACCEPT;
34 size_t byte_count;
35 const uint8_t *ibuffer = buffer;
36
37 for( byte_count = 0; byte_count < buffer_size && *ibuffer; byte_count++ ) {
38 if ( DFA_UTF8_REJECT == dfa_utf8_decode(state, codepoint, *ibuffer++) ) {
39 break; // not a valid byte for a utf8 stream, end here!
40 } // else DFA_UTF8_ACCEPT -> valid_utf8_chars++
41 }
42 if( 0 < byte_count ) {
43 return std::string( (const char*)buffer, byte_count );
44 }
45 return std::string();
46}
47
48/************************************************************************/
49/************************************************************************/
50/************************************************************************/
51
52/**
53 * Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
54 * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
55 *
56 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
57 * to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute,
58 * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
59 * subject to the following conditions:
60 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
61 *
62 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
63 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
64 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
65 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
66 * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
67 */
68
69static const uint8_t dfa_utf8d[] = {
70 // The first part of the table maps bytes to character classes that
71 // to reduce the size of the transition table and create bitmasks.
72 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
73 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
74 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
75 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
76 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
77 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
78 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
79 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
80
81 // The second part is a transition table that maps a combination
82 // of a state of the automaton and a character class to a state.
83 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
84 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
85 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
86 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
87 12,36,12,12,12,12,12,12,12,12,12,12,
88};
89
90uint32_t jau::dfa_utf8_decode(uint32_t & state, uint32_t & codep, const uint32_t byte_value) {
91 const uint32_t type = dfa_utf8d[byte_value];
92
93 codep = (state != DFA_UTF8_ACCEPT) ?
94 (byte_value & 0x3fu) | (codep << 6) :
95 (0xff >> type) & (byte_value);
96
97 state = dfa_utf8d[256 + state + type];
98 return state;
99}
100
101
static const uint8_t dfa_utf8d[]
Copyright (c) 2008-2010 Bjoern Hoehrmann bjoern@hoehrmann.de See http://bjoern.hoehrmann....
#define DFA_UTF8_ACCEPT
#define DFA_UTF8_REJECT
uint32_t dfa_utf8_decode(uint32_t &state, uint32_t &codep, const uint32_t byte_value)
__pack(...): Produces MSVC, clang and gcc compatible lead-in and -out macros.
Definition: backtrace.hpp:32