diff options
Diffstat (limited to 'src/unicode.c')
-rw-r--r-- | src/unicode.c | 55 |
1 files changed, 45 insertions, 10 deletions
diff --git a/src/unicode.c b/src/unicode.c index e1faa55..2f9e3e2 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -1,9 +1,45 @@ #include "unicode.h" +#include "unicode-data.h" -/* Branchless UTF-8 decoding and validation by Christopher Wellons. +#define RUNE_IS_GEN(fn, stg1, stg2, blksz) \ + bool fn(rune ch) \ + { \ + unsigned x = ch % blksz; \ + return stg2[stg1[ch / blksz]][x / 8] & (1 << (x % 8)); \ + } + +RUNE_IS_GEN(rune_is_pat_ws, pat_ws_stage1, pat_ws_stage2, 512) +RUNE_IS_GEN(rune_is_xids, xids_stage1, xids_stage2, 128) +RUNE_IS_GEN(rune_is_xidc, xidc_stage1, xidc_stage2, 128) - You can find the original source with comments at - https://github.com/skeeto/branchless-utf8. */ +/* + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * For more information, please refer to <http://unlicense.org/> + * + * Source: https://github.com/skeeto/branchless-utf8 + */ static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0}; @@ -13,7 +49,7 @@ static const int shiftc[] = {0, 18, 12, 6, 0}; static const int shifte[] = {0, 6, 4, 2, 0}; rune -utf8_decode(const char **buf) +utf8_decode(const unsigned char **buf) { const unsigned char *s = *buf; int len = lengths[s[0] >> 3]; @@ -27,11 +63,10 @@ utf8_decode(const char **buf) } size_t -utf8_validate_off(const char *buf, size_t len) +utf8_validate_off(const unsigned char *s, size_t len) { - const char *start = buf, *end = start + len; - while (buf < end) { - const unsigned char *s = buf; + const unsigned char *start = s, *end = start + len; + while (s < end) { int len = lengths[s[0] >> 3]; const unsigned char *next = s + len + !len; @@ -51,8 +86,8 @@ utf8_validate_off(const char *buf, size_t len) e ^= 0x2A; e >>= shifte[len]; if (e != 0) - return buf - start + 1; - buf = next; + return s - start + 1; + s = next; } return 0; |