From 5160a52f2daba2a3b290c88b843b7ce1c64f134f Mon Sep 17 00:00:00 2001 From: Amaan Qureshi Date: Sun, 18 Jun 2023 14:09:06 -0400 Subject: [PATCH] feat: rewrite the scanner in C --- dialects/terraform/src/scanner.c | 436 ++++++++++++++++++++++++++++++ dialects/terraform/src/scanner.cc | 336 ----------------------- src/scanner.c | 436 ++++++++++++++++++++++++++++++ src/scanner.cc | 336 ----------------------- 4 files changed, 872 insertions(+), 672 deletions(-) create mode 100644 dialects/terraform/src/scanner.c delete mode 100644 dialects/terraform/src/scanner.cc create mode 100644 src/scanner.c delete mode 100644 src/scanner.cc diff --git a/dialects/terraform/src/scanner.c b/dialects/terraform/src/scanner.c new file mode 100644 index 0000000..c9f0938 --- /dev/null +++ b/dialects/terraform/src/scanner.c @@ -0,0 +1,436 @@ +#include +#include +#include +#include +#include +#include + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define VEC_RESIZE(vec, _cap) \ + void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \ + assert(tmp != NULL); \ + (vec).data = tmp; \ + (vec).cap = (_cap); + +#define VEC_PUSH(vec, el) \ + if ((vec).cap == (vec).len) { \ + VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \ + } \ + (vec).data[(vec).len++] = (el); + +#define VEC_POP(vec) \ + { \ + STRING_FREE(VEC_BACK((vec)).heredoc_identifier); \ + (vec).len--; \ + } + +#define VEC_BACK(vec) ((vec).data[(vec).len - 1]) + +#define VEC_FREE(vec) \ + { \ + if ((vec).data != NULL) \ + free((vec).data); \ + } + +#define VEC_CLEAR(vec) \ + { \ + for (int i = 0; i < (vec).len; i++) { \ + STRING_FREE((vec).data[i].heredoc_identifier); \ + } \ + (vec).len = 0; \ + } + +#define STRING_RESIZE(vec, _cap) \ + void *tmp = realloc((vec).data, (_cap + 1) * sizeof((vec).data[0])); \ + assert(tmp != NULL); \ + (vec).data = tmp; \ + memset((vec).data + (vec).len, 0, \ + ((_cap + 1) - (vec).len) * sizeof((vec).data[0])); \ + (vec).cap = (_cap); + +#define STRING_GROW(vec, _cap) \ + if ((vec).cap < (_cap)) { \ + STRING_RESIZE((vec), (_cap)); \ + } + +#define STRING_PUSH(vec, el) \ + if ((vec).cap == (vec).len) { \ + STRING_RESIZE((vec), MAX(16, (vec).len * 2)); \ + } \ + (vec).data[(vec).len++] = (el); + +#define STRING_FREE(vec) \ + { \ + if ((vec).data != NULL) \ + free((vec).data); \ + } + +enum TokenType { + QUOTED_TEMPLATE_START, + QUOTED_TEMPLATE_END, + TEMPLATE_LITERAL_CHUNK, + TEMPLATE_INTERPOLATION_START, + TEMPLATE_INTERPOLATION_END, + TEMPLATE_DIRECTIVE_START, + TEMPLATE_DIRECTIVE_END, + HEREDOC_IDENTIFIER, +}; + +enum ContextType { + TEMPLATE_INTERPOLATION, + TEMPLATE_DIRECTIVE, + QUOTED_TEMPLATE, + HEREDOC_TEMPLATE, +}; + +typedef struct { + uint32_t cap; + uint32_t len; + char *data; +} String; + +String string_new() { + return (String){ + .cap = 16, + .len = 0, + .data = calloc(1, sizeof(char) * 17), + }; +} + +typedef struct { + enum ContextType type; + + // valid if type == HEREDOC_TEMPLATE + String heredoc_identifier; +} Context; + +Context context_new(enum ContextType type, const char *data) { + Context ctx = { + .type = type, + .heredoc_identifier = string_new(), + }; + ctx.heredoc_identifier.len = strlen(data); + ctx.heredoc_identifier.cap = strlen(data); + memcpy(ctx.heredoc_identifier.data, data, ctx.heredoc_identifier.len); + return ctx; +} + +typedef struct { + uint32_t len; + uint32_t cap; + Context *data; +} context_vec; + +typedef struct { + context_vec context_stack; +} Scanner; + +static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } + +static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } + +static unsigned serialize(Scanner *scanner, char *buf) { + unsigned size = 0; + + if (scanner->context_stack.len > CHAR_MAX) { + return 0; + } + + buf[size++] = (char)scanner->context_stack.len; + for (int i = 0; i < scanner->context_stack.len; i++) { + Context *context = &scanner->context_stack.data[i]; + if (size + 2 + context->heredoc_identifier.len >= + TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { + return 0; + } + if (context->heredoc_identifier.len > CHAR_MAX) { + return 0; + } + buf[size++] = context->type; + buf[size++] = (char)context->heredoc_identifier.len; + memcpy(&buf[size], context->heredoc_identifier.data, + context->heredoc_identifier.len); + size += context->heredoc_identifier.len; + } + return size; +} + +static void deserialize(Scanner *scanner, const char *buffer, unsigned length) { + if (length == 0) { + return; + } + + VEC_CLEAR(scanner->context_stack); + unsigned size = 0; + uint8_t context_stack_size = buffer[size++]; + for (uint32_t j = 0; j < context_stack_size; j++) { + Context ctx = { + .type = (enum ContextType)buffer[size++], + .heredoc_identifier = string_new(), + }; + uint8_t heredoc_identifier_size = buffer[size++]; + STRING_GROW(ctx.heredoc_identifier, heredoc_identifier_size); + memcpy(ctx.heredoc_identifier.data, buffer + size, + heredoc_identifier_size); + ctx.heredoc_identifier.len = heredoc_identifier_size; + size += heredoc_identifier_size; + VEC_PUSH(scanner->context_stack, ctx); + } + assert(size == length); +} + +static inline bool accept_inplace(TSLexer *lexer, enum TokenType token) { + lexer->result_symbol = token; + return true; +} + +static inline bool accept_and_advance(TSLexer *lexer, enum TokenType token) { + advance(lexer); + return accept_inplace(lexer, token); +} + +static inline bool consume_wxdigit(TSLexer *lexer) { + advance(lexer); + return iswxdigit(lexer->lookahead); +} + +static inline bool skip_comment(TSLexer *lexer) { + while (iswspace(lexer->lookahead)) { + skip(lexer); + } + if (lexer->lookahead != '#') { + return false; + } + skip(lexer); + while (lexer->lookahead != '\n') { + skip(lexer); + if (lexer->eof(lexer)) { + return false; + } + } + return true; +} + +static inline bool in_context_type(Scanner *scanner, enum ContextType type) { + if (scanner->context_stack.len == 0) { + return false; + } + return VEC_BACK(scanner->context_stack).type == type; +} + +static inline bool in_quoted_context(Scanner *scanner) { + return in_context_type(scanner, QUOTED_TEMPLATE); +} + +static inline bool in_heredoc_context(Scanner *scanner) { + return in_context_type(scanner, HEREDOC_TEMPLATE); +} + +static inline bool in_template_context(Scanner *scanner) { + return in_quoted_context(scanner) || in_heredoc_context(scanner); +} + +static inline bool in_interpolation_context(Scanner *scanner) { + return in_context_type(scanner, TEMPLATE_INTERPOLATION); +} + +static inline bool in_directive_context(Scanner *scanner) { + return in_context_type(scanner, TEMPLATE_DIRECTIVE); +} + +static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) { + bool has_leading_whitespace_with_newline = false; + while (iswspace(lexer->lookahead)) { + if (lexer->lookahead == '\n') { + has_leading_whitespace_with_newline = true; + } + skip(lexer); + } + if (lexer->lookahead == '\0') { + return false; + } + // manage quoted context + if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context(scanner) && + lexer->lookahead == '"') { + Context ctx = context_new(QUOTED_TEMPLATE, ""); + VEC_PUSH(scanner->context_stack, ctx); + return accept_and_advance(lexer, QUOTED_TEMPLATE_START); + } + if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context(scanner) && + lexer->lookahead == '"') { + VEC_POP(scanner->context_stack); + return accept_and_advance(lexer, QUOTED_TEMPLATE_END); + } + + // manage template interpolations + if (valid_symbols[TEMPLATE_INTERPOLATION_START] && + valid_symbols[TEMPLATE_LITERAL_CHUNK] && + !in_interpolation_context(scanner) && lexer->lookahead == '$') { + advance(lexer); + if (lexer->lookahead == '{') { + Context ctx = context_new(TEMPLATE_INTERPOLATION, ""); + VEC_PUSH(scanner->context_stack, ctx); + return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START); + } + // try to scan escape sequence + if (lexer->lookahead == '$') { + advance(lexer); + if (lexer->lookahead == '{') { + // $${ + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + } + } + return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); + } + if (valid_symbols[TEMPLATE_INTERPOLATION_END] && + in_interpolation_context(scanner) && lexer->lookahead == '}') { + VEC_POP(scanner->context_stack); + return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END); + } + + // manage template directives + if (valid_symbols[TEMPLATE_DIRECTIVE_START] && + valid_symbols[TEMPLATE_LITERAL_CHUNK] && + !in_directive_context(scanner) && lexer->lookahead == '%') { + advance(lexer); + if (lexer->lookahead == '{') { + Context ctx = context_new(TEMPLATE_DIRECTIVE, ""); + VEC_PUSH(scanner->context_stack, ctx); + return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START); + } + // try to scan escape sequence + if (lexer->lookahead == '%') { + advance(lexer); + if (lexer->lookahead == '{') { + // $${ + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + } + } + return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); + } + if (valid_symbols[TEMPLATE_DIRECTIVE_END] && + in_directive_context(scanner) && lexer->lookahead == '}') { + VEC_POP(scanner->context_stack); + return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END); + } + + // manage heredoc context + if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context(scanner)) { + String identifier = string_new(); + // TODO: check that this is a valid identifier + while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' || + lexer->lookahead == '-') { + STRING_PUSH(identifier, lexer->lookahead); + advance(lexer); + } + Context ctx = {HEREDOC_TEMPLATE, identifier}; + VEC_PUSH(scanner->context_stack, ctx); + return accept_inplace(lexer, HEREDOC_IDENTIFIER); + } + if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context(scanner) && + has_leading_whitespace_with_newline) { + String expected_identifier = + VEC_BACK(scanner->context_stack).heredoc_identifier; + + for (size_t i = 0; i < expected_identifier.len; i++) { + if (lexer->lookahead == expected_identifier.data[i]) { + advance(lexer); + } else { + return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); + } + } + // check if the identifier is on a line of its own + lexer->mark_end(lexer); + while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') { + advance(lexer); + } + if (lexer->lookahead == '\n') { + VEC_POP(scanner->context_stack); + return accept_inplace(lexer, HEREDOC_IDENTIFIER); + } + advance(lexer); + lexer->mark_end(lexer); + return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); + } + // manage template literal chunks + + // handle template literal chunks in quoted contexts + // + // they may not contain newlines and may contain escape sequences + if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context(scanner)) { + switch (lexer->lookahead) { + case '\\': + advance(lexer); + switch (lexer->lookahead) { + case '"': + case 'n': + case 'r': + case 't': + case '\\': + return accept_and_advance(lexer, + TEMPLATE_LITERAL_CHUNK); + case 'u': + for (int i = 0; i < 4; i++) { + if (!consume_wxdigit(lexer)) { + return false; + } + } + return accept_and_advance(lexer, + TEMPLATE_LITERAL_CHUNK); + case 'U': + for (int i = 0; i < 8; i++) { + if (!consume_wxdigit(lexer)) { + return false; + } + } + return accept_and_advance(lexer, + TEMPLATE_LITERAL_CHUNK); + default: + return false; + } + } + } + + // handle all other quoted template or string literal characters + if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context(scanner)) { + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + } + + // probably not handled by the external scanner + return false; +} + +void *tree_sitter_terraform_external_scanner_create() { + Scanner *scanner = calloc(1, sizeof(Scanner)); + scanner->context_stack.data = calloc(1, sizeof(Context)); + return scanner; +} + +unsigned tree_sitter_terraform_external_scanner_serialize(void *payload, + char *buffer) { + Scanner *scanner = (Scanner *)payload; + return serialize(scanner, buffer); +} + +void tree_sitter_terraform_external_scanner_deserialize(void *payload, + const char *buffer, + unsigned length) { + Scanner *scanner = (Scanner *)payload; + deserialize(scanner, buffer, length); +} + +bool tree_sitter_terraform_external_scanner_scan(void *payload, TSLexer *lexer, + const bool *valid_symbols) { + Scanner *scanner = (Scanner *)payload; + return scan(scanner, lexer, valid_symbols); +} + +void tree_sitter_terraform_external_scanner_destroy(void *payload) { + Scanner *scanner = (Scanner *)payload; + for (int i = 0; i < scanner->context_stack.len; i++) { + STRING_FREE(scanner->context_stack.data[i].heredoc_identifier); + } + VEC_FREE(scanner->context_stack); + free(scanner); +} diff --git a/dialects/terraform/src/scanner.cc b/dialects/terraform/src/scanner.cc deleted file mode 100644 index 18c3892..0000000 --- a/dialects/terraform/src/scanner.cc +++ /dev/null @@ -1,336 +0,0 @@ -#include - -#include -#include -#include -#include -#include - -namespace { - -using std::string; -using std::vector; - -enum TokenType { - QUOTED_TEMPLATE_START, - QUOTED_TEMPLATE_END, - TEMPLATE_LITERAL_CHUNK, - TEMPLATE_INTERPOLATION_START, - TEMPLATE_INTERPOLATION_END, - TEMPLATE_DIRECTIVE_START, - TEMPLATE_DIRECTIVE_END, - HEREDOC_IDENTIFIER, -}; - -enum ContextType { - TEMPLATE_INTERPOLATION, - TEMPLATE_DIRECTIVE, - QUOTED_TEMPLATE, - HEREDOC_TEMPLATE, -}; - -struct Context { - ContextType type; - - // valid if type == HEREDOC_TEMPLATE - string heredoc_identifier; -}; - -struct Scanner { - -public: - unsigned serialize(char *buf) { - unsigned size = 0; - - if (context_stack.size() > CHAR_MAX) { - return 0; - } - - buf[size++] = context_stack.size(); - for (vector::iterator it = context_stack.begin(); - it != context_stack.end(); ++it) { - if (size + 2 + it->heredoc_identifier.size() >= - TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { - return 0; - } - if (it->heredoc_identifier.size() > CHAR_MAX) { - return 0; - } - buf[size++] = it->type; - buf[size++] = it->heredoc_identifier.size(); - it->heredoc_identifier.copy(&buf[size], it->heredoc_identifier.size()); - size += it->heredoc_identifier.size(); - } - return size; - } - - void deserialize(const char *buf, unsigned n) { - context_stack.clear(); - - if (n == 0) { - return; - } - - unsigned size = 0; - uint8_t context_stack_size = buf[size++]; - for (unsigned j = 0; j < context_stack_size; j++) { - Context ctx; - ctx.type = static_cast(buf[size++]); - uint8_t heredoc_identifier_size = buf[size++]; - ctx.heredoc_identifier.assign(buf + size, - buf + size + heredoc_identifier_size); - size += heredoc_identifier_size; - context_stack.push_back(ctx); - } - assert(size == n); - } - - bool scan(TSLexer *lexer, const bool *valid_symbols) { - bool has_leading_whitespace_with_newline = false; - while (iswspace(lexer->lookahead)) { - if (lexer->lookahead == '\n') { - has_leading_whitespace_with_newline = true; - } - skip(lexer); - } - if (lexer->lookahead == '\0') { - return false; - } - // manage quoted context - if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context() && - lexer->lookahead == '"') { - Context ctx = {QUOTED_TEMPLATE, ""}; - context_stack.push_back(ctx); - return accept_and_advance(lexer, QUOTED_TEMPLATE_START); - } - if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context() && - lexer->lookahead == '"') { - context_stack.pop_back(); - return accept_and_advance(lexer, QUOTED_TEMPLATE_END); - } - - // manage template interpolations - if (valid_symbols[TEMPLATE_INTERPOLATION_START] && - valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_interpolation_context() && - lexer->lookahead == '$') { - advance(lexer); - if (lexer->lookahead == '{') { - Context ctx = {TEMPLATE_INTERPOLATION, ""}; - context_stack.push_back(ctx); - return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START); - } - // try to scan escape sequence - if (lexer->lookahead == '$') { - advance(lexer); - if (lexer->lookahead == '{') { - // $${ - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - } - } - return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); - } - if (valid_symbols[TEMPLATE_INTERPOLATION_END] && - in_interpolation_context() && lexer->lookahead == '}') { - context_stack.pop_back(); - return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END); - } - - // manage template directives - if (valid_symbols[TEMPLATE_DIRECTIVE_START] && - valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_directive_context() && - lexer->lookahead == '%') { - advance(lexer); - if (lexer->lookahead == '{') { - Context ctx = {TEMPLATE_DIRECTIVE, ""}; - context_stack.push_back(ctx); - return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START); - } - // try to scan escape sequence - if (lexer->lookahead == '%') { - advance(lexer); - if (lexer->lookahead == '{') { - // $${ - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - } - } - return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); - } - if (valid_symbols[TEMPLATE_DIRECTIVE_END] && in_directive_context() && - lexer->lookahead == '}') { - context_stack.pop_back(); - return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END); - } - - // manage heredoc context - if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context()) { - string identifier; - // TODO: check that this is a valid identifier - while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' || - lexer->lookahead == '-') { - identifier.push_back(lexer->lookahead); - advance(lexer); - } - Context ctx = {HEREDOC_TEMPLATE, identifier}; - context_stack.push_back(ctx); - return accept_inplace(lexer, HEREDOC_IDENTIFIER); - } - if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context() && - has_leading_whitespace_with_newline) { - string expected_identifier = context_stack.back().heredoc_identifier; - - for (string::iterator it = expected_identifier.begin(); - it != expected_identifier.end(); ++it) { - if (lexer->lookahead == *it) { - advance(lexer); - } else { - return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); - } - } - // check if the identifier is on a line of its own - lexer->mark_end(lexer); - while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') { - advance(lexer); - } - if (lexer->lookahead == '\n') { - context_stack.pop_back(); - return accept_inplace(lexer, HEREDOC_IDENTIFIER); - } else { - advance(lexer); - lexer->mark_end(lexer); - return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); - } - } - // manage template literal chunks - - // handle template literal chunks in quoted contexts - // - // they may not contain newlines and may contain escape sequences - if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context()) { - switch (lexer->lookahead) { - case '\\': - advance(lexer); - switch (lexer->lookahead) { - case '"': - case 'n': - case 'r': - case 't': - case '\\': - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - case 'u': - for (int i = 0; i < 4; i++) { - if (!consume_wxdigit(lexer)) - return false; - } - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - case 'U': - for (int i = 0; i < 8; i++) { - if (!consume_wxdigit(lexer)) - return false; - } - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - default: - return false; - } - } - } - - // handle all other quoted template or string literal characters - if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context()) { - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - } - - // probably not handled by the external scanner - return false; - } - -private: - vector context_stack; - - void advance(TSLexer *lexer) { lexer->advance(lexer, false); } - - void skip(TSLexer *lexer) { lexer->advance(lexer, true); } - - bool accept_inplace(TSLexer *lexer, TokenType token) { - lexer->result_symbol = token; - return true; - } - - bool accept_and_advance(TSLexer *lexer, TokenType token) { - advance(lexer); - return accept_inplace(lexer, token); - } - - bool consume_wxdigit(TSLexer *lexer) { - advance(lexer); - return iswxdigit(lexer->lookahead); - } - - bool skip_comment(TSLexer* lexer) { - while (iswspace(lexer->lookahead)) { - skip(lexer); - } - if (lexer->lookahead != '#') { - return false; - } - skip(lexer); - while (lexer->lookahead != '\n') { - skip(lexer); - if (lexer->eof(lexer)) { - return false; - } - } - return true; - } - - bool in_context_type(ContextType type) { - if (context_stack.empty()) { - return false; - } - return context_stack.back().type == type; - } - - bool in_quoted_context() { return in_context_type(QUOTED_TEMPLATE); } - - bool in_heredoc_context() { return in_context_type(HEREDOC_TEMPLATE); } - - bool in_template_context() { - return in_quoted_context() || in_heredoc_context(); - } - - bool in_interpolation_context() { - return in_context_type(TEMPLATE_INTERPOLATION); - } - - bool in_directive_context() { return in_context_type(TEMPLATE_DIRECTIVE); } -}; - -} // namespace - -extern "C" { - -// tree sitter callbacks -void *tree_sitter_terraform_external_scanner_create() { return new Scanner(); } - -void tree_sitter_terraform_external_scanner_destroy(void *p) { - Scanner *scanner = static_cast(p); - delete scanner; -} - -unsigned tree_sitter_terraform_external_scanner_serialize(void *p, char *b) { - Scanner *scanner = static_cast(p); - return scanner->serialize(b); -} - -void tree_sitter_terraform_external_scanner_deserialize(void *p, const char *b, - unsigned n) { - Scanner *scanner = static_cast(p); - return scanner->deserialize(b, n); -} - -bool tree_sitter_terraform_external_scanner_scan(void *p, TSLexer *lexer, - const bool *valid_symbols) { - Scanner *scanner = static_cast(p); - return scanner->scan(lexer, valid_symbols); -} - -} // extern "C" diff --git a/src/scanner.c b/src/scanner.c new file mode 100644 index 0000000..06821d3 --- /dev/null +++ b/src/scanner.c @@ -0,0 +1,436 @@ +#include +#include +#include +#include +#include +#include + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define VEC_RESIZE(vec, _cap) \ + void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \ + assert(tmp != NULL); \ + (vec).data = tmp; \ + (vec).cap = (_cap); + +#define VEC_PUSH(vec, el) \ + if ((vec).cap == (vec).len) { \ + VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \ + } \ + (vec).data[(vec).len++] = (el); + +#define VEC_POP(vec) \ + { \ + STRING_FREE(VEC_BACK((vec)).heredoc_identifier); \ + (vec).len--; \ + } + +#define VEC_BACK(vec) ((vec).data[(vec).len - 1]) + +#define VEC_FREE(vec) \ + { \ + if ((vec).data != NULL) \ + free((vec).data); \ + } + +#define VEC_CLEAR(vec) \ + { \ + for (int i = 0; i < (vec).len; i++) { \ + STRING_FREE((vec).data[i].heredoc_identifier); \ + } \ + (vec).len = 0; \ + } + +#define STRING_RESIZE(vec, _cap) \ + void *tmp = realloc((vec).data, (_cap + 1) * sizeof((vec).data[0])); \ + assert(tmp != NULL); \ + (vec).data = tmp; \ + memset((vec).data + (vec).len, 0, \ + ((_cap + 1) - (vec).len) * sizeof((vec).data[0])); \ + (vec).cap = (_cap); + +#define STRING_GROW(vec, _cap) \ + if ((vec).cap < (_cap)) { \ + STRING_RESIZE((vec), (_cap)); \ + } + +#define STRING_PUSH(vec, el) \ + if ((vec).cap == (vec).len) { \ + STRING_RESIZE((vec), MAX(16, (vec).len * 2)); \ + } \ + (vec).data[(vec).len++] = (el); + +#define STRING_FREE(vec) \ + { \ + if ((vec).data != NULL) \ + free((vec).data); \ + } + +enum TokenType { + QUOTED_TEMPLATE_START, + QUOTED_TEMPLATE_END, + TEMPLATE_LITERAL_CHUNK, + TEMPLATE_INTERPOLATION_START, + TEMPLATE_INTERPOLATION_END, + TEMPLATE_DIRECTIVE_START, + TEMPLATE_DIRECTIVE_END, + HEREDOC_IDENTIFIER, +}; + +enum ContextType { + TEMPLATE_INTERPOLATION, + TEMPLATE_DIRECTIVE, + QUOTED_TEMPLATE, + HEREDOC_TEMPLATE, +}; + +typedef struct { + uint32_t cap; + uint32_t len; + char *data; +} String; + +String string_new() { + return (String){ + .cap = 16, + .len = 0, + .data = calloc(1, sizeof(char) * 17), + }; +} + +typedef struct { + enum ContextType type; + + // valid if type == HEREDOC_TEMPLATE + String heredoc_identifier; +} Context; + +Context context_new(enum ContextType type, const char *data) { + Context ctx = { + .type = type, + .heredoc_identifier = string_new(), + }; + ctx.heredoc_identifier.len = strlen(data); + ctx.heredoc_identifier.cap = strlen(data); + memcpy(ctx.heredoc_identifier.data, data, ctx.heredoc_identifier.len); + return ctx; +} + +typedef struct { + uint32_t len; + uint32_t cap; + Context *data; +} context_vec; + +typedef struct { + context_vec context_stack; +} Scanner; + +static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } + +static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } + +static unsigned serialize(Scanner *scanner, char *buf) { + unsigned size = 0; + + if (scanner->context_stack.len > CHAR_MAX) { + return 0; + } + + buf[size++] = (char)scanner->context_stack.len; + for (int i = 0; i < scanner->context_stack.len; i++) { + Context *context = &scanner->context_stack.data[i]; + if (size + 2 + context->heredoc_identifier.len >= + TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { + return 0; + } + if (context->heredoc_identifier.len > CHAR_MAX) { + return 0; + } + buf[size++] = context->type; + buf[size++] = (char)context->heredoc_identifier.len; + memcpy(&buf[size], context->heredoc_identifier.data, + context->heredoc_identifier.len); + size += context->heredoc_identifier.len; + } + return size; +} + +static void deserialize(Scanner *scanner, const char *buffer, unsigned length) { + if (length == 0) { + return; + } + + VEC_CLEAR(scanner->context_stack); + unsigned size = 0; + uint8_t context_stack_size = buffer[size++]; + for (uint32_t j = 0; j < context_stack_size; j++) { + Context ctx = { + .type = (enum ContextType)buffer[size++], + .heredoc_identifier = string_new(), + }; + uint8_t heredoc_identifier_size = buffer[size++]; + STRING_GROW(ctx.heredoc_identifier, heredoc_identifier_size); + memcpy(ctx.heredoc_identifier.data, buffer + size, + heredoc_identifier_size); + ctx.heredoc_identifier.len = heredoc_identifier_size; + size += heredoc_identifier_size; + VEC_PUSH(scanner->context_stack, ctx); + } + assert(size == length); +} + +static inline bool accept_inplace(TSLexer *lexer, enum TokenType token) { + lexer->result_symbol = token; + return true; +} + +static inline bool accept_and_advance(TSLexer *lexer, enum TokenType token) { + advance(lexer); + return accept_inplace(lexer, token); +} + +static inline bool consume_wxdigit(TSLexer *lexer) { + advance(lexer); + return iswxdigit(lexer->lookahead); +} + +static inline bool skip_comment(TSLexer *lexer) { + while (iswspace(lexer->lookahead)) { + skip(lexer); + } + if (lexer->lookahead != '#') { + return false; + } + skip(lexer); + while (lexer->lookahead != '\n') { + skip(lexer); + if (lexer->eof(lexer)) { + return false; + } + } + return true; +} + +static inline bool in_context_type(Scanner *scanner, enum ContextType type) { + if (scanner->context_stack.len == 0) { + return false; + } + return VEC_BACK(scanner->context_stack).type == type; +} + +static inline bool in_quoted_context(Scanner *scanner) { + return in_context_type(scanner, QUOTED_TEMPLATE); +} + +static inline bool in_heredoc_context(Scanner *scanner) { + return in_context_type(scanner, HEREDOC_TEMPLATE); +} + +static inline bool in_template_context(Scanner *scanner) { + return in_quoted_context(scanner) || in_heredoc_context(scanner); +} + +static inline bool in_interpolation_context(Scanner *scanner) { + return in_context_type(scanner, TEMPLATE_INTERPOLATION); +} + +static inline bool in_directive_context(Scanner *scanner) { + return in_context_type(scanner, TEMPLATE_DIRECTIVE); +} + +static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) { + bool has_leading_whitespace_with_newline = false; + while (iswspace(lexer->lookahead)) { + if (lexer->lookahead == '\n') { + has_leading_whitespace_with_newline = true; + } + skip(lexer); + } + if (lexer->lookahead == '\0') { + return false; + } + // manage quoted context + if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context(scanner) && + lexer->lookahead == '"') { + Context ctx = context_new(QUOTED_TEMPLATE, ""); + VEC_PUSH(scanner->context_stack, ctx); + return accept_and_advance(lexer, QUOTED_TEMPLATE_START); + } + if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context(scanner) && + lexer->lookahead == '"') { + VEC_POP(scanner->context_stack); + return accept_and_advance(lexer, QUOTED_TEMPLATE_END); + } + + // manage template interpolations + if (valid_symbols[TEMPLATE_INTERPOLATION_START] && + valid_symbols[TEMPLATE_LITERAL_CHUNK] && + !in_interpolation_context(scanner) && lexer->lookahead == '$') { + advance(lexer); + if (lexer->lookahead == '{') { + Context ctx = context_new(TEMPLATE_INTERPOLATION, ""); + VEC_PUSH(scanner->context_stack, ctx); + return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START); + } + // try to scan escape sequence + if (lexer->lookahead == '$') { + advance(lexer); + if (lexer->lookahead == '{') { + // $${ + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + } + } + return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); + } + if (valid_symbols[TEMPLATE_INTERPOLATION_END] && + in_interpolation_context(scanner) && lexer->lookahead == '}') { + VEC_POP(scanner->context_stack); + return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END); + } + + // manage template directives + if (valid_symbols[TEMPLATE_DIRECTIVE_START] && + valid_symbols[TEMPLATE_LITERAL_CHUNK] && + !in_directive_context(scanner) && lexer->lookahead == '%') { + advance(lexer); + if (lexer->lookahead == '{') { + Context ctx = context_new(TEMPLATE_DIRECTIVE, ""); + VEC_PUSH(scanner->context_stack, ctx); + return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START); + } + // try to scan escape sequence + if (lexer->lookahead == '%') { + advance(lexer); + if (lexer->lookahead == '{') { + // $${ + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + } + } + return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); + } + if (valid_symbols[TEMPLATE_DIRECTIVE_END] && + in_directive_context(scanner) && lexer->lookahead == '}') { + VEC_POP(scanner->context_stack); + return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END); + } + + // manage heredoc context + if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context(scanner)) { + String identifier = string_new(); + // TODO: check that this is a valid identifier + while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' || + lexer->lookahead == '-') { + STRING_PUSH(identifier, lexer->lookahead); + advance(lexer); + } + Context ctx = {HEREDOC_TEMPLATE, identifier}; + VEC_PUSH(scanner->context_stack, ctx); + return accept_inplace(lexer, HEREDOC_IDENTIFIER); + } + if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context(scanner) && + has_leading_whitespace_with_newline) { + String expected_identifier = + VEC_BACK(scanner->context_stack).heredoc_identifier; + + for (size_t i = 0; i < expected_identifier.len; i++) { + if (lexer->lookahead == expected_identifier.data[i]) { + advance(lexer); + } else { + return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); + } + } + // check if the identifier is on a line of its own + lexer->mark_end(lexer); + while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') { + advance(lexer); + } + if (lexer->lookahead == '\n') { + VEC_POP(scanner->context_stack); + return accept_inplace(lexer, HEREDOC_IDENTIFIER); + } + advance(lexer); + lexer->mark_end(lexer); + return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); + } + // manage template literal chunks + + // handle template literal chunks in quoted contexts + // + // they may not contain newlines and may contain escape sequences + if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context(scanner)) { + switch (lexer->lookahead) { + case '\\': + advance(lexer); + switch (lexer->lookahead) { + case '"': + case 'n': + case 'r': + case 't': + case '\\': + return accept_and_advance(lexer, + TEMPLATE_LITERAL_CHUNK); + case 'u': + for (int i = 0; i < 4; i++) { + if (!consume_wxdigit(lexer)) { + return false; + } + } + return accept_and_advance(lexer, + TEMPLATE_LITERAL_CHUNK); + case 'U': + for (int i = 0; i < 8; i++) { + if (!consume_wxdigit(lexer)) { + return false; + } + } + return accept_and_advance(lexer, + TEMPLATE_LITERAL_CHUNK); + default: + return false; + } + } + } + + // handle all other quoted template or string literal characters + if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context(scanner)) { + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + } + + // probably not handled by the external scanner + return false; +} + +void *tree_sitter_hcl_external_scanner_create() { + Scanner *scanner = calloc(1, sizeof(Scanner)); + scanner->context_stack.data = calloc(1, sizeof(Context)); + return scanner; +} + +unsigned tree_sitter_hcl_external_scanner_serialize(void *payload, + char *buffer) { + Scanner *scanner = (Scanner *)payload; + return serialize(scanner, buffer); +} + +void tree_sitter_hcl_external_scanner_deserialize(void *payload, + const char *buffer, + unsigned length) { + Scanner *scanner = (Scanner *)payload; + deserialize(scanner, buffer, length); +} + +bool tree_sitter_hcl_external_scanner_scan(void *payload, TSLexer *lexer, + const bool *valid_symbols) { + Scanner *scanner = (Scanner *)payload; + return scan(scanner, lexer, valid_symbols); +} + +void tree_sitter_hcl_external_scanner_destroy(void *payload) { + Scanner *scanner = (Scanner *)payload; + for (int i = 0; i < scanner->context_stack.len; i++) { + STRING_FREE(scanner->context_stack.data[i].heredoc_identifier); + } + VEC_FREE(scanner->context_stack); + free(scanner); +} diff --git a/src/scanner.cc b/src/scanner.cc deleted file mode 100644 index 74296e6..0000000 --- a/src/scanner.cc +++ /dev/null @@ -1,336 +0,0 @@ -#include - -#include -#include -#include -#include -#include - -namespace { - -using std::string; -using std::vector; - -enum TokenType { - QUOTED_TEMPLATE_START, - QUOTED_TEMPLATE_END, - TEMPLATE_LITERAL_CHUNK, - TEMPLATE_INTERPOLATION_START, - TEMPLATE_INTERPOLATION_END, - TEMPLATE_DIRECTIVE_START, - TEMPLATE_DIRECTIVE_END, - HEREDOC_IDENTIFIER, -}; - -enum ContextType { - TEMPLATE_INTERPOLATION, - TEMPLATE_DIRECTIVE, - QUOTED_TEMPLATE, - HEREDOC_TEMPLATE, -}; - -struct Context { - ContextType type; - - // valid if type == HEREDOC_TEMPLATE - string heredoc_identifier; -}; - -struct Scanner { - -public: - unsigned serialize(char *buf) { - unsigned size = 0; - - if (context_stack.size() > CHAR_MAX) { - return 0; - } - - buf[size++] = context_stack.size(); - for (vector::iterator it = context_stack.begin(); - it != context_stack.end(); ++it) { - if (size + 2 + it->heredoc_identifier.size() >= - TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { - return 0; - } - if (it->heredoc_identifier.size() > CHAR_MAX) { - return 0; - } - buf[size++] = it->type; - buf[size++] = it->heredoc_identifier.size(); - it->heredoc_identifier.copy(&buf[size], it->heredoc_identifier.size()); - size += it->heredoc_identifier.size(); - } - return size; - } - - void deserialize(const char *buf, unsigned n) { - context_stack.clear(); - - if (n == 0) { - return; - } - - unsigned size = 0; - uint8_t context_stack_size = buf[size++]; - for (unsigned j = 0; j < context_stack_size; j++) { - Context ctx; - ctx.type = static_cast(buf[size++]); - uint8_t heredoc_identifier_size = buf[size++]; - ctx.heredoc_identifier.assign(buf + size, - buf + size + heredoc_identifier_size); - size += heredoc_identifier_size; - context_stack.push_back(ctx); - } - assert(size == n); - } - - bool scan(TSLexer *lexer, const bool *valid_symbols) { - bool has_leading_whitespace_with_newline = false; - while (iswspace(lexer->lookahead)) { - if (lexer->lookahead == '\n') { - has_leading_whitespace_with_newline = true; - } - skip(lexer); - } - if (lexer->lookahead == '\0') { - return false; - } - // manage quoted context - if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context() && - lexer->lookahead == '"') { - Context ctx = {QUOTED_TEMPLATE, ""}; - context_stack.push_back(ctx); - return accept_and_advance(lexer, QUOTED_TEMPLATE_START); - } - if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context() && - lexer->lookahead == '"') { - context_stack.pop_back(); - return accept_and_advance(lexer, QUOTED_TEMPLATE_END); - } - - // manage template interpolations - if (valid_symbols[TEMPLATE_INTERPOLATION_START] && - valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_interpolation_context() && - lexer->lookahead == '$') { - advance(lexer); - if (lexer->lookahead == '{') { - Context ctx = {TEMPLATE_INTERPOLATION, ""}; - context_stack.push_back(ctx); - return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START); - } - // try to scan escape sequence - if (lexer->lookahead == '$') { - advance(lexer); - if (lexer->lookahead == '{') { - // $${ - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - } - } - return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); - } - if (valid_symbols[TEMPLATE_INTERPOLATION_END] && - in_interpolation_context() && lexer->lookahead == '}') { - context_stack.pop_back(); - return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END); - } - - // manage template directives - if (valid_symbols[TEMPLATE_DIRECTIVE_START] && - valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_directive_context() && - lexer->lookahead == '%') { - advance(lexer); - if (lexer->lookahead == '{') { - Context ctx = {TEMPLATE_DIRECTIVE, ""}; - context_stack.push_back(ctx); - return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START); - } - // try to scan escape sequence - if (lexer->lookahead == '%') { - advance(lexer); - if (lexer->lookahead == '{') { - // $${ - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - } - } - return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); - } - if (valid_symbols[TEMPLATE_DIRECTIVE_END] && in_directive_context() && - lexer->lookahead == '}') { - context_stack.pop_back(); - return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END); - } - - // manage heredoc context - if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context()) { - string identifier; - // TODO: check that this is a valid identifier - while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' || - lexer->lookahead == '-') { - identifier.push_back(lexer->lookahead); - advance(lexer); - } - Context ctx = {HEREDOC_TEMPLATE, identifier}; - context_stack.push_back(ctx); - return accept_inplace(lexer, HEREDOC_IDENTIFIER); - } - if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context() && - has_leading_whitespace_with_newline) { - string expected_identifier = context_stack.back().heredoc_identifier; - - for (string::iterator it = expected_identifier.begin(); - it != expected_identifier.end(); ++it) { - if (lexer->lookahead == *it) { - advance(lexer); - } else { - return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); - } - } - // check if the identifier is on a line of its own - lexer->mark_end(lexer); - while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') { - advance(lexer); - } - if (lexer->lookahead == '\n') { - context_stack.pop_back(); - return accept_inplace(lexer, HEREDOC_IDENTIFIER); - } else { - advance(lexer); - lexer->mark_end(lexer); - return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); - } - } - // manage template literal chunks - - // handle template literal chunks in quoted contexts - // - // they may not contain newlines and may contain escape sequences - if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context()) { - switch (lexer->lookahead) { - case '\\': - advance(lexer); - switch (lexer->lookahead) { - case '"': - case 'n': - case 'r': - case 't': - case '\\': - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - case 'u': - for (int i = 0; i < 4; i++) { - if (!consume_wxdigit(lexer)) - return false; - } - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - case 'U': - for (int i = 0; i < 8; i++) { - if (!consume_wxdigit(lexer)) - return false; - } - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - default: - return false; - } - } - } - - // handle all other quoted template or string literal characters - if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context()) { - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - } - - // probably not handled by the external scanner - return false; - } - -private: - vector context_stack; - - void advance(TSLexer *lexer) { lexer->advance(lexer, false); } - - void skip(TSLexer *lexer) { lexer->advance(lexer, true); } - - bool accept_inplace(TSLexer *lexer, TokenType token) { - lexer->result_symbol = token; - return true; - } - - bool accept_and_advance(TSLexer *lexer, TokenType token) { - advance(lexer); - return accept_inplace(lexer, token); - } - - bool consume_wxdigit(TSLexer *lexer) { - advance(lexer); - return iswxdigit(lexer->lookahead); - } - - bool skip_comment(TSLexer* lexer) { - while (iswspace(lexer->lookahead)) { - skip(lexer); - } - if (lexer->lookahead != '#') { - return false; - } - skip(lexer); - while (lexer->lookahead != '\n') { - skip(lexer); - if (lexer->eof(lexer)) { - return false; - } - } - return true; - } - - bool in_context_type(ContextType type) { - if (context_stack.empty()) { - return false; - } - return context_stack.back().type == type; - } - - bool in_quoted_context() { return in_context_type(QUOTED_TEMPLATE); } - - bool in_heredoc_context() { return in_context_type(HEREDOC_TEMPLATE); } - - bool in_template_context() { - return in_quoted_context() || in_heredoc_context(); - } - - bool in_interpolation_context() { - return in_context_type(TEMPLATE_INTERPOLATION); - } - - bool in_directive_context() { return in_context_type(TEMPLATE_DIRECTIVE); } -}; - -} // namespace - -extern "C" { - -// tree sitter callbacks -void *tree_sitter_hcl_external_scanner_create() { return new Scanner(); } - -void tree_sitter_hcl_external_scanner_destroy(void *p) { - Scanner *scanner = static_cast(p); - delete scanner; -} - -unsigned tree_sitter_hcl_external_scanner_serialize(void *p, char *b) { - Scanner *scanner = static_cast(p); - return scanner->serialize(b); -} - -void tree_sitter_hcl_external_scanner_deserialize(void *p, const char *b, - unsigned n) { - Scanner *scanner = static_cast(p); - return scanner->deserialize(b, n); -} - -bool tree_sitter_hcl_external_scanner_scan(void *p, TSLexer *lexer, - const bool *valid_symbols) { - Scanner *scanner = static_cast(p); - return scanner->scan(lexer, valid_symbols); -} - -} // extern "C"