diff --git a/.editorconfig b/.editorconfig index fdfd3d6..22114f7 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,6 +1,6 @@ root = true -[*.{c,txt,js}] +[*.{cc,txt,js}] indent_style = space indent_size = 2 tab_width = 8 diff --git a/src/scanner.c b/src/scanner.c deleted file mode 100644 index bc971da..0000000 --- a/src/scanner.c +++ /dev/null @@ -1,257 +0,0 @@ -#include - -#include -#include -#include -#include - -enum TokenType { - QUOTED_TEMPLATE_START, - QUOTED_TEMPLATE_END, - TEMPLATE_LITERAL_CHUNK, - TEMPLATE_INTERPOLATION_START, - TEMPLATE_INTERPOLATION_END, - HEREDOC_IDENTIFIER, -}; - -static void advance(TSLexer *lexer) { lexer->advance(lexer, false); } - -static void skip(TSLexer *lexer) { lexer->advance(lexer, true); } - -static bool accept_inplace(TSLexer *lexer, enum TokenType token) { - lexer->result_symbol = token; - return true; -} - -static bool accept_and_advance(TSLexer *lexer, enum TokenType token) { - advance(lexer); - return accept_inplace(lexer, token); -} - -static bool consume_wxdigit(TSLexer *lexer) { - advance(lexer); - return iswxdigit(lexer->lookahead); -} - -typedef struct Scanner { - bool in_template_interpolation; - bool in_quoted_context; - int template_interpolation_depth; - int quoted_context_depth; -} Scanner; - -void print_debug_info(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) { - printf("\nDEBUG INFO START\n\n"); - printf("currently at: '%c'\n\n", lexer->lookahead); - printf("could be one of\n"); - printf("quoted_template_start: %x\n", valid_symbols[QUOTED_TEMPLATE_START]); - printf("quoted_template_end: %x\n", valid_symbols[QUOTED_TEMPLATE_END]); - printf("template_literal_chunk: %x\n", valid_symbols[TEMPLATE_LITERAL_CHUNK]); - printf("template_interpolation_start: %x\n", valid_symbols[TEMPLATE_INTERPOLATION_START]); - printf("template_interpolation_end: %x\n", valid_symbols[TEMPLATE_INTERPOLATION_END]); - printf("\n"); - printf("scanner state:\n"); - printf("in_template_interpolation %x\n", scanner->in_template_interpolation); - printf("in_quoted_context %x\n", scanner->in_quoted_context); - printf("template_interpolation_depth %x\n", scanner->template_interpolation_depth); - printf("quoted_context_depth %x\n", scanner->quoted_context_depth); - printf("\nDEBUG INFO END\n\n"); -} - -void scanner_enter_interpolation_context(Scanner *scanner) { - scanner->template_interpolation_depth++; - scanner->in_template_interpolation = true; - scanner->in_quoted_context = false; -} - -void scanner_exit_interpolation_context(Scanner *scanner) { - assert(scanner->template_interpolation_depth > 0); - scanner->template_interpolation_depth--; - scanner->in_template_interpolation = false; - if (scanner->quoted_context_depth > 0) { - scanner->in_quoted_context = true; - } -} - -void scanner_enter_quoted_context(Scanner *scanner) { - scanner->quoted_context_depth++; - scanner->in_quoted_context = true; - scanner->in_template_interpolation = false; -} - -void scanner_exit_quoted_context(Scanner *scanner) { - assert(scanner->quoted_context_depth > 0); - scanner->quoted_context_depth--; - scanner->in_quoted_context = false; - if (scanner->template_interpolation_depth > 0) { - scanner->in_template_interpolation = true; - } -} - -bool scanner_scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) { - // print_debug_info(scanner, lexer, valid_symbols); - - while (iswspace(lexer->lookahead)) { - skip(lexer); - } - if (lexer->lookahead == '\0') return false; - - // manage quoted context - if ( - valid_symbols[QUOTED_TEMPLATE_START] && - !scanner->in_quoted_context && - lexer->lookahead == '"' - ) { - scanner_enter_quoted_context(scanner); - return accept_and_advance(lexer, QUOTED_TEMPLATE_START); - } - if ( - valid_symbols[QUOTED_TEMPLATE_END] && - scanner->in_quoted_context && - lexer->lookahead == '"' - ) { - scanner_exit_quoted_context(scanner); - return accept_and_advance(lexer, QUOTED_TEMPLATE_END); - } - - // manage template interpolations - if ( - valid_symbols[TEMPLATE_INTERPOLATION_START] && - lexer->lookahead == '$' - ) { - advance(lexer); - if (lexer->lookahead == '{') { - scanner_enter_interpolation_context(scanner); - return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START); - } - if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) { - // try to scan escape sequence - if (lexer->lookahead == '$') { - advance(lexer); - if (lexer->lookahead == '{') { - // $${ - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - } - } - return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); - } - } - if ( - valid_symbols[TEMPLATE_INTERPOLATION_END] && - lexer->lookahead == '}' && - scanner->in_template_interpolation - ) { - scanner_exit_interpolation_context(scanner); - return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END); - } - - // handle template literal chunks - - // handle template literal chunks in quoted contexts - // - // they may not contain newlines and may contain escape sequences - - if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && scanner->in_quoted_context) { - switch (lexer->lookahead) { - case '\\': - advance(lexer); - switch (lexer->lookahead) { - case '"': - case 'n': - case 'r': - case 't': - case '\\': - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - case 'u': - for (int i = 0; i < 4; i++) { - if (!consume_wxdigit(lexer)) return false; - } - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - case 'U': - for (int i = 0; i < 8; i++) { - if (!consume_wxdigit(lexer)) return false; - } - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - default: - return false; - } - } - } - - // handle escaped template interpolations in string literals - if ( - valid_symbols[TEMPLATE_LITERAL_CHUNK] && - !valid_symbols[TEMPLATE_INTERPOLATION_START] && - scanner->in_quoted_context - ) { - // try to scan escaped template interpolation - switch (lexer->lookahead) { - case '$': - advance(lexer); - if (lexer->lookahead == '{') { - // unescaped template interpolation - return false; - } - if (lexer->lookahead == '$') { - advance(lexer); - if (lexer->lookahead == '{') { - // $${ - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - } - return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); - } - } - } - - // handle heredoc identifier - if (valid_symbols[HEREDOC_IDENTIFIER]) { - if (lexer->lookahead != 'E') { - if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - } - advance(lexer); - if (lexer->lookahead != 'O') { - if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - } - advance(lexer); - if (lexer->lookahead != 'F') { - if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - } - return accept_and_advance(lexer, HEREDOC_IDENTIFIER); - } - - // handle all other quoted template or string literal characters - if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) { - return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); - } - - // probably not handled by the external scanner - return false; -} - -void *tree_sitter_hcl_external_scanner_create() { - Scanner *scanner = (Scanner*)malloc(sizeof(Scanner)); - if (scanner) { - scanner->in_template_interpolation = false; - scanner->in_quoted_context = false; - scanner->template_interpolation_depth = 0; - scanner->quoted_context_depth = 0; - } - return scanner; -} - -void tree_sitter_hcl_external_scanner_destroy(void *p) { - free(p); -} - -unsigned tree_sitter_hcl_external_scanner_serialize(void *p, char *b) { - memcpy(b, p, sizeof(Scanner)); return sizeof(Scanner); -} - -void tree_sitter_hcl_external_scanner_deserialize(void *p, const char *b, unsigned n) { - memcpy(p, b, n); -} - -bool tree_sitter_hcl_external_scanner_scan(void *p, TSLexer *lexer, const bool *valid_symbols) { - return scanner_scan((Scanner*)p, lexer, valid_symbols); -} - diff --git a/src/scanner.cc b/src/scanner.cc new file mode 100644 index 0000000..52ba60a --- /dev/null +++ b/src/scanner.cc @@ -0,0 +1,255 @@ +#include + +#include +#include +#include +#include +#include + +namespace { + +using std::vector; +using std::string; + +enum TokenType { + QUOTED_TEMPLATE_START, + QUOTED_TEMPLATE_END, + TEMPLATE_LITERAL_CHUNK, + TEMPLATE_INTERPOLATION_START, + TEMPLATE_INTERPOLATION_END, + HEREDOC_IDENTIFIER, +}; + +enum ContextType { + TEMPLATE_INTERPOLATION, + QUOTED_TEMPLATE, + HEREDOC_TEMPLATE, +}; + +struct Context { + ContextType type; + + // valid if type == HEREDOC_TEMPLATE + char* identifier; + size_t identifier_size; +}; + +struct Scanner { + +public: + // TODO: implement properly + unsigned serialize(char* buf) { + return 0; + } + + // TODO: implement properly + void deserialize(const char* buf, unsigned n) { + } + + bool scan(TSLexer* lexer, const bool* valid_symbols) { + while (iswspace(lexer->lookahead)) { + skip(lexer); + } + if (lexer->lookahead == '\0') { + return false; + } + // manage quoted context + if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context() && lexer->lookahead == '"') { + context_stack.push_back({ .type = QUOTED_TEMPLATE }); + return accept_and_advance(lexer, QUOTED_TEMPLATE_START); + } + if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context() && lexer->lookahead == '"') { + context_stack.pop_back(); + return accept_and_advance(lexer, QUOTED_TEMPLATE_END); + } + + // manage template interpolations + if (valid_symbols[TEMPLATE_INTERPOLATION_START] && !in_interpolation_context() && lexer->lookahead == '$') { + advance(lexer); + if (lexer->lookahead == '{') { + context_stack.push_back({ .type = TEMPLATE_INTERPOLATION}); + return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START); + } + if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) { + // try to scan escape sequence + if (lexer->lookahead == '$') { + advance(lexer); + if (lexer->lookahead == '{') { + // $${ + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + } + } + return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); + } + } + if (valid_symbols[TEMPLATE_INTERPOLATION_END] && in_interpolation_context() && lexer->lookahead == '}') { + context_stack.pop_back(); + return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END); + } + + // handle template literal chunks + + // handle template literal chunks in quoted contexts + // + // they may not contain newlines and may contain escape sequences + + if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context()) { + switch (lexer->lookahead) { + case '\\': + advance(lexer); + switch (lexer->lookahead) { + case '"': + case 'n': + case 'r': + case 't': + case '\\': + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + case 'u': + for (int i = 0; i < 4; i++) { + if (!consume_wxdigit(lexer)) return false; + } + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + case 'U': + for (int i = 0; i < 8; i++) { + if (!consume_wxdigit(lexer)) return false; + } + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + default: + return false; + } + } + } + + // handle escaped template interpolations in string literals + if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && !valid_symbols[TEMPLATE_INTERPOLATION_START] && in_quoted_context()) { + // try to scan escaped template interpolation + switch (lexer->lookahead) { + case '$': + advance(lexer); + if (lexer->lookahead == '{') { + // unescaped template interpolation + return false; + } + if (lexer->lookahead == '$') { + advance(lexer); + if (lexer->lookahead == '{') { + // $${ + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + } + return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK); + } + } + } + + // handle heredoc identifier + if (valid_symbols[HEREDOC_IDENTIFIER]) { + if (lexer->lookahead != 'E') { + if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) { + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + } else { + return false; + } + } + advance(lexer); + if (lexer->lookahead != 'O') { + if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) { + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + } else { + return false; + } + } + advance(lexer); + if (lexer->lookahead != 'F') { + if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) { + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + } else { + return false; + } + } + return accept_and_advance(lexer, HEREDOC_IDENTIFIER); + } + + // handle all other quoted template or string literal characters + if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context()) { + return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK); + } + + // probably not handled by the external scanner + return false; + } + +private: + vector context_stack; + vector heredoc_identifier_stack; + + void advance(TSLexer* lexer) { + lexer->advance(lexer, false); + } + + void skip(TSLexer* lexer) { lexer->advance(lexer, true); } + + bool accept_inplace(TSLexer* lexer, TokenType token) { + lexer->result_symbol = token; + return true; + } + + bool accept_and_advance(TSLexer* lexer, TokenType token) { + advance(lexer); + return accept_inplace(lexer, token); + } + bool consume_wxdigit(TSLexer* lexer) { + advance(lexer); + return iswxdigit(lexer->lookahead); + } + + bool in_context_type(ContextType type) { + if (context_stack.empty()) { + return false; + } + return context_stack.back().type == type; + } + + bool in_quoted_context() { + return in_context_type(QUOTED_TEMPLATE); + } + + bool in_template_context() { + return in_context_type(QUOTED_TEMPLATE) || in_context_type(HEREDOC_TEMPLATE); + } + + bool in_interpolation_context() { + return in_context_type(TEMPLATE_INTERPOLATION); + } + +}; + +} // namespace + +extern "C" { + +// tree sitter callbacks +void* tree_sitter_hcl_external_scanner_create() { + return new Scanner(); +} + +void tree_sitter_hcl_external_scanner_destroy(void* p) { + Scanner* scanner = static_cast(p); + delete scanner; +} + +unsigned tree_sitter_hcl_external_scanner_serialize(void* p, char* b) { + Scanner* scanner = static_cast(p); + return scanner->serialize(b); +} + +void tree_sitter_hcl_external_scanner_deserialize(void* p, const char* b, unsigned n) { + Scanner* scanner = static_cast(p); + return scanner->deserialize(b, n); +} + +bool tree_sitter_hcl_external_scanner_scan(void* p, TSLexer* lexer, const bool* valid_symbols) { + Scanner* scanner = static_cast(p); + return scanner->scan(lexer, valid_symbols); +} + +} // extern "C" diff --git a/test/corpus/collections.txt b/test/corpus/collections.txt index 9cce689..aa810e5 100644 --- a/test/corpus/collections.txt +++ b/test/corpus/collections.txt @@ -1,5 +1,5 @@ ================================================================================ -tuple +simple tuple ================================================================================ foo = [1, 2, "foo"]