grammar: fix structure of coments in block bodies

This commit is contained in:
Michael Hoffmann
2023-04-08 15:36:10 +02:00
parent 4e33af0710
commit 486488948b
9 changed files with 27177 additions and 26547 deletions

View File

@@ -4,6 +4,7 @@
feature feature
* add dialects so we can have different queries in `nvim-treesitter` * add dialects so we can have different queries in `nvim-treesitter`
* fix structure of comments in block bodies
housekeeping: housekeeping:
* reformat using LSPs, ditch editorconfig * reformat using LSPs, ditch editorconfig

View File

@@ -23,6 +23,28 @@
] ]
}, },
"body": { "body": {
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_shim"
},
{
"type": "SEQ",
"members": [
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_shim"
},
{
"type": "BLANK"
}
]
},
{
"type": "REPEAT1", "type": "REPEAT1",
"content": { "content": {
"type": "CHOICE", "type": "CHOICE",
@@ -37,6 +59,10 @@
} }
] ]
} }
}
]
}
]
}, },
"attribute": { "attribute": {
"type": "SEQ", "type": "SEQ",
@@ -1779,6 +1805,10 @@
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "heredoc_identifier" "name": "heredoc_identifier"
},
{
"type": "SYMBOL",
"name": "_shim"
} }
], ],
"inline": [], "inline": [],

View File

@@ -139,7 +139,7 @@
"fields": {}, "fields": {},
"children": { "children": {
"multiple": true, "multiple": true,
"required": true, "required": false,
"types": [ "types": [
{ {
"type": "attribute", "type": "attribute",

File diff suppressed because it is too large Load Diff

View File

@@ -1,26 +1,341 @@
#include "../../../src/scanner.cc" #include <tree_sitter/parser.h>
#include <assert.h>
#include <climits>
#include <string>
#include <vector>
#include <wctype.h>
namespace {
using std::string;
using std::vector;
enum TokenType {
QUOTED_TEMPLATE_START,
QUOTED_TEMPLATE_END,
TEMPLATE_LITERAL_CHUNK,
TEMPLATE_INTERPOLATION_START,
TEMPLATE_INTERPOLATION_END,
TEMPLATE_DIRECTIVE_START,
TEMPLATE_DIRECTIVE_END,
HEREDOC_IDENTIFIER,
SHIM,
};
enum ContextType {
TEMPLATE_INTERPOLATION,
TEMPLATE_DIRECTIVE,
QUOTED_TEMPLATE,
HEREDOC_TEMPLATE,
};
struct Context {
ContextType type;
// valid if type == HEREDOC_TEMPLATE
string heredoc_identifier;
};
struct Scanner {
public:
unsigned serialize(char *buf) {
unsigned size = 0;
if (context_stack.size() > CHAR_MAX) {
return 0;
}
buf[size++] = context_stack.size();
for (vector<Context>::iterator it = context_stack.begin();
it != context_stack.end(); ++it) {
if (size + 2 + it->heredoc_identifier.size() >=
TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
return 0;
}
if (it->heredoc_identifier.size() > CHAR_MAX) {
return 0;
}
buf[size++] = it->type;
buf[size++] = it->heredoc_identifier.size();
it->heredoc_identifier.copy(&buf[size], it->heredoc_identifier.size());
size += it->heredoc_identifier.size();
}
return size;
}
void deserialize(const char *buf, unsigned n) {
context_stack.clear();
if (n == 0) {
return;
}
unsigned size = 0;
uint8_t context_stack_size = buf[size++];
for (unsigned j = 0; j < context_stack_size; j++) {
Context ctx;
ctx.type = static_cast<ContextType>(buf[size++]);
uint8_t heredoc_identifier_size = buf[size++];
ctx.heredoc_identifier.assign(buf + size,
buf + size + heredoc_identifier_size);
size += heredoc_identifier_size;
context_stack.push_back(ctx);
}
assert(size == n);
}
bool scan(TSLexer *lexer, const bool *valid_symbols) {
bool has_leading_whitespace_with_newline = false;
while (iswspace(lexer->lookahead)) {
if (lexer->lookahead == '\n') {
has_leading_whitespace_with_newline = true;
}
skip(lexer);
}
if (lexer->lookahead == '\0') {
return false;
}
if (valid_symbols[SHIM]) {
lexer->mark_end(lexer);
while(skip_comment(lexer));
if (lexer->lookahead != '}') {
return accept_inplace(lexer, SHIM);
}
}
// manage quoted context
if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context() &&
lexer->lookahead == '"') {
Context ctx = {QUOTED_TEMPLATE, ""};
context_stack.push_back(ctx);
return accept_and_advance(lexer, QUOTED_TEMPLATE_START);
}
if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context() &&
lexer->lookahead == '"') {
context_stack.pop_back();
return accept_and_advance(lexer, QUOTED_TEMPLATE_END);
}
// manage template interpolations
if (valid_symbols[TEMPLATE_INTERPOLATION_START] &&
valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_interpolation_context() &&
lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
Context ctx = {TEMPLATE_INTERPOLATION, ""};
context_stack.push_back(ctx);
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START);
}
// try to scan escape sequence
if (lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
if (valid_symbols[TEMPLATE_INTERPOLATION_END] &&
in_interpolation_context() && lexer->lookahead == '}') {
context_stack.pop_back();
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END);
}
// manage template directives
if (valid_symbols[TEMPLATE_DIRECTIVE_START] &&
valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_directive_context() &&
lexer->lookahead == '%') {
advance(lexer);
if (lexer->lookahead == '{') {
Context ctx = {TEMPLATE_DIRECTIVE, ""};
context_stack.push_back(ctx);
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START);
}
// try to scan escape sequence
if (lexer->lookahead == '%') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
if (valid_symbols[TEMPLATE_DIRECTIVE_END] && in_directive_context() &&
lexer->lookahead == '}') {
context_stack.pop_back();
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END);
}
// manage heredoc context
if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context()) {
string identifier;
// TODO: check that this is a valid identifier
while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' ||
lexer->lookahead == '-') {
identifier.push_back(lexer->lookahead);
advance(lexer);
}
Context ctx = {HEREDOC_TEMPLATE, identifier};
context_stack.push_back(ctx);
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
}
if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context() &&
has_leading_whitespace_with_newline) {
string expected_identifier = context_stack.back().heredoc_identifier;
for (string::iterator it = expected_identifier.begin();
it != expected_identifier.end(); ++it) {
if (lexer->lookahead == *it) {
advance(lexer);
} else {
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
// check if the identifier is on a line of its own
lexer->mark_end(lexer);
while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
advance(lexer);
}
if (lexer->lookahead == '\n') {
context_stack.pop_back();
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
} else {
advance(lexer);
lexer->mark_end(lexer);
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
// manage template literal chunks
// handle template literal chunks in quoted contexts
//
// they may not contain newlines and may contain escape sequences
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context()) {
switch (lexer->lookahead) {
case '\\':
advance(lexer);
switch (lexer->lookahead) {
case '"':
case 'n':
case 'r':
case 't':
case '\\':
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
case 'u':
for (int i = 0; i < 4; i++) {
if (!consume_wxdigit(lexer))
return false;
}
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
case 'U':
for (int i = 0; i < 8; i++) {
if (!consume_wxdigit(lexer))
return false;
}
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
default:
return false;
}
}
}
// handle all other quoted template or string literal characters
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context()) {
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
// probably not handled by the external scanner
return false;
}
private:
vector<Context> context_stack;
void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
bool accept_inplace(TSLexer *lexer, TokenType token) {
lexer->result_symbol = token;
return true;
}
bool accept_and_advance(TSLexer *lexer, TokenType token) {
advance(lexer);
return accept_inplace(lexer, token);
}
bool consume_wxdigit(TSLexer *lexer) {
advance(lexer);
return iswxdigit(lexer->lookahead);
}
bool skip_comment(TSLexer* lexer) {
while (iswspace(lexer->lookahead)) {
skip(lexer);
}
if (lexer->lookahead != '#') {
return false;
}
skip(lexer);
while (lexer->lookahead != '\n') {
skip(lexer);
}
return true;
}
bool in_context_type(ContextType type) {
if (context_stack.empty()) {
return false;
}
return context_stack.back().type == type;
}
bool in_quoted_context() { return in_context_type(QUOTED_TEMPLATE); }
bool in_heredoc_context() { return in_context_type(HEREDOC_TEMPLATE); }
bool in_template_context() {
return in_quoted_context() || in_heredoc_context();
}
bool in_interpolation_context() {
return in_context_type(TEMPLATE_INTERPOLATION);
}
bool in_directive_context() { return in_context_type(TEMPLATE_DIRECTIVE); }
};
} // namespace
extern "C" { extern "C" {
// tree sitter callbacks // tree sitter callbacks
void* tree_sitter_terraform_external_scanner_create() { void *tree_sitter_terraform_external_scanner_create() { return new Scanner(); }
return tree_sitter_hcl_external_scanner_create();
void tree_sitter_terraform_external_scanner_destroy(void *p) {
Scanner *scanner = static_cast<Scanner *>(p);
delete scanner;
} }
void tree_sitter_terraform_external_scanner_destroy(void* p) { unsigned tree_sitter_terraform_external_scanner_serialize(void *p, char *b) {
return tree_sitter_hcl_external_scanner_destroy(p); Scanner *scanner = static_cast<Scanner *>(p);
return scanner->serialize(b);
} }
unsigned tree_sitter_terraform_external_scanner_serialize(void* p, char* b) { void tree_sitter_terraform_external_scanner_deserialize(void *p, const char *b,
return tree_sitter_hcl_external_scanner_serialize(p, b); unsigned n) {
Scanner *scanner = static_cast<Scanner *>(p);
return scanner->deserialize(b, n);
} }
void tree_sitter_terraform_external_scanner_deserialize(void* p, const char* b, unsigned n) { bool tree_sitter_terraform_external_scanner_scan(void *p, TSLexer *lexer,
return tree_sitter_hcl_external_scanner_deserialize(p, b, n); const bool *valid_symbols) {
} Scanner *scanner = static_cast<Scanner *>(p);
return scanner->scan(lexer, valid_symbols);
bool tree_sitter_terraform_external_scanner_scan(void* p, TSLexer* lexer, const bool* valid_symbols) {
return tree_sitter_hcl_external_scanner_scan(p, lexer, valid_symbols);
} }
} // extern "C" } // extern "C"

View File

@@ -23,6 +23,28 @@
] ]
}, },
"body": { "body": {
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_shim"
},
{
"type": "SEQ",
"members": [
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_shim"
},
{
"type": "BLANK"
}
]
},
{
"type": "REPEAT1", "type": "REPEAT1",
"content": { "content": {
"type": "CHOICE", "type": "CHOICE",
@@ -37,6 +59,10 @@
} }
] ]
} }
}
]
}
]
}, },
"attribute": { "attribute": {
"type": "SEQ", "type": "SEQ",
@@ -1779,6 +1805,10 @@
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "heredoc_identifier" "name": "heredoc_identifier"
},
{
"type": "SYMBOL",
"name": "_shim"
} }
], ],
"inline": [], "inline": [],

View File

@@ -139,7 +139,7 @@
"fields": {}, "fields": {},
"children": { "children": {
"multiple": true, "multiple": true,
"required": true, "required": false,
"types": [ "types": [
{ {
"type": "attribute", "type": "attribute",

26558
src/parser.c

File diff suppressed because it is too large Load Diff

146
test/corpus/comments.txt Normal file
View File

@@ -0,0 +1,146 @@
================================================================================
comment in empty block body
================================================================================
block {
# foo
}
--------------------------------------------------------------------------------
(config_file
(body
(block
(identifier)
(block_start)
(comment)
(block_end))))
================================================================================
multiline comment in empty block body
================================================================================
block {
/*
foo
*/
}
--------------------------------------------------------------------------------
(config_file
(body
(block
(identifier)
(block_start)
(body)
(comment)
(block_end))))
================================================================================
multiline comment above attribute in block body
================================================================================
block {
/*
foo
*/
foo = bar
}
--------------------------------------------------------------------------------
(config_file
(body
(block
(identifier)
(block_start)
(body
(comment)
(attribute
(identifier)
(expression
(variable_expr
(identifier)))))
(block_end))))
================================================================================
comment above first attribute in block body
================================================================================
block {
# foo
foo = bar
}
--------------------------------------------------------------------------------
(config_file
(body
(block
(identifier)
(block_start)
(body
(comment)
(attribute
(identifier)
(expression
(variable_expr
(identifier)))))
(block_end))))
================================================================================
comment after last attribute in block body
================================================================================
block {
foo = bar
# foo
}
--------------------------------------------------------------------------------
(config_file
(body
(block
(identifier)
(block_start)
(body
(attribute
(identifier)
(expression
(variable_expr
(identifier)))))
(comment)
(block_end))))
================================================================================
comment between attributes in block body
================================================================================
block {
foo = bar
# foo
baz = quz
}
--------------------------------------------------------------------------------
(config_file
(body
(block
(identifier)
(block_start)
(body
(attribute
(identifier)
(expression
(variable_expr
(identifier))))
(comment)
(attribute
(identifier)
(expression
(variable_expr
(identifier)))))
(block_end))))