grammar: fix structure of coments in block bodies

2023-04-08 15:36:10 +02:00
parent 4e33af0710
commit 486488948b
9 changed files with 27177 additions and 26547 deletions
@@ -4,6 +4,7 @@

 feature
 * add dialects so we can have different queries in `nvim-treesitter`
+* fix structure of comments in block bodies

 housekeeping:
 * reformat using LSPs, ditch editorconfig
@@ -23,6 +23,28 @@
      ]
    },
    "body": {
+      "type": "CHOICE",
+      "members": [
+        {
+          "type": "SYMBOL",
+          "name": "_shim"
+        },
+        {
+          "type": "SEQ",
+          "members": [
+            {
+              "type": "CHOICE",
+              "members": [
+                {
+                  "type": "SYMBOL",
+                  "name": "_shim"
+                },
+                {
+                  "type": "BLANK"
+                }
+              ]
+            },
+            {
              "type": "REPEAT1",
              "content": {
                "type": "CHOICE",
@@ -37,6 +59,10 @@
                  }
                ]
              }
+            }
+          ]
+        }
+      ]
    },
    "attribute": {
      "type": "SEQ",
@@ -1779,6 +1805,10 @@
    {
      "type": "SYMBOL",
      "name": "heredoc_identifier"
+    },
+    {
+      "type": "SYMBOL",
+      "name": "_shim"
    }
  ],
  "inline": [],
@@ -139,7 +139,7 @@
    "fields": {},
    "children": {
      "multiple": true,
-      "required": true,
+      "required": false,
      "types": [
        {
          "type": "attribute",
@@ -1,26 +1,341 @@
-#include "../../../src/scanner.cc"
+#include <tree_sitter/parser.h>
+
+#include <assert.h>
+#include <climits>
+#include <string>
+#include <vector>
+#include <wctype.h>
+
+namespace {
+
+using std::string;
+using std::vector;
+
+enum TokenType {
+  QUOTED_TEMPLATE_START,
+  QUOTED_TEMPLATE_END,
+  TEMPLATE_LITERAL_CHUNK,
+  TEMPLATE_INTERPOLATION_START,
+  TEMPLATE_INTERPOLATION_END,
+  TEMPLATE_DIRECTIVE_START,
+  TEMPLATE_DIRECTIVE_END,
+  HEREDOC_IDENTIFIER,
+  SHIM,
+};
+
+enum ContextType {
+  TEMPLATE_INTERPOLATION,
+  TEMPLATE_DIRECTIVE,
+  QUOTED_TEMPLATE,
+  HEREDOC_TEMPLATE,
+};
+
+struct Context {
+  ContextType type;
+
+  // valid if type == HEREDOC_TEMPLATE
+  string heredoc_identifier;
+};
+
+struct Scanner {
+
+public:
+  unsigned serialize(char *buf) {
+    unsigned size = 0;
+
+    if (context_stack.size() > CHAR_MAX) {
+      return 0;
+    }
+
+    buf[size++] = context_stack.size();
+    for (vector<Context>::iterator it = context_stack.begin();
+         it != context_stack.end(); ++it) {
+      if (size + 2 + it->heredoc_identifier.size() >=
+          TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
+        return 0;
+      }
+      if (it->heredoc_identifier.size() > CHAR_MAX) {
+        return 0;
+      }
+      buf[size++] = it->type;
+      buf[size++] = it->heredoc_identifier.size();
+      it->heredoc_identifier.copy(&buf[size], it->heredoc_identifier.size());
+      size += it->heredoc_identifier.size();
+    }
+    return size;
+  }
+
+  void deserialize(const char *buf, unsigned n) {
+    context_stack.clear();
+
+    if (n == 0) {
+      return;
+    }
+
+    unsigned size = 0;
+    uint8_t context_stack_size = buf[size++];
+    for (unsigned j = 0; j < context_stack_size; j++) {
+      Context ctx;
+      ctx.type = static_cast<ContextType>(buf[size++]);
+      uint8_t heredoc_identifier_size = buf[size++];
+      ctx.heredoc_identifier.assign(buf + size,
+                                    buf + size + heredoc_identifier_size);
+      size += heredoc_identifier_size;
+      context_stack.push_back(ctx);
+    }
+    assert(size == n);
+  }
+
+  bool scan(TSLexer *lexer, const bool *valid_symbols) {
+    bool has_leading_whitespace_with_newline = false;
+    while (iswspace(lexer->lookahead)) {
+      if (lexer->lookahead == '\n') {
+        has_leading_whitespace_with_newline = true;
+      }
+      skip(lexer);
+    }
+    if (lexer->lookahead == '\0') {
+      return false;
+    }
+    if (valid_symbols[SHIM]) {
+      lexer->mark_end(lexer);
+      while(skip_comment(lexer));
+      if (lexer->lookahead != '}') {
+        return accept_inplace(lexer, SHIM);
+      }
+    }
+    // manage quoted context
+    if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context() &&
+        lexer->lookahead == '"') {
+      Context ctx = {QUOTED_TEMPLATE, ""};
+      context_stack.push_back(ctx);
+      return accept_and_advance(lexer, QUOTED_TEMPLATE_START);
+    }
+    if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context() &&
+        lexer->lookahead == '"') {
+      context_stack.pop_back();
+      return accept_and_advance(lexer, QUOTED_TEMPLATE_END);
+    }
+
+    // manage template interpolations
+    if (valid_symbols[TEMPLATE_INTERPOLATION_START] &&
+        valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_interpolation_context() &&
+        lexer->lookahead == '$') {
+      advance(lexer);
+      if (lexer->lookahead == '{') {
+        Context ctx = {TEMPLATE_INTERPOLATION, ""};
+        context_stack.push_back(ctx);
+        return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START);
+      }
+      // try to scan escape sequence
+      if (lexer->lookahead == '$') {
+        advance(lexer);
+        if (lexer->lookahead == '{') {
+          // $${
+          return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
+        }
+      }
+      return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
+    }
+    if (valid_symbols[TEMPLATE_INTERPOLATION_END] &&
+        in_interpolation_context() && lexer->lookahead == '}') {
+      context_stack.pop_back();
+      return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END);
+    }
+
+    // manage template directives
+    if (valid_symbols[TEMPLATE_DIRECTIVE_START] &&
+        valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_directive_context() &&
+        lexer->lookahead == '%') {
+      advance(lexer);
+      if (lexer->lookahead == '{') {
+        Context ctx = {TEMPLATE_DIRECTIVE, ""};
+        context_stack.push_back(ctx);
+        return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START);
+      }
+      // try to scan escape sequence
+      if (lexer->lookahead == '%') {
+        advance(lexer);
+        if (lexer->lookahead == '{') {
+          // $${
+          return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
+        }
+      }
+      return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
+    }
+    if (valid_symbols[TEMPLATE_DIRECTIVE_END] && in_directive_context() &&
+        lexer->lookahead == '}') {
+      context_stack.pop_back();
+      return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END);
+    }
+
+    // manage heredoc context
+    if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context()) {
+      string identifier;
+      // TODO: check that this is a valid identifier
+      while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' ||
+             lexer->lookahead == '-') {
+        identifier.push_back(lexer->lookahead);
+        advance(lexer);
+      }
+      Context ctx = {HEREDOC_TEMPLATE, identifier};
+      context_stack.push_back(ctx);
+      return accept_inplace(lexer, HEREDOC_IDENTIFIER);
+    }
+    if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context() &&
+        has_leading_whitespace_with_newline) {
+      string expected_identifier = context_stack.back().heredoc_identifier;
+
+      for (string::iterator it = expected_identifier.begin();
+           it != expected_identifier.end(); ++it) {
+        if (lexer->lookahead == *it) {
+          advance(lexer);
+        } else {
+          return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
+        }
+      }
+      // check if the identifier is on a line of its own
+      lexer->mark_end(lexer);
+      while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
+        advance(lexer);
+      }
+      if (lexer->lookahead == '\n') {
+        context_stack.pop_back();
+        return accept_inplace(lexer, HEREDOC_IDENTIFIER);
+      } else {
+        advance(lexer);
+        lexer->mark_end(lexer);
+        return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
+      }
+    }
+    // manage template literal chunks
+
+    // handle template literal chunks in quoted contexts
+    //
+    // they may not contain newlines and may contain escape sequences
+    if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context()) {
+      switch (lexer->lookahead) {
+      case '\\':
+        advance(lexer);
+        switch (lexer->lookahead) {
+        case '"':
+        case 'n':
+        case 'r':
+        case 't':
+        case '\\':
+          return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
+        case 'u':
+          for (int i = 0; i < 4; i++) {
+            if (!consume_wxdigit(lexer))
+              return false;
+          }
+          return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
+        case 'U':
+          for (int i = 0; i < 8; i++) {
+            if (!consume_wxdigit(lexer))
+              return false;
+          }
+          return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
+        default:
+          return false;
+        }
+      }
+    }
+
+    // handle all other quoted template or string literal characters
+    if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context()) {
+      return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
+    }
+
+    // probably not handled by the external scanner
+    return false;
+  }
+
+private:
+  vector<Context> context_stack;
+
+  void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
+
+  void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
+
+  bool accept_inplace(TSLexer *lexer, TokenType token) {
+    lexer->result_symbol = token;
+    return true;
+  }
+
+  bool accept_and_advance(TSLexer *lexer, TokenType token) {
+    advance(lexer);
+    return accept_inplace(lexer, token);
+  }
+
+  bool consume_wxdigit(TSLexer *lexer) {
+    advance(lexer);
+    return iswxdigit(lexer->lookahead);
+  }
+
+  bool skip_comment(TSLexer* lexer) {
+    while (iswspace(lexer->lookahead)) {
+      skip(lexer);
+    }
+    if (lexer->lookahead != '#') {
+      return false;
+    }
+    skip(lexer);
+    while (lexer->lookahead != '\n') {
+      skip(lexer);
+    }
+    return true;
+  }
+
+  bool in_context_type(ContextType type) {
+    if (context_stack.empty()) {
+      return false;
+    }
+    return context_stack.back().type == type;
+  }
+
+  bool in_quoted_context() { return in_context_type(QUOTED_TEMPLATE); }
+
+  bool in_heredoc_context() { return in_context_type(HEREDOC_TEMPLATE); }
+
+  bool in_template_context() {
+    return in_quoted_context() || in_heredoc_context();
+  }
+
+  bool in_interpolation_context() {
+    return in_context_type(TEMPLATE_INTERPOLATION);
+  }
+
+  bool in_directive_context() { return in_context_type(TEMPLATE_DIRECTIVE); }
+};
+
+} // namespace

 extern "C" {

 // tree sitter callbacks
-void* tree_sitter_terraform_external_scanner_create() {
-  return tree_sitter_hcl_external_scanner_create();
-}
+void *tree_sitter_terraform_external_scanner_create() { return new Scanner(); }

 void tree_sitter_terraform_external_scanner_destroy(void *p) {
-  return tree_sitter_hcl_external_scanner_destroy(p);
+  Scanner *scanner = static_cast<Scanner *>(p);
+  delete scanner;
 }

 unsigned tree_sitter_terraform_external_scanner_serialize(void *p, char *b) {
-  return tree_sitter_hcl_external_scanner_serialize(p, b);
+  Scanner *scanner = static_cast<Scanner *>(p);
+  return scanner->serialize(b);
 }

-void tree_sitter_terraform_external_scanner_deserialize(void* p, const char* b, unsigned n) {
-  return tree_sitter_hcl_external_scanner_deserialize(p, b, n);
+void tree_sitter_terraform_external_scanner_deserialize(void *p, const char *b,
+                                                  unsigned n) {
+  Scanner *scanner = static_cast<Scanner *>(p);
+  return scanner->deserialize(b, n);
 }

-bool tree_sitter_terraform_external_scanner_scan(void* p, TSLexer* lexer, const bool* valid_symbols) {
-  return tree_sitter_hcl_external_scanner_scan(p, lexer, valid_symbols);
+bool tree_sitter_terraform_external_scanner_scan(void *p, TSLexer *lexer,
+                                           const bool *valid_symbols) {
+  Scanner *scanner = static_cast<Scanner *>(p);
+  return scanner->scan(lexer, valid_symbols);
 }

 } // extern "C"
@@ -23,6 +23,28 @@
      ]
    },
    "body": {
+      "type": "CHOICE",
+      "members": [
+        {
+          "type": "SYMBOL",
+          "name": "_shim"
+        },
+        {
+          "type": "SEQ",
+          "members": [
+            {
+              "type": "CHOICE",
+              "members": [
+                {
+                  "type": "SYMBOL",
+                  "name": "_shim"
+                },
+                {
+                  "type": "BLANK"
+                }
+              ]
+            },
+            {
              "type": "REPEAT1",
              "content": {
                "type": "CHOICE",
@@ -37,6 +59,10 @@
                  }
                ]
              }
+            }
+          ]
+        }
+      ]
    },
    "attribute": {
      "type": "SEQ",
@@ -1779,6 +1805,10 @@
    {
      "type": "SYMBOL",
      "name": "heredoc_identifier"
+    },
+    {
+      "type": "SYMBOL",
+      "name": "_shim"
    }
  ],
  "inline": [],
@@ -139,7 +139,7 @@
    "fields": {},
    "children": {
      "multiple": true,
-      "required": true,
+      "required": false,
      "types": [
        {
          "type": "attribute",
@@ -0,0 +1,146 @@
+================================================================================
+comment in empty block body
+================================================================================
+
+block {
+  # foo
+}
+
+--------------------------------------------------------------------------------
+
+(config_file
+  (body
+    (block
+      (identifier)
+      (block_start)
+      (comment)
+      (block_end))))
+
+================================================================================
+multiline comment in empty block body
+================================================================================
+
+block {
+  /*
+    foo
+  */
+}
+
+--------------------------------------------------------------------------------
+
+(config_file
+  (body
+    (block
+      (identifier)
+      (block_start)
+      (body)
+      (comment)
+      (block_end))))
+
+================================================================================
+multiline comment above attribute in block body
+================================================================================
+
+block {
+  /*
+    foo
+  */
+  foo = bar
+}
+
+--------------------------------------------------------------------------------
+
+(config_file
+  (body
+    (block
+      (identifier)
+      (block_start)
+      (body
+        (comment)
+        (attribute
+          (identifier)
+          (expression
+            (variable_expr
+              (identifier)))))
+      (block_end))))
+
+================================================================================
+comment above first attribute in block body
+================================================================================
+
+block {
+  # foo
+  foo = bar
+}
+
+--------------------------------------------------------------------------------
+
+(config_file
+  (body
+    (block
+      (identifier)
+      (block_start)
+      (body
+        (comment)
+        (attribute
+          (identifier)
+          (expression
+            (variable_expr
+              (identifier)))))
+      (block_end))))
+
+================================================================================
+comment after last attribute in block body
+================================================================================
+
+block {
+  foo = bar
+  # foo
+}
+
+--------------------------------------------------------------------------------
+
+(config_file
+  (body
+    (block
+      (identifier)
+      (block_start)
+      (body
+        (attribute
+          (identifier)
+          (expression
+            (variable_expr
+              (identifier)))))
+      (comment)
+      (block_end))))
+
+================================================================================
+comment between attributes in block body
+================================================================================
+
+block {
+  foo = bar
+  # foo
+  baz = quz
+}
+
+--------------------------------------------------------------------------------
+
+(config_file
+  (body
+    (block
+      (identifier)
+      (block_start)
+      (body
+        (attribute
+          (identifier)
+          (expression
+            (variable_expr
+              (identifier))))
+        (comment)
+        (attribute
+          (identifier)
+          (expression
+            (variable_expr
+              (identifier)))))
+      (block_end))))