feat: rewrite the scanner in C
This commit is contained in:
committed by
Michael Hoffmann
parent
636dbe7030
commit
5160a52f2d
436
dialects/terraform/src/scanner.c
Normal file
436
dialects/terraform/src/scanner.c
Normal file
@@ -0,0 +1,436 @@
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <tree_sitter/parser.h>
|
||||
#include <wctype.h>
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
#define VEC_RESIZE(vec, _cap) \
|
||||
void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \
|
||||
assert(tmp != NULL); \
|
||||
(vec).data = tmp; \
|
||||
(vec).cap = (_cap);
|
||||
|
||||
#define VEC_PUSH(vec, el) \
|
||||
if ((vec).cap == (vec).len) { \
|
||||
VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \
|
||||
} \
|
||||
(vec).data[(vec).len++] = (el);
|
||||
|
||||
#define VEC_POP(vec) \
|
||||
{ \
|
||||
STRING_FREE(VEC_BACK((vec)).heredoc_identifier); \
|
||||
(vec).len--; \
|
||||
}
|
||||
|
||||
#define VEC_BACK(vec) ((vec).data[(vec).len - 1])
|
||||
|
||||
#define VEC_FREE(vec) \
|
||||
{ \
|
||||
if ((vec).data != NULL) \
|
||||
free((vec).data); \
|
||||
}
|
||||
|
||||
#define VEC_CLEAR(vec) \
|
||||
{ \
|
||||
for (int i = 0; i < (vec).len; i++) { \
|
||||
STRING_FREE((vec).data[i].heredoc_identifier); \
|
||||
} \
|
||||
(vec).len = 0; \
|
||||
}
|
||||
|
||||
#define STRING_RESIZE(vec, _cap) \
|
||||
void *tmp = realloc((vec).data, (_cap + 1) * sizeof((vec).data[0])); \
|
||||
assert(tmp != NULL); \
|
||||
(vec).data = tmp; \
|
||||
memset((vec).data + (vec).len, 0, \
|
||||
((_cap + 1) - (vec).len) * sizeof((vec).data[0])); \
|
||||
(vec).cap = (_cap);
|
||||
|
||||
#define STRING_GROW(vec, _cap) \
|
||||
if ((vec).cap < (_cap)) { \
|
||||
STRING_RESIZE((vec), (_cap)); \
|
||||
}
|
||||
|
||||
#define STRING_PUSH(vec, el) \
|
||||
if ((vec).cap == (vec).len) { \
|
||||
STRING_RESIZE((vec), MAX(16, (vec).len * 2)); \
|
||||
} \
|
||||
(vec).data[(vec).len++] = (el);
|
||||
|
||||
#define STRING_FREE(vec) \
|
||||
{ \
|
||||
if ((vec).data != NULL) \
|
||||
free((vec).data); \
|
||||
}
|
||||
|
||||
enum TokenType {
|
||||
QUOTED_TEMPLATE_START,
|
||||
QUOTED_TEMPLATE_END,
|
||||
TEMPLATE_LITERAL_CHUNK,
|
||||
TEMPLATE_INTERPOLATION_START,
|
||||
TEMPLATE_INTERPOLATION_END,
|
||||
TEMPLATE_DIRECTIVE_START,
|
||||
TEMPLATE_DIRECTIVE_END,
|
||||
HEREDOC_IDENTIFIER,
|
||||
};
|
||||
|
||||
enum ContextType {
|
||||
TEMPLATE_INTERPOLATION,
|
||||
TEMPLATE_DIRECTIVE,
|
||||
QUOTED_TEMPLATE,
|
||||
HEREDOC_TEMPLATE,
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
uint32_t cap;
|
||||
uint32_t len;
|
||||
char *data;
|
||||
} String;
|
||||
|
||||
String string_new() {
|
||||
return (String){
|
||||
.cap = 16,
|
||||
.len = 0,
|
||||
.data = calloc(1, sizeof(char) * 17),
|
||||
};
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
enum ContextType type;
|
||||
|
||||
// valid if type == HEREDOC_TEMPLATE
|
||||
String heredoc_identifier;
|
||||
} Context;
|
||||
|
||||
Context context_new(enum ContextType type, const char *data) {
|
||||
Context ctx = {
|
||||
.type = type,
|
||||
.heredoc_identifier = string_new(),
|
||||
};
|
||||
ctx.heredoc_identifier.len = strlen(data);
|
||||
ctx.heredoc_identifier.cap = strlen(data);
|
||||
memcpy(ctx.heredoc_identifier.data, data, ctx.heredoc_identifier.len);
|
||||
return ctx;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
uint32_t len;
|
||||
uint32_t cap;
|
||||
Context *data;
|
||||
} context_vec;
|
||||
|
||||
typedef struct {
|
||||
context_vec context_stack;
|
||||
} Scanner;
|
||||
|
||||
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
|
||||
|
||||
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
|
||||
|
||||
static unsigned serialize(Scanner *scanner, char *buf) {
|
||||
unsigned size = 0;
|
||||
|
||||
if (scanner->context_stack.len > CHAR_MAX) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
buf[size++] = (char)scanner->context_stack.len;
|
||||
for (int i = 0; i < scanner->context_stack.len; i++) {
|
||||
Context *context = &scanner->context_stack.data[i];
|
||||
if (size + 2 + context->heredoc_identifier.len >=
|
||||
TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
|
||||
return 0;
|
||||
}
|
||||
if (context->heredoc_identifier.len > CHAR_MAX) {
|
||||
return 0;
|
||||
}
|
||||
buf[size++] = context->type;
|
||||
buf[size++] = (char)context->heredoc_identifier.len;
|
||||
memcpy(&buf[size], context->heredoc_identifier.data,
|
||||
context->heredoc_identifier.len);
|
||||
size += context->heredoc_identifier.len;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
|
||||
if (length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
VEC_CLEAR(scanner->context_stack);
|
||||
unsigned size = 0;
|
||||
uint8_t context_stack_size = buffer[size++];
|
||||
for (uint32_t j = 0; j < context_stack_size; j++) {
|
||||
Context ctx = {
|
||||
.type = (enum ContextType)buffer[size++],
|
||||
.heredoc_identifier = string_new(),
|
||||
};
|
||||
uint8_t heredoc_identifier_size = buffer[size++];
|
||||
STRING_GROW(ctx.heredoc_identifier, heredoc_identifier_size);
|
||||
memcpy(ctx.heredoc_identifier.data, buffer + size,
|
||||
heredoc_identifier_size);
|
||||
ctx.heredoc_identifier.len = heredoc_identifier_size;
|
||||
size += heredoc_identifier_size;
|
||||
VEC_PUSH(scanner->context_stack, ctx);
|
||||
}
|
||||
assert(size == length);
|
||||
}
|
||||
|
||||
static inline bool accept_inplace(TSLexer *lexer, enum TokenType token) {
|
||||
lexer->result_symbol = token;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool accept_and_advance(TSLexer *lexer, enum TokenType token) {
|
||||
advance(lexer);
|
||||
return accept_inplace(lexer, token);
|
||||
}
|
||||
|
||||
static inline bool consume_wxdigit(TSLexer *lexer) {
|
||||
advance(lexer);
|
||||
return iswxdigit(lexer->lookahead);
|
||||
}
|
||||
|
||||
static inline bool skip_comment(TSLexer *lexer) {
|
||||
while (iswspace(lexer->lookahead)) {
|
||||
skip(lexer);
|
||||
}
|
||||
if (lexer->lookahead != '#') {
|
||||
return false;
|
||||
}
|
||||
skip(lexer);
|
||||
while (lexer->lookahead != '\n') {
|
||||
skip(lexer);
|
||||
if (lexer->eof(lexer)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool in_context_type(Scanner *scanner, enum ContextType type) {
|
||||
if (scanner->context_stack.len == 0) {
|
||||
return false;
|
||||
}
|
||||
return VEC_BACK(scanner->context_stack).type == type;
|
||||
}
|
||||
|
||||
static inline bool in_quoted_context(Scanner *scanner) {
|
||||
return in_context_type(scanner, QUOTED_TEMPLATE);
|
||||
}
|
||||
|
||||
static inline bool in_heredoc_context(Scanner *scanner) {
|
||||
return in_context_type(scanner, HEREDOC_TEMPLATE);
|
||||
}
|
||||
|
||||
static inline bool in_template_context(Scanner *scanner) {
|
||||
return in_quoted_context(scanner) || in_heredoc_context(scanner);
|
||||
}
|
||||
|
||||
static inline bool in_interpolation_context(Scanner *scanner) {
|
||||
return in_context_type(scanner, TEMPLATE_INTERPOLATION);
|
||||
}
|
||||
|
||||
static inline bool in_directive_context(Scanner *scanner) {
|
||||
return in_context_type(scanner, TEMPLATE_DIRECTIVE);
|
||||
}
|
||||
|
||||
static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
|
||||
bool has_leading_whitespace_with_newline = false;
|
||||
while (iswspace(lexer->lookahead)) {
|
||||
if (lexer->lookahead == '\n') {
|
||||
has_leading_whitespace_with_newline = true;
|
||||
}
|
||||
skip(lexer);
|
||||
}
|
||||
if (lexer->lookahead == '\0') {
|
||||
return false;
|
||||
}
|
||||
// manage quoted context
|
||||
if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context(scanner) &&
|
||||
lexer->lookahead == '"') {
|
||||
Context ctx = context_new(QUOTED_TEMPLATE, "");
|
||||
VEC_PUSH(scanner->context_stack, ctx);
|
||||
return accept_and_advance(lexer, QUOTED_TEMPLATE_START);
|
||||
}
|
||||
if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context(scanner) &&
|
||||
lexer->lookahead == '"') {
|
||||
VEC_POP(scanner->context_stack);
|
||||
return accept_and_advance(lexer, QUOTED_TEMPLATE_END);
|
||||
}
|
||||
|
||||
// manage template interpolations
|
||||
if (valid_symbols[TEMPLATE_INTERPOLATION_START] &&
|
||||
valid_symbols[TEMPLATE_LITERAL_CHUNK] &&
|
||||
!in_interpolation_context(scanner) && lexer->lookahead == '$') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
Context ctx = context_new(TEMPLATE_INTERPOLATION, "");
|
||||
VEC_PUSH(scanner->context_stack, ctx);
|
||||
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START);
|
||||
}
|
||||
// try to scan escape sequence
|
||||
if (lexer->lookahead == '$') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
// $${
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
if (valid_symbols[TEMPLATE_INTERPOLATION_END] &&
|
||||
in_interpolation_context(scanner) && lexer->lookahead == '}') {
|
||||
VEC_POP(scanner->context_stack);
|
||||
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END);
|
||||
}
|
||||
|
||||
// manage template directives
|
||||
if (valid_symbols[TEMPLATE_DIRECTIVE_START] &&
|
||||
valid_symbols[TEMPLATE_LITERAL_CHUNK] &&
|
||||
!in_directive_context(scanner) && lexer->lookahead == '%') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
Context ctx = context_new(TEMPLATE_DIRECTIVE, "");
|
||||
VEC_PUSH(scanner->context_stack, ctx);
|
||||
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START);
|
||||
}
|
||||
// try to scan escape sequence
|
||||
if (lexer->lookahead == '%') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
// $${
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
if (valid_symbols[TEMPLATE_DIRECTIVE_END] &&
|
||||
in_directive_context(scanner) && lexer->lookahead == '}') {
|
||||
VEC_POP(scanner->context_stack);
|
||||
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END);
|
||||
}
|
||||
|
||||
// manage heredoc context
|
||||
if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context(scanner)) {
|
||||
String identifier = string_new();
|
||||
// TODO: check that this is a valid identifier
|
||||
while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' ||
|
||||
lexer->lookahead == '-') {
|
||||
STRING_PUSH(identifier, lexer->lookahead);
|
||||
advance(lexer);
|
||||
}
|
||||
Context ctx = {HEREDOC_TEMPLATE, identifier};
|
||||
VEC_PUSH(scanner->context_stack, ctx);
|
||||
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
|
||||
}
|
||||
if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context(scanner) &&
|
||||
has_leading_whitespace_with_newline) {
|
||||
String expected_identifier =
|
||||
VEC_BACK(scanner->context_stack).heredoc_identifier;
|
||||
|
||||
for (size_t i = 0; i < expected_identifier.len; i++) {
|
||||
if (lexer->lookahead == expected_identifier.data[i]) {
|
||||
advance(lexer);
|
||||
} else {
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
// check if the identifier is on a line of its own
|
||||
lexer->mark_end(lexer);
|
||||
while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
|
||||
advance(lexer);
|
||||
}
|
||||
if (lexer->lookahead == '\n') {
|
||||
VEC_POP(scanner->context_stack);
|
||||
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
|
||||
}
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
// manage template literal chunks
|
||||
|
||||
// handle template literal chunks in quoted contexts
|
||||
//
|
||||
// they may not contain newlines and may contain escape sequences
|
||||
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context(scanner)) {
|
||||
switch (lexer->lookahead) {
|
||||
case '\\':
|
||||
advance(lexer);
|
||||
switch (lexer->lookahead) {
|
||||
case '"':
|
||||
case 'n':
|
||||
case 'r':
|
||||
case 't':
|
||||
case '\\':
|
||||
return accept_and_advance(lexer,
|
||||
TEMPLATE_LITERAL_CHUNK);
|
||||
case 'u':
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (!consume_wxdigit(lexer)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return accept_and_advance(lexer,
|
||||
TEMPLATE_LITERAL_CHUNK);
|
||||
case 'U':
|
||||
for (int i = 0; i < 8; i++) {
|
||||
if (!consume_wxdigit(lexer)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return accept_and_advance(lexer,
|
||||
TEMPLATE_LITERAL_CHUNK);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handle all other quoted template or string literal characters
|
||||
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context(scanner)) {
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
|
||||
// probably not handled by the external scanner
|
||||
return false;
|
||||
}
|
||||
|
||||
void *tree_sitter_terraform_external_scanner_create() {
|
||||
Scanner *scanner = calloc(1, sizeof(Scanner));
|
||||
scanner->context_stack.data = calloc(1, sizeof(Context));
|
||||
return scanner;
|
||||
}
|
||||
|
||||
unsigned tree_sitter_terraform_external_scanner_serialize(void *payload,
|
||||
char *buffer) {
|
||||
Scanner *scanner = (Scanner *)payload;
|
||||
return serialize(scanner, buffer);
|
||||
}
|
||||
|
||||
void tree_sitter_terraform_external_scanner_deserialize(void *payload,
|
||||
const char *buffer,
|
||||
unsigned length) {
|
||||
Scanner *scanner = (Scanner *)payload;
|
||||
deserialize(scanner, buffer, length);
|
||||
}
|
||||
|
||||
bool tree_sitter_terraform_external_scanner_scan(void *payload, TSLexer *lexer,
|
||||
const bool *valid_symbols) {
|
||||
Scanner *scanner = (Scanner *)payload;
|
||||
return scan(scanner, lexer, valid_symbols);
|
||||
}
|
||||
|
||||
void tree_sitter_terraform_external_scanner_destroy(void *payload) {
|
||||
Scanner *scanner = (Scanner *)payload;
|
||||
for (int i = 0; i < scanner->context_stack.len; i++) {
|
||||
STRING_FREE(scanner->context_stack.data[i].heredoc_identifier);
|
||||
}
|
||||
VEC_FREE(scanner->context_stack);
|
||||
free(scanner);
|
||||
}
|
||||
@@ -1,336 +0,0 @@
|
||||
#include <tree_sitter/parser.h>
|
||||
|
||||
#include <assert.h>
|
||||
#include <climits>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <wctype.h>
|
||||
|
||||
namespace {
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
enum TokenType {
|
||||
QUOTED_TEMPLATE_START,
|
||||
QUOTED_TEMPLATE_END,
|
||||
TEMPLATE_LITERAL_CHUNK,
|
||||
TEMPLATE_INTERPOLATION_START,
|
||||
TEMPLATE_INTERPOLATION_END,
|
||||
TEMPLATE_DIRECTIVE_START,
|
||||
TEMPLATE_DIRECTIVE_END,
|
||||
HEREDOC_IDENTIFIER,
|
||||
};
|
||||
|
||||
enum ContextType {
|
||||
TEMPLATE_INTERPOLATION,
|
||||
TEMPLATE_DIRECTIVE,
|
||||
QUOTED_TEMPLATE,
|
||||
HEREDOC_TEMPLATE,
|
||||
};
|
||||
|
||||
struct Context {
|
||||
ContextType type;
|
||||
|
||||
// valid if type == HEREDOC_TEMPLATE
|
||||
string heredoc_identifier;
|
||||
};
|
||||
|
||||
struct Scanner {
|
||||
|
||||
public:
|
||||
unsigned serialize(char *buf) {
|
||||
unsigned size = 0;
|
||||
|
||||
if (context_stack.size() > CHAR_MAX) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
buf[size++] = context_stack.size();
|
||||
for (vector<Context>::iterator it = context_stack.begin();
|
||||
it != context_stack.end(); ++it) {
|
||||
if (size + 2 + it->heredoc_identifier.size() >=
|
||||
TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
|
||||
return 0;
|
||||
}
|
||||
if (it->heredoc_identifier.size() > CHAR_MAX) {
|
||||
return 0;
|
||||
}
|
||||
buf[size++] = it->type;
|
||||
buf[size++] = it->heredoc_identifier.size();
|
||||
it->heredoc_identifier.copy(&buf[size], it->heredoc_identifier.size());
|
||||
size += it->heredoc_identifier.size();
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
void deserialize(const char *buf, unsigned n) {
|
||||
context_stack.clear();
|
||||
|
||||
if (n == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned size = 0;
|
||||
uint8_t context_stack_size = buf[size++];
|
||||
for (unsigned j = 0; j < context_stack_size; j++) {
|
||||
Context ctx;
|
||||
ctx.type = static_cast<ContextType>(buf[size++]);
|
||||
uint8_t heredoc_identifier_size = buf[size++];
|
||||
ctx.heredoc_identifier.assign(buf + size,
|
||||
buf + size + heredoc_identifier_size);
|
||||
size += heredoc_identifier_size;
|
||||
context_stack.push_back(ctx);
|
||||
}
|
||||
assert(size == n);
|
||||
}
|
||||
|
||||
bool scan(TSLexer *lexer, const bool *valid_symbols) {
|
||||
bool has_leading_whitespace_with_newline = false;
|
||||
while (iswspace(lexer->lookahead)) {
|
||||
if (lexer->lookahead == '\n') {
|
||||
has_leading_whitespace_with_newline = true;
|
||||
}
|
||||
skip(lexer);
|
||||
}
|
||||
if (lexer->lookahead == '\0') {
|
||||
return false;
|
||||
}
|
||||
// manage quoted context
|
||||
if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context() &&
|
||||
lexer->lookahead == '"') {
|
||||
Context ctx = {QUOTED_TEMPLATE, ""};
|
||||
context_stack.push_back(ctx);
|
||||
return accept_and_advance(lexer, QUOTED_TEMPLATE_START);
|
||||
}
|
||||
if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context() &&
|
||||
lexer->lookahead == '"') {
|
||||
context_stack.pop_back();
|
||||
return accept_and_advance(lexer, QUOTED_TEMPLATE_END);
|
||||
}
|
||||
|
||||
// manage template interpolations
|
||||
if (valid_symbols[TEMPLATE_INTERPOLATION_START] &&
|
||||
valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_interpolation_context() &&
|
||||
lexer->lookahead == '$') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
Context ctx = {TEMPLATE_INTERPOLATION, ""};
|
||||
context_stack.push_back(ctx);
|
||||
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START);
|
||||
}
|
||||
// try to scan escape sequence
|
||||
if (lexer->lookahead == '$') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
// $${
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
if (valid_symbols[TEMPLATE_INTERPOLATION_END] &&
|
||||
in_interpolation_context() && lexer->lookahead == '}') {
|
||||
context_stack.pop_back();
|
||||
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END);
|
||||
}
|
||||
|
||||
// manage template directives
|
||||
if (valid_symbols[TEMPLATE_DIRECTIVE_START] &&
|
||||
valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_directive_context() &&
|
||||
lexer->lookahead == '%') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
Context ctx = {TEMPLATE_DIRECTIVE, ""};
|
||||
context_stack.push_back(ctx);
|
||||
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START);
|
||||
}
|
||||
// try to scan escape sequence
|
||||
if (lexer->lookahead == '%') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
// $${
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
if (valid_symbols[TEMPLATE_DIRECTIVE_END] && in_directive_context() &&
|
||||
lexer->lookahead == '}') {
|
||||
context_stack.pop_back();
|
||||
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END);
|
||||
}
|
||||
|
||||
// manage heredoc context
|
||||
if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context()) {
|
||||
string identifier;
|
||||
// TODO: check that this is a valid identifier
|
||||
while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' ||
|
||||
lexer->lookahead == '-') {
|
||||
identifier.push_back(lexer->lookahead);
|
||||
advance(lexer);
|
||||
}
|
||||
Context ctx = {HEREDOC_TEMPLATE, identifier};
|
||||
context_stack.push_back(ctx);
|
||||
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
|
||||
}
|
||||
if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context() &&
|
||||
has_leading_whitespace_with_newline) {
|
||||
string expected_identifier = context_stack.back().heredoc_identifier;
|
||||
|
||||
for (string::iterator it = expected_identifier.begin();
|
||||
it != expected_identifier.end(); ++it) {
|
||||
if (lexer->lookahead == *it) {
|
||||
advance(lexer);
|
||||
} else {
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
// check if the identifier is on a line of its own
|
||||
lexer->mark_end(lexer);
|
||||
while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
|
||||
advance(lexer);
|
||||
}
|
||||
if (lexer->lookahead == '\n') {
|
||||
context_stack.pop_back();
|
||||
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
|
||||
} else {
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
// manage template literal chunks
|
||||
|
||||
// handle template literal chunks in quoted contexts
|
||||
//
|
||||
// they may not contain newlines and may contain escape sequences
|
||||
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context()) {
|
||||
switch (lexer->lookahead) {
|
||||
case '\\':
|
||||
advance(lexer);
|
||||
switch (lexer->lookahead) {
|
||||
case '"':
|
||||
case 'n':
|
||||
case 'r':
|
||||
case 't':
|
||||
case '\\':
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
case 'u':
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (!consume_wxdigit(lexer))
|
||||
return false;
|
||||
}
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
case 'U':
|
||||
for (int i = 0; i < 8; i++) {
|
||||
if (!consume_wxdigit(lexer))
|
||||
return false;
|
||||
}
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handle all other quoted template or string literal characters
|
||||
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context()) {
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
|
||||
// probably not handled by the external scanner
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
vector<Context> context_stack;
|
||||
|
||||
void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
|
||||
|
||||
void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
|
||||
|
||||
bool accept_inplace(TSLexer *lexer, TokenType token) {
|
||||
lexer->result_symbol = token;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool accept_and_advance(TSLexer *lexer, TokenType token) {
|
||||
advance(lexer);
|
||||
return accept_inplace(lexer, token);
|
||||
}
|
||||
|
||||
bool consume_wxdigit(TSLexer *lexer) {
|
||||
advance(lexer);
|
||||
return iswxdigit(lexer->lookahead);
|
||||
}
|
||||
|
||||
bool skip_comment(TSLexer* lexer) {
|
||||
while (iswspace(lexer->lookahead)) {
|
||||
skip(lexer);
|
||||
}
|
||||
if (lexer->lookahead != '#') {
|
||||
return false;
|
||||
}
|
||||
skip(lexer);
|
||||
while (lexer->lookahead != '\n') {
|
||||
skip(lexer);
|
||||
if (lexer->eof(lexer)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool in_context_type(ContextType type) {
|
||||
if (context_stack.empty()) {
|
||||
return false;
|
||||
}
|
||||
return context_stack.back().type == type;
|
||||
}
|
||||
|
||||
bool in_quoted_context() { return in_context_type(QUOTED_TEMPLATE); }
|
||||
|
||||
bool in_heredoc_context() { return in_context_type(HEREDOC_TEMPLATE); }
|
||||
|
||||
bool in_template_context() {
|
||||
return in_quoted_context() || in_heredoc_context();
|
||||
}
|
||||
|
||||
bool in_interpolation_context() {
|
||||
return in_context_type(TEMPLATE_INTERPOLATION);
|
||||
}
|
||||
|
||||
bool in_directive_context() { return in_context_type(TEMPLATE_DIRECTIVE); }
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
extern "C" {
|
||||
|
||||
// tree sitter callbacks
|
||||
void *tree_sitter_terraform_external_scanner_create() { return new Scanner(); }
|
||||
|
||||
void tree_sitter_terraform_external_scanner_destroy(void *p) {
|
||||
Scanner *scanner = static_cast<Scanner *>(p);
|
||||
delete scanner;
|
||||
}
|
||||
|
||||
unsigned tree_sitter_terraform_external_scanner_serialize(void *p, char *b) {
|
||||
Scanner *scanner = static_cast<Scanner *>(p);
|
||||
return scanner->serialize(b);
|
||||
}
|
||||
|
||||
void tree_sitter_terraform_external_scanner_deserialize(void *p, const char *b,
|
||||
unsigned n) {
|
||||
Scanner *scanner = static_cast<Scanner *>(p);
|
||||
return scanner->deserialize(b, n);
|
||||
}
|
||||
|
||||
bool tree_sitter_terraform_external_scanner_scan(void *p, TSLexer *lexer,
|
||||
const bool *valid_symbols) {
|
||||
Scanner *scanner = static_cast<Scanner *>(p);
|
||||
return scanner->scan(lexer, valid_symbols);
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
436
src/scanner.c
Normal file
436
src/scanner.c
Normal file
@@ -0,0 +1,436 @@
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <tree_sitter/parser.h>
|
||||
#include <wctype.h>
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
#define VEC_RESIZE(vec, _cap) \
|
||||
void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \
|
||||
assert(tmp != NULL); \
|
||||
(vec).data = tmp; \
|
||||
(vec).cap = (_cap);
|
||||
|
||||
#define VEC_PUSH(vec, el) \
|
||||
if ((vec).cap == (vec).len) { \
|
||||
VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \
|
||||
} \
|
||||
(vec).data[(vec).len++] = (el);
|
||||
|
||||
#define VEC_POP(vec) \
|
||||
{ \
|
||||
STRING_FREE(VEC_BACK((vec)).heredoc_identifier); \
|
||||
(vec).len--; \
|
||||
}
|
||||
|
||||
#define VEC_BACK(vec) ((vec).data[(vec).len - 1])
|
||||
|
||||
#define VEC_FREE(vec) \
|
||||
{ \
|
||||
if ((vec).data != NULL) \
|
||||
free((vec).data); \
|
||||
}
|
||||
|
||||
#define VEC_CLEAR(vec) \
|
||||
{ \
|
||||
for (int i = 0; i < (vec).len; i++) { \
|
||||
STRING_FREE((vec).data[i].heredoc_identifier); \
|
||||
} \
|
||||
(vec).len = 0; \
|
||||
}
|
||||
|
||||
#define STRING_RESIZE(vec, _cap) \
|
||||
void *tmp = realloc((vec).data, (_cap + 1) * sizeof((vec).data[0])); \
|
||||
assert(tmp != NULL); \
|
||||
(vec).data = tmp; \
|
||||
memset((vec).data + (vec).len, 0, \
|
||||
((_cap + 1) - (vec).len) * sizeof((vec).data[0])); \
|
||||
(vec).cap = (_cap);
|
||||
|
||||
#define STRING_GROW(vec, _cap) \
|
||||
if ((vec).cap < (_cap)) { \
|
||||
STRING_RESIZE((vec), (_cap)); \
|
||||
}
|
||||
|
||||
#define STRING_PUSH(vec, el) \
|
||||
if ((vec).cap == (vec).len) { \
|
||||
STRING_RESIZE((vec), MAX(16, (vec).len * 2)); \
|
||||
} \
|
||||
(vec).data[(vec).len++] = (el);
|
||||
|
||||
#define STRING_FREE(vec) \
|
||||
{ \
|
||||
if ((vec).data != NULL) \
|
||||
free((vec).data); \
|
||||
}
|
||||
|
||||
enum TokenType {
|
||||
QUOTED_TEMPLATE_START,
|
||||
QUOTED_TEMPLATE_END,
|
||||
TEMPLATE_LITERAL_CHUNK,
|
||||
TEMPLATE_INTERPOLATION_START,
|
||||
TEMPLATE_INTERPOLATION_END,
|
||||
TEMPLATE_DIRECTIVE_START,
|
||||
TEMPLATE_DIRECTIVE_END,
|
||||
HEREDOC_IDENTIFIER,
|
||||
};
|
||||
|
||||
enum ContextType {
|
||||
TEMPLATE_INTERPOLATION,
|
||||
TEMPLATE_DIRECTIVE,
|
||||
QUOTED_TEMPLATE,
|
||||
HEREDOC_TEMPLATE,
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
uint32_t cap;
|
||||
uint32_t len;
|
||||
char *data;
|
||||
} String;
|
||||
|
||||
String string_new() {
|
||||
return (String){
|
||||
.cap = 16,
|
||||
.len = 0,
|
||||
.data = calloc(1, sizeof(char) * 17),
|
||||
};
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
enum ContextType type;
|
||||
|
||||
// valid if type == HEREDOC_TEMPLATE
|
||||
String heredoc_identifier;
|
||||
} Context;
|
||||
|
||||
Context context_new(enum ContextType type, const char *data) {
|
||||
Context ctx = {
|
||||
.type = type,
|
||||
.heredoc_identifier = string_new(),
|
||||
};
|
||||
ctx.heredoc_identifier.len = strlen(data);
|
||||
ctx.heredoc_identifier.cap = strlen(data);
|
||||
memcpy(ctx.heredoc_identifier.data, data, ctx.heredoc_identifier.len);
|
||||
return ctx;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
uint32_t len;
|
||||
uint32_t cap;
|
||||
Context *data;
|
||||
} context_vec;
|
||||
|
||||
typedef struct {
|
||||
context_vec context_stack;
|
||||
} Scanner;
|
||||
|
||||
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
|
||||
|
||||
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
|
||||
|
||||
static unsigned serialize(Scanner *scanner, char *buf) {
|
||||
unsigned size = 0;
|
||||
|
||||
if (scanner->context_stack.len > CHAR_MAX) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
buf[size++] = (char)scanner->context_stack.len;
|
||||
for (int i = 0; i < scanner->context_stack.len; i++) {
|
||||
Context *context = &scanner->context_stack.data[i];
|
||||
if (size + 2 + context->heredoc_identifier.len >=
|
||||
TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
|
||||
return 0;
|
||||
}
|
||||
if (context->heredoc_identifier.len > CHAR_MAX) {
|
||||
return 0;
|
||||
}
|
||||
buf[size++] = context->type;
|
||||
buf[size++] = (char)context->heredoc_identifier.len;
|
||||
memcpy(&buf[size], context->heredoc_identifier.data,
|
||||
context->heredoc_identifier.len);
|
||||
size += context->heredoc_identifier.len;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
|
||||
if (length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
VEC_CLEAR(scanner->context_stack);
|
||||
unsigned size = 0;
|
||||
uint8_t context_stack_size = buffer[size++];
|
||||
for (uint32_t j = 0; j < context_stack_size; j++) {
|
||||
Context ctx = {
|
||||
.type = (enum ContextType)buffer[size++],
|
||||
.heredoc_identifier = string_new(),
|
||||
};
|
||||
uint8_t heredoc_identifier_size = buffer[size++];
|
||||
STRING_GROW(ctx.heredoc_identifier, heredoc_identifier_size);
|
||||
memcpy(ctx.heredoc_identifier.data, buffer + size,
|
||||
heredoc_identifier_size);
|
||||
ctx.heredoc_identifier.len = heredoc_identifier_size;
|
||||
size += heredoc_identifier_size;
|
||||
VEC_PUSH(scanner->context_stack, ctx);
|
||||
}
|
||||
assert(size == length);
|
||||
}
|
||||
|
||||
static inline bool accept_inplace(TSLexer *lexer, enum TokenType token) {
|
||||
lexer->result_symbol = token;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool accept_and_advance(TSLexer *lexer, enum TokenType token) {
|
||||
advance(lexer);
|
||||
return accept_inplace(lexer, token);
|
||||
}
|
||||
|
||||
static inline bool consume_wxdigit(TSLexer *lexer) {
|
||||
advance(lexer);
|
||||
return iswxdigit(lexer->lookahead);
|
||||
}
|
||||
|
||||
static inline bool skip_comment(TSLexer *lexer) {
|
||||
while (iswspace(lexer->lookahead)) {
|
||||
skip(lexer);
|
||||
}
|
||||
if (lexer->lookahead != '#') {
|
||||
return false;
|
||||
}
|
||||
skip(lexer);
|
||||
while (lexer->lookahead != '\n') {
|
||||
skip(lexer);
|
||||
if (lexer->eof(lexer)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool in_context_type(Scanner *scanner, enum ContextType type) {
|
||||
if (scanner->context_stack.len == 0) {
|
||||
return false;
|
||||
}
|
||||
return VEC_BACK(scanner->context_stack).type == type;
|
||||
}
|
||||
|
||||
static inline bool in_quoted_context(Scanner *scanner) {
|
||||
return in_context_type(scanner, QUOTED_TEMPLATE);
|
||||
}
|
||||
|
||||
static inline bool in_heredoc_context(Scanner *scanner) {
|
||||
return in_context_type(scanner, HEREDOC_TEMPLATE);
|
||||
}
|
||||
|
||||
static inline bool in_template_context(Scanner *scanner) {
|
||||
return in_quoted_context(scanner) || in_heredoc_context(scanner);
|
||||
}
|
||||
|
||||
static inline bool in_interpolation_context(Scanner *scanner) {
|
||||
return in_context_type(scanner, TEMPLATE_INTERPOLATION);
|
||||
}
|
||||
|
||||
static inline bool in_directive_context(Scanner *scanner) {
|
||||
return in_context_type(scanner, TEMPLATE_DIRECTIVE);
|
||||
}
|
||||
|
||||
static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
|
||||
bool has_leading_whitespace_with_newline = false;
|
||||
while (iswspace(lexer->lookahead)) {
|
||||
if (lexer->lookahead == '\n') {
|
||||
has_leading_whitespace_with_newline = true;
|
||||
}
|
||||
skip(lexer);
|
||||
}
|
||||
if (lexer->lookahead == '\0') {
|
||||
return false;
|
||||
}
|
||||
// manage quoted context
|
||||
if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context(scanner) &&
|
||||
lexer->lookahead == '"') {
|
||||
Context ctx = context_new(QUOTED_TEMPLATE, "");
|
||||
VEC_PUSH(scanner->context_stack, ctx);
|
||||
return accept_and_advance(lexer, QUOTED_TEMPLATE_START);
|
||||
}
|
||||
if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context(scanner) &&
|
||||
lexer->lookahead == '"') {
|
||||
VEC_POP(scanner->context_stack);
|
||||
return accept_and_advance(lexer, QUOTED_TEMPLATE_END);
|
||||
}
|
||||
|
||||
// manage template interpolations
|
||||
if (valid_symbols[TEMPLATE_INTERPOLATION_START] &&
|
||||
valid_symbols[TEMPLATE_LITERAL_CHUNK] &&
|
||||
!in_interpolation_context(scanner) && lexer->lookahead == '$') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
Context ctx = context_new(TEMPLATE_INTERPOLATION, "");
|
||||
VEC_PUSH(scanner->context_stack, ctx);
|
||||
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START);
|
||||
}
|
||||
// try to scan escape sequence
|
||||
if (lexer->lookahead == '$') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
// $${
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
if (valid_symbols[TEMPLATE_INTERPOLATION_END] &&
|
||||
in_interpolation_context(scanner) && lexer->lookahead == '}') {
|
||||
VEC_POP(scanner->context_stack);
|
||||
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END);
|
||||
}
|
||||
|
||||
// manage template directives
|
||||
if (valid_symbols[TEMPLATE_DIRECTIVE_START] &&
|
||||
valid_symbols[TEMPLATE_LITERAL_CHUNK] &&
|
||||
!in_directive_context(scanner) && lexer->lookahead == '%') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
Context ctx = context_new(TEMPLATE_DIRECTIVE, "");
|
||||
VEC_PUSH(scanner->context_stack, ctx);
|
||||
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START);
|
||||
}
|
||||
// try to scan escape sequence
|
||||
if (lexer->lookahead == '%') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
// $${
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
if (valid_symbols[TEMPLATE_DIRECTIVE_END] &&
|
||||
in_directive_context(scanner) && lexer->lookahead == '}') {
|
||||
VEC_POP(scanner->context_stack);
|
||||
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END);
|
||||
}
|
||||
|
||||
// manage heredoc context
|
||||
if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context(scanner)) {
|
||||
String identifier = string_new();
|
||||
// TODO: check that this is a valid identifier
|
||||
while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' ||
|
||||
lexer->lookahead == '-') {
|
||||
STRING_PUSH(identifier, lexer->lookahead);
|
||||
advance(lexer);
|
||||
}
|
||||
Context ctx = {HEREDOC_TEMPLATE, identifier};
|
||||
VEC_PUSH(scanner->context_stack, ctx);
|
||||
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
|
||||
}
|
||||
if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context(scanner) &&
|
||||
has_leading_whitespace_with_newline) {
|
||||
String expected_identifier =
|
||||
VEC_BACK(scanner->context_stack).heredoc_identifier;
|
||||
|
||||
for (size_t i = 0; i < expected_identifier.len; i++) {
|
||||
if (lexer->lookahead == expected_identifier.data[i]) {
|
||||
advance(lexer);
|
||||
} else {
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
// check if the identifier is on a line of its own
|
||||
lexer->mark_end(lexer);
|
||||
while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
|
||||
advance(lexer);
|
||||
}
|
||||
if (lexer->lookahead == '\n') {
|
||||
VEC_POP(scanner->context_stack);
|
||||
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
|
||||
}
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
// manage template literal chunks
|
||||
|
||||
// handle template literal chunks in quoted contexts
|
||||
//
|
||||
// they may not contain newlines and may contain escape sequences
|
||||
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context(scanner)) {
|
||||
switch (lexer->lookahead) {
|
||||
case '\\':
|
||||
advance(lexer);
|
||||
switch (lexer->lookahead) {
|
||||
case '"':
|
||||
case 'n':
|
||||
case 'r':
|
||||
case 't':
|
||||
case '\\':
|
||||
return accept_and_advance(lexer,
|
||||
TEMPLATE_LITERAL_CHUNK);
|
||||
case 'u':
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (!consume_wxdigit(lexer)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return accept_and_advance(lexer,
|
||||
TEMPLATE_LITERAL_CHUNK);
|
||||
case 'U':
|
||||
for (int i = 0; i < 8; i++) {
|
||||
if (!consume_wxdigit(lexer)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return accept_and_advance(lexer,
|
||||
TEMPLATE_LITERAL_CHUNK);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handle all other quoted template or string literal characters
|
||||
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context(scanner)) {
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
|
||||
// probably not handled by the external scanner
|
||||
return false;
|
||||
}
|
||||
|
||||
void *tree_sitter_hcl_external_scanner_create() {
|
||||
Scanner *scanner = calloc(1, sizeof(Scanner));
|
||||
scanner->context_stack.data = calloc(1, sizeof(Context));
|
||||
return scanner;
|
||||
}
|
||||
|
||||
unsigned tree_sitter_hcl_external_scanner_serialize(void *payload,
|
||||
char *buffer) {
|
||||
Scanner *scanner = (Scanner *)payload;
|
||||
return serialize(scanner, buffer);
|
||||
}
|
||||
|
||||
void tree_sitter_hcl_external_scanner_deserialize(void *payload,
|
||||
const char *buffer,
|
||||
unsigned length) {
|
||||
Scanner *scanner = (Scanner *)payload;
|
||||
deserialize(scanner, buffer, length);
|
||||
}
|
||||
|
||||
bool tree_sitter_hcl_external_scanner_scan(void *payload, TSLexer *lexer,
|
||||
const bool *valid_symbols) {
|
||||
Scanner *scanner = (Scanner *)payload;
|
||||
return scan(scanner, lexer, valid_symbols);
|
||||
}
|
||||
|
||||
void tree_sitter_hcl_external_scanner_destroy(void *payload) {
|
||||
Scanner *scanner = (Scanner *)payload;
|
||||
for (int i = 0; i < scanner->context_stack.len; i++) {
|
||||
STRING_FREE(scanner->context_stack.data[i].heredoc_identifier);
|
||||
}
|
||||
VEC_FREE(scanner->context_stack);
|
||||
free(scanner);
|
||||
}
|
||||
336
src/scanner.cc
336
src/scanner.cc
@@ -1,336 +0,0 @@
|
||||
#include <tree_sitter/parser.h>
|
||||
|
||||
#include <assert.h>
|
||||
#include <climits>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <wctype.h>
|
||||
|
||||
namespace {
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
enum TokenType {
|
||||
QUOTED_TEMPLATE_START,
|
||||
QUOTED_TEMPLATE_END,
|
||||
TEMPLATE_LITERAL_CHUNK,
|
||||
TEMPLATE_INTERPOLATION_START,
|
||||
TEMPLATE_INTERPOLATION_END,
|
||||
TEMPLATE_DIRECTIVE_START,
|
||||
TEMPLATE_DIRECTIVE_END,
|
||||
HEREDOC_IDENTIFIER,
|
||||
};
|
||||
|
||||
enum ContextType {
|
||||
TEMPLATE_INTERPOLATION,
|
||||
TEMPLATE_DIRECTIVE,
|
||||
QUOTED_TEMPLATE,
|
||||
HEREDOC_TEMPLATE,
|
||||
};
|
||||
|
||||
struct Context {
|
||||
ContextType type;
|
||||
|
||||
// valid if type == HEREDOC_TEMPLATE
|
||||
string heredoc_identifier;
|
||||
};
|
||||
|
||||
struct Scanner {
|
||||
|
||||
public:
|
||||
unsigned serialize(char *buf) {
|
||||
unsigned size = 0;
|
||||
|
||||
if (context_stack.size() > CHAR_MAX) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
buf[size++] = context_stack.size();
|
||||
for (vector<Context>::iterator it = context_stack.begin();
|
||||
it != context_stack.end(); ++it) {
|
||||
if (size + 2 + it->heredoc_identifier.size() >=
|
||||
TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
|
||||
return 0;
|
||||
}
|
||||
if (it->heredoc_identifier.size() > CHAR_MAX) {
|
||||
return 0;
|
||||
}
|
||||
buf[size++] = it->type;
|
||||
buf[size++] = it->heredoc_identifier.size();
|
||||
it->heredoc_identifier.copy(&buf[size], it->heredoc_identifier.size());
|
||||
size += it->heredoc_identifier.size();
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
void deserialize(const char *buf, unsigned n) {
|
||||
context_stack.clear();
|
||||
|
||||
if (n == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned size = 0;
|
||||
uint8_t context_stack_size = buf[size++];
|
||||
for (unsigned j = 0; j < context_stack_size; j++) {
|
||||
Context ctx;
|
||||
ctx.type = static_cast<ContextType>(buf[size++]);
|
||||
uint8_t heredoc_identifier_size = buf[size++];
|
||||
ctx.heredoc_identifier.assign(buf + size,
|
||||
buf + size + heredoc_identifier_size);
|
||||
size += heredoc_identifier_size;
|
||||
context_stack.push_back(ctx);
|
||||
}
|
||||
assert(size == n);
|
||||
}
|
||||
|
||||
bool scan(TSLexer *lexer, const bool *valid_symbols) {
|
||||
bool has_leading_whitespace_with_newline = false;
|
||||
while (iswspace(lexer->lookahead)) {
|
||||
if (lexer->lookahead == '\n') {
|
||||
has_leading_whitespace_with_newline = true;
|
||||
}
|
||||
skip(lexer);
|
||||
}
|
||||
if (lexer->lookahead == '\0') {
|
||||
return false;
|
||||
}
|
||||
// manage quoted context
|
||||
if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context() &&
|
||||
lexer->lookahead == '"') {
|
||||
Context ctx = {QUOTED_TEMPLATE, ""};
|
||||
context_stack.push_back(ctx);
|
||||
return accept_and_advance(lexer, QUOTED_TEMPLATE_START);
|
||||
}
|
||||
if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context() &&
|
||||
lexer->lookahead == '"') {
|
||||
context_stack.pop_back();
|
||||
return accept_and_advance(lexer, QUOTED_TEMPLATE_END);
|
||||
}
|
||||
|
||||
// manage template interpolations
|
||||
if (valid_symbols[TEMPLATE_INTERPOLATION_START] &&
|
||||
valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_interpolation_context() &&
|
||||
lexer->lookahead == '$') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
Context ctx = {TEMPLATE_INTERPOLATION, ""};
|
||||
context_stack.push_back(ctx);
|
||||
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START);
|
||||
}
|
||||
// try to scan escape sequence
|
||||
if (lexer->lookahead == '$') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
// $${
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
if (valid_symbols[TEMPLATE_INTERPOLATION_END] &&
|
||||
in_interpolation_context() && lexer->lookahead == '}') {
|
||||
context_stack.pop_back();
|
||||
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END);
|
||||
}
|
||||
|
||||
// manage template directives
|
||||
if (valid_symbols[TEMPLATE_DIRECTIVE_START] &&
|
||||
valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_directive_context() &&
|
||||
lexer->lookahead == '%') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
Context ctx = {TEMPLATE_DIRECTIVE, ""};
|
||||
context_stack.push_back(ctx);
|
||||
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START);
|
||||
}
|
||||
// try to scan escape sequence
|
||||
if (lexer->lookahead == '%') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '{') {
|
||||
// $${
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
if (valid_symbols[TEMPLATE_DIRECTIVE_END] && in_directive_context() &&
|
||||
lexer->lookahead == '}') {
|
||||
context_stack.pop_back();
|
||||
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END);
|
||||
}
|
||||
|
||||
// manage heredoc context
|
||||
if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context()) {
|
||||
string identifier;
|
||||
// TODO: check that this is a valid identifier
|
||||
while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' ||
|
||||
lexer->lookahead == '-') {
|
||||
identifier.push_back(lexer->lookahead);
|
||||
advance(lexer);
|
||||
}
|
||||
Context ctx = {HEREDOC_TEMPLATE, identifier};
|
||||
context_stack.push_back(ctx);
|
||||
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
|
||||
}
|
||||
if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context() &&
|
||||
has_leading_whitespace_with_newline) {
|
||||
string expected_identifier = context_stack.back().heredoc_identifier;
|
||||
|
||||
for (string::iterator it = expected_identifier.begin();
|
||||
it != expected_identifier.end(); ++it) {
|
||||
if (lexer->lookahead == *it) {
|
||||
advance(lexer);
|
||||
} else {
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
// check if the identifier is on a line of its own
|
||||
lexer->mark_end(lexer);
|
||||
while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
|
||||
advance(lexer);
|
||||
}
|
||||
if (lexer->lookahead == '\n') {
|
||||
context_stack.pop_back();
|
||||
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
|
||||
} else {
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
}
|
||||
// manage template literal chunks
|
||||
|
||||
// handle template literal chunks in quoted contexts
|
||||
//
|
||||
// they may not contain newlines and may contain escape sequences
|
||||
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context()) {
|
||||
switch (lexer->lookahead) {
|
||||
case '\\':
|
||||
advance(lexer);
|
||||
switch (lexer->lookahead) {
|
||||
case '"':
|
||||
case 'n':
|
||||
case 'r':
|
||||
case 't':
|
||||
case '\\':
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
case 'u':
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (!consume_wxdigit(lexer))
|
||||
return false;
|
||||
}
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
case 'U':
|
||||
for (int i = 0; i < 8; i++) {
|
||||
if (!consume_wxdigit(lexer))
|
||||
return false;
|
||||
}
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handle all other quoted template or string literal characters
|
||||
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context()) {
|
||||
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
|
||||
}
|
||||
|
||||
// probably not handled by the external scanner
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
vector<Context> context_stack;
|
||||
|
||||
void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
|
||||
|
||||
void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
|
||||
|
||||
bool accept_inplace(TSLexer *lexer, TokenType token) {
|
||||
lexer->result_symbol = token;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool accept_and_advance(TSLexer *lexer, TokenType token) {
|
||||
advance(lexer);
|
||||
return accept_inplace(lexer, token);
|
||||
}
|
||||
|
||||
bool consume_wxdigit(TSLexer *lexer) {
|
||||
advance(lexer);
|
||||
return iswxdigit(lexer->lookahead);
|
||||
}
|
||||
|
||||
bool skip_comment(TSLexer* lexer) {
|
||||
while (iswspace(lexer->lookahead)) {
|
||||
skip(lexer);
|
||||
}
|
||||
if (lexer->lookahead != '#') {
|
||||
return false;
|
||||
}
|
||||
skip(lexer);
|
||||
while (lexer->lookahead != '\n') {
|
||||
skip(lexer);
|
||||
if (lexer->eof(lexer)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool in_context_type(ContextType type) {
|
||||
if (context_stack.empty()) {
|
||||
return false;
|
||||
}
|
||||
return context_stack.back().type == type;
|
||||
}
|
||||
|
||||
bool in_quoted_context() { return in_context_type(QUOTED_TEMPLATE); }
|
||||
|
||||
bool in_heredoc_context() { return in_context_type(HEREDOC_TEMPLATE); }
|
||||
|
||||
bool in_template_context() {
|
||||
return in_quoted_context() || in_heredoc_context();
|
||||
}
|
||||
|
||||
bool in_interpolation_context() {
|
||||
return in_context_type(TEMPLATE_INTERPOLATION);
|
||||
}
|
||||
|
||||
bool in_directive_context() { return in_context_type(TEMPLATE_DIRECTIVE); }
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
extern "C" {
|
||||
|
||||
// tree sitter callbacks
|
||||
void *tree_sitter_hcl_external_scanner_create() { return new Scanner(); }
|
||||
|
||||
void tree_sitter_hcl_external_scanner_destroy(void *p) {
|
||||
Scanner *scanner = static_cast<Scanner *>(p);
|
||||
delete scanner;
|
||||
}
|
||||
|
||||
unsigned tree_sitter_hcl_external_scanner_serialize(void *p, char *b) {
|
||||
Scanner *scanner = static_cast<Scanner *>(p);
|
||||
return scanner->serialize(b);
|
||||
}
|
||||
|
||||
void tree_sitter_hcl_external_scanner_deserialize(void *p, const char *b,
|
||||
unsigned n) {
|
||||
Scanner *scanner = static_cast<Scanner *>(p);
|
||||
return scanner->deserialize(b, n);
|
||||
}
|
||||
|
||||
bool tree_sitter_hcl_external_scanner_scan(void *p, TSLexer *lexer,
|
||||
const bool *valid_symbols) {
|
||||
Scanner *scanner = static_cast<Scanner *>(p);
|
||||
return scanner->scan(lexer, valid_symbols);
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
Reference in New Issue
Block a user