feat: rewrite the scanner in C

This commit is contained in:
Amaan Qureshi
2023-06-18 14:09:06 -04:00
committed by Michael Hoffmann
parent 636dbe7030
commit 5160a52f2d
4 changed files with 872 additions and 672 deletions

View File

@@ -0,0 +1,436 @@
#include <assert.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <tree_sitter/parser.h>
#include <wctype.h>
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define VEC_RESIZE(vec, _cap) \
void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \
assert(tmp != NULL); \
(vec).data = tmp; \
(vec).cap = (_cap);
#define VEC_PUSH(vec, el) \
if ((vec).cap == (vec).len) { \
VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \
} \
(vec).data[(vec).len++] = (el);
#define VEC_POP(vec) \
{ \
STRING_FREE(VEC_BACK((vec)).heredoc_identifier); \
(vec).len--; \
}
#define VEC_BACK(vec) ((vec).data[(vec).len - 1])
#define VEC_FREE(vec) \
{ \
if ((vec).data != NULL) \
free((vec).data); \
}
#define VEC_CLEAR(vec) \
{ \
for (int i = 0; i < (vec).len; i++) { \
STRING_FREE((vec).data[i].heredoc_identifier); \
} \
(vec).len = 0; \
}
#define STRING_RESIZE(vec, _cap) \
void *tmp = realloc((vec).data, (_cap + 1) * sizeof((vec).data[0])); \
assert(tmp != NULL); \
(vec).data = tmp; \
memset((vec).data + (vec).len, 0, \
((_cap + 1) - (vec).len) * sizeof((vec).data[0])); \
(vec).cap = (_cap);
#define STRING_GROW(vec, _cap) \
if ((vec).cap < (_cap)) { \
STRING_RESIZE((vec), (_cap)); \
}
#define STRING_PUSH(vec, el) \
if ((vec).cap == (vec).len) { \
STRING_RESIZE((vec), MAX(16, (vec).len * 2)); \
} \
(vec).data[(vec).len++] = (el);
#define STRING_FREE(vec) \
{ \
if ((vec).data != NULL) \
free((vec).data); \
}
enum TokenType {
QUOTED_TEMPLATE_START,
QUOTED_TEMPLATE_END,
TEMPLATE_LITERAL_CHUNK,
TEMPLATE_INTERPOLATION_START,
TEMPLATE_INTERPOLATION_END,
TEMPLATE_DIRECTIVE_START,
TEMPLATE_DIRECTIVE_END,
HEREDOC_IDENTIFIER,
};
enum ContextType {
TEMPLATE_INTERPOLATION,
TEMPLATE_DIRECTIVE,
QUOTED_TEMPLATE,
HEREDOC_TEMPLATE,
};
typedef struct {
uint32_t cap;
uint32_t len;
char *data;
} String;
String string_new() {
return (String){
.cap = 16,
.len = 0,
.data = calloc(1, sizeof(char) * 17),
};
}
typedef struct {
enum ContextType type;
// valid if type == HEREDOC_TEMPLATE
String heredoc_identifier;
} Context;
Context context_new(enum ContextType type, const char *data) {
Context ctx = {
.type = type,
.heredoc_identifier = string_new(),
};
ctx.heredoc_identifier.len = strlen(data);
ctx.heredoc_identifier.cap = strlen(data);
memcpy(ctx.heredoc_identifier.data, data, ctx.heredoc_identifier.len);
return ctx;
}
typedef struct {
uint32_t len;
uint32_t cap;
Context *data;
} context_vec;
typedef struct {
context_vec context_stack;
} Scanner;
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
static unsigned serialize(Scanner *scanner, char *buf) {
unsigned size = 0;
if (scanner->context_stack.len > CHAR_MAX) {
return 0;
}
buf[size++] = (char)scanner->context_stack.len;
for (int i = 0; i < scanner->context_stack.len; i++) {
Context *context = &scanner->context_stack.data[i];
if (size + 2 + context->heredoc_identifier.len >=
TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
return 0;
}
if (context->heredoc_identifier.len > CHAR_MAX) {
return 0;
}
buf[size++] = context->type;
buf[size++] = (char)context->heredoc_identifier.len;
memcpy(&buf[size], context->heredoc_identifier.data,
context->heredoc_identifier.len);
size += context->heredoc_identifier.len;
}
return size;
}
static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
if (length == 0) {
return;
}
VEC_CLEAR(scanner->context_stack);
unsigned size = 0;
uint8_t context_stack_size = buffer[size++];
for (uint32_t j = 0; j < context_stack_size; j++) {
Context ctx = {
.type = (enum ContextType)buffer[size++],
.heredoc_identifier = string_new(),
};
uint8_t heredoc_identifier_size = buffer[size++];
STRING_GROW(ctx.heredoc_identifier, heredoc_identifier_size);
memcpy(ctx.heredoc_identifier.data, buffer + size,
heredoc_identifier_size);
ctx.heredoc_identifier.len = heredoc_identifier_size;
size += heredoc_identifier_size;
VEC_PUSH(scanner->context_stack, ctx);
}
assert(size == length);
}
static inline bool accept_inplace(TSLexer *lexer, enum TokenType token) {
lexer->result_symbol = token;
return true;
}
static inline bool accept_and_advance(TSLexer *lexer, enum TokenType token) {
advance(lexer);
return accept_inplace(lexer, token);
}
static inline bool consume_wxdigit(TSLexer *lexer) {
advance(lexer);
return iswxdigit(lexer->lookahead);
}
static inline bool skip_comment(TSLexer *lexer) {
while (iswspace(lexer->lookahead)) {
skip(lexer);
}
if (lexer->lookahead != '#') {
return false;
}
skip(lexer);
while (lexer->lookahead != '\n') {
skip(lexer);
if (lexer->eof(lexer)) {
return false;
}
}
return true;
}
static inline bool in_context_type(Scanner *scanner, enum ContextType type) {
if (scanner->context_stack.len == 0) {
return false;
}
return VEC_BACK(scanner->context_stack).type == type;
}
static inline bool in_quoted_context(Scanner *scanner) {
return in_context_type(scanner, QUOTED_TEMPLATE);
}
static inline bool in_heredoc_context(Scanner *scanner) {
return in_context_type(scanner, HEREDOC_TEMPLATE);
}
static inline bool in_template_context(Scanner *scanner) {
return in_quoted_context(scanner) || in_heredoc_context(scanner);
}
static inline bool in_interpolation_context(Scanner *scanner) {
return in_context_type(scanner, TEMPLATE_INTERPOLATION);
}
static inline bool in_directive_context(Scanner *scanner) {
return in_context_type(scanner, TEMPLATE_DIRECTIVE);
}
static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
bool has_leading_whitespace_with_newline = false;
while (iswspace(lexer->lookahead)) {
if (lexer->lookahead == '\n') {
has_leading_whitespace_with_newline = true;
}
skip(lexer);
}
if (lexer->lookahead == '\0') {
return false;
}
// manage quoted context
if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context(scanner) &&
lexer->lookahead == '"') {
Context ctx = context_new(QUOTED_TEMPLATE, "");
VEC_PUSH(scanner->context_stack, ctx);
return accept_and_advance(lexer, QUOTED_TEMPLATE_START);
}
if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context(scanner) &&
lexer->lookahead == '"') {
VEC_POP(scanner->context_stack);
return accept_and_advance(lexer, QUOTED_TEMPLATE_END);
}
// manage template interpolations
if (valid_symbols[TEMPLATE_INTERPOLATION_START] &&
valid_symbols[TEMPLATE_LITERAL_CHUNK] &&
!in_interpolation_context(scanner) && lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
Context ctx = context_new(TEMPLATE_INTERPOLATION, "");
VEC_PUSH(scanner->context_stack, ctx);
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START);
}
// try to scan escape sequence
if (lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
if (valid_symbols[TEMPLATE_INTERPOLATION_END] &&
in_interpolation_context(scanner) && lexer->lookahead == '}') {
VEC_POP(scanner->context_stack);
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END);
}
// manage template directives
if (valid_symbols[TEMPLATE_DIRECTIVE_START] &&
valid_symbols[TEMPLATE_LITERAL_CHUNK] &&
!in_directive_context(scanner) && lexer->lookahead == '%') {
advance(lexer);
if (lexer->lookahead == '{') {
Context ctx = context_new(TEMPLATE_DIRECTIVE, "");
VEC_PUSH(scanner->context_stack, ctx);
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START);
}
// try to scan escape sequence
if (lexer->lookahead == '%') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
if (valid_symbols[TEMPLATE_DIRECTIVE_END] &&
in_directive_context(scanner) && lexer->lookahead == '}') {
VEC_POP(scanner->context_stack);
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END);
}
// manage heredoc context
if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context(scanner)) {
String identifier = string_new();
// TODO: check that this is a valid identifier
while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' ||
lexer->lookahead == '-') {
STRING_PUSH(identifier, lexer->lookahead);
advance(lexer);
}
Context ctx = {HEREDOC_TEMPLATE, identifier};
VEC_PUSH(scanner->context_stack, ctx);
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
}
if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context(scanner) &&
has_leading_whitespace_with_newline) {
String expected_identifier =
VEC_BACK(scanner->context_stack).heredoc_identifier;
for (size_t i = 0; i < expected_identifier.len; i++) {
if (lexer->lookahead == expected_identifier.data[i]) {
advance(lexer);
} else {
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
// check if the identifier is on a line of its own
lexer->mark_end(lexer);
while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
advance(lexer);
}
if (lexer->lookahead == '\n') {
VEC_POP(scanner->context_stack);
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
}
advance(lexer);
lexer->mark_end(lexer);
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
// manage template literal chunks
// handle template literal chunks in quoted contexts
//
// they may not contain newlines and may contain escape sequences
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context(scanner)) {
switch (lexer->lookahead) {
case '\\':
advance(lexer);
switch (lexer->lookahead) {
case '"':
case 'n':
case 'r':
case 't':
case '\\':
return accept_and_advance(lexer,
TEMPLATE_LITERAL_CHUNK);
case 'u':
for (int i = 0; i < 4; i++) {
if (!consume_wxdigit(lexer)) {
return false;
}
}
return accept_and_advance(lexer,
TEMPLATE_LITERAL_CHUNK);
case 'U':
for (int i = 0; i < 8; i++) {
if (!consume_wxdigit(lexer)) {
return false;
}
}
return accept_and_advance(lexer,
TEMPLATE_LITERAL_CHUNK);
default:
return false;
}
}
}
// handle all other quoted template or string literal characters
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context(scanner)) {
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
// probably not handled by the external scanner
return false;
}
void *tree_sitter_terraform_external_scanner_create() {
Scanner *scanner = calloc(1, sizeof(Scanner));
scanner->context_stack.data = calloc(1, sizeof(Context));
return scanner;
}
unsigned tree_sitter_terraform_external_scanner_serialize(void *payload,
char *buffer) {
Scanner *scanner = (Scanner *)payload;
return serialize(scanner, buffer);
}
void tree_sitter_terraform_external_scanner_deserialize(void *payload,
const char *buffer,
unsigned length) {
Scanner *scanner = (Scanner *)payload;
deserialize(scanner, buffer, length);
}
bool tree_sitter_terraform_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = (Scanner *)payload;
return scan(scanner, lexer, valid_symbols);
}
void tree_sitter_terraform_external_scanner_destroy(void *payload) {
Scanner *scanner = (Scanner *)payload;
for (int i = 0; i < scanner->context_stack.len; i++) {
STRING_FREE(scanner->context_stack.data[i].heredoc_identifier);
}
VEC_FREE(scanner->context_stack);
free(scanner);
}

View File

@@ -1,336 +0,0 @@
#include <tree_sitter/parser.h>
#include <assert.h>
#include <climits>
#include <string>
#include <vector>
#include <wctype.h>
namespace {
using std::string;
using std::vector;
enum TokenType {
QUOTED_TEMPLATE_START,
QUOTED_TEMPLATE_END,
TEMPLATE_LITERAL_CHUNK,
TEMPLATE_INTERPOLATION_START,
TEMPLATE_INTERPOLATION_END,
TEMPLATE_DIRECTIVE_START,
TEMPLATE_DIRECTIVE_END,
HEREDOC_IDENTIFIER,
};
enum ContextType {
TEMPLATE_INTERPOLATION,
TEMPLATE_DIRECTIVE,
QUOTED_TEMPLATE,
HEREDOC_TEMPLATE,
};
struct Context {
ContextType type;
// valid if type == HEREDOC_TEMPLATE
string heredoc_identifier;
};
struct Scanner {
public:
unsigned serialize(char *buf) {
unsigned size = 0;
if (context_stack.size() > CHAR_MAX) {
return 0;
}
buf[size++] = context_stack.size();
for (vector<Context>::iterator it = context_stack.begin();
it != context_stack.end(); ++it) {
if (size + 2 + it->heredoc_identifier.size() >=
TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
return 0;
}
if (it->heredoc_identifier.size() > CHAR_MAX) {
return 0;
}
buf[size++] = it->type;
buf[size++] = it->heredoc_identifier.size();
it->heredoc_identifier.copy(&buf[size], it->heredoc_identifier.size());
size += it->heredoc_identifier.size();
}
return size;
}
void deserialize(const char *buf, unsigned n) {
context_stack.clear();
if (n == 0) {
return;
}
unsigned size = 0;
uint8_t context_stack_size = buf[size++];
for (unsigned j = 0; j < context_stack_size; j++) {
Context ctx;
ctx.type = static_cast<ContextType>(buf[size++]);
uint8_t heredoc_identifier_size = buf[size++];
ctx.heredoc_identifier.assign(buf + size,
buf + size + heredoc_identifier_size);
size += heredoc_identifier_size;
context_stack.push_back(ctx);
}
assert(size == n);
}
bool scan(TSLexer *lexer, const bool *valid_symbols) {
bool has_leading_whitespace_with_newline = false;
while (iswspace(lexer->lookahead)) {
if (lexer->lookahead == '\n') {
has_leading_whitespace_with_newline = true;
}
skip(lexer);
}
if (lexer->lookahead == '\0') {
return false;
}
// manage quoted context
if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context() &&
lexer->lookahead == '"') {
Context ctx = {QUOTED_TEMPLATE, ""};
context_stack.push_back(ctx);
return accept_and_advance(lexer, QUOTED_TEMPLATE_START);
}
if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context() &&
lexer->lookahead == '"') {
context_stack.pop_back();
return accept_and_advance(lexer, QUOTED_TEMPLATE_END);
}
// manage template interpolations
if (valid_symbols[TEMPLATE_INTERPOLATION_START] &&
valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_interpolation_context() &&
lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
Context ctx = {TEMPLATE_INTERPOLATION, ""};
context_stack.push_back(ctx);
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START);
}
// try to scan escape sequence
if (lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
if (valid_symbols[TEMPLATE_INTERPOLATION_END] &&
in_interpolation_context() && lexer->lookahead == '}') {
context_stack.pop_back();
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END);
}
// manage template directives
if (valid_symbols[TEMPLATE_DIRECTIVE_START] &&
valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_directive_context() &&
lexer->lookahead == '%') {
advance(lexer);
if (lexer->lookahead == '{') {
Context ctx = {TEMPLATE_DIRECTIVE, ""};
context_stack.push_back(ctx);
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START);
}
// try to scan escape sequence
if (lexer->lookahead == '%') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
if (valid_symbols[TEMPLATE_DIRECTIVE_END] && in_directive_context() &&
lexer->lookahead == '}') {
context_stack.pop_back();
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END);
}
// manage heredoc context
if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context()) {
string identifier;
// TODO: check that this is a valid identifier
while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' ||
lexer->lookahead == '-') {
identifier.push_back(lexer->lookahead);
advance(lexer);
}
Context ctx = {HEREDOC_TEMPLATE, identifier};
context_stack.push_back(ctx);
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
}
if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context() &&
has_leading_whitespace_with_newline) {
string expected_identifier = context_stack.back().heredoc_identifier;
for (string::iterator it = expected_identifier.begin();
it != expected_identifier.end(); ++it) {
if (lexer->lookahead == *it) {
advance(lexer);
} else {
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
// check if the identifier is on a line of its own
lexer->mark_end(lexer);
while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
advance(lexer);
}
if (lexer->lookahead == '\n') {
context_stack.pop_back();
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
} else {
advance(lexer);
lexer->mark_end(lexer);
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
// manage template literal chunks
// handle template literal chunks in quoted contexts
//
// they may not contain newlines and may contain escape sequences
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context()) {
switch (lexer->lookahead) {
case '\\':
advance(lexer);
switch (lexer->lookahead) {
case '"':
case 'n':
case 'r':
case 't':
case '\\':
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
case 'u':
for (int i = 0; i < 4; i++) {
if (!consume_wxdigit(lexer))
return false;
}
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
case 'U':
for (int i = 0; i < 8; i++) {
if (!consume_wxdigit(lexer))
return false;
}
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
default:
return false;
}
}
}
// handle all other quoted template or string literal characters
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context()) {
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
// probably not handled by the external scanner
return false;
}
private:
vector<Context> context_stack;
void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
bool accept_inplace(TSLexer *lexer, TokenType token) {
lexer->result_symbol = token;
return true;
}
bool accept_and_advance(TSLexer *lexer, TokenType token) {
advance(lexer);
return accept_inplace(lexer, token);
}
bool consume_wxdigit(TSLexer *lexer) {
advance(lexer);
return iswxdigit(lexer->lookahead);
}
bool skip_comment(TSLexer* lexer) {
while (iswspace(lexer->lookahead)) {
skip(lexer);
}
if (lexer->lookahead != '#') {
return false;
}
skip(lexer);
while (lexer->lookahead != '\n') {
skip(lexer);
if (lexer->eof(lexer)) {
return false;
}
}
return true;
}
bool in_context_type(ContextType type) {
if (context_stack.empty()) {
return false;
}
return context_stack.back().type == type;
}
bool in_quoted_context() { return in_context_type(QUOTED_TEMPLATE); }
bool in_heredoc_context() { return in_context_type(HEREDOC_TEMPLATE); }
bool in_template_context() {
return in_quoted_context() || in_heredoc_context();
}
bool in_interpolation_context() {
return in_context_type(TEMPLATE_INTERPOLATION);
}
bool in_directive_context() { return in_context_type(TEMPLATE_DIRECTIVE); }
};
} // namespace
extern "C" {
// tree sitter callbacks
void *tree_sitter_terraform_external_scanner_create() { return new Scanner(); }
void tree_sitter_terraform_external_scanner_destroy(void *p) {
Scanner *scanner = static_cast<Scanner *>(p);
delete scanner;
}
unsigned tree_sitter_terraform_external_scanner_serialize(void *p, char *b) {
Scanner *scanner = static_cast<Scanner *>(p);
return scanner->serialize(b);
}
void tree_sitter_terraform_external_scanner_deserialize(void *p, const char *b,
unsigned n) {
Scanner *scanner = static_cast<Scanner *>(p);
return scanner->deserialize(b, n);
}
bool tree_sitter_terraform_external_scanner_scan(void *p, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = static_cast<Scanner *>(p);
return scanner->scan(lexer, valid_symbols);
}
} // extern "C"

436
src/scanner.c Normal file
View File

@@ -0,0 +1,436 @@
#include <assert.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <tree_sitter/parser.h>
#include <wctype.h>
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define VEC_RESIZE(vec, _cap) \
void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \
assert(tmp != NULL); \
(vec).data = tmp; \
(vec).cap = (_cap);
#define VEC_PUSH(vec, el) \
if ((vec).cap == (vec).len) { \
VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \
} \
(vec).data[(vec).len++] = (el);
#define VEC_POP(vec) \
{ \
STRING_FREE(VEC_BACK((vec)).heredoc_identifier); \
(vec).len--; \
}
#define VEC_BACK(vec) ((vec).data[(vec).len - 1])
#define VEC_FREE(vec) \
{ \
if ((vec).data != NULL) \
free((vec).data); \
}
#define VEC_CLEAR(vec) \
{ \
for (int i = 0; i < (vec).len; i++) { \
STRING_FREE((vec).data[i].heredoc_identifier); \
} \
(vec).len = 0; \
}
#define STRING_RESIZE(vec, _cap) \
void *tmp = realloc((vec).data, (_cap + 1) * sizeof((vec).data[0])); \
assert(tmp != NULL); \
(vec).data = tmp; \
memset((vec).data + (vec).len, 0, \
((_cap + 1) - (vec).len) * sizeof((vec).data[0])); \
(vec).cap = (_cap);
#define STRING_GROW(vec, _cap) \
if ((vec).cap < (_cap)) { \
STRING_RESIZE((vec), (_cap)); \
}
#define STRING_PUSH(vec, el) \
if ((vec).cap == (vec).len) { \
STRING_RESIZE((vec), MAX(16, (vec).len * 2)); \
} \
(vec).data[(vec).len++] = (el);
#define STRING_FREE(vec) \
{ \
if ((vec).data != NULL) \
free((vec).data); \
}
enum TokenType {
QUOTED_TEMPLATE_START,
QUOTED_TEMPLATE_END,
TEMPLATE_LITERAL_CHUNK,
TEMPLATE_INTERPOLATION_START,
TEMPLATE_INTERPOLATION_END,
TEMPLATE_DIRECTIVE_START,
TEMPLATE_DIRECTIVE_END,
HEREDOC_IDENTIFIER,
};
enum ContextType {
TEMPLATE_INTERPOLATION,
TEMPLATE_DIRECTIVE,
QUOTED_TEMPLATE,
HEREDOC_TEMPLATE,
};
typedef struct {
uint32_t cap;
uint32_t len;
char *data;
} String;
String string_new() {
return (String){
.cap = 16,
.len = 0,
.data = calloc(1, sizeof(char) * 17),
};
}
typedef struct {
enum ContextType type;
// valid if type == HEREDOC_TEMPLATE
String heredoc_identifier;
} Context;
Context context_new(enum ContextType type, const char *data) {
Context ctx = {
.type = type,
.heredoc_identifier = string_new(),
};
ctx.heredoc_identifier.len = strlen(data);
ctx.heredoc_identifier.cap = strlen(data);
memcpy(ctx.heredoc_identifier.data, data, ctx.heredoc_identifier.len);
return ctx;
}
typedef struct {
uint32_t len;
uint32_t cap;
Context *data;
} context_vec;
typedef struct {
context_vec context_stack;
} Scanner;
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
static unsigned serialize(Scanner *scanner, char *buf) {
unsigned size = 0;
if (scanner->context_stack.len > CHAR_MAX) {
return 0;
}
buf[size++] = (char)scanner->context_stack.len;
for (int i = 0; i < scanner->context_stack.len; i++) {
Context *context = &scanner->context_stack.data[i];
if (size + 2 + context->heredoc_identifier.len >=
TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
return 0;
}
if (context->heredoc_identifier.len > CHAR_MAX) {
return 0;
}
buf[size++] = context->type;
buf[size++] = (char)context->heredoc_identifier.len;
memcpy(&buf[size], context->heredoc_identifier.data,
context->heredoc_identifier.len);
size += context->heredoc_identifier.len;
}
return size;
}
static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
if (length == 0) {
return;
}
VEC_CLEAR(scanner->context_stack);
unsigned size = 0;
uint8_t context_stack_size = buffer[size++];
for (uint32_t j = 0; j < context_stack_size; j++) {
Context ctx = {
.type = (enum ContextType)buffer[size++],
.heredoc_identifier = string_new(),
};
uint8_t heredoc_identifier_size = buffer[size++];
STRING_GROW(ctx.heredoc_identifier, heredoc_identifier_size);
memcpy(ctx.heredoc_identifier.data, buffer + size,
heredoc_identifier_size);
ctx.heredoc_identifier.len = heredoc_identifier_size;
size += heredoc_identifier_size;
VEC_PUSH(scanner->context_stack, ctx);
}
assert(size == length);
}
static inline bool accept_inplace(TSLexer *lexer, enum TokenType token) {
lexer->result_symbol = token;
return true;
}
static inline bool accept_and_advance(TSLexer *lexer, enum TokenType token) {
advance(lexer);
return accept_inplace(lexer, token);
}
static inline bool consume_wxdigit(TSLexer *lexer) {
advance(lexer);
return iswxdigit(lexer->lookahead);
}
static inline bool skip_comment(TSLexer *lexer) {
while (iswspace(lexer->lookahead)) {
skip(lexer);
}
if (lexer->lookahead != '#') {
return false;
}
skip(lexer);
while (lexer->lookahead != '\n') {
skip(lexer);
if (lexer->eof(lexer)) {
return false;
}
}
return true;
}
static inline bool in_context_type(Scanner *scanner, enum ContextType type) {
if (scanner->context_stack.len == 0) {
return false;
}
return VEC_BACK(scanner->context_stack).type == type;
}
static inline bool in_quoted_context(Scanner *scanner) {
return in_context_type(scanner, QUOTED_TEMPLATE);
}
static inline bool in_heredoc_context(Scanner *scanner) {
return in_context_type(scanner, HEREDOC_TEMPLATE);
}
static inline bool in_template_context(Scanner *scanner) {
return in_quoted_context(scanner) || in_heredoc_context(scanner);
}
static inline bool in_interpolation_context(Scanner *scanner) {
return in_context_type(scanner, TEMPLATE_INTERPOLATION);
}
static inline bool in_directive_context(Scanner *scanner) {
return in_context_type(scanner, TEMPLATE_DIRECTIVE);
}
static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
bool has_leading_whitespace_with_newline = false;
while (iswspace(lexer->lookahead)) {
if (lexer->lookahead == '\n') {
has_leading_whitespace_with_newline = true;
}
skip(lexer);
}
if (lexer->lookahead == '\0') {
return false;
}
// manage quoted context
if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context(scanner) &&
lexer->lookahead == '"') {
Context ctx = context_new(QUOTED_TEMPLATE, "");
VEC_PUSH(scanner->context_stack, ctx);
return accept_and_advance(lexer, QUOTED_TEMPLATE_START);
}
if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context(scanner) &&
lexer->lookahead == '"') {
VEC_POP(scanner->context_stack);
return accept_and_advance(lexer, QUOTED_TEMPLATE_END);
}
// manage template interpolations
if (valid_symbols[TEMPLATE_INTERPOLATION_START] &&
valid_symbols[TEMPLATE_LITERAL_CHUNK] &&
!in_interpolation_context(scanner) && lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
Context ctx = context_new(TEMPLATE_INTERPOLATION, "");
VEC_PUSH(scanner->context_stack, ctx);
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START);
}
// try to scan escape sequence
if (lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
if (valid_symbols[TEMPLATE_INTERPOLATION_END] &&
in_interpolation_context(scanner) && lexer->lookahead == '}') {
VEC_POP(scanner->context_stack);
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END);
}
// manage template directives
if (valid_symbols[TEMPLATE_DIRECTIVE_START] &&
valid_symbols[TEMPLATE_LITERAL_CHUNK] &&
!in_directive_context(scanner) && lexer->lookahead == '%') {
advance(lexer);
if (lexer->lookahead == '{') {
Context ctx = context_new(TEMPLATE_DIRECTIVE, "");
VEC_PUSH(scanner->context_stack, ctx);
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START);
}
// try to scan escape sequence
if (lexer->lookahead == '%') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
if (valid_symbols[TEMPLATE_DIRECTIVE_END] &&
in_directive_context(scanner) && lexer->lookahead == '}') {
VEC_POP(scanner->context_stack);
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END);
}
// manage heredoc context
if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context(scanner)) {
String identifier = string_new();
// TODO: check that this is a valid identifier
while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' ||
lexer->lookahead == '-') {
STRING_PUSH(identifier, lexer->lookahead);
advance(lexer);
}
Context ctx = {HEREDOC_TEMPLATE, identifier};
VEC_PUSH(scanner->context_stack, ctx);
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
}
if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context(scanner) &&
has_leading_whitespace_with_newline) {
String expected_identifier =
VEC_BACK(scanner->context_stack).heredoc_identifier;
for (size_t i = 0; i < expected_identifier.len; i++) {
if (lexer->lookahead == expected_identifier.data[i]) {
advance(lexer);
} else {
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
// check if the identifier is on a line of its own
lexer->mark_end(lexer);
while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
advance(lexer);
}
if (lexer->lookahead == '\n') {
VEC_POP(scanner->context_stack);
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
}
advance(lexer);
lexer->mark_end(lexer);
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
// manage template literal chunks
// handle template literal chunks in quoted contexts
//
// they may not contain newlines and may contain escape sequences
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context(scanner)) {
switch (lexer->lookahead) {
case '\\':
advance(lexer);
switch (lexer->lookahead) {
case '"':
case 'n':
case 'r':
case 't':
case '\\':
return accept_and_advance(lexer,
TEMPLATE_LITERAL_CHUNK);
case 'u':
for (int i = 0; i < 4; i++) {
if (!consume_wxdigit(lexer)) {
return false;
}
}
return accept_and_advance(lexer,
TEMPLATE_LITERAL_CHUNK);
case 'U':
for (int i = 0; i < 8; i++) {
if (!consume_wxdigit(lexer)) {
return false;
}
}
return accept_and_advance(lexer,
TEMPLATE_LITERAL_CHUNK);
default:
return false;
}
}
}
// handle all other quoted template or string literal characters
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context(scanner)) {
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
// probably not handled by the external scanner
return false;
}
void *tree_sitter_hcl_external_scanner_create() {
Scanner *scanner = calloc(1, sizeof(Scanner));
scanner->context_stack.data = calloc(1, sizeof(Context));
return scanner;
}
unsigned tree_sitter_hcl_external_scanner_serialize(void *payload,
char *buffer) {
Scanner *scanner = (Scanner *)payload;
return serialize(scanner, buffer);
}
void tree_sitter_hcl_external_scanner_deserialize(void *payload,
const char *buffer,
unsigned length) {
Scanner *scanner = (Scanner *)payload;
deserialize(scanner, buffer, length);
}
bool tree_sitter_hcl_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = (Scanner *)payload;
return scan(scanner, lexer, valid_symbols);
}
void tree_sitter_hcl_external_scanner_destroy(void *payload) {
Scanner *scanner = (Scanner *)payload;
for (int i = 0; i < scanner->context_stack.len; i++) {
STRING_FREE(scanner->context_stack.data[i].heredoc_identifier);
}
VEC_FREE(scanner->context_stack);
free(scanner);
}

View File

@@ -1,336 +0,0 @@
#include <tree_sitter/parser.h>
#include <assert.h>
#include <climits>
#include <string>
#include <vector>
#include <wctype.h>
namespace {
using std::string;
using std::vector;
enum TokenType {
QUOTED_TEMPLATE_START,
QUOTED_TEMPLATE_END,
TEMPLATE_LITERAL_CHUNK,
TEMPLATE_INTERPOLATION_START,
TEMPLATE_INTERPOLATION_END,
TEMPLATE_DIRECTIVE_START,
TEMPLATE_DIRECTIVE_END,
HEREDOC_IDENTIFIER,
};
enum ContextType {
TEMPLATE_INTERPOLATION,
TEMPLATE_DIRECTIVE,
QUOTED_TEMPLATE,
HEREDOC_TEMPLATE,
};
struct Context {
ContextType type;
// valid if type == HEREDOC_TEMPLATE
string heredoc_identifier;
};
struct Scanner {
public:
unsigned serialize(char *buf) {
unsigned size = 0;
if (context_stack.size() > CHAR_MAX) {
return 0;
}
buf[size++] = context_stack.size();
for (vector<Context>::iterator it = context_stack.begin();
it != context_stack.end(); ++it) {
if (size + 2 + it->heredoc_identifier.size() >=
TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
return 0;
}
if (it->heredoc_identifier.size() > CHAR_MAX) {
return 0;
}
buf[size++] = it->type;
buf[size++] = it->heredoc_identifier.size();
it->heredoc_identifier.copy(&buf[size], it->heredoc_identifier.size());
size += it->heredoc_identifier.size();
}
return size;
}
void deserialize(const char *buf, unsigned n) {
context_stack.clear();
if (n == 0) {
return;
}
unsigned size = 0;
uint8_t context_stack_size = buf[size++];
for (unsigned j = 0; j < context_stack_size; j++) {
Context ctx;
ctx.type = static_cast<ContextType>(buf[size++]);
uint8_t heredoc_identifier_size = buf[size++];
ctx.heredoc_identifier.assign(buf + size,
buf + size + heredoc_identifier_size);
size += heredoc_identifier_size;
context_stack.push_back(ctx);
}
assert(size == n);
}
bool scan(TSLexer *lexer, const bool *valid_symbols) {
bool has_leading_whitespace_with_newline = false;
while (iswspace(lexer->lookahead)) {
if (lexer->lookahead == '\n') {
has_leading_whitespace_with_newline = true;
}
skip(lexer);
}
if (lexer->lookahead == '\0') {
return false;
}
// manage quoted context
if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context() &&
lexer->lookahead == '"') {
Context ctx = {QUOTED_TEMPLATE, ""};
context_stack.push_back(ctx);
return accept_and_advance(lexer, QUOTED_TEMPLATE_START);
}
if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context() &&
lexer->lookahead == '"') {
context_stack.pop_back();
return accept_and_advance(lexer, QUOTED_TEMPLATE_END);
}
// manage template interpolations
if (valid_symbols[TEMPLATE_INTERPOLATION_START] &&
valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_interpolation_context() &&
lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
Context ctx = {TEMPLATE_INTERPOLATION, ""};
context_stack.push_back(ctx);
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START);
}
// try to scan escape sequence
if (lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
if (valid_symbols[TEMPLATE_INTERPOLATION_END] &&
in_interpolation_context() && lexer->lookahead == '}') {
context_stack.pop_back();
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END);
}
// manage template directives
if (valid_symbols[TEMPLATE_DIRECTIVE_START] &&
valid_symbols[TEMPLATE_LITERAL_CHUNK] && !in_directive_context() &&
lexer->lookahead == '%') {
advance(lexer);
if (lexer->lookahead == '{') {
Context ctx = {TEMPLATE_DIRECTIVE, ""};
context_stack.push_back(ctx);
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_START);
}
// try to scan escape sequence
if (lexer->lookahead == '%') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
if (valid_symbols[TEMPLATE_DIRECTIVE_END] && in_directive_context() &&
lexer->lookahead == '}') {
context_stack.pop_back();
return accept_and_advance(lexer, TEMPLATE_DIRECTIVE_END);
}
// manage heredoc context
if (valid_symbols[HEREDOC_IDENTIFIER] && !in_heredoc_context()) {
string identifier;
// TODO: check that this is a valid identifier
while (iswalnum(lexer->lookahead) || lexer->lookahead == '_' ||
lexer->lookahead == '-') {
identifier.push_back(lexer->lookahead);
advance(lexer);
}
Context ctx = {HEREDOC_TEMPLATE, identifier};
context_stack.push_back(ctx);
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
}
if (valid_symbols[HEREDOC_IDENTIFIER] && in_heredoc_context() &&
has_leading_whitespace_with_newline) {
string expected_identifier = context_stack.back().heredoc_identifier;
for (string::iterator it = expected_identifier.begin();
it != expected_identifier.end(); ++it) {
if (lexer->lookahead == *it) {
advance(lexer);
} else {
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
// check if the identifier is on a line of its own
lexer->mark_end(lexer);
while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
advance(lexer);
}
if (lexer->lookahead == '\n') {
context_stack.pop_back();
return accept_inplace(lexer, HEREDOC_IDENTIFIER);
} else {
advance(lexer);
lexer->mark_end(lexer);
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
// manage template literal chunks
// handle template literal chunks in quoted contexts
//
// they may not contain newlines and may contain escape sequences
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context()) {
switch (lexer->lookahead) {
case '\\':
advance(lexer);
switch (lexer->lookahead) {
case '"':
case 'n':
case 'r':
case 't':
case '\\':
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
case 'u':
for (int i = 0; i < 4; i++) {
if (!consume_wxdigit(lexer))
return false;
}
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
case 'U':
for (int i = 0; i < 8; i++) {
if (!consume_wxdigit(lexer))
return false;
}
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
default:
return false;
}
}
}
// handle all other quoted template or string literal characters
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context()) {
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
// probably not handled by the external scanner
return false;
}
private:
vector<Context> context_stack;
void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
bool accept_inplace(TSLexer *lexer, TokenType token) {
lexer->result_symbol = token;
return true;
}
bool accept_and_advance(TSLexer *lexer, TokenType token) {
advance(lexer);
return accept_inplace(lexer, token);
}
bool consume_wxdigit(TSLexer *lexer) {
advance(lexer);
return iswxdigit(lexer->lookahead);
}
bool skip_comment(TSLexer* lexer) {
while (iswspace(lexer->lookahead)) {
skip(lexer);
}
if (lexer->lookahead != '#') {
return false;
}
skip(lexer);
while (lexer->lookahead != '\n') {
skip(lexer);
if (lexer->eof(lexer)) {
return false;
}
}
return true;
}
bool in_context_type(ContextType type) {
if (context_stack.empty()) {
return false;
}
return context_stack.back().type == type;
}
bool in_quoted_context() { return in_context_type(QUOTED_TEMPLATE); }
bool in_heredoc_context() { return in_context_type(HEREDOC_TEMPLATE); }
bool in_template_context() {
return in_quoted_context() || in_heredoc_context();
}
bool in_interpolation_context() {
return in_context_type(TEMPLATE_INTERPOLATION);
}
bool in_directive_context() { return in_context_type(TEMPLATE_DIRECTIVE); }
};
} // namespace
extern "C" {
// tree sitter callbacks
void *tree_sitter_hcl_external_scanner_create() { return new Scanner(); }
void tree_sitter_hcl_external_scanner_destroy(void *p) {
Scanner *scanner = static_cast<Scanner *>(p);
delete scanner;
}
unsigned tree_sitter_hcl_external_scanner_serialize(void *p, char *b) {
Scanner *scanner = static_cast<Scanner *>(p);
return scanner->serialize(b);
}
void tree_sitter_hcl_external_scanner_deserialize(void *p, const char *b,
unsigned n) {
Scanner *scanner = static_cast<Scanner *>(p);
return scanner->deserialize(b, n);
}
bool tree_sitter_hcl_external_scanner_scan(void *p, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = static_cast<Scanner *>(p);
return scanner->scan(lexer, valid_symbols);
}
} // extern "C"