wip move to cpp external scanner

This commit is contained in:
mhoffm
2021-06-22 23:40:31 +02:00
parent 5a58000ba1
commit b9a95e5b48
4 changed files with 257 additions and 259 deletions

View File

@@ -1,6 +1,6 @@
root = true
[*.{c,txt,js}]
[*.{cc,txt,js}]
indent_style = space
indent_size = 2
tab_width = 8

View File

@@ -1,257 +0,0 @@
#include <tree_sitter/parser.h>
#include <wctype.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
enum TokenType {
QUOTED_TEMPLATE_START,
QUOTED_TEMPLATE_END,
TEMPLATE_LITERAL_CHUNK,
TEMPLATE_INTERPOLATION_START,
TEMPLATE_INTERPOLATION_END,
HEREDOC_IDENTIFIER,
};
static void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
static void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
static bool accept_inplace(TSLexer *lexer, enum TokenType token) {
lexer->result_symbol = token;
return true;
}
static bool accept_and_advance(TSLexer *lexer, enum TokenType token) {
advance(lexer);
return accept_inplace(lexer, token);
}
static bool consume_wxdigit(TSLexer *lexer) {
advance(lexer);
return iswxdigit(lexer->lookahead);
}
typedef struct Scanner {
bool in_template_interpolation;
bool in_quoted_context;
int template_interpolation_depth;
int quoted_context_depth;
} Scanner;
void print_debug_info(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
printf("\nDEBUG INFO START\n\n");
printf("currently at: '%c'\n\n", lexer->lookahead);
printf("could be one of\n");
printf("quoted_template_start: %x\n", valid_symbols[QUOTED_TEMPLATE_START]);
printf("quoted_template_end: %x\n", valid_symbols[QUOTED_TEMPLATE_END]);
printf("template_literal_chunk: %x\n", valid_symbols[TEMPLATE_LITERAL_CHUNK]);
printf("template_interpolation_start: %x\n", valid_symbols[TEMPLATE_INTERPOLATION_START]);
printf("template_interpolation_end: %x\n", valid_symbols[TEMPLATE_INTERPOLATION_END]);
printf("\n");
printf("scanner state:\n");
printf("in_template_interpolation %x\n", scanner->in_template_interpolation);
printf("in_quoted_context %x\n", scanner->in_quoted_context);
printf("template_interpolation_depth %x\n", scanner->template_interpolation_depth);
printf("quoted_context_depth %x\n", scanner->quoted_context_depth);
printf("\nDEBUG INFO END\n\n");
}
void scanner_enter_interpolation_context(Scanner *scanner) {
scanner->template_interpolation_depth++;
scanner->in_template_interpolation = true;
scanner->in_quoted_context = false;
}
void scanner_exit_interpolation_context(Scanner *scanner) {
assert(scanner->template_interpolation_depth > 0);
scanner->template_interpolation_depth--;
scanner->in_template_interpolation = false;
if (scanner->quoted_context_depth > 0) {
scanner->in_quoted_context = true;
}
}
void scanner_enter_quoted_context(Scanner *scanner) {
scanner->quoted_context_depth++;
scanner->in_quoted_context = true;
scanner->in_template_interpolation = false;
}
void scanner_exit_quoted_context(Scanner *scanner) {
assert(scanner->quoted_context_depth > 0);
scanner->quoted_context_depth--;
scanner->in_quoted_context = false;
if (scanner->template_interpolation_depth > 0) {
scanner->in_template_interpolation = true;
}
}
bool scanner_scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
// print_debug_info(scanner, lexer, valid_symbols);
while (iswspace(lexer->lookahead)) {
skip(lexer);
}
if (lexer->lookahead == '\0') return false;
// manage quoted context
if (
valid_symbols[QUOTED_TEMPLATE_START] &&
!scanner->in_quoted_context &&
lexer->lookahead == '"'
) {
scanner_enter_quoted_context(scanner);
return accept_and_advance(lexer, QUOTED_TEMPLATE_START);
}
if (
valid_symbols[QUOTED_TEMPLATE_END] &&
scanner->in_quoted_context &&
lexer->lookahead == '"'
) {
scanner_exit_quoted_context(scanner);
return accept_and_advance(lexer, QUOTED_TEMPLATE_END);
}
// manage template interpolations
if (
valid_symbols[TEMPLATE_INTERPOLATION_START] &&
lexer->lookahead == '$'
) {
advance(lexer);
if (lexer->lookahead == '{') {
scanner_enter_interpolation_context(scanner);
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START);
}
if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) {
// try to scan escape sequence
if (lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
if (
valid_symbols[TEMPLATE_INTERPOLATION_END] &&
lexer->lookahead == '}' &&
scanner->in_template_interpolation
) {
scanner_exit_interpolation_context(scanner);
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END);
}
// handle template literal chunks
// handle template literal chunks in quoted contexts
//
// they may not contain newlines and may contain escape sequences
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && scanner->in_quoted_context) {
switch (lexer->lookahead) {
case '\\':
advance(lexer);
switch (lexer->lookahead) {
case '"':
case 'n':
case 'r':
case 't':
case '\\':
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
case 'u':
for (int i = 0; i < 4; i++) {
if (!consume_wxdigit(lexer)) return false;
}
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
case 'U':
for (int i = 0; i < 8; i++) {
if (!consume_wxdigit(lexer)) return false;
}
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
default:
return false;
}
}
}
// handle escaped template interpolations in string literals
if (
valid_symbols[TEMPLATE_LITERAL_CHUNK] &&
!valid_symbols[TEMPLATE_INTERPOLATION_START] &&
scanner->in_quoted_context
) {
// try to scan escaped template interpolation
switch (lexer->lookahead) {
case '$':
advance(lexer);
if (lexer->lookahead == '{') {
// unescaped template interpolation
return false;
}
if (lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
}
// handle heredoc identifier
if (valid_symbols[HEREDOC_IDENTIFIER]) {
if (lexer->lookahead != 'E') {
if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
advance(lexer);
if (lexer->lookahead != 'O') {
if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
advance(lexer);
if (lexer->lookahead != 'F') {
if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
return accept_and_advance(lexer, HEREDOC_IDENTIFIER);
}
// handle all other quoted template or string literal characters
if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) {
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
// probably not handled by the external scanner
return false;
}
void *tree_sitter_hcl_external_scanner_create() {
Scanner *scanner = (Scanner*)malloc(sizeof(Scanner));
if (scanner) {
scanner->in_template_interpolation = false;
scanner->in_quoted_context = false;
scanner->template_interpolation_depth = 0;
scanner->quoted_context_depth = 0;
}
return scanner;
}
void tree_sitter_hcl_external_scanner_destroy(void *p) {
free(p);
}
unsigned tree_sitter_hcl_external_scanner_serialize(void *p, char *b) {
memcpy(b, p, sizeof(Scanner)); return sizeof(Scanner);
}
void tree_sitter_hcl_external_scanner_deserialize(void *p, const char *b, unsigned n) {
memcpy(p, b, n);
}
bool tree_sitter_hcl_external_scanner_scan(void *p, TSLexer *lexer, const bool *valid_symbols) {
return scanner_scan((Scanner*)p, lexer, valid_symbols);
}

255
src/scanner.cc Normal file
View File

@@ -0,0 +1,255 @@
#include <tree_sitter/parser.h>
#include <vector>
#include <string>
#include <wctype.h>
#include <assert.h>
#include <stdio.h>
namespace {
using std::vector;
using std::string;
enum TokenType {
QUOTED_TEMPLATE_START,
QUOTED_TEMPLATE_END,
TEMPLATE_LITERAL_CHUNK,
TEMPLATE_INTERPOLATION_START,
TEMPLATE_INTERPOLATION_END,
HEREDOC_IDENTIFIER,
};
enum ContextType {
TEMPLATE_INTERPOLATION,
QUOTED_TEMPLATE,
HEREDOC_TEMPLATE,
};
struct Context {
ContextType type;
// valid if type == HEREDOC_TEMPLATE
char* identifier;
size_t identifier_size;
};
struct Scanner {
public:
// TODO: implement properly
unsigned serialize(char* buf) {
return 0;
}
// TODO: implement properly
void deserialize(const char* buf, unsigned n) {
}
bool scan(TSLexer* lexer, const bool* valid_symbols) {
while (iswspace(lexer->lookahead)) {
skip(lexer);
}
if (lexer->lookahead == '\0') {
return false;
}
// manage quoted context
if (valid_symbols[QUOTED_TEMPLATE_START] && !in_quoted_context() && lexer->lookahead == '"') {
context_stack.push_back({ .type = QUOTED_TEMPLATE });
return accept_and_advance(lexer, QUOTED_TEMPLATE_START);
}
if (valid_symbols[QUOTED_TEMPLATE_END] && in_quoted_context() && lexer->lookahead == '"') {
context_stack.pop_back();
return accept_and_advance(lexer, QUOTED_TEMPLATE_END);
}
// manage template interpolations
if (valid_symbols[TEMPLATE_INTERPOLATION_START] && !in_interpolation_context() && lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
context_stack.push_back({ .type = TEMPLATE_INTERPOLATION});
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_START);
}
if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) {
// try to scan escape sequence
if (lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
if (valid_symbols[TEMPLATE_INTERPOLATION_END] && in_interpolation_context() && lexer->lookahead == '}') {
context_stack.pop_back();
return accept_and_advance(lexer, TEMPLATE_INTERPOLATION_END);
}
// handle template literal chunks
// handle template literal chunks in quoted contexts
//
// they may not contain newlines and may contain escape sequences
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_quoted_context()) {
switch (lexer->lookahead) {
case '\\':
advance(lexer);
switch (lexer->lookahead) {
case '"':
case 'n':
case 'r':
case 't':
case '\\':
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
case 'u':
for (int i = 0; i < 4; i++) {
if (!consume_wxdigit(lexer)) return false;
}
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
case 'U':
for (int i = 0; i < 8; i++) {
if (!consume_wxdigit(lexer)) return false;
}
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
default:
return false;
}
}
}
// handle escaped template interpolations in string literals
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && !valid_symbols[TEMPLATE_INTERPOLATION_START] && in_quoted_context()) {
// try to scan escaped template interpolation
switch (lexer->lookahead) {
case '$':
advance(lexer);
if (lexer->lookahead == '{') {
// unescaped template interpolation
return false;
}
if (lexer->lookahead == '$') {
advance(lexer);
if (lexer->lookahead == '{') {
// $${
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
return accept_inplace(lexer, TEMPLATE_LITERAL_CHUNK);
}
}
}
// handle heredoc identifier
if (valid_symbols[HEREDOC_IDENTIFIER]) {
if (lexer->lookahead != 'E') {
if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) {
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
} else {
return false;
}
}
advance(lexer);
if (lexer->lookahead != 'O') {
if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) {
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
} else {
return false;
}
}
advance(lexer);
if (lexer->lookahead != 'F') {
if (valid_symbols[TEMPLATE_LITERAL_CHUNK]) {
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
} else {
return false;
}
}
return accept_and_advance(lexer, HEREDOC_IDENTIFIER);
}
// handle all other quoted template or string literal characters
if (valid_symbols[TEMPLATE_LITERAL_CHUNK] && in_template_context()) {
return accept_and_advance(lexer, TEMPLATE_LITERAL_CHUNK);
}
// probably not handled by the external scanner
return false;
}
private:
vector<Context> context_stack;
vector<string> heredoc_identifier_stack;
void advance(TSLexer* lexer) {
lexer->advance(lexer, false);
}
void skip(TSLexer* lexer) { lexer->advance(lexer, true); }
bool accept_inplace(TSLexer* lexer, TokenType token) {
lexer->result_symbol = token;
return true;
}
bool accept_and_advance(TSLexer* lexer, TokenType token) {
advance(lexer);
return accept_inplace(lexer, token);
}
bool consume_wxdigit(TSLexer* lexer) {
advance(lexer);
return iswxdigit(lexer->lookahead);
}
bool in_context_type(ContextType type) {
if (context_stack.empty()) {
return false;
}
return context_stack.back().type == type;
}
bool in_quoted_context() {
return in_context_type(QUOTED_TEMPLATE);
}
bool in_template_context() {
return in_context_type(QUOTED_TEMPLATE) || in_context_type(HEREDOC_TEMPLATE);
}
bool in_interpolation_context() {
return in_context_type(TEMPLATE_INTERPOLATION);
}
};
} // namespace
extern "C" {
// tree sitter callbacks
void* tree_sitter_hcl_external_scanner_create() {
return new Scanner();
}
void tree_sitter_hcl_external_scanner_destroy(void* p) {
Scanner* scanner = static_cast<Scanner*>(p);
delete scanner;
}
unsigned tree_sitter_hcl_external_scanner_serialize(void* p, char* b) {
Scanner* scanner = static_cast<Scanner*>(p);
return scanner->serialize(b);
}
void tree_sitter_hcl_external_scanner_deserialize(void* p, const char* b, unsigned n) {
Scanner* scanner = static_cast<Scanner*>(p);
return scanner->deserialize(b, n);
}
bool tree_sitter_hcl_external_scanner_scan(void* p, TSLexer* lexer, const bool* valid_symbols) {
Scanner* scanner = static_cast<Scanner*>(p);
return scanner->scan(lexer, valid_symbols);
}
} // extern "C"

View File

@@ -1,5 +1,5 @@
================================================================================
tuple
simple tuple
================================================================================
foo = [1, 2, "foo"]