[Universal Parser] Reduce dependence on RArray in parse.y

- Introduce `rb_parser_ary_t` structure to partly eliminate RArray from parse.y - In this , `parser_params->tokens` and `parser_params->ast->node_buffer->tokens` are now `rb_parser_ary_t *` - Instead, `ast_node_all_tokens()` internally creates a Ruby Array object from the `rb_parser_ary_t` - Also, delete `rb_ast_tokens()` and `rb_ast_set_tokens()` in node.c - Implement `rb_parser_str_escape()` - This is a port of the `rb_str_escape()` function in string.c - `rb_parser_str_escape()` does not depend on `VALUE` (RString) - Instead, it uses `rb_parser_stirng_t *` - This function works when --dump=y option passed - Because WIP of the universal parser, similar functions like `rb_parser_tokens_free()` exist in both node.c and parse.y. Refactoring them may be needed in some way in the future - Although we considered redesigning the structure: `ast->node_buffer->tokens` into `ast->tokens`, we leave it as it is because `rb_ast_t` is an imemo. (We will address it in the future)
author: HASUMI Hitoshi <[email protected]> 2024-02-16 17:45:22 +0900
committer: Yuichiro Kaneko <[email protected]> 2024-03-12 17:17:52 +0900
commit: 9a19cfd4cd1a16528cc997e3a510c3046b83cdec ()
tree: 1d061cebf79d8aee39be26887b539f12b48a6f9f
parent: f42164e03700469a7000b4f00148a8ca01d75044 (diff)
7 files changed, 429 insertions, 215 deletions
@@ -774,10 +774,35 @@ ast_node_last_column(rb_execution_context_t *ec, VALUE self)
 static VALUE
 ast_node_all_tokens(rb_execution_context_t *ec, VALUE self)
 {
 struct ASTNodeData *data;
 TypedData_Get_Struct(self, struct ASTNodeData, &rb_node_type, data);
- return rb_ast_tokens(data->ast);
 }
 static VALUE
@@ -69,7 +69,7 @@ rb_node_buffer_new(void)
 init_node_buffer_list(&nb->unmarkable, (node_buffer_elem_t*)&nb[1], ruby_xmalloc);
 init_node_buffer_list(&nb->markable, (node_buffer_elem_t*)((size_t)nb->unmarkable.head + bucket_size), ruby_xmalloc);
 nb->local_tables = 0;
- nb->tokens = Qnil;
 #ifdef UNIVERSAL_PARSER
 nb->config = config;
 #endif
@@ -177,6 +177,24 @@ parser_string_free(rb_ast_t *ast, rb_parser_string_t *str)
 }
 static void
 free_ast_value(rb_ast_t *ast, void *ctx, NODE *node)
 {
 switch (nd_type(node)) {
@@ -228,6 +246,9 @@ free_ast_value(rb_ast_t *ast, void *ctx, NODE *node)
 static void
 rb_node_buffer_free(rb_ast_t *ast, node_buffer_t *nb)
 {
 iterate_node_values(ast, &nb->unmarkable, free_ast_value, NULL);
 node_buffer_list_free(ast, &nb->unmarkable);
 node_buffer_list_free(ast, &nb->markable);
@@ -388,8 +409,6 @@ void
 rb_ast_mark_and_move(rb_ast_t *ast, bool reference_updating)
 {
 if (ast->node_buffer) {
- rb_gc_mark_and_move(&ast->node_buffer->tokens);
-
 node_buffer_t *nb = ast->node_buffer;
 iterate_node_values(ast, &nb->markable, mark_and_move_ast_value, NULL);
@@ -439,18 +458,6 @@ rb_ast_dispose(rb_ast_t *ast)
 }
 VALUE
-rb_ast_tokens(rb_ast_t *ast)
-{
- return ast->node_buffer->tokens;
-}
-
-void
-rb_ast_set_tokens(rb_ast_t *ast, VALUE tokens)
-{
- RB_OBJ_WRITE(ast, &ast->node_buffer->tokens, tokens);
-}
-
-VALUE
 rb_node_set_type(NODE *n, enum node_type t)
 {
 #if RUBY_DEBUG
@@ -40,7 +40,7 @@ struct node_buffer_struct {
 // - text of token
 // - location info
 // Array, whose entry is array
- VALUE tokens;
 #ifdef UNIVERSAL_PARSER
 const rb_parser_config_t *config;
 #endif
@@ -55,7 +55,6 @@ rb_ast_t *rb_ast_new(void);
 #endif
 size_t rb_ast_memsize(const rb_ast_t*);
 void rb_ast_dispose(rb_ast_t*);
-VALUE rb_ast_tokens(rb_ast_t *ast);
 #if RUBY_DEBUG
 void rb_ast_node_type_change(NODE *n, enum node_type type);
 #endif
@@ -65,7 +64,6 @@ void rb_node_init(NODE *n, enum node_type type);
 void rb_ast_mark_and_move(rb_ast_t *ast, bool reference_updating);
 void rb_ast_update_references(rb_ast_t*);
 void rb_ast_free(rb_ast_t*);
-void rb_ast_set_tokens(rb_ast_t*, VALUE);
 NODE *rb_ast_newnode(rb_ast_t*, enum node_type type, size_t size, size_t alignment);
 void rb_ast_delete_node(rb_ast_t*, NODE *n);
 rb_ast_id_table_t *rb_ast_new_local_table(rb_ast_t*, int);
@@ -634,7 +634,7 @@ struct parser_params {
 /* id for terms */
 int token_id;
 /* Array for term tokens */
- VALUE tokens;
 #else
 /* Ripper only */
@@ -875,170 +875,170 @@ peek_end_expect_token_locations(struct parser_params *p)
 return p->end_expect_token_locations;
 }
-static ID
-parser_token2id(struct parser_params *p, enum yytokentype tok)
 {
 switch ((int) tok) {
-#define TOKEN2ID(tok) case tok: return rb_intern(#tok);
-#define TOKEN2ID2(tok, name) case tok: return rb_intern(name);
- TOKEN2ID2(' ', "words_sep")
- TOKEN2ID2('!', "!")
- TOKEN2ID2('%', "%");
- TOKEN2ID2('&', "&");
- TOKEN2ID2('*', "*");
- TOKEN2ID2('+', "+");
- TOKEN2ID2('-', "-");
- TOKEN2ID2('/', "/");
- TOKEN2ID2('<', "<");
- TOKEN2ID2('=', "=");
- TOKEN2ID2('>', ">");
- TOKEN2ID2('?', "?");
- TOKEN2ID2('^', "^");
- TOKEN2ID2('|', "|");
- TOKEN2ID2('~', "~");
- TOKEN2ID2(':', ":");
- TOKEN2ID2(',', ",");
- TOKEN2ID2('.', ".");
- TOKEN2ID2(';', ";");
- TOKEN2ID2('`', "`");
- TOKEN2ID2('\n', "nl");
- TOKEN2ID2('{', "{");
- TOKEN2ID2('}', "}");
- TOKEN2ID2('[', "[");
- TOKEN2ID2(']', "]");
- TOKEN2ID2('(', "(");
- TOKEN2ID2(')', ")");
- TOKEN2ID2('\\', "backslash");
- TOKEN2ID(keyword_class);
- TOKEN2ID(keyword_module);
- TOKEN2ID(keyword_def);
- TOKEN2ID(keyword_undef);
- TOKEN2ID(keyword_begin);
- TOKEN2ID(keyword_rescue);
- TOKEN2ID(keyword_ensure);
- TOKEN2ID(keyword_end);
- TOKEN2ID(keyword_if);
- TOKEN2ID(keyword_unless);
- TOKEN2ID(keyword_then);
- TOKEN2ID(keyword_elsif);
- TOKEN2ID(keyword_else);
- TOKEN2ID(keyword_case);
- TOKEN2ID(keyword_when);
- TOKEN2ID(keyword_while);
- TOKEN2ID(keyword_until);
- TOKEN2ID(keyword_for);
- TOKEN2ID(keyword_break);
- TOKEN2ID(keyword_next);
- TOKEN2ID(keyword_redo);
- TOKEN2ID(keyword_retry);
- TOKEN2ID(keyword_in);
- TOKEN2ID(keyword_do);
- TOKEN2ID(keyword_do_cond);
- TOKEN2ID(keyword_do_block);
- TOKEN2ID(keyword_do_LAMBDA);
- TOKEN2ID(keyword_return);
- TOKEN2ID(keyword_yield);
- TOKEN2ID(keyword_super);
- TOKEN2ID(keyword_self);
- TOKEN2ID(keyword_nil);
- TOKEN2ID(keyword_true);
- TOKEN2ID(keyword_false);
- TOKEN2ID(keyword_and);
- TOKEN2ID(keyword_or);
- TOKEN2ID(keyword_not);
- TOKEN2ID(modifier_if);
- TOKEN2ID(modifier_unless);
- TOKEN2ID(modifier_while);
- TOKEN2ID(modifier_until);
- TOKEN2ID(modifier_rescue);
- TOKEN2ID(keyword_alias);
- TOKEN2ID(keyword_defined);
- TOKEN2ID(keyword_BEGIN);
- TOKEN2ID(keyword_END);
- TOKEN2ID(keyword__LINE__);
- TOKEN2ID(keyword__FILE__);
- TOKEN2ID(keyword__ENCODING__);
- TOKEN2ID(tIDENTIFIER);
- TOKEN2ID(tFID);
- TOKEN2ID(tGVAR);
- TOKEN2ID(tIVAR);
- TOKEN2ID(tCONSTANT);
- TOKEN2ID(tCVAR);
- TOKEN2ID(tLABEL);
- TOKEN2ID(tINTEGER);
- TOKEN2ID(tFLOAT);
- TOKEN2ID(tRATIONAL);
- TOKEN2ID(tIMAGINARY);
- TOKEN2ID(tCHAR);
- TOKEN2ID(tNTH_REF);
- TOKEN2ID(tBACK_REF);
- TOKEN2ID(tSTRING_CONTENT);
- TOKEN2ID(tREGEXP_END);
- TOKEN2ID(tDUMNY_END);
- TOKEN2ID(tSP);
- TOKEN2ID(tUPLUS);
- TOKEN2ID(tUMINUS);
- TOKEN2ID(tPOW);
- TOKEN2ID(tCMP);
- TOKEN2ID(tEQ);
- TOKEN2ID(tEQQ);
- TOKEN2ID(tNEQ);
- TOKEN2ID(tGEQ);
- TOKEN2ID(tLEQ);
- TOKEN2ID(tANDOP);
- TOKEN2ID(tOROP);
- TOKEN2ID(tMATCH);
- TOKEN2ID(tNMATCH);
- TOKEN2ID(tDOT2);
- TOKEN2ID(tDOT3);
- TOKEN2ID(tBDOT2);
- TOKEN2ID(tBDOT3);
- TOKEN2ID(tAREF);
- TOKEN2ID(tASET);
- TOKEN2ID(tLSHFT);
- TOKEN2ID(tRSHFT);
- TOKEN2ID(tANDDOT);
- TOKEN2ID(tCOLON2);
- TOKEN2ID(tCOLON3);
- TOKEN2ID(tOP_ASGN);
- TOKEN2ID(tASSOC);
- TOKEN2ID(tLPAREN);
- TOKEN2ID(tLPAREN_ARG);
- TOKEN2ID(tRPAREN);
- TOKEN2ID(tLBRACK);
- TOKEN2ID(tLBRACE);
- TOKEN2ID(tLBRACE_ARG);
- TOKEN2ID(tSTAR);
- TOKEN2ID(tDSTAR);
- TOKEN2ID(tAMPER);
- TOKEN2ID(tLAMBDA);
- TOKEN2ID(tSYMBEG);
- TOKEN2ID(tSTRING_BEG);
- TOKEN2ID(tXSTRING_BEG);
- TOKEN2ID(tREGEXP_BEG);
- TOKEN2ID(tWORDS_BEG);
- TOKEN2ID(tQWORDS_BEG);
- TOKEN2ID(tSYMBOLS_BEG);
- TOKEN2ID(tQSYMBOLS_BEG);
- TOKEN2ID(tSTRING_END);
- TOKEN2ID(tSTRING_DEND);
- TOKEN2ID(tSTRING_DBEG);
- TOKEN2ID(tSTRING_DVAR);
- TOKEN2ID(tLAMBEG);
- TOKEN2ID(tLABEL_END);
- TOKEN2ID(tIGNORED_NL);
- TOKEN2ID(tCOMMENT);
- TOKEN2ID(tEMBDOC_BEG);
- TOKEN2ID(tEMBDOC);
- TOKEN2ID(tEMBDOC_END);
- TOKEN2ID(tHEREDOC_BEG);
- TOKEN2ID(tHEREDOC_END);
- TOKEN2ID(k__END__);
- TOKEN2ID(tLOWEST);
- TOKEN2ID(tUMINUS_NUM);
- TOKEN2ID(tLAST_TOKEN);
-#undef TOKEN2ID
-#undef TOKEN2ID2
 }
 rb_bug("parser_token2id: unknown token %d", tok);
@@ -2565,8 +2565,8 @@ rb_parser_str_resize(struct parser_params *p, rb_parser_string_t *str, long len)
 return str;
 }
-#ifndef UNIVERSAL_PARSER
 #ifndef RIPPER
 # define PARSER_ENC_STRING_GETMEM(str, ptrvar, lenvar, encvar) \
 ((ptrvar) = str->ptr, \
 (lenvar) = str->len, \
@@ -2587,7 +2587,73 @@ rb_parser_string_hash_cmp(rb_parser_string_t *str1, rb_parser_string_t *str2)
 memcmp(ptr1, ptr2, len1) != 0);
 }
 #endif
-#endif
 %}
 %expect 0
@@ -7035,35 +7101,100 @@ parser_has_token(struct parser_params *p)
 return pcur > ptok;
 }
-static VALUE
-code_loc_to_ary(struct parser_params *p, const rb_code_location_t *loc)
 {
- VALUE ary = rb_ary_new_from_args(4,
- INT2NUM(loc->beg_pos.lineno), INT2NUM(loc->beg_pos.column),
- INT2NUM(loc->end_pos.lineno), INT2NUM(loc->end_pos.column));
- rb_obj_freeze(ary);
-
- return ary;
 }
-static void
-parser_append_tokens(struct parser_params *p, VALUE str, enum yytokentype t, int line)
 {
- VALUE ary;
- int token_id;
- ary = rb_ary_new2(4);
- token_id = p->token_id;
- rb_ary_push(ary, INT2FIX(token_id));
- rb_ary_push(ary, ID2SYM(parser_token2id(p, t)));
- rb_ary_push(ary, str);
- rb_ary_push(ary, code_loc_to_ary(p, p->yylloc));
- rb_obj_freeze(ary);
- rb_ary_push(p->tokens, ary);
 p->token_id++;
 if (p->debug) {
- rb_parser_printf(p, "Append tokens (line: %d) %"PRIsVALUE"\n", line, ary);
 }
 }
@@ -7077,7 +7208,7 @@ parser_dis_scan_event(struct parser_params *p, enum yytokentype t, int line
 RUBY_SET_YYLLOC(*p->yylloc);
 if (p->keep_tokens) {
- VALUE str = STR_NEW(p->lex.ptok, p->lex.pcur - p->lex.ptok);
 parser_append_tokens(p, str, t, line);
 }
@@ -7095,7 +7226,8 @@ parser_dis_delayed_token(struct parser_params *p, enum yytokentype t, int l
 RUBY_SET_YYLLOC_OF_DELAYED_TOKEN(*p->yylloc);
 if (p->keep_tokens) {
- parser_append_tokens(p, p->delayed.token, t, line);
 }
 p->delayed.token = Qnil;
@@ -7607,7 +7739,7 @@ yycompile0(VALUE arg)
 tree = NEW_NIL(&NULL_LOC);
 }
 else {
- VALUE tokens = p->tokens;
 NODE *prelude;
 NODE *body = parser_append_options(p, RNODE_SCOPE(tree)->nd_body);
 prelude = block_append(p, p->eval_tree_begin, body);
@@ -7615,8 +7747,8 @@ yycompile0(VALUE arg)
 p->ast->body.frozen_string_literal = p->frozen_string_literal;
 p->ast->body.coverage_enabled = cov;
 if (p->keep_tokens) {
- rb_obj_freeze(tokens);
- rb_ast_set_tokens(p->ast, tokens);
 }
 }
 p->ast->body.root = tree;
@@ -9230,7 +9362,7 @@ parser_dis_heredoc_end(struct parser_params *p, int line)
 dis_delayed_token(p, tSTRING_CONTENT);
 if (p->keep_tokens) {
- VALUE str = STR_NEW(p->lex.ptok, p->lex.pend - p->lex.ptok);
 RUBY_SET_YYLLOC_OF_HEREDOC_END(*p->yylloc);
 parser_append_tokens(p, str, tHEREDOC_END, line);
 }
@@ -15973,7 +16105,7 @@ parser_initialize(struct parser_params *p)
 p->error_buffer = Qfalse;
 p->end_expect_token_locations = NULL;
 p->token_id = 0;
- p->tokens = Qnil;
 #else
 p->result = Qnil;
 p->parsing_thread = Qnil;
@@ -16006,7 +16138,6 @@ rb_ruby_parser_mark(void *ptr)
 #ifndef RIPPER
 rb_gc_mark(p->debug_lines);
 rb_gc_mark(p->error_buffer);
- rb_gc_mark(p->tokens);
 #else
 rb_gc_mark(p->value);
 rb_gc_mark(p->result);
@@ -16028,6 +16159,12 @@ rb_ruby_parser_free(void *ptr)
 struct parser_params *p = (struct parser_params*)ptr;
 struct local_vars *local, *prev;
 if (p->tokenbuf) {
 ruby_sized_xfree(p->tokenbuf, p->toksiz);
 }
@@ -16145,8 +16282,7 @@ void
 rb_ruby_parser_keep_tokens(rb_parser_t *p)
 {
 p->keep_tokens = 1;
- // TODO
- p->tokens = rb_ary_new();
 }
 #ifndef UNIVERSAL_PARSER
@@ -461,6 +461,27 @@ str_coderange_scan_restartable(const char *s, const char *e, void *enc, int *cr)
 return rb_str_coderange_scan_restartable(s, e, (rb_encoding *)enc, cr);
 }
 VALUE rb_io_gets_internal(VALUE io);
 extern VALUE rb_eArgError;
 extern VALUE rb_mRubyVMFrozenCore;
@@ -596,6 +617,10 @@ static const rb_parser_config_t rb_global_parser_config = {
 .encoding_set = encoding_set,
 .encoding_is_ascii8bit = encoding_is_ascii8bit,
 .usascii_encoding = usascii_encoding,
 .ractor_make_shareable = rb_ractor_make_shareable,
@@ -189,6 +189,22 @@ typedef struct rb_code_location_struct {
 rb_code_position_t end_pos;
 } rb_code_location_t;
 /* Header part of AST Node */
 typedef struct RNode {
 VALUE flags;
@@ -1340,6 +1356,10 @@ typedef struct rb_parser_config_struct {
 void (*encoding_set)(VALUE obj, int encindex);
 int (*encoding_is_ascii8bit)(VALUE obj);
 rb_encoding *(*usascii_encoding)(void);
 /* Ractor */
 VALUE (*ractor_make_shareable)(VALUE obj);
@@ -293,6 +293,9 @@ struct rb_imemo_tmpbuf_struct {
 #define rb_mRubyVMFrozenCore p->config->mRubyVMFrozenCore()
 #undef rb_long2int
 #define rb_long2int p->config->long2int
 #define rb_node_case_when_optimizable_literal p->config->node_case_when_optimizable_literal
author	HASUMI Hitoshi <[email protected]>	2024-02-16 17:45:22 +0900
committer	Yuichiro Kaneko <[email protected]>	2024-03-12 17:17:52 +0900
commit	9a19cfd4cd1a16528cc997e3a510c3046b83cdec ()
tree	1d061cebf79d8aee39be26887b539f12b48a6f9f
parent	f42164e03700469a7000b4f00148a8ca01d75044 (diff)