From 1ea2557f7edd321ff4ef1c5fb8c1fd6302fa5e19 Mon Sep 17 00:00:00 2001 From: Ian Piumarta Date: Thu, 20 Jun 2024 02:34:32 +0900 Subject: [PATCH] Fix digitValue() and reading of numerical escaped characters in strings. Indexing a string always returns an unsigned char value. Add PEG VM and corresponding primitive __match__(). --- minproto.leg | 534 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 496 insertions(+), 38 deletions(-) diff --git a/minproto.leg b/minproto.leg index 520ab14..20867bd 100644 --- a/minproto.leg +++ b/minproto.leg @@ -1,9 +1,9 @@ # minproto.leg -- minimal prototype langauge for semantic experiments # -# last edited: 2024-06-13 14:47:05 by piumarta on zora +# last edited: 2024-06-20 02:30:46 by piumarta on m1mbp %{ -; + ; //#define YY_DEBUG 1 #ifndef GC @@ -46,6 +46,10 @@ # define PROFILE 0 #endif +#ifndef PEGVM // include parsing expression grammar VM +# define PEGVM 1 +#endif + #include #include #include @@ -139,7 +143,7 @@ doTypes(makeProto); doProperties(declareProp); #undef declareProp -#define doSymbols(_) _(t) _(name) _(expr) _(function) _(arguments) _(object) _(index) _(key) _(value) _(self) _(method) _(parameters) _(body) _(lambda) _(environment) _(operation) _(full) _(condition) _(consequent) _(alternate) _(expression) _(identifier) _(initialise) _(update) _(first) _(last) _(fixed) _(keyvals) _(__namespaces__) _(O) _(d) _(p) _(v) _(statement) _(handler) _(kind) _(message) _(operand1) _(operand2) _(profile) _(parent) _(count) _(stamp) _(time) _(start) _(stop) +#define doSymbols(_) _(t) _(name) _(expr) _(function) _(arguments) _(object) _(index) _(key) _(value) _(self) _(method) _(parameters) _(body) _(lambda) _(environment) _(operation) _(full) _(condition) _(consequent) _(alternate) _(expression) _(identifier) _(initialise) _(update) _(first) _(last) _(fixed) _(keyvals) _(__namespaces__) _(O) _(d) _(p) _(v) _(statement) _(handler) _(kind) _(message) _(operand1) _(operand2) _(profile) _(parent) _(count) _(stamp) _(time) _(start) _(stop) _($$) _(yytext) _(yyleng) #define declareSym(NAME) oop sym_##NAME = 0; doSymbols(declareSym); @@ -168,11 +172,11 @@ int nnlrs = 0; int maxnlrs = 0; oop valnlr = 0; -#define nlrPush() ({ \ - if (++nnlrs >= maxnlrs) nlrs = realloc(nlrs, sizeof(struct NLR) * ++maxnlrs); \ - nlrs[nnlrs - 1].ntrace = _get(trace, Object,isize); \ - setjmp(nlrs[nnlrs - 1].env); \ - }) +#define nlrPush() ({ \ + if (++nnlrs >= maxnlrs) nlrs = realloc(nlrs, sizeof(struct NLR) * ++maxnlrs); \ + nlrs[nnlrs - 1].ntrace = _get(trace, Object,isize); \ + setjmp(nlrs[nnlrs - 1].env); \ + }) #define nlrReturn(VAL, TYPE) { \ valnlr = VAL; \ @@ -366,26 +370,6 @@ oop newString(char *value) #define isString(obj) is(String, obj) -int digitValue(int digit, int base) -{ - if ('a' <= digit && digit <= 'z') digit -= 'a' - 10; - else if ('A' <= digit && digit <= 'Z') digit -= 'A' - 10; - else if (digit < '0' || digit > '9') return -1; - return (digit < base) ? digit : -1; -} - -int readCharValue(char **stringp, int base, int limit) -{ - char *string = *stringp; - int value = 0, d = 0; - while (limit-- && *string && (d = digitValue(*string, base)) >= 0) { - ++string; - value = value * base + d; - } - *stringp = string; - return value; -} - int String_length(oop str) { return get(str, String,length); } oop String_reset (oop str) { set(str, String,length, 0); return str; } @@ -470,6 +454,27 @@ oop String_repeat(oop s, int n) return result; } +int digitValue(int digit, int base) +{ + if ('0' <= digit && digit <= '9') digit -= '0'; + else if ('a' <= digit && digit <= 'z') digit -= 'a' - 10; + else if ('A' <= digit && digit <= 'Z') digit -= 'A' - 10; + else return -1; + return (digit < base) ? digit : -1; +} + +int readCharValue(char **stringp, int base, int limit) +{ + char *string = *stringp; + int value = 0, d = 0; + while (limit-- && *string && (d = digitValue(*string, base)) >= 0) { + ++string; + value = value * base + d; + } + *stringp = string; + return value; +} + oop newStringUnescaped(char *string) { oop buf = newStringLen(0, 0); @@ -489,9 +494,9 @@ oop newStringUnescaped(char *string) case 't' : c = '\t'; break; case 'v' : c = '\v'; break; case 'X' : - case 'x' : c = readCharValue(&string, 16, -1); break; - case '0' : c = readCharValue(&string, 8, 3); break; - default : warning("illegal character escape sequence: \\%c", c); break; + case 'x' : c = readCharValue(&string, 16, -1); break; + case '0'...'7': --string; c = readCharValue(&string, 8, 3); break; + default : warning("illegal character escape sequence: \\%c", c); break; } } String_append(buf, c); @@ -525,7 +530,6 @@ oop String_escaped(oop obj) String_format(buf, "\\%c", c); } } - return buf; } @@ -1436,7 +1440,7 @@ oop storeOn(oop buf, oop obj, int indent) char *str = _get(obj, String,value); int len = _get(obj, String,length); for (int i = 0; i < len; ++i) { - int c = str[i]; + int c = (unsigned char)str[i]; switch (c) { case '\a': String_appendAll(buf, "\\a"); break; case '\b': String_appendAll(buf, "\\b"); break; @@ -1448,7 +1452,7 @@ oop storeOn(oop buf, oop obj, int indent) case '"': String_appendAll(buf, "\\\""); break; case '\\': String_appendAll(buf, "\\\\"); break; default: - if (c < ' ' || c > '~') String_format(buf, "\\%04o", c); + if (c < ' ' || c > '~') String_format(buf, "\\%03o", c); else String_append(buf, c); break; } @@ -1997,13 +2001,13 @@ oop GetArray_eval(oop exp, oop env) if (isInteger(ind)) { int index = _integerValue(ind); switch (getType(obj)) { - case String: return newInteger(*String_aref(obj, index)); - case Symbol: return newInteger(*Symbol_aref(obj, index)); + case String: return newInteger(*(unsigned char *)String_aref(obj, index)); + case Symbol: return newInteger(*(unsigned char *)Symbol_aref(obj, index)); case Object: return *Object_aref(obj, index); default: typeError("[]", "non-indexable object", obj); } } - if (!is(Object, obj)) typeError("[]", "non-indexable object", obj); + if (!is(Object, obj)) typeError("[]", "non-associative object", obj); return Object_getLocal(obj, ind); } @@ -2039,7 +2043,7 @@ oop SetArray_eval(oop exp, oop env) } return val; } - if (!is(Object, obj)) typeError("[]=", "non-indexable object", obj); + if (!is(Object, obj)) typeError("[]=", "non-associative object", obj); return Object_put(obj, ind, val); } @@ -4686,6 +4690,459 @@ oop prim_externalCall(oop func, oop self, oop args, oop env) return 0; } +#if PEGVM + +oop applyThunkIn(oop func, oop env) +{ + int functype = getType(func); + if (Primitive == functype) { +# if PROFILE + oop profile = nil; + if (opt_p) { + profile = _get(func, Primitive,profile); + if (nil == profile) profile = _set(func, Primitive,profile, profileInit(func)); + profileTick(profile); + } +# endif + oop result = _get(func, Primitive,function)(func, nil, nil, env); +# if PROFILE + if (opt_p) profileTock(profile); +# endif + return result; + } +#if PRIMCLOSURE + if (Closure != functype) valueError("()", "cannot apply", func); + oop lambda = _get(func, Closure,function); + oop body = _get(lambda, Lambda,body); +#else + if (Object != functype || pClosure != _getDelegate(func)) + valueError(nil == self ? "()" : ".()", "cannot apply", func); + oop lambda = Object_get(func, sym_function); + oop parameters = Object_get(lambda, sym_parameters); +# if PROFILE + oop profile = nil; + if (opt_p) { + profile = Object_getLocal(lambda, sym_profile); + if (nil == profile) profile = Object_put(lambda, sym_profile, profileInit(lambda)); + profileTick(profile); + } +# endif +#endif + oop *exprs = get(body, Object,indexed); + int size = _get(body, Object,isize); + oop result = nil; + oop args = new(pObject); + // inherit from closure's captured environment + _setDelegate(args, env); +# if NONLOCAL + switch (nlrPush()) { + case NLR_CONTINUE: syntaxError("continue outside loop"); + case NLR_BREAK: syntaxError("break outside loop"); + case NLR_RETURN: return nlrPop(); + case NLR_RAISE: nlrReturn(nlrPop(), NLR_RAISE); + } +# endif + for (int i = 0; i < size; ++i) + result = eval(exprs[i], args); +# if NONLOCAL + nlrPop(); +# endif +# if PROFILE + if (opt_p) profileTock(profile); +# endif + return result; +} + +typedef unsigned char byte; + +typedef enum op_t { + PUSH, DROP, POP, DOT, CLASS, STRING, RULE, CALL, + SUCCEED, FAIL, ACTION, BEGIN, END, UNEND, SET, +} op_t; + +char *op_n[] = { + "PUSH", "DROP", "POP", "DOT", "CLASS", "STRING", "RULE", "CALL", + "SUCCEED", "FAIL", "ACTION", "BEGIN", "END", "UNEND", "SET", +}; + +typedef struct vmInsn vmInsn; + +struct vmInsn { + union { + char *str; + oop obj; + vmInsn *code; + } arg; + unsigned short op, arglen, ok, ko; +}; + +typedef struct vmState +{ + oop result; + oop variables; +} vmState; + +#define VM_STATE_INITIALISER { nil, nil } + +void vmEnter(vmState *state, oop obj, char *yytext, int yyleng) +{ + state->variables = new(state->variables); + Object_put(state->variables, sym_$$, state->result); +} + +void vmSet(vmState *state, oop obj, char *yytext, int yyleng) +{ + Object_put(state->variables, obj, state->result); +} + +void vmAction(vmState *state, oop obj, char *yytext, int yyleng) +{ + oop text = yyleng ? newStringLen(yytext, yyleng) : nil; + Object_put(state->variables, sym_yytext, text); + Object_put(state->variables, sym_yyleng, newInteger(yyleng)); + applyThunkIn(obj, state->variables); +} + +void vmLeave(vmState *state, oop obj, char *yytext, int yyleng) +{ + state->result = Object_getLocal(state->variables, sym_$$); + state->variables = _getDelegate(state->variables); +} + +void vmDisassemble(vmInsn *code, int pc) +{ + vmInsn *i = &code[pc]; + printf("%p ", code); + switch (i->op) { + case CLASS: + case STRING: + printf("%03d %-7s \"%s\" %2d %2d %2d\n", + pc, op_n[i->op], i->arg.str, i->arglen, i->ok, i->ko); + break; + case CALL: + printf("%03d %-7s %p %2d %2d %2d\n", + pc, op_n[i->op], i->arg.code, i->arglen, i->ok, i->ko); + break; + default: + printf("%03d %-7s %s %2d %2d %2d\n", + pc, op_n[i->op], codeString(i->arg.obj, 0), i->arglen, i->ok, i->ko); + break; + } +} + +oop vmCache = 0; + +void vmCachePut(oop grammar, oop symbol, vmInsn *code) +{ + oop *ref = Object_refLocal(vmCache, grammar); + if (UNDEFINED == ref) { + Object_put(vmCache, grammar, new(pObject)); + ref = _refvar(vmCache, grammar); + } + oop line = *ref; + ref = Object_refLocal(line, symbol); + if (UNDEFINED != ref) fatal("vm cache collision"); + Object_put(line, symbol, mkptr(code)); +} + +vmInsn *vmCompile(oop grammar, oop symbol) +{ + oop program = Object_get(grammar, symbol); + if (!is(Object, program)) valueError("__match__", "program is not an object", program); + oop *prog = _get(program, Object,indexed); + int plen = _get(program, Object,isize); + if (plen % 5 != 0) valueError("__match__", "program length is not a multiple of five", program); + vmInsn *code = calloc(plen / 5, sizeof(*code)); + vmCachePut(grammar, symbol, code); + oop env = nil; + int ppc = 0; + int cpc = 0; + while (ppc < plen) { + int op = integerValue(prog[ppc++], "__match__"); + oop arg = prog[ppc++]; + int arglen = integerValue(prog[ppc++], "__match__"); + int ok = integerValue(prog[ppc++], "__match__"); + int ko = integerValue(prog[ppc++], "__match__"); + code[cpc] = (vmInsn){ .arg.obj = arg, .op = op, .arglen = arglen, .ok = ok, .ko = ko }; + switch (op) { + case CLASS: + case STRING: { + code[cpc].arg.str = stringValue(code[cpc].arg.obj, "__match__"); + break; + } + case RULE: { + if (!is(Symbol, code[cpc].arg.obj)) valueError("__match__", "RULE argument must be a symbol", program); + break; + } + case CALL: { + valueError("__match__", "program contains CALL opcode", program); + break; + } + case ACTION: { + if (!isClosure(code[cpc].arg.obj)) valueError("__match__", "ACTION argument is not a function", program); + break; + } + case SET: { + if (!is(Symbol, code[cpc].arg.obj)) valueError("__match__", "SET argument is not a symbol", program); + if (nil == env) env = new(pObject); + Object_put(env, code[cpc].arg.obj, nil); + break; + } + default: { + break; + } + } + ++cpc; + } +#if 0 + if (!cpc || code[0].op != ENTER) valueError("__match__", "program does not begin with ENTER", program); + code[0].arg.obj = env; // clone this to make a list of local variables for the rule +#endif + + if (opt_d) { + printf("---- BEGIN "); println(symbol, 0); + for (int pc = 0; pc < cpc; ++pc) vmDisassemble(code, pc); + printf("---- END "); println(symbol, 0); + } + + return code; +} + +vmInsn *vmCacheGet(oop grammar, oop symbol) +{ + oop *ref = Object_refLocal(vmCache, grammar); + if (UNDEFINED != ref) { + oop line = *ref; + ref = Object_refLocal(line, symbol); + if (UNDEFINED != ref) + return (vmInsn *)(intptr_t)_integerValue(*ref); + } + return vmCompile(grammar, symbol); +} + +int vmRun(oop grammar, oop symbol, char *text, int start, int length) +{ + vmCache = new(pObject); + + vmInsn *code = vmCacheGet(grammar, symbol); + + struct Action { + void (*function)(vmState *state, oop object, char *yytext, int yyleng); + oop object; + int textbeg, textlen; + } *actions = 0; + + struct Context { // for back-tracking + int position; + int nactions; + } *cstack, context; + + int csp = 0, ncstack = 32; + cstack = malloc(sizeof(*cstack) * ncstack); + context.position = start; + context.nactions = 0; + + vmState state = VM_STATE_INITIALISER; + +# define saveAction(ACT, OBJ, BEG, LEN) { \ + actions = realloc(actions, sizeof(*actions) * (context.nactions + 1)); \ + actions[context.nactions++] = (struct Action){ ACT, OBJ, BEG, LEN }; \ + } + + struct Frame { + oop symbol; + vmInsn *code; + int pc; + int nactions; + } *rstack, frame; + + int rsp = 0, nrstack = 32; + rstack = malloc(sizeof(*rstack) * nrstack); + frame.symbol = symbol; + frame.code = vmCacheGet(grammar, symbol); + frame.pc = 0; + frame.nactions = context.nactions; + + + int textbeg = 0, textend = 0; + int result = 0; + +#define push(C, X) { \ + if (C##sp >= n##C##stack) C##stack = realloc(C##stack, sizeof(*C##stack) * (n##C##stack *= 2)); \ + C##stack[C##sp++] = (X); \ + } + +#define drop(C) { \ + assert(C##sp); \ + --C##sp; \ + } + +#define pop(C) ({ \ + assert(C##sp); \ + C##stack[--C##sp]; \ + }) + + for (;;) { + if (opt_d) vmDisassemble(frame.code, frame.pc); + vmInsn *i = frame.code + frame.pc++; + switch (i->op) { + case PUSH: push(c, context); frame.pc = i->ok; continue; + case DROP: drop(c); frame.pc = i->ok; continue; + case POP: context = pop(c); frame.pc = i->ok; continue; + case DOT: { + if (context.position < length) { + context.position++; + frame.pc = i->ok; + continue; + } + frame.pc = i->ko; + continue; + } + case CLASS: { + if (context.position < length) { + byte c = text[context.position]; + if ((((byte *)i->arg.str)[c/8] >> (c % 8)) & 1) { + context.position++; + frame.pc = i->ok; + continue; + } + } + frame.pc = i->ko; + continue; + } + case STRING: { + if (context.position + i->arglen <= length) { + if (0 == memcmp(text + context.position, i->arg.str, i->arglen)) { + context.position += i->arglen; + frame.pc = i->ok; + continue; + } + } + frame.pc = i->ko; + continue; + } + case RULE: { + // frame.pc--; // save pc of call insn + // push(r, frame); + // frame.code = vmCacheGet(grammar, i->arg.obj); + // frame.pc = 0; + // if (((Node *)(i->arg))->Symbol.nvars) { + // saveAction(enter, i->arg, 0, 0); + // frame.nactions = context.nactions; + // } + // continue; + //i->op = CALL; + //i->arglen = ((Node *)i->arg)->Symbol.nvars; + //i->arg = ((Node *)i->arg)->Symbol.code; assert(i->arg); + i->op = CALL; + i->arg.code = vmCacheGet(grammar, i->arg.obj); + goto doCall; // (just in case they are not consecutive ;-)) + } + case CALL: { doCall: + frame.pc--; // save pc of call insn + push(r, frame); + frame.code = i->arg.code; + frame.pc = 0; + saveAction(vmEnter, nil, 0, 0); + frame.nactions = context.nactions; + continue; + } + case SUCCEED: { + if (frame.nactions == context.nactions) // no actions were added + context.nactions--; // remove the enter action + else + saveAction(vmLeave, nil, 0, 0); + if (rsp) { + frame = pop(r); + i = frame.code + frame.pc; + frame.pc = i->ok; + continue; + } + result = context.position - start; + break; + } + case FAIL: { + if (rsp) { + frame = pop(r); + i = frame.code + frame.pc; + frame.pc = i->ko; + continue; + } + result = -1; + break; + } + case ACTION: { + // printf("--> ACTION %d [%d %d]\n", context.nactions, textbeg, textend); + saveAction(vmAction, i->arg.obj, textbeg, textend - textbeg); + frame.pc = i->ok; + continue; + } + case BEGIN: { + textbeg = textend = context.position; + frame.pc = i->ok; + continue; + } + case END: { + textend = context.position; + frame.pc = i->ok; + continue; + } + case UNEND: { + textbeg = textend = 0; + frame.pc = i->ok; + continue; + } + case SET: { + saveAction(vmSet, i->arg.obj, 0, 0); + frame.pc = i->ok; + continue; + } + default: { + fatal("this cannot happen"); + break; + } + } + break; + } + + saveAction(vmLeave, nil, 0, 0); + +#undef pop +#undef drop +#undef push + + free(cstack); + free(rstack); + + for (int i = 0; i < context.nactions; ++i) { + char *yytext = text + actions[i].textbeg; + int yyleng = actions[i].textlen; + // printf("==> ACTION %d [%d %d]\n", i, actions[i].textbeg, actions[i].textbeg + actions[i].textlen); + actions[i].function(&state, actions[i].object, yytext, yyleng); + } + + vmCache = nil; + return result; +} + +oop prim_match(oop func, oop self, oop args, oop env) +{ + oop grammar = getArg (args, 0, "__match__"); + oop symbol = getArgType(args, 1, Symbol, "__match__"); + oop string = getArgType(args, 2, String, "__match__"); + int start = _integerValue(getArgType(args, 3, Integer, "__match__")); + + char *text = get(string, String,value); + int length = get(string, String,length); + oop program = Object_get(grammar, symbol); + + int result = vmRun(grammar, symbol, text, start, length); + + return newInteger(result); +} + +#endif // PEGVM + oop replFile(FILE *in) { int oldline = lineno; @@ -4855,6 +5312,7 @@ int main(int argc, char **argv) prim(error , prim_error); prim(defined , prim_defined); prim(__extern__ , prim_extern); + prim(__match__ , prim_match); # undef prim